from fastai.tabular import * # Quick accesss to tabular functionality
Tabular data should be in a Pandas DataFrame
.
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df['salary'].unique()
array(['>=50k', '<50k'], dtype=object)
# function import
from fastai.utils.mem import *
# other function teset
gpu_with_max_free_mem()
(0, 7812)
# test reduce_mem_usage(df)
df.head()
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | salary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 49 | Private | 101320 | Assoc-acdm | 12.0 | Married-civ-spouse | NaN | Wife | White | Female | 0 | 1902 | 40 | United-States | >=50k |
1 | 44 | Private | 236746 | Masters | 14.0 | Divorced | Exec-managerial | Not-in-family | White | Male | 10520 | 0 | 45 | United-States | >=50k |
2 | 38 | Private | 96185 | HS-grad | NaN | Divorced | NaN | Unmarried | Black | Female | 0 | 0 | 32 | United-States | <50k |
3 | 38 | Self-emp-inc | 112847 | Prof-school | 15.0 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | United-States | >=50k |
4 | 42 | Self-emp-not-inc | 82297 | 7th-8th | NaN | Married-civ-spouse | Other-service | Wife | Black | Female | 0 | 0 | 50 | United-States | <50k |
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [FillMissing, Categorify, Normalize]
test = TabularList.from_df(df.iloc[800:1000].copy(), path=path, cat_names=cat_names, cont_names=cont_names)
data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
.split_by_idx(list(range(800,1000)))
.label_from_df(cols=dep_var)
.add_test(test)
.databunch())
data.show_batch(rows=10)
workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | target |
---|---|---|---|---|---|---|---|---|---|---|
Private | Bachelors | Never-married | Exec-managerial | Not-in-family | White | False | -1.1425 | -0.9280 | 1.1422 | <50k |
Private | HS-grad | Never-married | Other-service | Not-in-family | White | False | -0.5561 | 0.7244 | -0.4224 | <50k |
Private | Some-college | Never-married | Other-service | Own-child | White | False | -1.5090 | -0.1673 | -0.0312 | <50k |
Private | HS-grad | Divorced | Adm-clerical | Other-relative | Amer-Indian-Eskimo | False | 1.2763 | -0.8370 | -0.4224 | <50k |
Local-gov | Bachelors | Divorced | Transport-moving | Not-in-family | White | False | 0.2502 | -1.3617 | 1.1422 | <50k |
Private | Some-college | Married-civ-spouse | Prof-specialty | Husband | White | False | -0.8493 | -0.3286 | -0.0312 | >=50k |
Private | 11th | Never-married | Prof-specialty | Own-child | White | False | -1.5090 | 0.8521 | -1.2046 | <50k |
Private | 7th-8th | Married-civ-spouse | Tech-support | Husband | White | False | -0.2629 | 0.0550 | -2.3781 | <50k |
Private | HS-grad | Never-married | Transport-moving | Not-in-family | White | False | -0.8493 | -0.3286 | -0.4224 | <50k |
Private | Bachelors | Never-married | Adm-clerical | Not-in-family | White | False | -0.7760 | 1.3159 | 1.1422 | <50k |
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)
learn.fit(1, 1e-2)
epoch | train_loss | valid_loss | accuracy | time |
---|---|---|---|---|
0 | 0.357122 | 0.381649 | 0.790000 | 00:02 |
row = df.iloc[0]
learn.predict(row)
(Category >=50k, tensor(1), tensor([0.3581, 0.6419]))