from fastai.tabular import *
Tabular data should be in a Pandas DataFrame
.
Tabular数据是Pandas里的DataFrame
。
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [FillMissing, Categorify, Normalize]
test = TabularList.from_df(df.iloc[800:1000].copy(), path=path, cat_names=cat_names, cont_names=cont_names)
data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
.split_by_idx(list(range(800,1000)))
.label_from_df(cols=dep_var)
.add_test(test)
.databunch())
data.show_batch(rows=10)
workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | target |
---|---|---|---|---|---|---|---|---|---|---|
Private | HS-grad | Never-married | Sales | Not-in-family | White | False | -1.2158 | 1.1004 | -0.4224 | <50k |
? | HS-grad | Widowed | ? | Not-in-family | White | False | 1.8627 | 0.0976 | -0.4224 | <50k |
Self-emp-not-inc | HS-grad | Never-married | Craft-repair | Own-child | Black | False | 0.0303 | 0.2092 | -0.4224 | <50k |
Private | HS-grad | Married-civ-spouse | Protective-serv | Husband | White | False | 1.5695 | -0.5938 | -0.4224 | <50k |
Private | HS-grad | Married-civ-spouse | Handlers-cleaners | Husband | White | False | -0.9959 | -0.0318 | -0.4224 | <50k |
Private | 10th | Married-civ-spouse | Farming-fishing | Wife | White | False | -0.7027 | 0.6071 | -1.5958 | <50k |
Private | HS-grad | Married-civ-spouse | Machine-op-inspct | Husband | White | False | 0.1036 | -0.0968 | -0.4224 | <50k |
Private | Some-college | Married-civ-spouse | Exec-managerial | Own-child | White | False | -0.7760 | -0.6653 | -0.0312 | >=50k |
State-gov | Some-college | Never-married | Tech-support | Own-child | White | False | -0.8493 | -1.4959 | -0.0312 | <50k |
Private | 11th | Never-married | Machine-op-inspct | Not-in-family | White | False | -1.0692 | -0.9516 | -1.2046 | <50k |
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)
learn.fit(1, 1e-2)
epoch | train_loss | valid_loss | accuracy |
---|---|---|---|
1 | 0.354604 | 0.378520 | 0.820000 |
row = df.iloc[0]
learn.predict(row)
(Category >=50k, tensor(1), tensor([0.4402, 0.5598]))