%load_ext autoreload
%autoreload 2
%matplotlib inline
from fastai.imports import *
from fastai.torch_imports import *
from fastai.dataset import *
from fastai.learner import *
from fastai.structured import *
from fastai.column_data import *
dep = 'SalePrice'
PATH = "data/bulldozers/"
df_raw = pd.read_feather('tmp/bulldozers-raw')
keep_cols = list(np.load('tmp/keep_cols.npy'))
df_raw.loc[df_raw.YearMade<1950, 'YearMade'] = 1950
df_raw['age'] = df_raw.saleYear-df_raw.YearMade
df_raw = df_raw[keep_cols+['age', dep]].copy()
df_indep = df_raw.drop(dep,axis=1)
n_valid = 12000
n_trn = len(df_raw)-n_valid
cat_flds = [n for n in df_indep.columns if df_raw[n].nunique()<n_trn/50]
' '.join(cat_flds)
'YearMade Coupler_System ProductSize fiProductClassDesc ModelID saleElapsed fiSecondaryDesc Enclosure fiModelDesc Hydraulics_Flow fiModelDescriptor Hydraulics Drive_System ProductGroupDesc ProductGroup state saleDay Track_Type saleDayofyear Stick_Length age'
for o in ['saleElapsed', 'saleDayofyear', 'saleDay', 'age', 'YearMade']: cat_flds.remove(o)
[n for n in df_indep.drop(cat_flds,axis=1).columns if not is_numeric_dtype(df_raw[n])]
[]
for n in cat_flds: df_raw[n] = df_raw[n].astype('category').cat.as_ordered()
cont_flds = [n for n in df_indep.columns if n not in cat_flds]
' '.join(cont_flds)
'YearMade saleElapsed SalesID MachineID saleDay saleDayofyear age'
df_raw = df_raw[cat_flds+cont_flds+[dep]]
df, y, nas, mapper = proc_df(df_raw, 'SalePrice', do_scale=True)
val_idx = list(range(n_trn, len(df)))
md = ColumnarModelData.from_data_frame(PATH, val_idx, df, y, cat_flds=cat_flds, bs=64)
df.head()
Coupler_System | ProductSize | fiProductClassDesc | ModelID | fiSecondaryDesc | Enclosure | fiModelDesc | Hydraulics_Flow | fiModelDescriptor | Hydraulics | ... | state | Track_Type | Stick_Length | YearMade | saleElapsed | SalesID | MachineID | saleDay | saleDayofyear | age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 59 | 644 | 41 | 3 | 950 | 0 | 0 | 1 | ... | 1 | 0 | 0 | 0.913196 | 0.397377 | -0.858580 | -0.496185 | -0.013101 | 1.352092 | -0.828814 |
1 | 0 | 4 | 62 | 11 | 55 | 3 | 1725 | 0 | 0 | 1 | ... | 33 | 0 | 0 | 0.405756 | -0.061496 | -0.858578 | -2.494936 | 1.173518 | -0.907472 | -0.430749 |
2 | 1 | 0 | 39 | 1542 | 0 | 6 | 331 | 3 | 0 | 4 | ... | 32 | 0 | 0 | 0.722906 | -0.075286 | -0.858577 | -1.775759 | 1.173518 | -1.187503 | -0.762470 |
3 | 0 | 6 | 8 | 110 | 0 | 3 | 3674 | 0 | 0 | 1 | ... | 44 | 0 | 0 | 0.722906 | 1.179600 | -0.858574 | -0.434096 | 0.342885 | -0.395690 | -0.298060 |
4 | 1 | 0 | 40 | 3540 | 0 | 1 | 4208 | 3 | 0 | 4 | ... | 32 | 0 | 0 | 1.103486 | 0.863382 | -0.858572 | -0.364020 | 0.817532 | 0.231967 | -0.828814 |
5 rows × 23 columns
def rmse(x,y): return math.sqrt(((x-y)**2).mean())
emb_c = {n: len(c.cat.categories)+1 for n,c in df_raw[cat_flds].items()}
emb_c
{'Coupler_System': 3, 'Drive_System': 5, 'Enclosure': 7, 'Hydraulics': 13, 'Hydraulics_Flow': 4, 'ModelID': 5219, 'ProductGroup': 7, 'ProductGroupDesc': 7, 'ProductSize': 7, 'Stick_Length': 30, 'Track_Type': 3, 'fiModelDesc': 5000, 'fiModelDescriptor': 140, 'fiProductClassDesc': 75, 'fiSecondaryDesc': 176, 'state': 54}
emb_szs = [(c, min(50, (c+1)//2)) for _,c in emb_c.items()]
metrics=[rmse]
y_range=(0,np.max(y)*1.2)
m = md.get_learner(emb_szs, len(cont_flds), 0.05, 1, [500,250], [0.5,0.05],
y_range=y_range, use_bn=True)
m.lr_find()
A Jupyter Widget
63%|██████▎ | 3812/6081 [00:28<00:14, 154.49it/s, loss=0.202]
m.sched.plot(1300)
63%|██████▎ | 3812/6081 [00:40<00:23, 95.21it/s, loss=0.314]
lr=1e-3; wd=1e-7
m.fit(lr, 2, wd, cycle_len=1, cycle_mult=2)
A Jupyter Widget
[ 0. 0.06207 0.09731] [ 1. 0.06048 0.07684] [ 2. 0.05326 0.06389]
m.fit(lr, 2, wd, cycle_len=2, cycle_mult=2)
A Jupyter Widget
[ 0. 0.05471 0.0523 ] [ 1. 0.04767 0.0512 ] [ 2. 0.05249 0.05747] [ 3. 0.04643 0.05393] [ 4. 0.04984 0.04934] [ 5. 0.04277 0.04869]
math.sqrt(0.0487)
0.22068076490713912