Deep learning for Bulldozers¶

In [ ]:

%load_ext autoreload
%autoreload 2

In [ ]:

%matplotlib inline

from fastai.imports import *
from fastai.torch_imports import *
from fastai.dataset import *
from fastai.learner import *
from fastai.structured import *
from fastai.column_data import *

Load in our data from last lesson¶

In [ ]:

dep = 'SalePrice'
PATH = "data/bulldozers/"
df_raw = pd.read_feather('tmp/bulldozers-raw')
keep_cols = list(np.load('tmp/keep_cols.npy'))

In [ ]:

df_raw.loc[df_raw.YearMade<1950, 'YearMade'] = 1950
df_raw['age'] = df_raw.saleYear-df_raw.YearMade
df_raw = df_raw[keep_cols+['age', dep]].copy()
df_indep = df_raw.drop(dep,axis=1)

n_valid = 12000
n_trn = len(df_raw)-n_valid

In [ ]:

cat_flds = [n for n in df_indep.columns if df_raw[n].nunique()<n_trn/50]
' '.join(cat_flds)

Out[ ]:

'YearMade Coupler_System ProductSize fiProductClassDesc ModelID saleElapsed fiSecondaryDesc Enclosure fiModelDesc Hydraulics_Flow fiModelDescriptor Hydraulics Drive_System ProductGroupDesc ProductGroup state saleDay Track_Type saleDayofyear Stick_Length age'

In [ ]:

for o in ['saleElapsed', 'saleDayofyear', 'saleDay', 'age', 'YearMade']: cat_flds.remove(o)
[n for n in df_indep.drop(cat_flds,axis=1).columns if not is_numeric_dtype(df_raw[n])]

Out[ ]:

[]

In [ ]:

for n in cat_flds: df_raw[n] = df_raw[n].astype('category').cat.as_ordered()

cont_flds = [n for n in df_indep.columns if n not in cat_flds]
' '.join(cont_flds)

Out[ ]:

'YearMade saleElapsed SalesID MachineID saleDay saleDayofyear age'

In [ ]:

df_raw = df_raw[cat_flds+cont_flds+[dep]]
df, y, nas, mapper = proc_df(df_raw, 'SalePrice', do_scale=True)

val_idx = list(range(n_trn, len(df)))

In [ ]:

md = ColumnarModelData.from_data_frame(PATH, val_idx, df, y, cat_flds=cat_flds, bs=64)

In [ ]:

df.head()

Out[ ]:

	Coupler_System	ProductSize	fiProductClassDesc	ModelID	fiSecondaryDesc	Enclosure	fiModelDesc	Hydraulics_Flow	Hydraulics	...	state	YearMade	saleElapsed	SalesID	MachineID	saleDay	saleDayofyear	age
0	0	0	59	644	41	3	950	0	1	...	1	0.913196	0.397377	-0.858580	-0.496185	-0.013101	1.352092	-0.828814
1	0	4	62	11	55	3	1725	0	1	...	33	0.405756	-0.061496	-0.858578	-2.494936	1.173518	-0.907472	-0.430749
2	1	0	39	1542	0	6	331	3	4	...	32	0.722906	-0.075286	-0.858577	-1.775759	1.173518	-1.187503	-0.762470
3	0	6	8	110	0	3	3674	0	1	...	44	0.722906	1.179600	-0.858574	-0.434096	0.342885	-0.395690	-0.298060
4	1	0	40	3540	0	1	4208	3	4	...	32	1.103486	0.863382	-0.858572	-0.364020	0.817532	0.231967	-0.828814

5 rows × 23 columns

Model¶

In [ ]:

def rmse(x,y): return math.sqrt(((x-y)**2).mean())

In [ ]:

emb_c = {n: len(c.cat.categories)+1 for n,c in df_raw[cat_flds].items()}
emb_c

Out[ ]:

{'Coupler_System': 3,
 'Drive_System': 5,
 'Enclosure': 7,
 'Hydraulics': 13,
 'Hydraulics_Flow': 4,
 'ModelID': 5219,
 'ProductGroup': 7,
 'ProductGroupDesc': 7,
 'ProductSize': 7,
 'Stick_Length': 30,
 'Track_Type': 3,
 'fiModelDesc': 5000,
 'fiModelDescriptor': 140,
 'fiProductClassDesc': 75,
 'fiSecondaryDesc': 176,
 'state': 54}

In [ ]:

emb_szs = [(c, min(50, (c+1)//2)) for _,c in emb_c.items()]
metrics=[rmse]

In [ ]:

y_range=(0,np.max(y)*1.2)

In [ ]:

m = md.get_learner(emb_szs, len(cont_flds), 0.05, 1, [500,250], [0.5,0.05],
                   y_range=y_range, use_bn=True)

In [ ]:

m.lr_find()

A Jupyter Widget

 63%|██████▎   | 3812/6081 [00:28<00:14, 154.49it/s, loss=0.202]

In [ ]:

m.sched.plot(1300)

 63%|██████▎   | 3812/6081 [00:40<00:23, 95.21it/s, loss=0.314]

In [ ]:

lr=1e-3; wd=1e-7

In [ ]:

m.fit(lr, 2, wd, cycle_len=1, cycle_mult=2)

A Jupyter Widget

[ 0.       0.06207  0.09731]                                     
[ 1.       0.06048  0.07684]                                     
[ 2.       0.05326  0.06389]

In [ ]:

m.fit(lr, 2, wd, cycle_len=2, cycle_mult=2)

A Jupyter Widget

[ 0.       0.05471  0.0523 ]                                     
[ 1.       0.04767  0.0512 ]                                     
[ 2.       0.05249  0.05747]                                     
[ 3.       0.04643  0.05393]                                     
[ 4.       0.04984  0.04934]                                     
[ 5.       0.04277  0.04869]

In [ ]:

math.sqrt(0.0487)

Out[ ]:

0.22068076490713912