Random Forest Model interpretation¶

In [ ]:

%load_ext autoreload
%autoreload 2

In [ ]:

%matplotlib inline

from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics

In [ ]:

set_plot_sizes(12,14,16)

Load in our data from last lesson¶

In [ ]:

PATH = "data/bulldozers/"

df_raw = pd.read_feather('tmp/bulldozers-raw')
df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice')

In [ ]:

def split_vals(a,n): return a[:n], a[n:]
n_valid = 12000
n_trn = len(df_trn)-n_valid
X_train, X_valid = split_vals(df_trn, n_trn)
y_train, y_valid = split_vals(y_trn, n_trn)
raw_train, raw_valid = split_vals(df_raw, n_trn)

In [ ]:

def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [ ]:

df_raw

Out[ ]:

	SalesID	SalePrice	MachineID	ModelID	datasource	auctioneerID	YearMade	MachineHoursCurrentMeter	UsageBand	fiModelDesc	...	saleDay	saleDayofweek	saleDayofyear	saleis_month_end	saleis_month_start	saleis_quarter_end	saleis_quarter_start	saleis_year_end	saleis_year_start	saleElapsed
0	1139246	11.097410	999089	3157	121	3.0	2004	68.0	Low	521D	...	16	3	320	False	False	False	False	False	False	6512
1	1139248	10.950807	117657	77	121	3.0	1996	4640.0	Low	950FII	...	26	4	86	False	False	False	False	False	False	5547
2	1139249	9.210340	434808	7009	121	3.0	2001	2838.0	High	226	...	26	3	57	False	False	False	False	False	False	5518
3	1139251	10.558414	1026470	332	121	3.0	2001	3486.0	High	PC120-6E	...	19	3	139	False	False	False	False	False	False	8157
4	1139253	9.305651	1057373	17311	121	3.0	2007	722.0	Medium	S175	...	23	3	204	False	False	False	False	False	False	7492
5	1139255	10.184900	1001274	4605	121	3.0	2004	508.0	Low	310G	...	18	3	353	False	False	False	False	False	False	7275
6	1139256	9.952278	772701	1937	121	3.0	1993	11540.0	High	790ELC	...	26	3	239	False	False	False	False	False	False	5700
7	1139261	10.203592	902002	3539	121	3.0	2001	4883.0	High	416D	...	17	3	321	False	False	False	False	False	False	6148
8	1139272	9.975808	1036251	36003	121	3.0	2008	302.0	Low	430HAG	...	27	3	239	False	False	False	False	False	False	7527
9	1139275	11.082143	1016474	3883	121	3.0	1000	20700.0	Medium	988B	...	9	3	221	False	False	False	False	False	False	6778
10	1139278	10.085809	1024998	4605	121	3.0	2004	1414.0	Medium	310G	...	21	3	234	False	False	False	False	False	False	7156
11	1139282	10.021271	319906	5255	121	3.0	1998	2764.0	Low	D31E	...	24	3	236	False	False	False	False	False	False	6428
12	1139283	10.491274	1052214	2232	121	3.0	1998	0.0	NaN	PC200LC6	...	20	3	293	False	False	False	False	False	False	6120
13	1139284	10.325482	1068082	3542	121	3.0	2001	1921.0	Medium	420D	...	26	3	26	False	False	False	False	False	False	6218
14	1139290	10.239960	1058450	5162	121	3.0	2004	320.0	Low	214E	...	3	1	3	False	False	False	False	False	False	6195
15	1139291	9.852194	1004810	4604	121	3.0	1999	2450.0	Medium	310E	...	16	3	320	False	False	False	False	False	False	6512
16	1139292	9.510445	1026973	9510	121	3.0	1999	1972.0	Low	334	...	14	3	165	False	False	False	False	False	False	6722
17	1139299	9.159047	1002713	21442	121	3.0	2003	0.0	NaN	45NX	...	28	3	28	False	False	False	False	False	False	7681
18	1139301	9.433484	125790	7040	121	3.0	2001	994.0	Low	302.5	...	9	3	68	False	False	False	False	False	False	6260
19	1139304	9.350102	1011914	3177	121	3.0	1991	8005.0	Medium	580SUPER K	...	17	3	321	False	False	False	False	False	False	6148
20	1139311	10.621327	1014135	8867	121	3.0	2000	3259.0	Medium	JS260	...	18	3	138	False	False	False	False	False	False	6330
21	1139333	10.448715	999192	3350	121	3.0	1000	16328.0	Medium	120G	...	19	3	292	False	False	False	False	False	False	6484
22	1139344	10.165852	1044500	7040	121	3.0	2005	109.0	Low	302.5	...	25	3	298	False	False	False	False	False	False	6855
23	1139346	11.198215	821452	85	121	3.0	1996	17033.0	High	966FII	...	19	3	292	False	False	False	False	False	False	6484
24	1139348	10.404263	294562	3542	121	3.0	2001	1877.0	Medium	420D	...	20	3	141	False	False	False	False	False	False	5602
25	1139351	9.433484	833838	7009	121	3.0	2003	1028.0	Medium	226	...	9	3	68	False	False	False	False	False	False	6260
26	1139354	9.648595	565440	7040	121	3.0	2003	356.0	Low	302.5	...	9	3	68	False	False	False	False	False	False	6260
27	1139356	10.878047	1004127	25458	121	3.0	2000	0.0	NaN	EX550STD	...	22	3	53	False	False	False	False	False	False	6610
28	1139357	10.736397	44800	19167	121	3.0	2004	904.0	Low	685B	...	9	3	221	False	False	False	False	False	False	6778
29	1139358	11.396392	1018076	1333	121	3.0	1998	10466.0	Medium	345BL	...	1	3	152	False	True	False	False	False	False	6344
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
401095	6333259	9.259131	1872639	21437	149	1.0	2003	NaN	NaN	35N	...	14	2	348	False	False	False	False	False	False	8366
401096	6333260	9.210340	1816341	21437	149	2.0	2004	NaN	NaN	35N	...	15	3	258	False	False	False	False	False	False	8276
401097	6333261	9.047821	1843949	21437	149	1.0	2005	NaN	NaN	35N	...	28	4	301	False	False	False	False	False	False	8319
401098	6333262	9.259131	1791341	21437	149	2.0	2004	NaN	NaN	35N	...	16	1	228	False	False	False	False	False	False	8246
401099	6333263	9.305651	1833174	21437	149	1.0	2004	NaN	NaN	35N	...	14	2	348	False	False	False	False	False	False	8366
401100	6333264	9.259131	1791370	21437	149	2.0	2004	NaN	NaN	35N	...	16	1	228	False	False	False	False	False	False	8246
401101	6333270	9.210340	1799208	21437	149	1.0	2004	NaN	NaN	35N	...	14	2	348	False	False	False	False	False	False	8366
401102	6333272	9.259131	1927142	21437	149	2.0	2005	NaN	NaN	35N	...	16	1	228	False	False	False	False	False	False	8246
401103	6333273	9.433484	1789856	21437	149	2.0	2005	NaN	NaN	35N	...	15	3	258	False	False	False	False	False	False	8276
401104	6333275	9.259131	1924623	21437	149	2.0	2005	NaN	NaN	35N	...	16	1	228	False	False	False	False	False	False	8246
401105	6333276	9.210340	1835350	21437	149	2.0	2005	NaN	NaN	35N	...	25	1	298	False	False	False	False	False	False	8316
401106	6333278	9.259131	1944702	21437	149	2.0	2005	NaN	NaN	35N	...	16	1	228	False	False	False	False	False	False	8246
401107	6333279	9.433484	1866563	21437	149	2.0	2005	NaN	NaN	35N	...	15	3	258	False	False	False	False	False	False	8276
401108	6333280	9.259131	1851633	21437	149	2.0	2005	NaN	NaN	35N	...	16	1	228	False	False	False	False	False	False	8246
401109	6333281	9.259131	1798958	21437	149	2.0	2005	NaN	NaN	35N	...	16	1	228	False	False	False	False	False	False	8246
401110	6333282	9.259131	1878866	21437	149	2.0	2005	NaN	NaN	35N	...	15	3	258	False	False	False	False	False	False	8276
401111	6333283	9.210340	1874235	21437	149	2.0	2005	NaN	NaN	35N	...	25	1	298	False	False	False	False	False	False	8316
401112	6333284	9.259131	1887654	21437	149	2.0	2005	NaN	NaN	35N	...	25	1	298	False	False	False	False	False	False	8316
401113	6333285	9.259131	1817165	21437	149	2.0	2005	NaN	NaN	35N	...	25	1	298	False	False	False	False	False	False	8316
401114	6333287	9.433484	1918242	21437	149	2.0	2005	NaN	NaN	35N	...	15	1	319	False	False	False	False	False	False	8337
401115	6333290	9.210340	1843374	21437	149	2.0	2005	NaN	NaN	35N	...	25	1	298	False	False	False	False	False	False	8316
401116	6333302	9.047821	1825337	21437	149	2.0	2005	NaN	NaN	35N	...	25	1	298	False	False	False	False	False	False	8316
401117	6333307	9.210340	1821747	21437	149	2.0	2005	NaN	NaN	35N	...	25	1	298	False	False	False	False	False	False	8316
401118	6333311	9.159047	1828862	21437	149	2.0	2006	NaN	NaN	35N	...	25	1	298	False	False	False	False	False	False	8316
401119	6333335	9.047821	1798293	21435	149	2.0	2005	NaN	NaN	30NX	...	25	1	298	False	False	False	False	False	False	8316
401120	6333336	9.259131	1840702	21439	149	1.0	2005	NaN	NaN	35NX2	...	2	2	306	False	False	False	False	False	False	8324
401121	6333337	9.305651	1830472	21439	149	1.0	2005	NaN	NaN	35NX2	...	2	2	306	False	False	False	False	False	False	8324
401122	6333338	9.350102	1887659	21439	149	1.0	2005	NaN	NaN	35NX2	...	2	2	306	False	False	False	False	False	False	8324
401123	6333341	9.104980	1903570	21435	149	2.0	2005	NaN	NaN	30NX	...	25	1	298	False	False	False	False	False	False	8316
401124	6333342	8.955448	1926965	21435	149	2.0	2005	NaN	NaN	30NX	...	25	1	298	False	False	False	False	False	False	8316

401125 rows × 65 columns

Confidence based on tree variance¶

For model interpretation, there's no need to use the full dataset on each tree - using a subset will be both faster, and also provide better interpretability (since an overfit model will not provide much variance across trees).

In [ ]:

set_rf_samples(50000)

In [ ]:

m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.2078231865448058, 0.24827834336192164, 0.90854271791930319, 0.88991563242710103, 0.89426780386728721]

We saw how the model averages predictions across the trees to get an estimate - but how can we know the confidence of the estimate? One simple way is to use the standard deviation of predictions, instead of just the mean. This tells us the relative confidence of predictions - that is, for rows where the trees give very different results, you would want to be more cautious of using those results, compared to cases where they are more consistent. Using the same example as in the last lesson when we looked at bagging:

In [ ]:

%time preds = np.stack([t.predict(X_valid) for t in m.estimators_])
np.mean(preds[:,0]), np.std(preds[:,0])

CPU times: user 1.38 s, sys: 20 ms, total: 1.4 s
Wall time: 1.4 s

Out[ ]:

(9.1960278072006023, 0.21225113407342761)

When we use python to loop through trees like this, we're calculating each in series, which is slow! We can use parallel processing to speed things up:

In [ ]:

def get_preds(t): return t.predict(X_valid)
%time preds = np.stack(parallel_trees(m, get_preds))
np.mean(preds[:,0]), np.std(preds[:,0])

CPU times: user 84 ms, sys: 140 ms, total: 224 ms
Wall time: 415 ms

Out[ ]:

(9.1960278072006023, 0.21225113407342761)

We can see that different trees are giving different estimates this this auction. In order to see how prediction confidence varies, we can add this into our dataset.

In [ ]:

x = raw_valid.copy()
x['pred_std'] = np.std(preds, axis=0)
x['pred'] = np.mean(preds, axis=0)
x.Enclosure.value_counts().plot.barh();

In [ ]:

flds = ['Enclosure', 'SalePrice', 'pred', 'pred_std']
enc_summ = x[flds].groupby('Enclosure', as_index=False).mean()
enc_summ

Out[ ]:

	Enclosure	SalePrice	pred	pred_std
0	EROPS	9.849178	9.845237	0.276256
1	EROPS AC	NaN	NaN	NaN
2	EROPS w AC	10.623971	10.579465	0.261992
3	NO ROPS	NaN	NaN	NaN
4	None or Unspecified	NaN	NaN	NaN
5	OROPS	9.682064	9.684717	0.220889

In [ ]:

enc_summ = enc_summ[~pd.isnull(enc_summ.SalePrice)]
enc_summ.plot('Enclosure', 'SalePrice', 'barh', xlim=(0,11));

In [ ]:

enc_summ.plot('Enclosure', 'pred', 'barh', xerr='pred_std', alpha=0.6, xlim=(0,11));

Question: Why are the predictions nearly exactly right, but the error bars are quite wide?

In [ ]:

raw_valid.ProductSize.value_counts().plot.barh();

In [ ]:

flds = ['ProductSize', 'SalePrice', 'pred', 'pred_std']
summ = x[flds].groupby(flds[0]).mean()
summ

Out[ ]:

	SalePrice	pred	pred_std
ProductSize
Compact	9.735093	9.888354	0.339142
Large	10.470589	10.392766	0.362407
Large / Medium	10.691871	10.639858	0.295774
Medium	10.681511	10.620441	0.285992
Mini	9.535147	9.555066	0.250787
Small	10.324448	10.322982	0.315314

In [ ]:

(summ.pred_std/summ.pred).sort_values(ascending=False)

Out[ ]:

ProductSize
Large             0.034871
Compact           0.034297
Small             0.030545
Large / Medium    0.027799
Medium            0.026928
Mini              0.026247
dtype: float64

Feature importance¶

It's not normally enough to just to know that a model can make accurate predictions - we also want to know how it's making predictions. The most important way to see this is with feature importance.

In [ ]:

fi = rf_feat_importance(m, df_trn); fi[:10]

Out[ ]:

	cols	imp
5	YearMade	0.178417
37	Coupler_System	0.114632
13	ProductSize	0.103073
14	fiProductClassDesc	0.081206
2	ModelID	0.060495
39	Hydraulics_Flow	0.051222
63	saleElapsed	0.050837
10	fiSecondaryDesc	0.038329
19	Enclosure	0.034592
8	fiModelDesc	0.030848

In [ ]:

fi.plot('cols', 'imp', figsize=(10,6), legend=False);

In [ ]:

def plot_fi(fi): return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

In [ ]:

plot_fi(fi[:30]);

In [ ]:

to_keep = fi[fi.imp>0.005].cols; len(to_keep)

Out[ ]:

In [ ]:

df_keep = df_trn[to_keep].copy()
X_train, X_valid = split_vals(df_keep, n_trn)

In [ ]:

m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5,
                          n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.20685390156773095, 0.24454842802383558, 0.91015213846294174, 0.89319840835270514, 0.8942078920004991]

In [ ]:

fi = rf_feat_importance(m, df_keep)
plot_fi(fi);

One-hot encoding¶

proc_df's optional max_n_cat argument will turn some categorical variables into new columns.

For example, the column ProductSize which has 6 categories:

Large
Large / Medium
Medium
Compact
Small
Mini

gets turned into 6 new columns:

ProductSize_Large
ProductSize_Large / Medium
ProductSize_Medium
ProductSize_Compact
ProductSize_Small
ProductSize_Mini

and the column ProductSize gets removed.

It will only happen to columns whose number of categories is no bigger than the value of the max_n_cat argument.

Now some of these new columns may prove to have more important features than in the earlier situation, where all categories were in one column.

In [ ]:

df_trn2, y_trn, nas = proc_df(df_raw, 'SalePrice', max_n_cat=7)
X_train, X_valid = split_vals(df_trn2, n_trn)

m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.6, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.2132925755978791, 0.25212838463780185, 0.90966193351324276, 0.88647501408921581, 0.89194147155121262]

In [ ]:

fi = rf_feat_importance(m, df_trn2)
plot_fi(fi[:25]);

Removing redundant features¶

One thing that makes this harder to interpret is that there seem to be some variables with very similar meanings. Let's try to remove redundent features.

In [ ]:

from scipy.cluster import hierarchy as hc

In [ ]:

corr = np.round(scipy.stats.spearmanr(df_keep).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(16,10))
dendrogram = hc.dendrogram(z, labels=df_keep.columns, orientation='left', leaf_font_size=16)
plt.show()

Let's try removing some of these related features to see if the model can be simplified without impacting the accuracy.

In [ ]:

def get_oob(df):
    m = RandomForestRegressor(n_estimators=30, min_samples_leaf=5, max_features=0.6, n_jobs=-1, oob_score=True)
    x, _ = split_vals(df, n_trn)
    m.fit(x, y_train)
    return m.oob_score_

Here's our baseline.

In [ ]:

get_oob(df_keep)

Out[ ]:

0.88999425494301454

Now we try removing each variable one at a time.

In [ ]:

for c in ('saleYear', 'saleElapsed', 'fiModelDesc', 'fiBaseModel', 'Grouser_Tracks', 'Coupler_System'):
    print(c, get_oob(df_keep.drop(c, axis=1)))

saleYear 0.889037446375
saleElapsed 0.886210803445
fiModelDesc 0.888540591321
fiBaseModel 0.88893958239
Grouser_Tracks 0.890385236272
Coupler_System 0.889601052658

It looks like we can try one from each group for removal. Let's see what that does.

In [ ]:

to_drop = ['saleYear', 'fiBaseModel', 'Grouser_Tracks']
get_oob(df_keep.drop(to_drop, axis=1))

Out[ ]:

0.88858458047200739

Looking good! Let's use this dataframe from here. We'll save the list of columns so we can reuse it later.

In [ ]:

df_keep.drop(to_drop, axis=1, inplace=True)
X_train, X_valid = split_vals(df_keep, n_trn)

In [ ]:

np.save('tmp/keep_cols.npy', np.array(df_keep.columns))

In [ ]:

keep_cols = np.load('tmp/keep_cols.npy')
df_keep = df_trn[keep_cols]

And let's see how this model looks on the full dataset.

In [ ]:

reset_rf_samples()

In [ ]:

m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.12615142089579687, 0.22781819082173235, 0.96677727309424211, 0.90731173105384466, 0.9084359846323049]

Partial dependence¶

In [ ]:

from pdpbox import pdp
from plotnine import *

In [ ]:

set_rf_samples(50000)

This next analysis will be a little easier if we use the 1-hot encoded categorical variables, so let's load them up again.

In [ ]:

df_trn2, y_trn, nas = proc_df(df_raw, 'SalePrice', max_n_cat=7)
X_train, X_valid = split_vals(df_trn2, n_trn)
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.6, n_jobs=-1)
m.fit(X_train, y_train);

In [ ]:

plot_fi(rf_feat_importance(m, df_trn2)[:10]);

In [ ]:

df_raw.plot('YearMade', 'saleElapsed', 'scatter', alpha=0.01, figsize=(10,8));

In [ ]:

x_all = get_sample(df_raw[df_raw.YearMade>1930], 500)

In [ ]:

ggplot(x_all, aes('YearMade', 'SalePrice'))+stat_smooth(se=True, method='loess')

Out[ ]:

<ggplot: (8729550331912)>

In [ ]:

x = get_sample(X_train[X_train.YearMade>1930], 500)

In [ ]:

def plot_pdp(feat, clusters=None, feat_name=None):
    feat_name = feat_name or feat
    p = pdp.pdp_isolate(m, x, x.columns, feat)
    return pdp.pdp_plot(p, feat_name, plot_lines=True,
                        cluster=clusters is not None,
                        n_cluster_centers=clusters)

In [ ]:

plot_pdp('YearMade')

In [ ]:

plot_pdp('YearMade', clusters=5)

In [ ]:

feats = ['saleElapsed', 'YearMade']
p = pdp.pdp_interact(m, x, x.columns, feats)
pdp.pdp_interact_plot(p, feats)

In [ ]:

plot_pdp(['Enclosure_EROPS w AC', 'Enclosure_EROPS', 'Enclosure_OROPS'], 5, 'Enclosure')

In [ ]:

df_raw.YearMade[df_raw.YearMade<1950] = 1950
df_keep['age'] = df_raw['age'] = df_raw.saleYear-df_raw.YearMade

In [ ]:

X_train, X_valid = split_vals(df_keep, n_trn)
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.6, n_jobs=-1)
m.fit(X_train, y_train)
plot_fi(rf_feat_importance(m, df_keep));

Tree interpreter¶

In [ ]:

from treeinterpreter import treeinterpreter as ti

In [ ]:

df_train, df_valid = split_vals(df_raw[df_keep.columns], n_trn)

In [ ]:

row = X_valid.values[None,0]; row

Out[ ]:

array([[4364751, 2300944, 665, 172, 1.0, 1999, 3726.0, 3, 3232, 1111, 0, 63, 0, 5, 17, 35, 4, 4, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 0, 3, 0, 0, 0, 2, 19, 29, 3, 2, 1, 0, 0, 0, 0, 0, 2010, 9, 37,
        16, 3, 259, False, False, False, False, False, False, 7912, False, False]], dtype=object)

In [ ]:

prediction, bias, contributions = ti.predict(m, row)

In [ ]:

prediction[0], bias[0]

Out[ ]:

(9.1909688098736275, 10.10606580677884)

In [ ]:

idxs = np.argsort(contributions[0])

In [ ]:

[o for o in zip(df_keep.columns[idxs], df_valid.iloc[0][idxs], contributions[0][idxs])]

Out[ ]:

[('ProductSize', 'Mini', -0.54680742853695008),
 ('age', 11, -0.12507089451852943),
 ('fiProductClassDesc',
  'Hydraulic Excavator, Track - 3.0 to 4.0 Metric Tons',
  -0.11143111128570773),
 ('fiModelDesc', 'KX1212', -0.065155113754146801),
 ('fiSecondaryDesc', nan, -0.055237427792181749),
 ('Enclosure', 'EROPS', -0.050467175593900217),
 ('fiModelDescriptor', nan, -0.042354676935508852),
 ('saleElapsed', 7912, -0.019642242073500914),
 ('saleDay', 16, -0.012812993479652724),
 ('Tire_Size', nan, -0.0029687660942271598),
 ('SalesID', 4364751, -0.0010443985823001434),
 ('saleDayofyear', 259, -0.00086540581130196688),
 ('Drive_System', nan, 0.0015385818526195915),
 ('Hydraulics', 'Standard', 0.0022411701338458821),
 ('state', 'Ohio', 0.0037587658190299409),
 ('ProductGroupDesc', 'Track Excavators', 0.0067688906745931197),
 ('ProductGroup', 'TEX', 0.014654732626326661),
 ('MachineID', 2300944, 0.015578052196894499),
 ('Hydraulics_Flow', nan, 0.028973749866174004),
 ('ModelID', 665, 0.038307429579276284),
 ('Coupler_System', nan, 0.052509808150765114),
 ('YearMade', 1999, 0.071829996446492878)]

In [ ]:

contributions[0].sum()

Out[ ]:

-0.7383536391949419

Extrapolation¶

In [ ]:

df_ext = df_keep.copy()
df_ext['is_valid'] = 1
df_ext.is_valid[:n_trn] = 0
x, y, nas = proc_df(df_ext, 'is_valid')

In [ ]:

m = RandomForestClassifier(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(x, y);
m.oob_score_

Out[ ]:

0.99998753505765037

In [ ]:

fi = rf_feat_importance(m, x); fi[:10]

Out[ ]:

	cols	imp
9	SalesID	0.764744
5	saleElapsed	0.146162
11	MachineID	0.077919
8	fiModelDesc	0.002931
20	saleDayofyear	0.002569
0	YearMade	0.002358
22	age	0.001202
4	ModelID	0.000664
6	fiSecondaryDesc	0.000361
1	Coupler_System	0.000208

In [ ]:

feats=['SalesID', 'saleElapsed', 'MachineID']

In [ ]:

(X_train[feats]/1000).describe()

Out[ ]:

	SalesID	saleElapsed	MachineID
count	389125.000000	389125.000000	389125.000000
mean	1800.452485	5.599522	1206.796148
std	595.627288	2.087862	430.850552
min	1139.246000	0.000000	0.000000
25%	1413.348000	4.232000	1087.016000
50%	1632.093000	6.176000	1273.859000
75%	2210.453000	7.328000	1458.661000
max	4364.741000	8.381000	2313.821000

In [ ]:

(X_valid[feats]/1000).describe()

Out[ ]:

	SalesID	saleElapsed	MachineID
count	12000.000000	12000.000000	12000.000000
mean	5786.967651	8.166793	1578.049709
std	836.899608	0.289098	589.497173
min	4364.751000	6.638000	0.830000
25%	4408.580750	8.197000	1271.225250
50%	6272.538500	8.276000	1825.317000
75%	6291.792250	8.338000	1907.858000
max	6333.342000	8.382000	2486.330000

In [ ]:

x.drop(feats, axis=1, inplace=True)

In [ ]:

m = RandomForestClassifier(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(x, y);
m.oob_score_

Out[ ]:

0.9789018385789966

In [ ]:

fi = rf_feat_importance(m, x); fi[:10]

Out[ ]:

	cols	imp
19	age	0.233626
0	YearMade	0.188127
17	saleDayofyear	0.157429
4	ModelID	0.077623
7	fiModelDesc	0.061301
15	saleDay	0.056252
14	state	0.055201
3	fiProductClassDesc	0.035131
5	fiSecondaryDesc	0.023661
6	Enclosure	0.022409

In [ ]:

set_rf_samples(50000)

In [ ]:

feats=['SalesID', 'saleElapsed', 'MachineID', 'age', 'YearMade', 'saleDayofyear']

In [ ]:

X_train, X_valid = split_vals(df_keep, n_trn)
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.21136509778791376, 0.2493668921196425, 0.90909393040946562, 0.88894821098056087, 0.89255408392415925]

In [ ]:

for f in feats:
    df_subs = df_keep.drop(f, axis=1)
    X_train, X_valid = split_vals(df_subs, n_trn)
    m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
    m.fit(X_train, y_train)
    print(f)
    print_score(m)

SalesID
[0.20918653475938534, 0.2459966629213187, 0.9053273181678706, 0.89192968797265737, 0.89245205174299469]
saleElapsed
[0.2194124612957369, 0.2546442621643524, 0.90358104739129086, 0.8841980790762114, 0.88681881032219145]
MachineID
[0.206612984511148, 0.24446409479358033, 0.90312476862123559, 0.89327205732490311, 0.89501553584754967]
age
[0.21317740718919814, 0.2471719147150774, 0.90260198977488226, 0.89089460707372525, 0.89185129799503315]
YearMade
[0.21305398932040326, 0.2534570148977216, 0.90555219348567462, 0.88527538596974953, 0.89158854973045432]
saleDayofyear
[0.21320711524847227, 0.24629839782893828, 0.90881970943169987, 0.89166441133215968, 0.89272793857941679]

In [ ]:

reset_rf_samples()

In [ ]:

df_subs = df_keep.drop(['SalesID', 'MachineID', 'saleDayofyear'], axis=1)
X_train, X_valid = split_vals(df_subs, n_trn)
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.1418970082803121, 0.21779153679471935, 0.96040441863389681, 0.91529091848161925, 0.90918594039522138]

In [ ]:

plot_fi(rf_feat_importance(m, X_train));

In [ ]:

np.save('tmp/subs_cols.npy', np.array(df_subs.columns))

Our final model!¶

In [ ]:

m = RandomForestRegressor(n_estimators=160, max_features=0.5, n_jobs=-1, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 6min 3s, sys: 2.75 s, total: 6min 6s
Wall time: 16.7 s
[0.08104912951128229, 0.2109679613161783, 0.9865755186304942, 0.92051576728916762, 0.9143700001430598]

In [ ]: