%load_ext autoreload
%autoreload 2
%matplotlib inline
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
set_plot_sizes(12,14,16)
PATH = "data/bulldozers/"
df_raw = pd.read_feather('tmp/bulldozers-raw')
df_raw['age'] = df_raw.saleYear-df_raw.YearMade
df, y, nas, mapper = proc_df(df_raw, 'SalePrice', max_n_cat=10, do_scale=True)
def split_vals(a,n): return a[:n], a[n:]
n_valid = 12000
n_trn = len(df)-n_valid
y_train, y_valid = split_vals(y, n_trn)
raw_train, raw_valid = split_vals(df_raw, n_trn)
def rmse(x,y): return math.sqrt(((x-y)**2).mean())
df.describe().transpose()
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
SalesID | 401125.0 | 3.147483e-15 | 1.000001 | -0.858580 | -0.551518 | -0.308344 | 0.355322 | 4.855369 |
MachineID | 401125.0 | 2.895953e-15 | 1.000001 | -2.761738 | -0.292989 | 0.139657 | 0.567277 | 2.876309 |
ModelID | 401125.0 | -1.300773e-13 | 1.000001 | -1.102854 | -0.583548 | -0.367372 | 0.294819 | 4.871330 |
datasource | 401125.0 | -6.855920e-13 | 1.000001 | -1.524823 | -0.297450 | -0.297450 | 0.148868 | 4.165727 |
auctioneerID | 401125.0 | -1.064768e-13 | 1.000001 | -0.381738 | -0.321407 | -0.261075 | -0.140412 | 5.591096 |
YearMade | 401125.0 | 6.797770e-16 | 1.000001 | -3.081446 | 0.294188 | 0.328458 | 0.345593 | 0.390145 |
MachineHoursCurrentMeter | 401125.0 | 8.635210e-14 | 1.000001 | -0.074396 | -0.074396 | -0.074396 | -0.074396 | 150.038499 |
fiModelDesc | 401125.0 | 1.673709e+03 | 1263.331163 | 1.000000 | 631.000000 | 1395.000000 | 2292.000000 | 4999.000000 |
fiBaseModel | 401125.0 | 5.591654e+02 | 469.310266 | 1.000000 | 206.000000 | 406.000000 | 704.000000 | 1950.000000 |
fiSecondaryDesc | 401125.0 | 3.668504e+01 | 38.228243 | 0.000000 | 0.000000 | 29.000000 | 57.000000 | 175.000000 |
fiModelSeries | 401125.0 | 9.192686e+00 | 27.006859 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 122.000000 |
fiModelDescriptor | 401125.0 | 1.223343e+01 | 29.041950 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 139.000000 |
fiProductClassDesc | 401125.0 | 3.226222e+01 | 22.596618 | 1.000000 | 11.000000 | 35.000000 | 52.000000 | 74.000000 |
state | 401125.0 | 2.350832e+01 | 15.732898 | 1.000000 | 9.000000 | 22.000000 | 41.000000 | 53.000000 |
Hydraulics | 401125.0 | 4.253801e+00 | 4.809368 | 0.000000 | 1.000000 | 1.000000 | 12.000000 | 12.000000 |
Tire_Size | 401125.0 | 3.059847e+00 | 5.950781 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 17.000000 |
Undercarriage_Pad_Width | 401125.0 | 4.311895e+00 | 7.741894 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 19.000000 |
Stick_Length | 401125.0 | 6.535564e+00 | 11.923308 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 29.000000 |
saleYear | 401125.0 | -2.781277e-13 | 1.000001 | -2.623435 | -0.711783 | 0.330937 | 0.852297 | 1.199870 |
saleMonth | 401125.0 | 2.443335e-14 | 1.000001 | -1.578894 | -0.994879 | -0.118857 | 0.757165 | 1.633187 |
saleWeek | 401125.0 | -9.469515e-15 | 1.000001 | -1.702718 | -0.891252 | -0.079785 | 0.866926 | 1.813637 |
saleDay | 401125.0 | -1.649883e-15 | 1.000001 | -1.793028 | -0.843733 | -0.013101 | 0.817532 | 1.766827 |
saleDayofweek | 401125.0 | 5.134902e-14 | 1.000001 | -1.850225 | -0.427508 | 0.283850 | 0.283850 | 2.417924 |
saleDayofyear | 401125.0 | 3.377684e-15 | 1.000001 | -1.718597 | -0.926784 | -0.115659 | 0.878936 | 1.786624 |
saleis_month_end | 401125.0 | 3.491732e-14 | 1.000001 | -0.186900 | -0.186900 | -0.186900 | -0.186900 | 5.350455 |
saleis_month_start | 401125.0 | 8.367190e-14 | 1.000001 | -0.161180 | -0.161180 | -0.161180 | -0.161180 | 6.204230 |
saleis_quarter_end | 401125.0 | 4.067403e-14 | 1.000001 | -0.124679 | -0.124679 | -0.124679 | -0.124679 | 8.020586 |
saleis_quarter_start | 401125.0 | 1.443002e-14 | 1.000001 | -0.079495 | -0.079495 | -0.079495 | -0.079495 | 12.579339 |
saleis_year_end | 401125.0 | -1.624260e-14 | 1.000001 | -0.001579 | -0.001579 | -0.001579 | -0.001579 | 633.343509 |
saleis_year_start | 401125.0 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
Blade_Type_Angle | 401125.0 | 4.143347e-03 | 0.064235 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Blade_Type_Coal | 401125.0 | 2.742287e-05 | 0.005237 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Blade_Type_Landfill | 401125.0 | 6.232471e-05 | 0.007894 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Blade_Type_No | 401125.0 | 1.852290e-03 | 0.042998 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Blade_Type_None or Unspecified | 401125.0 | 2.849735e-02 | 0.166389 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Blade_Type_PAT | 401125.0 | 9.625927e-02 | 0.294947 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Blade_Type_Semi U | 401125.0 | 2.148208e-02 | 0.144985 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Blade_Type_Straight | 401125.0 | 3.321409e-02 | 0.179195 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Blade_Type_U | 401125.0 | 4.641945e-03 | 0.067974 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Blade_Type_VPAT | 401125.0 | 8.842630e-03 | 0.093619 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Blade_Type_nan | 401125.0 | 8.009773e-01 | 0.399266 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
Travel_Controls_1 Speed | 401125.0 | 2.492988e-05 | 0.004993 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Travel_Controls_2 Pedal | 401125.0 | 2.846993e-03 | 0.053281 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Travel_Controls_Differential Steer | 401125.0 | 1.216329e-02 | 0.109615 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Travel_Controls_Finger Tip | 401125.0 | 6.541602e-03 | 0.080615 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Travel_Controls_Lever | 401125.0 | 2.094110e-03 | 0.045714 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Travel_Controls_None or Unspecified | 401125.0 | 1.743172e-01 | 0.379383 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Travel_Controls_Pedal | 401125.0 | 1.037083e-03 | 0.032187 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Travel_Controls_nan | 401125.0 | 8.009748e-01 | 0.399268 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
Differential_Type_Limited Slip | 401125.0 | 2.817077e-03 | 0.053001 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Differential_Type_Locking | 401125.0 | 4.985977e-06 | 0.002233 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Differential_Type_No Spin | 401125.0 | 5.135556e-04 | 0.022656 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Differential_Type_Standard | 401125.0 | 1.697052e-01 | 0.375374 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Differential_Type_nan | 401125.0 | 8.269592e-01 | 0.378283 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
Steering_Controls_Command Control | 401125.0 | 1.338735e-03 | 0.036564 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Steering_Controls_Conventional | 401125.0 | 1.712160e-01 | 0.376698 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Steering_Controls_Four Wheel Standard | 401125.0 | 3.440324e-04 | 0.018545 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Steering_Controls_No | 401125.0 | 2.492988e-06 | 0.001579 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Steering_Controls_Wheel | 401125.0 | 3.490184e-05 | 0.005908 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Steering_Controls_nan | 401125.0 | 8.270639e-01 | 0.378193 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
192 rows × 8 columns
X_train, X_valid = split_vals(df, n_trn)
m = LinearRegression().fit(X_train, y_train)
m.score(X_valid, y_valid)
0.31399415664841568
m.score(X_train, y_train)
0.60966668735322171
preds = m.predict(X_valid)
rmse(preds, y_valid)
0.6197833720320716
plt.scatter(preds, y_valid, alpha=0.1, s=2);
keep_cols = list(np.load('tmp/keep_cols.npy'))
', '.join(keep_cols)
'YearMade, Coupler_System, ProductSize, fiProductClassDesc, ModelID, Hydraulics_Flow, saleElapsed, fiSecondaryDesc, Enclosure, fiModelDesc, SalesID, fiModelDescriptor, MachineID, ProductGroupDesc, ProductGroup, Drive_System, Hydraulics, state, saleDay, Tire_Size, saleDayofyear'
df_sub = df_raw[keep_cols+['age', 'SalePrice']]
df, y, nas, mapper = proc_df(df_sub, 'SalePrice', max_n_cat=10, do_scale=True)
X_train, X_valid = split_vals(df, n_trn)
m = LinearRegression().fit(X_train, y_train)
m.score(X_valid, y_valid)
0.67602943126238957
rmse(m.predict(X_valid), y_valid)
0.42592074374245537
from operator import itemgetter
sorted(list(zip(X_valid.columns, m.coef_)), key=itemgetter(1))
[('saleElapsed', -145.64883869187287), ('ProductSize_Compact', -0.54007475122954363), ('ProductSize_Mini', -0.50048058229134851), ('Drive_System_Two Wheel Drive', -0.26738317785805954), ('ProductGroupDesc_Skid Steer Loaders', -0.24922453667794642), ('ProductGroup_SSL', -0.24922453667794109), ('Enclosure_None or Unspecified', -0.24279691741993142), ('Enclosure_nan', -0.18422637193293984), ('Drive_System_nan', -0.17481049937861393), ('Hydraulics_Flow_Standard', -0.16350014357513842), ('Enclosure_EROPS AC', -0.13844787814745835), ('Enclosure_OROPS', -0.11883972857575564), ('ProductSize_nan', -0.091923065651293712), ('MachineID', -0.074919067087428459), ('Drive_System_Four Wheel Drive', -0.070763049128190758), ('Enclosure_EROPS', -0.054481426247864562), ('Hydraulics_Flow_High Flow', -0.04086147315765426), ('ProductGroupDesc_Track Type Tractors', -0.0040345196773952097), ('ProductGroup_TTT', -0.0040345196773863279), ('Coupler_System_nan', -0.0033786325283386986), ('Tire_Size', -0.0029100702750870511), ('saleDay', -0.0018416299990260399), ('SalesID', -0.00054830445029807606), ('state', -0.00024678007480360975), ('Coupler_System_None or Unspecified', -5.2370856060690585e-06), ('ProductGroupDesc_nan', 0.0), ('ProductGroup_nan', 0.0), ('fiModelDesc', 4.2934486685908269e-06), ('Hydraulics', 0.00040543370646162202), ('fiModelDescriptor', 0.0021681557153456765), ('fiSecondaryDesc', 0.0027035894408982131), ('Coupler_System_Yes', 0.0033838696139043556), ('Hydraulics_Flow_nan', 0.004228336726749049), ('fiProductClassDesc', 0.0056127773827328031), ('ModelID', 0.015501818523116384), ('ProductGroup_WL', 0.02704461444028361), ('ProductGroupDesc_Wheel Loader', 0.027044614440295156), ('ProductGroup_TEX', 0.052690045537289176), ('ProductGroupDesc_Track Excavators', 0.052690045537290064), ('ProductGroupDesc_Backhoe Loaders', 0.059547357765127318), ('ProductGroup_BL', 0.05954735776512976), ('ProductGroup_MG', 0.11397703861262443), ('ProductGroupDesc_Motor Graders', 0.1139770386126262), ('Drive_System_No', 0.17580439323345765), ('ProductSize_Small', 0.17871767761687707), ('Hydraulics_Flow_None or Unspecified', 0.20013328000603833), ('ProductSize_Medium', 0.25052367123948038), ('ProductSize_Large', 0.28820495005131264), ('Enclosure_EROPS w AC', 0.28910815935957146), ('Drive_System_All Wheel Drive', 0.33715233313139936), ('ProductSize_Large / Medium', 0.41503210026451676), ('Enclosure_NO ROPS', 0.44968416296443187), ('saleDayofyear', 7.1497706071154195), ('YearMade', 145.69251462398279), ('age', 182.96050934738616)]
m = LassoCV().fit(X_train, y_train)
m.score(X_valid, y_valid)
0.634974200475658
rmse(m.predict(X_valid), y_valid)
0.452103421164379
m.alpha_
0.015888143202334452
coefs = sorted(list(zip(X_valid.columns, m.coef_)), key=itemgetter(1))
coefs
[('ProductSize_Mini', -0.32283011739021739), ('ProductGroupDesc_Skid Steer Loaders', -0.29197866480532003), ('ProductSize_nan', -0.16481153369955914), ('Enclosure_OROPS', -0.10072067167233209), ('MachineID', -0.077025231732003333), ('age', -0.07146167736242677), ('saleDayofyear', -0.005420434471588245), ('ProductGroup_SSL', -0.00026018612542580649), ('state', -0.00020510032574012273), ('ModelID', -0.0), ('saleElapsed', -0.0), ('SalesID', -0.0), ('Hydraulics', -0.0), ('saleDay', -0.0), ('Coupler_System_None or Unspecified', -0.0), ('Coupler_System_Yes', -0.0), ('ProductSize_Compact', -0.0), ('ProductSize_Large', 0.0), ('ProductSize_Medium', 0.0), ('ProductSize_Small', 0.0), ('Hydraulics_Flow_High Flow', -0.0), ('Hydraulics_Flow_None or Unspecified', 0.0), ('Hydraulics_Flow_Standard', -0.0), ('Hydraulics_Flow_nan', 0.0), ('Enclosure_EROPS', 0.0), ('Enclosure_EROPS AC', -0.0), ('Enclosure_NO ROPS', 0.0), ('Enclosure_None or Unspecified', -0.0), ('Enclosure_nan', -0.0), ('ProductGroupDesc_Backhoe Loaders', -0.0), ('ProductGroupDesc_Track Excavators', 0.0), ('ProductGroupDesc_Track Type Tractors', 0.0), ('ProductGroupDesc_Wheel Loader', -0.0), ('ProductGroupDesc_nan', 0.0), ('ProductGroup_BL', -0.0), ('ProductGroup_MG', 0.0), ('ProductGroup_TEX', 0.0), ('ProductGroup_TTT', 0.0), ('ProductGroup_WL', -0.0), ('ProductGroup_nan', 0.0), ('Drive_System_All Wheel Drive', 0.0), ('Drive_System_Four Wheel Drive', 0.0), ('Drive_System_No', 0.0), ('Drive_System_Two Wheel Drive', -0.0), ('Drive_System_nan', -0.0), ('fiModelDesc', 5.509329640857254e-06), ('fiProductClassDesc', 0.0022891561283628222), ('fiSecondaryDesc', 0.0030514019161731946), ('fiModelDescriptor', 0.0034724295356978328), ('Tire_Size', 0.0095336232150601847), ('YearMade', 0.088383969701718024), ('ProductGroupDesc_Motor Graders', 0.11373349421170943), ('ProductSize_Large / Medium', 0.15991368864779024), ('Coupler_System_nan', 0.2023810180165819), ('Enclosure_EROPS w AC', 0.34461861098943347)]
skip = [n for n,c in coefs if abs(c)<0.01]
df.drop(skip, axis=1, inplace=True)
# for n,c in df.items():
# if '_' not in n: df[n+'2'] = df[n]**2
X_train, X_valid = split_vals(df, n_trn)
m = LassoCV().fit(X_train, y_train)
m.score(X_valid, y_valid)
0.63678168860529394
rmse(m.predict(X_valid), y_valid)
0.4509826980922721
coefs = sorted(list(zip(X_valid.columns, m.coef_)), key=itemgetter(1))
coefs
[('ProductSize_Mini', -0.77420353132504083), ('ProductSize_nan', -0.37649627504627403), ('ProductGroupDesc_Skid Steer Loaders', -0.31232507784064917), ('MachineID', -0.092899281127635194), ('Enclosure_OROPS', -0.034526809361612412), ('age', -0.024367657576137017), ('ProductSize_Large / Medium', 0.08968235308777571), ('YearMade', 0.15983396599807115), ('Enclosure_EROPS w AC', 0.40652833657308551), ('Coupler_System_nan', 0.42191856864836064), ('ProductGroupDesc_Motor Graders', 0.49456961585974812)]
np.savez(f'{PATH}tmp/regr_resid', m.predict(X_train), m.predict(X_valid))