%load_ext autoreload
%autoreload 2

%matplotlib inline

from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV

set_plot_sizes(12,14,16)

PATH = "data/bulldozers/"

df_raw = pd.read_feather('tmp/bulldozers-raw')

df_raw['age'] = df_raw.saleYear-df_raw.YearMade

df, y, nas, mapper = proc_df(df_raw, 'SalePrice', max_n_cat=10, do_scale=True)

def split_vals(a,n): return a[:n], a[n:]
n_valid = 12000
n_trn = len(df)-n_valid
y_train, y_valid = split_vals(y, n_trn)
raw_train, raw_valid = split_vals(df_raw, n_trn)

def rmse(x,y): return math.sqrt(((x-y)**2).mean())

df.describe().transpose()

X_train, X_valid = split_vals(df, n_trn)

m = LinearRegression().fit(X_train, y_train)
m.score(X_valid, y_valid)

m.score(X_train, y_train)

preds = m.predict(X_valid)

rmse(preds, y_valid)

plt.scatter(preds, y_valid, alpha=0.1, s=2);

keep_cols = list(np.load('tmp/keep_cols.npy'))
', '.join(keep_cols)

df_sub = df_raw[keep_cols+['age', 'SalePrice']]

df, y, nas, mapper = proc_df(df_sub, 'SalePrice', max_n_cat=10, do_scale=True)

X_train, X_valid = split_vals(df, n_trn)

m = LinearRegression().fit(X_train, y_train)
m.score(X_valid, y_valid)

rmse(m.predict(X_valid), y_valid)

from operator import itemgetter

sorted(list(zip(X_valid.columns, m.coef_)), key=itemgetter(1))

m = LassoCV().fit(X_train, y_train)
m.score(X_valid, y_valid)

rmse(m.predict(X_valid), y_valid)

m.alpha_

coefs = sorted(list(zip(X_valid.columns, m.coef_)), key=itemgetter(1))
coefs

skip = [n for n,c in coefs if abs(c)<0.01]

df.drop(skip, axis=1, inplace=True)

# for n,c in df.items():
#     if '_' not in n: df[n+'2'] = df[n]**2

X_train, X_valid = split_vals(df, n_trn)

m = LassoCV().fit(X_train, y_train)
m.score(X_valid, y_valid)

rmse(m.predict(X_valid), y_valid)

coefs = sorted(list(zip(X_valid.columns, m.coef_)), key=itemgetter(1))
coefs

np.savez(f'{PATH}tmp/regr_resid', m.predict(X_train), m.predict(X_valid))