#!/usr/bin/env python # coding: utf-8 # In[222]: import pandas as pd import statsmodels.formula.api as smf import matplotlib.pyplot as plt import seaborn as sns from sklearn import model_selection, metrics, linear_model, decomposition, cross_decomposition from itertools import combinations import numpy as np import time # In[236]: hitters_dat = pd.read_csv('hitters.csv') hitters_dat = hitters_dat.drop(hitters_dat.columns[0], axis=1) hitters_dat = hitters_dat.dropna(axis=0) hitters_dat['League'] = hitters_dat['League'].map({'N' : 1., 'A' : 0.}) hitters_dat['Division'] = hitters_dat['Division'].map({'W' : 1., 'E' : 0.}) hitters_dat['NewLeague'] = hitters_dat['NewLeague'].map({'N' : 1., 'A' : 0.}) #hitters_dat = (hitters_dat - hitters_dat.mean()) / hitters_dat.std() hitters_dat.head() # In[1]: # Best subset selection # There are 2 ^ 19 ~= 5 * 10^5 possible subsets of all features. # Trying them all takes ages. # For 4 predictors there are (19 choose 1) + .. + (19 choose 4) ~= 5 * 10^3 possible models. def n_choose_k(n, k): return int(np.math.factorial(n) / (np.math.factorial(n - k) * np.math.factorial(k))) def best_subset(n_predictors, target_column, data): i = 1 predictors = data.drop(target_column, axis=1).columns top_models = [] while i <= n_predictors: tick = time.time() cmbs = list(combinations(predictors, i)) formulae = ['Salary ~ ' + ' + '.join(s) for s in cmbs] models = [smf.ols(formula=f, data=data).fit() for f in formulae] top_model = sorted(models, key=lambda m: m.rsquared, reverse=True)[0] top_models.append(top_model) tock = time.time() print('{}/{} : {} combinations, {} seconds'.format(i, n_predictors, n_choose_k(n_predictors, i), tock-tick)) i += 1 return pd.DataFrame({'model' : top_models, 'n_predictors' : range(1, n_predictors + 1)}) # In[238]: # Select between models with different numbers of predictors using adjusted training metrics best = best_subset(3, 'Salary', hitters_dat) best['adj_R_sq'] = best['model'].map(lambda m: m.rsquared_adj) best['bic'] = best['model'].map(lambda m: m.bic) best['aic'] = best['model'].map(lambda m: m.aic) # In[239]: # Plot the error metrics _, (ax1, ax2, ax3) = plt.subplots(nrows=3, figsize=(15,15)) sns.barplot(y='n_predictors', x='adj_R_sq', data=best, ax=ax1, orient='h') sns.barplot(y='n_predictors', x='bic', data=best, ax=ax2, orient='h') sns.barplot(y='n_predictors', x='aic', data=best, ax=ax3, orient='h') # In[249]: # Select between models with different numbers of predictors using the validation set approach train, test = model_selection.train_test_split(hitters_dat, test_size=0.2) best = best_subset(3, 'Salary', train) best['MSE'] = best['model'].map(lambda m: metrics.mean_squared_error(test['Salary'], m.predict(test))) best # In[251]: # Plot the error metric sns.barplot(y='n_predictors', x='MSE', data=best, orient='h') # In[309]: # Select between models with different numbers of predictors using cross validation results = pd.DataFrame() for train_idx, test_idx in model_selection.KFold(n_splits=10).split(hitters_dat): train = hitters_dat.iloc[train_idx] test = hitters_dat.iloc[test_idx] best = best_subset(3, 'Salary', train) mse = best['model'].map(lambda m: metrics.mean_squared_error(test['Salary'], m.predict(test))) results = results.append(mse, ignore_index=True) best['MSE'] = results.mean() best # In[310]: # Plot the error metric sns.barplot(y='n_predictors', x='MSE', data=best, orient='h') # In[311]: # Forward selection: # There are 1 + p(1 + p) / 2 possible models in this space def fwd_selection(target_column, data): predictors = data.drop('Salary', axis=1).columns formulae_s1 = [(p, 'Salary ~ {}'.format(p)) for p in predictors] models_s1 = [(p, smf.ols(formula=f, data=data).fit()) for p, f in formulae_s1] predictor_s1, model_s1 = sorted(models_s1, key=lambda tup: tup[1].rsquared, reverse=True)[0] predictors = predictors.drop(predictor_s1) formula = 'Salary ~ {}'.format(predictor_s1) step_models = [model_s1] while len(predictors) > 0: models = [] for p in predictors: f = formula + ' + ' + p m = smf.ols(formula=f, data=data).fit() models.append((p, m)) predictor, model = sorted(models, key=lambda tup: tup[1].rsquared, reverse=True)[0] step_models.append(model) formula = formula + ' + ' + predictor predictors = predictors.drop(predictor) return pd.DataFrame({'model': step_models, 'n_predictors' : range(1, 20)}) # In[312]: # Select between models with different numbers of predictors using adjusted training metrics best = fwd_selection('Salary', hitters_dat) best['adj_R_sq'] = best['model'].map(lambda m: m.rsquared_adj) best['bic'] = best['model'].map(lambda m: m.bic) best['aic'] = best['model'].map(lambda m: m.aic) # In[313]: # Plot the error metrics _, (ax1, ax2, ax3) = plt.subplots(nrows=3, figsize=(15,15)) sns.barplot(y='n_predictors', x='adj_R_sq', data=best, ax=ax1, orient='h') sns.barplot(y='n_predictors', x='bic', data=best, ax=ax2, orient='h') sns.barplot(y='n_predictors', x='aic', data=best, ax=ax3, orient='h') # In[314]: # Select between models with different numbers of predictors using the validation set approach train, test = model_selection.train_test_split(hitters_dat, test_size=0.2) best = fwd_selection('Salary', train) best['MSE'] = best['model'].map(lambda m: metrics.mean_squared_error(test['Salary'], m.predict(test))) best # In[315]: # Plot the error metric sns.barplot(y='n_predictors', x='MSE', data=best, orient='h') # In[316]: # Select between models with different numbers of predictors using cross validation results = pd.DataFrame() for train_idx, test_idx in model_selection.KFold(n_splits=10).split(hitters_dat): train = hitters_dat.iloc[train_idx] test = hitters_dat.iloc[test_idx] best = fwd_selection('Salary', train) mse = best['model'].map(lambda m: metrics.mean_squared_error(test['Salary'], m.predict(test))) results = results.append(mse, ignore_index=True) best['MSE'] = results.mean() best # In[317]: # Plot the error metric sns.barplot(y='n_predictors', x='MSE', data=best, orient='h') # In[334]: # Backward selection # There are 1 + p(1 + p) / 2 possible models in this space def bwd_selection(target_column, data): predictors = data.drop('Salary', axis=1).columns formula_s1 = 'Salary ~ ' + ' + '.join(predictors) model_s1 = smf.ols(formula=formula_s1, data=data).fit() step_models = [model_s1] while len(predictors) > 1: models = [] for p in predictors: f = 'Salary ~ ' + ' + '.join(predictors.drop(p)) m = smf.ols(formula=f, data=data).fit() models.append((p, m)) p, m = sorted(models, key=lambda tup: tup[1].rsquared, reverse=True)[0] step_models.append(m) predictors = predictors.drop(p) return pd.DataFrame({'model': step_models, 'n_predictors' : range(1, 20)[::-1]}) # In[331]: # Select between models with different numbers of predictors using adjusted training metrics best = bwd_selection('Salary', hitters_dat) best['adj_R_sq'] = best['model'].map(lambda m: m.rsquared_adj) best['bic'] = best['model'].map(lambda m: m.bic) best['aic'] = best['model'].map(lambda m: m.aic) # In[333]: # Plot the error metrics _, (ax1, ax2, ax3) = plt.subplots(nrows=3, figsize=(15,15)) sns.barplot(y='n_predictors', x='adj_R_sq', data=best, ax=ax1, orient='h') sns.barplot(y='n_predictors', x='bic', data=best, ax=ax2, orient='h') sns.barplot(y='n_predictors', x='aic', data=best, ax=ax3, orient='h') # In[336]: # Select between models with different numbers of predictors using the validation set approach train, test = model_selection.train_test_split(hitters_dat, test_size=0.2) best = bwd_selection('Salary', train) best['MSE'] = best['model'].map(lambda m: metrics.mean_squared_error(test['Salary'], m.predict(test))) best # In[338]: # Plot the error metric sns.barplot(y='n_predictors', x='MSE', data=best, orient='h') # In[339]: # Select between models with different numbers of predictors using cross validation results = pd.DataFrame() for train_idx, test_idx in model_selection.KFold(n_splits=10).split(hitters_dat): train = hitters_dat.iloc[train_idx] test = hitters_dat.iloc[test_idx] best = bwd_selection('Salary', train) mse = best['model'].map(lambda m: metrics.mean_squared_error(test['Salary'], m.predict(test))) results = results.append(mse, ignore_index=True) best['MSE'] = results.mean() best # In[340]: # Plot the error metric sns.barplot(y='n_predictors', x='MSE', data=best, orient='h') # In[382]: # Ridge Regression - Cross-Validation with test MSE results = pd.DataFrame() alpha_range = np.linspace(10**-3, 3 * (10**2), num=1000) for train_idx, test_idx in model_selection.KFold(n_splits=10).split(hitters_dat): train, test = hitters_dat.iloc[train_idx], hitters_dat.iloc[test_idx] errors = [] for alpha in alpha_range: reg = linear_model.Ridge(alpha=alpha) reg.fit(train.drop('Salary', axis=1), train['Salary']) error = metrics.mean_squared_error(test['Salary'], reg.predict(test.drop('Salary', axis=1))) errors.append(error) results = pd.concat([results, pd.DataFrame(errors, index=alpha_range).T], axis=0, ignore_index=True) df = pd.DataFrame({'lambda' : alpha_range, 'MSE' : results.mean()}) sns.scatterplot(x='lambda', y='MSE', data=df) # In[384]: # LASSO Regression - Cross-Validation with test MSE results = pd.DataFrame() alpha_range = np.linspace(10**-2, 10**3, num=500) for train_idx, test_idx in model_selection.KFold(n_splits=10).split(hitters_dat): train, test = hitters_dat.iloc[train_idx], hitters_dat.iloc[test_idx] errors = [] for alpha in alpha_range: reg = linear_model.Lasso(alpha=alpha, max_iter=10000) reg.fit(train.drop('Salary', axis=1), train['Salary']) error = metrics.mean_squared_error(test['Salary'], reg.predict(test.drop('Salary', axis=1))) errors.append(error) results = pd.concat([results, pd.DataFrame(errors, index=alpha_range).T], axis=0, ignore_index=True) df = pd.DataFrame({'lambda' : alpha_range, 'MSE' : results.mean()}) sns.scatterplot(x='lambda', y='MSE', data=df) # In[387]: # PCR Regression - Cross-Validation with test MSE results = pd.DataFrame() predictors = hitters_dat.drop('Salary', axis=1).columns c_range = range(1, len(predictors) + 1) for train_idx, test_idx in model_selection.KFold(n_splits=10).split(hitters_dat): train, test = hitters_dat.iloc[train_idx], hitters_dat.iloc[test_idx] errors = [] for c in c_range: pca = decomposition.PCA(n_components=c) X = pca.fit_transform(train.drop('Salary', axis=1)) reg = linear_model.LinearRegression() reg.fit(X, train['Salary']) test_X = pca.transform(test.drop('Salary', axis=1)) error = metrics.mean_squared_error(test['Salary'], reg.predict(test_X)) #error = metrics.mean_squared_error(test['Salary'], reg.predict(test_X)) errors.append(error) results = pd.concat([results, pd.DataFrame(errors, index=c_range).T], axis=0, ignore_index=True) df = pd.DataFrame({'n_components' : c_range, 'MSE' : results.mean()}) sns.barplot(x='n_components', y='MSE', data=df) # In[388]: # PLS Regression - Cross-Validation with test MSE results = pd.DataFrame() predictors = hitters_dat.drop('Salary', axis=1).columns c_range = range(1, len(predictors) + 1) for train_idx, test_idx in model_selection.KFold(n_splits=10).split(hitters_dat): train, test = hitters_dat.iloc[train_idx], hitters_dat.iloc[test_idx] errors = [] for c in c_range: reg = cross_decomposition.PLSRegression(n_components=c) reg.fit(train.drop('Salary', axis=1), train['Salary']) error = metrics.mean_squared_error(test['Salary'], reg.predict(test.drop('Salary', axis=1))) errors.append(error) results = pd.concat([results, pd.DataFrame(errors, index=c_range).T], axis=0, ignore_index=True) df = pd.DataFrame({'n_components' : c_range, 'MSE' : results.mean()}) sns.barplot(x='n_components', y='MSE', data=df)