# Standard libraries
from itertools import combinations
# Use rpy2 for loading R datasets
from rpy2.robjects.packages import importr
from rpy2.robjects.packages import data as rdata
from rpy2.robjects import pandas2ri
# Math and data processing
import numpy as np
import scipy as sp
import pandas as pd
# StatsModels
import statsmodels.api as sm
import statsmodels.formula.api as smf
# scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score
from sklearn.preprocessing import scale
from sklearn.metrics import mean_squared_error
# Visulization
from IPython.display import display
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
mpl.style.use('ggplot')
# Hitters dataset is in R ISLR package
islr = importr('ISLR')
hitters_rdf = rdata(islr).fetch('Hitters')['Hitters']
hitters = pandas2ri.ri2py(hitters_rdf)
display(hitters.head(5))
print(hitters.info())
print(hitters.Salary.isnull().sum())
hitters.dropna(axis=0, inplace=True)
print(hitters.shape)
print(hitters.Salary.isnull().sum())
# Score function to produce linear regression fitting and RSS for a feature set
def score_rss(data, feature_set, endog_name):
"""
Regress score function based on RSS, residual sum of squares.
"""
# Have to use sm formula interface to avoid manually generate dummies
reg = smf.ols(endog_name + ' ~ ' + '+'.join(feature_set), data).fit()
rss = reg.mse_resid * reg.df_resid
return reg, (-1) * rss
# For a given number of features, select the best combination by score function
def select_k_best(data, k, endog_name, score_function):
"""
Select the best k features out of all the possible combinations.
Different than sklearn.feature_selection.SelectKBest.
data: data frame.
k: int.
endog_name: response name.
score_function: a callable with parameters X, y, feature_set
returns: an iterable of (model, score).
"""
features = data.columns.drop(endog_name)
# Iterate over all feature combinations and compute scores
models = []
scores = []
for feature_subset in combinations(features, k):
model, score = score_function(data, feature_subset, endog_name)
models.append(model)
scores.append(score)
# Return the best model with the highest score
return models[scores.index(max(scores))]
# Define features and response
response = 'Salary'
features = hitters.columns.drop([response])
# Find the best models with 1 to p features
print("\nSearch the best k features for k = 1 to 8, based on RSS.\nTaking long time...\n")
best_models = []
for k in range(1, 8+1):
best_model = select_k_best(hitters, k, response, score_rss)
best_models.append(best_model)
best_features = best_model.model.exog_names
best_features.remove('Intercept')
print("Best {} features: {}".format(k, best_features))
# Forward stepwise selection
def forward_select(data, endog_name, score_function):
"""
Select features by forward stepwise selection algorithm.
data: data frame.
endog_name: response name.
score_function: a callable with parameters X, y, feature_set
returns: an iterable of (model, score).
"""
features = data.columns.drop(endog_name)
p = len(features)
# Loop number of features from 1 to p
selected_features = [] # start from null model
for j in range(1, p+1):
feature_candidates = [xj for xj in features if xj not in selected_features]
models = []
scores = []
# Loop over remaining features at the j-th step
for xj in feature_candidates:
feature_subset = selected_features + [xj]
model, score = score_function(data, feature_subset, endog_name)
models.append(model)
scores.append(score)
# Select the best feature at the j-th step
best_feature = feature_candidates[scores.index(max(scores))]
selected_features += [best_feature]
print(" Best {} features: {}".format(j, selected_features))
# Define features and response
response = 'Salary'
features = hitters.columns.drop([response])
# Run forward stepwise selection on Hitters
print("\nForward stepwise select features based on RSS:\n")
forward_select(hitters, response, score_rss)
# Backward stepwise selection
def backward_select(data, endog_name, score_function):
"""
Select features by backward stepwise selection algorithm.
data: data frame.
endog_name: response name.
score_function: a callable with parameters X, y, feature_set
returns: an iterable of (model, score).
"""
features = list(data.columns.drop(endog_name))
p = len(features)
# Loop number of features from 1 to p
selected_features = features.copy() # start from all the features
for j in range(p, 0, -1):
print("Best {} features: {}".format(j, selected_features))
if j==1: break
feature_candidates = list(combinations(selected_features, j-1))
models = []
scores = []
# Loop over feature candidates at the j-th step
for feature_subset in feature_candidates:
model, score = score_function(data, feature_subset, endog_name)
models.append(model)
scores.append(score)
# Select the best feature at the j-th step
selected_features = list(feature_candidates[scores.index(max(scores))])
# Define features and response
response = 'Salary'
features = hitters.columns.drop([response])
# Run backward stepwise selection on Hitters
print("\nBackward stepwise select features based on RSS:\n")
backward_select(hitters, response, score_rss)
# Score function and forward select function
def score_rss_valset(data_train, data_test, feature_set, endog_name):
"""
Regress score function based on RSS, residual sum of squares.
"""
# Have to use sm formula interface to avoid manually generate dummies
reg = smf.ols(endog_name + ' ~ ' + '+'.join(feature_set), data_train).fit()
y_pred = reg.predict(data_test[feature_set])
y_test = data_test[endog_name].values
rss = ((y_pred - y_test) ** 2).sum()
return reg, (-1) * rss
def forward_select_valset(data_train, data_test, endog_name, score_function):
"""
Select features by forward stepwise selection algorithm.
data: data frame.
endog_name: response name.
score_function: a callable with parameters X, y, feature_set
returns: an iterable of (model, score).
"""
features = data_train.columns.drop(endog_name)
p = len(features)
high_scores = []
best_features = []
# Loop number of features from 1 to p
selected_features = [] # start from null model
for j in range(1, p+1):
feature_candidates = [xj for xj in features if xj not in selected_features]
models = []
scores = []
# Loop over remaining features at the j-th step
for xj in feature_candidates:
feature_subset = selected_features + [xj]
model, score = score_function(data_train, data_test, feature_subset, endog_name)
models.append(model)
scores.append(score)
# Select the best feature at the j-th step
highest_score = max(scores)
high_scores.append(highest_score)
best_feature = feature_candidates[scores.index(highest_score)]
selected_features += [best_feature]
best_features.append(selected_features)
scores_df = pd.DataFrame(high_scores, index=range(1,p+1), columns=[''])
scores_df.index.name = '# of x_j'
best_score = max(high_scores)
best_j = high_scores.index(best_score) + 1
return scores_df, best_j, best_score, best_features
# Train-test split. Test set size is random with prob=0.5, as in the book.
np.random.seed(seed=51)
train_mask = np.random.choice([True, False], size = hitters.shape[0], replace = True)
test_mask = np.invert(train_mask)
print("Train set size = ", train_mask.sum())
print("Test set size = ", test_mask.sum())
Now run forward stepwise selection instead of BSS as in the book to speed up.
# Define features and response
response = 'Salary'
features = hitters.columns.drop([response])
# Run forward stepwise selection on Hitters
print("\nForward stepwise select features on training set, based on RSS. \n")
scores_valset, best_j_valset, best_score, best_features_valset = forward_select_valset(hitters[train_mask], hitters[test_mask], response, score_rss_valset)
print("Mean RSS over numbers of features:")
mean_rss_valset = scores_valset * (-1)
display(mean_rss_valset)
print("\nThe best model is the one having {} features:\n{}".format(best_j_valset, best_features_valset[best_j_valset-1]))
Finally perform best subset selection on the full data set, and select the best 10-variable model.
# Run forward stepwise selection on Hitters full date set
print("\nForward stepwise select features on full data set, based on RSS. \n")
_, _, _, best_features_full = forward_select_valset(hitters, hitters, response, score_rss_valset)
print("The best {} features are:\n{}".format(best_j_valset, best_features_full[best_j_valset-1]))
# 10-fold CV splits
kf = KFold(10, random_state=55)
# Forward stepwise selection on each split
scores_cv_list = []
for train_index, test_index in kf.split(hitters):
scores_df, _, _, _ = forward_select_valset(hitters.iloc[train_index], hitters.iloc[test_mask], response, score_rss_valset)
scores_cv_list.append(scores_df)
# Combine the scores of 10 folds
scores_cv = pd.concat(scores_cv_list, axis=1)
scores_cv.columns = range(1, 11)
scores_cv.columns.name = 'K-Fold'
# Compute mean RSS of 10 folds for each j = 1..p
mean_rss_cv = scores_cv.mean(axis=1) * (-1)
print(mean_rss_cv)
# Plots to compare validation set and cross-validation
fig, ax = plt.subplots(1, 2, figsize=(10, 6))
ax1 = plt.subplot(121)
plt.plot(range(1, len(features)+1), mean_rss_valset)
plt.title('Validation Set Approach')
plt.xlabel('Number of Predictors')
plt.ylabel('Lowest RSS')
ax2 = plt.subplot(122)
plt.plot(range(1, len(features)+1), mean_rss_cv)
plt.title('Cross-Validation Approach')
plt.xlabel('Number of Predictors')
plt.ylabel('Mean of Lowest RSS')
plt.show()
# Generate dummy variables for qualitative variables
qual_vars = ['League', 'Division', 'NewLeague']
hitters_dummies = pd.get_dummies(hitters[qual_vars])
hitters_dummies.info()
# Define X, y features and reponse data for scikit-learn
dummy_vars = ['League_N', 'Division_W', 'NewLeague_N']
response = 'Salary'
y = hitters[response]
# Drop response and qualitative variables, and combine with dummy data frame
X = pd.concat([hitters.drop(qual_vars + [response], axis=1), hitters_dummies[dummy_vars]], axis=1)
features = hitters.columns.drop([response])
X.info()
# Ridge regression on full dataset, over 100 alphas from 10 to -2
alphas = 10**np.linspace(10,-2,100)
ridge = Ridge()
coefs = []
for alpha in alphas:
ridge.set_params(alpha=alpha*0.5) # alpha/2 to align with R
ridge.fit(scale(X), y)
coefs.append(ridge.coef_)
# Plot
fig, ax = plt.subplots(figsize=(8,6))
plt.plot(alphas, coefs)
ax.set_xscale('log')
plt.xlim(1e10, 1e-2) # reverse axis
plt.xlabel('Regularization Parameter')
plt.ylabel('Coefficients')
plt.title('Ridge regression coefficients vs. regularization parameter.')
plt.show()
# Split Hitters data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)
Fit on training set with alpha = 4
ridge_a4 = Ridge(alpha=4, normalize=True)
ridge_a4.fit(X_train, y_train)
y_pred_a4 = ridge_a4.predict(X_test)
print('\nRidge regression coefficients:\n', pd.Series(ridge_a4.coef_, index=features))
print('\nMSE = ', mean_squared_error(y_test, y_pred_a4))
Fit on training set with alpha = 1010
ridge_a1e10 = Ridge(alpha=10**10, normalize=True)
ridge_a1e10.fit(X_train, y_train)
y_pred_a1e10 = ridge_a1e10.predict(X_test)
print('\nRidge regression coefficients:\n', pd.Series(ridge_a1e10.coef_, index=features))
print('\nMSE = ', mean_squared_error(y_test, y_pred_a1e10))
Ridge regularization with cross-validation
# Ridge regularization with cross-validation
ridgecv = RidgeCV(alphas=alphas*0.5, scoring='neg_mean_squared_error', normalize=True)
ridgecv.fit(X_train, y_train)
print("The best ridge regularization Alpha = ", ridgecv.alpha_)
ridge_cv = Ridge(alpha=ridgecv.alpha_, normalize=True)
ridge_cv.fit(X_train, y_train)
mse = mean_squared_error(y_test, ridge_cv.predict(X_test))
print("MSE = ", mse)
Ridge regression on full data set with CV-optimized alpha
ridge_cv.fit(X, y)
print(pd.Series(ridge_cv.coef_, index=features))
As expected, none of the coefficients are exactly zero - ridge regression does not perform variable selection!
Lasso regression on training set, over 100 alphas from 10 to -2.
lasso = Lasso(max_iter=10000, normalize=True)
coefs = []
for alpha in alphas:
lasso.set_params(alpha=alpha)
lasso.fit(scale(X_train), y_train)
coefs.append(lasso.coef_)
# Plot
fig, ax = plt.subplots(figsize=(8, 6))
plt.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])
plt.xlabel('Regularization Parameter')
plt.ylabel('Regression Coefficients')
plt.title('Lasso regression coefficients vs. regularization parameter.')
plt.show()
Lasso regularization with cross-validation
lassocv = LassoCV(alphas=None, cv=10, max_iter=100000, normalize=True)
lassocv.fit(X_train, y_train)
print("The best Lasso regularization Alpha = ", ridgecv.alpha_)
lasso_cv = Lasso(max_iter=10000, normalize=True, alpha=lassocv.alpha_)
lasso_cv.fit(X_train, y_train)
mse = mean_squared_error(y_test, lasso_cv.predict(X_test))
print("MSE = ", mse)
pd.Series(lasso_cv.coef_, index=features)
Some coefficients are set to exact 0 by Lasso.
pca = PCA()
X_reduced = pca.fit_transform(scale(X))
print("The sahpe of loading matrix:", pca.components_.shape)
print("\nHead of loading matrix:")
display(pd.DataFrame(pca.components_.T).loc[:4,:5])
# PCR with number of PCs from 0 to 19, with 10-fold CV
# For 0 PC case, regression on intercept only.
kf = KFold(n_splits=10, shuffle=True, random_state=1)
regr = LinearRegression()
mse = []
for n_pc in range(0, pca.n_components_ + 1):
if n_pc == 0:
X_regr = np.ones((len(y),1))
else:
X_regr = X_reduced[:, :n_pc]
scores = cross_val_score(regr, X_regr, y, cv=kf, scoring='neg_mean_squared_error')
mse.append(scores.mean() * (-1))
# Find the n_pc with lowest MSE, or highest CV score
min_mse = min(mse)
min_mse_idx = mse.index(min_mse)
# Plot MSE vs. number of PCs
fig, ax = plt.subplots(figsize=(8, 6))
plt.plot(mse)
plt.xticks(range(20), range(20))
min_mse_marker, = plt.plot(min_mse_idx, min_mse, 'b*', markersize=15)
plt.xlabel('Number of Principal Components')
plt.ylabel('MSE')
plt.title('Principal Component Regression with 10-Fold Cross-Validation')
plt.legend([min_mse_marker], ['Best number of principal components'])
plt.show()
From the plot we also see that the cross-validation error is roughly the same when only one component is included in the model. This suggests that a model that uses just a small number of components might suffice.
evr = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print("Explained Variance Ratio:")
display(pd.Series([str(p) + ' %' for p in evr]))
PCR on training data and evaluate its test set performance.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)
X_train_reduced = pca.fit_transform(scale(X_train))
# PCR on training set with number of PCs from 0 to 19, with 10-fold CV
# For 0 PC case, regression on intercept only.
kf = KFold(n_splits=10, shuffle=True, random_state=1)
regr = LinearRegression()
mse = []
for n_pc in range(0, pca.n_components_ + 1):
if n_pc == 0:
X_regr = np.ones((len(y_train),1))
else:
X_regr = X_train_reduced[:, :n_pc]
scores = cross_val_score(regr, X_regr, y_train, cv=kf, scoring='neg_mean_squared_error')
mse.append(scores.mean() * (-1))
# Find the n_pc with lowest MSE, or highest CV score
min_mse = min(mse)
min_mse_idx = mse.index(min_mse)
# Plot MSE vs. number of PCs
fig, ax = plt.subplots(figsize=(8, 6))
plt.plot(mse)
plt.xticks(range(20), range(20))
min_mse_marker, = plt.plot(min_mse_idx, min_mse, 'b*', markersize=15)
plt.xlabel('Number of Principal Components')
plt.ylabel('MSE')
plt.title('Principal Component Regression on Training Set with 10-Fold Cross-Validation')
plt.legend([min_mse_marker], ['Best number of principal components'])
plt.show()
The lowest cross-validation error occurs when 6 components are used. Next perform PCR on the test data and compute the test MSE.
X_test_reduced = pca.transform(scale(X_test))[:,:7]
# Train regression model on training data
regr = LinearRegression()
regr.fit(X_train_reduced[:,:7], y_train)
# Prediction with test data
y_pred = regr.predict(X_test_reduced)
print("Test set MSE = ", mean_squared_error(y_test, y_pred))
kf = KFold(n_splits=10, shuffle=True, random_state=1)
mse = []
for i in range(1, 20):
pls = PLSRegression(n_components=i)
scores = cross_val_score(pls, scale(X_train), y_train, cv=kf, scoring='neg_mean_squared_error')
mse.append(scores.mean() * (-1))
# Find the number of PLS directions with lowest MSE, or highest CV score
min_mse = min(mse)
min_mse_idx = mse.index(min_mse) + 1
# Plot MSE vs. number of PLS directions
fig, ax = plt.subplots(figsize=(8, 6))
plt.plot(range(1, 20), mse)
plt.xticks(range(20), range(20))
min_mse_marker, = plt.plot(min_mse_idx, min_mse, 'b*', markersize=15)
plt.xlabel('Number of PLS directions')
plt.ylabel('MSE')
plt.title('PLS on Training Set with 10-Fold Cross-Validation')
plt.legend([min_mse_marker], ['Best number of PLS directions'])
plt.show()
The lowest MSE occurs with 2 PLS directions, which is now used for evaluating on test set.
pls = PLSRegression(n_components=2)
pls.fit(scale(X_train), y_train)
mean_squared_error(y_test, pls.predict(scale(X_test)))