# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glmnet as gln
from sklearn.preprocessing import scale
from sklearn import model_selection
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
%matplotlib inline
plt.style.use('seaborn-white')
# In R, I exported the dataset from package 'ISLR' to a csv file.
df = pd.read_csv('Data/Hitters.csv', index_col=0).dropna()
df.index.name = 'Player'
df.info()
df.head()
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
dummies.info()
print(dummies.head())
y = df.Salary
# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
# Define the feature set X.
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X.info()
X.head(5)
X_train = pd.read_csv('Data/Hitters_X_train.csv', index_col=0)
y_train = pd.read_csv('Data/Hitters_y_train.csv', index_col=0)
X_test = pd.read_csv('Data/Hitters_X_test.csv', index_col=0)
y_test = pd.read_csv('Data/Hitters_y_test.csv', index_col=0)
The glmnet algorithms in R optimize the objective function using cyclical coordinate descent, while scikit-learn Ridge regression uses linear least squares with L2 regularization. They are rather different implementations, but the general principles are the same.
The glmnet() function in R optimizes:
(See R documentation and https://cran.r-project.org/web/packages/glmnet/vignettes/glmnet_beta.pdf)
The function supports L1 and L2 regularization. For just Ridge regression we need to use $\alpha = 0 $. This reduces the above cost function to
The sklearn Ridge() function optimizes:
which is equivalent to optimizing
alphas = 10**np.linspace(10,-2,100)*0.5
ridge = Ridge()
coefs = []
for a in alphas:
ridge.set_params(alpha=a)
ridge.fit(scale(X), y)
coefs.append(ridge.coef_)
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization');
The above plot shows that the Ridge coefficients get larger when we decrease alpha.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
ridge2 = Ridge(alpha=len(X_)*11498/2)
ridge2.fit(scaler.transform(X_train), y_train)
pred = ridge2.predict(scaler.transform(X_test))
mean_squared_error(y_test, pred)
pd.Series(ridge2.coef_.flatten(), index=X.columns)
This big penalty shrinks the coefficients to a very large degree and makes the model more biased, resulting in a higher MSE.
ridge2.set_params(alpha=10**10)
ridge2.fit(scale(X_train), y_train)
pred = ridge2.predict(scale(X_test))
mean_squared_error(y_test, pred)
ridgecv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error')
ridgecv.fit(scale(X_train), y_train)
ridgecv.alpha_
ridge2.set_params(alpha=ridgecv.alpha_)
ridge2.fit(scale(X_train), y_train)
mean_squared_error(y_test, ridge2.predict(scale(X_test)))
pd.Series(ridge2.coef_.flatten(), index=X.columns)
This relatively new module is a wrapper for the fortran library used in the R package glmnet
. It gives mostly the exact same results as described in the book. However, the predict()
method does not give you the regression coefficients for lambda values not in the lambda_path. It only returns the predicted values.
https://github.com/civisanalytics/python-glmnet
grid = 10**np.linspace(10,-2,100)
ridge3 = gln.ElasticNet(alpha=0, lambda_path=grid)
ridge3.fit(X, y)
ridge3.lambda_path_[49]
print('Intercept: {:.3f}'.format(ridge3.intercept_path_[49]))
pd.Series(np.round(ridge3.coef_path_[:,49], decimals=3), index=X.columns)
np.sqrt(np.sum(ridge3.coef_path_[:,49]**2))
ridge3.lambda_path_[59]
print('Intercept: {:.3f}'.format(ridge3.intercept_path_[59]))
pd.Series(np.round(ridge3.coef_path_[:,59], decimals=3), index=X.columns)
np.sqrt(np.sum(ridge3.coef_path_[:,59]**2))
ridge4 = gln.ElasticNet(alpha=0, lambda_path=grid, scoring='mean_squared_error', tol=1e-12)
ridge4.fit(X_train, y_train.values.ravel())
# prediction using lambda = 4
pred = ridge4.predict(X_test, lamb=4)
mean_squared_error(y_test.values.ravel(), pred)
ridge5 = gln.ElasticNet(alpha=0, scoring='mean_squared_error')
ridge5.fit(X_train, y_train.values.ravel())
# Lambda with best CV performance
ridge5.lambda_max_
# Lambda larger than lambda_max_, but with a CV score that is within 1 standard deviation away from lambda_max_
ridge5.lambda_best_
plt.figure(figsize=(15,6))
plt.errorbar(np.log(ridge5.lambda_path_), -ridge5.cv_mean_score_, color='r', linestyle='None', marker='o',
markersize=5, yerr=ridge5.cv_standard_error_, ecolor='lightgrey', capsize=4)
for ref, txt in zip([ridge5.lambda_best_, ridge5.lambda_max_], ['Lambda best', 'Lambda max']):
plt.axvline(x=np.log(ref), linestyle='dashed', color='lightgrey')
plt.text(np.log(ref), .95*plt.gca().get_ylim()[1], txt, ha='center')
plt.xlabel('log(Lambda)')
plt.ylabel('Mean-Squared Error');
# MSE for lambda with best CV performance
pred = ridge5.predict(X_test, lamb=ridge5.lambda_max_)
mean_squared_error(y_test, pred)
ridge6= gln.ElasticNet(alpha=0, scoring='mean_squared_error', n_splits=10)
ridge6.fit(X, y)
# These are not really close to the ones in the book.
pd.Series(ridge6.coef_path_[:,ridge6.lambda_max_inx_], index=X.columns)
For both glmnet in R and sklearn Lasso() function the standard L1 penalty is:
lasso = Lasso(max_iter=10000)
coefs = []
for a in alphas*2:
lasso.set_params(alpha=a)
lasso.fit(scale(X_train), y_train)
coefs.append(lasso.coef_)
ax = plt.gca()
ax.plot(alphas*2, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Lasso coefficients as a function of the regularization');
lassocv = LassoCV(alphas=None, cv=10, max_iter=10000)
lassocv.fit(scale(X_train), y_train.values.ravel())
lassocv.alpha_
lasso.set_params(alpha=lassocv.alpha_)
lasso.fit(scale(X_train), y_train)
mean_squared_error(y_test, lasso.predict(scale(X_test)))
# Some of the coefficients are now reduced to exactly zero.
pd.Series(lasso.coef_, index=X.columns)
lasso2 = gln.ElasticNet(alpha=1, lambda_path=grid, scoring='mean_squared_error', n_splits=10)
lasso2.fit(X_train, y_train.values.ravel())
l1_norm = np.sum(np.abs(lasso2.coef_path_), axis=0)
plt.figure(figsize=(10,6))
plt.plot(l1_norm, lasso2.coef_path_.T)
plt.xlabel('L1 norm')
plt.ylabel('Coefficients');
lasso3 = gln.ElasticNet(alpha=1, scoring='mean_squared_error', n_splits=10)
lasso3.fit(X_train, y_train.values.ravel())
plt.figure(figsize=(15,6))
plt.errorbar(np.log(lasso3.lambda_path_), -lasso3.cv_mean_score_, color='r', linestyle='None', marker='o',
markersize=5, yerr=lasso3.cv_standard_error_, ecolor='lightgrey', capsize=4)
for ref, txt in zip([lasso3.lambda_best_, lasso3.lambda_max_], ['Lambda best', 'Lambda max']):
plt.axvline(x=np.log(ref), linestyle='dashed', color='lightgrey')
plt.text(np.log(ref), .95*plt.gca().get_ylim()[1], txt, ha='center')
plt.xlabel('log(Lambda)')
plt.ylabel('Mean-Squared Error');
pred = lasso3.predict(X_test, lamb=lasso3.lambda_max_)
mean_squared_error(y_test, pred)
lasso4 = gln.ElasticNet(alpha=1, lambda_path=grid, scoring='mean_squared_error', n_splits=10)
lasso4.fit(X, y)
# These are not really close to the ones in the book.
pd.Series(lasso4.coef_path_[:,lasso4.lambda_max_inx_], index=X.columns)
Scikit-klearn does not have an implementation of PCA and regression combined like the 'pls' package in R. https://cran.r-project.org/web/packages/pls/vignettes/pls-manual.pdf
pca = PCA()
X_reduced = pca.fit_transform(scale(X))
print(pca.components_.shape)
pd.DataFrame(pca.components_.T).loc[:4,:5]
The above loadings are the same as in R.
print(X_reduced.shape)
pd.DataFrame(X_reduced).loc[:4,:5]
The above principal components are the same as in R.
# Variance explained by the principal components
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
# 10-fold CV, with shuffle
n = len(X_reduced)
kf_10 = KFold(n_splits=10, shuffle=True, random_state=1)
regr = LinearRegression()
mse = []
# Calculate MSE with only the intercept (no principal components in regression)
score = -1*cross_val_score(regr, np.ones((n,1)), y.ravel(), cv=kf_10, scoring='neg_mean_squared_error').mean()
mse.append(score)
# Calculate MSE using CV for the 19 principle components, adding one component at the time.
for i in np.arange(1, 20):
score = -1*cross_val_score(regr, X_reduced[:,:i], y.ravel(), cv=kf_10, scoring='neg_mean_squared_error').mean()
mse.append(score)
plt.plot(mse, '-v')
plt.xlabel('Number of principal components in regression')
plt.ylabel('MSE')
plt.title('Salary')
plt.xlim(xmin=-1);
The above plot indicates that the lowest training MSE is reached when doing regression on 18 components.
regr_test = LinearRegression()
regr_test.fit(X_reduced, y)
regr_test.coef_
pca2 = PCA()
X_reduced_train = pca2.fit_transform(scale(X_train))
n = len(X_reduced_train)
# 10-fold CV, with shuffle
kf_10 = KFold(n_splits=10, shuffle=False, random_state=1)
mse = []
# Calculate MSE with only the intercept (no principal components in regression)
score = -1*cross_val_score(regr, np.ones((n,1)), y_train, cv=kf_10, scoring='neg_mean_squared_error').mean()
mse.append(score)
# Calculate MSE using CV for the 19 principle components, adding one component at the time.
for i in np.arange(1, 20):
score = -1*cross_val_score(regr, X_reduced_train[:,:i], y_train, cv=kf_10, scoring='neg_mean_squared_error').mean()
mse.append(score)
plt.plot(np.array(mse), '-v')
plt.xlabel('Number of principal components in regression')
plt.ylabel('MSE')
plt.title('Salary')
plt.xlim(xmin=-1);
The above plot indicates that the lowest training MSE is reached when doing regression on 6 components.
X_reduced_test = pca2.transform(scale(X_test))[:,:7]
# Train regression model on training data
regr = LinearRegression()
regr.fit(X_reduced_train[:,:7], y_train)
# Prediction with test data
pred = regr.predict(X_reduced_test)
mean_squared_error(y_test, pred)
Scikit-learn PLSRegression gives same results as the pls package in R when using 'method='oscorespls'. In the LAB excercise, the standard method is used which is 'kernelpls'.
When doing a slightly different fitting in R, the result is close to the one obtained using scikit-learn.
pls.fit=plsr(Salary~., data=Hitters, subset=train, scale=TRUE, validation="CV", method='oscorespls')
validationplot(pls.fit,val.type="MSEP", intercept = FALSE)
See documentation: http://scikit-learn.org/dev/modules/generated/sklearn.cross_decomposition.PLSRegression.html#sklearn.cross_decomposition.PLSRegression
n = len(X_train)
# 10-fold CV, with shuffle
kf_10 = KFold(n_splits=10, shuffle=False, random_state=1)
mse = []
for i in np.arange(1, 20):
pls = PLSRegression(n_components=i)
score = cross_val_score(pls, scale(X_train), y_train, cv=kf_10, scoring='neg_mean_squared_error').mean()
mse.append(-score)
plt.plot(np.arange(1, 20), np.array(mse), '-v')
plt.xlabel('Number of principal components in regression')
plt.ylabel('MSE')
plt.title('Salary')
plt.xlim(xmin=-1);
pls = PLSRegression(n_components=2)
pls.fit(scale(X_train), y_train)
mean_squared_error(y_test, pls.predict(scale(X_test)))