%load_ext watermark
%watermark -d -u -a 'Andreas Mueller, Kyle Kastner, Sebastian Raschka' -v -p numpy,scipy,matplotlib,scikit-learn
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
Linear models are useful when little data is available or for very large feature spaces as in text classification. In addition, they form a good case study for regularization.
All linear models for regression learn a coefficient parameter coef_
and an offset intercept_
to make predictions using a linear combination of features:
y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_
The difference between the linear models for regression is what kind of restrictions or penalties are put on coef_
as regularization , in addition to fitting the training data well.
The most standard linear model is the 'ordinary least squares regression', often simply called 'linear regression'. It doesn't put any additional restrictions on coef_
, so when the number of features is large, it becomes ill-posed and the model overfits.
Let us generate a simple simulation, to see the behavior of these models.
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
X, y, true_coefficient = make_regression(n_samples=200, n_features=30, n_informative=10, noise=100, coef=True, random_state=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5, train_size=60)
print(X_train.shape)
print(y_train.shape)
from sklearn.linear_model import LinearRegression
linear_regression = LinearRegression().fit(X_train, y_train)
print("R^2 on training set: %f" % linear_regression.score(X_train, y_train))
print("R^2 on test set: %f" % linear_regression.score(X_test, y_test))
from sklearn.metrics import r2_score
print(r2_score(np.dot(X, true_coefficient), y))
plt.figure(figsize=(10, 5))
coefficient_sorting = np.argsort(true_coefficient)[::-1]
plt.plot(true_coefficient[coefficient_sorting], "o", label="true")
plt.plot(linear_regression.coef_[coefficient_sorting], "o", label="linear regression")
plt.legend()
from sklearn.learning_curve import learning_curve
def plot_learning_curve(est, X, y):
training_set_size, train_scores, test_scores = learning_curve(est, X, y, train_sizes=np.linspace(.1, 1, 20))
estimator_name = est.__class__.__name__
line = plt.plot(training_set_size, train_scores.mean(axis=1), '--', label="training scores " + estimator_name)
plt.plot(training_set_size, test_scores.mean(axis=1), '-', label="test scores " + estimator_name, c=line[0].get_color())
plt.xlabel('Training set size')
plt.legend(loc='best')
plt.ylim(-0.1, 1.1)
plt.figure()
plot_learning_curve(LinearRegression(), X, y)
The Ridge estimator is a simple regularization (called l2 penalty) of the ordinary LinearRegression. In particular, it has the benefit of being not computationally more expensive than the ordinary least square estimate.
$$ \text{min}_{w,b} \sum_i || w^\mathsf{T}x_i + b - y_i||^2 + \alpha ||w||_2^2$$The amount of regularization is set via the alpha
parameter of the Ridge.
from sklearn.linear_model import Ridge
ridge_models = {}
training_scores = []
test_scores = []
for alpha in [100, 10, 1, .01]:
ridge = Ridge(alpha=alpha).fit(X_train, y_train)
training_scores.append(ridge.score(X_train, y_train))
test_scores.append(ridge.score(X_test, y_test))
ridge_models[alpha] = ridge
plt.figure()
plt.plot(training_scores, label="training scores")
plt.plot(test_scores, label="test scores")
plt.xticks(range(4), [100, 10, 1, .01])
plt.legend(loc="best")
plt.figure(figsize=(10, 5))
plt.plot(true_coefficient[coefficient_sorting], "o", label="true", c='b')
for i, alpha in enumerate([100, 10, 1, .01]):
plt.plot(ridge_models[alpha].coef_[coefficient_sorting], "o", label="alpha = %.2f" % alpha, c=plt.cm.summer(i / 3.))
plt.legend(loc="best")
Tuning alpha is critical for performance.
plt.figure()
plot_learning_curve(LinearRegression(), X, y)
plot_learning_curve(Ridge(alpha=10), X, y)
The Lasso estimator is useful to impose sparsity on the coefficient. In other words, it is to be prefered if we believe that many of the features are not relevant. This is done via the so-called l1 penalty.
$$ \text{min}_{w, b} \sum_i || w^\mathsf{T}x_i + b - y_i||^2 + \alpha ||w||_1$$from sklearn.linear_model import Lasso
lasso_models = {}
training_scores = []
test_scores = []
for alpha in [30, 10, 1, .01]:
lasso = Lasso(alpha=alpha).fit(X_train, y_train)
training_scores.append(lasso.score(X_train, y_train))
test_scores.append(lasso.score(X_test, y_test))
lasso_models[alpha] = lasso
plt.figure()
plt.plot(training_scores, label="training scores")
plt.plot(test_scores, label="test scores")
plt.xticks(range(4), [30, 10, 1, .01])
plt.legend(loc="best")
plt.figure(figsize=(10, 5))
plt.plot(true_coefficient[coefficient_sorting], "o", label="true", c='b')
for i, alpha in enumerate([30, 10, 1, .01]):
plt.plot(lasso_models[alpha].coef_[coefficient_sorting], "o", label="alpha = %.2f" % alpha, c=plt.cm.summer(i / 3.))
plt.legend(loc="best")
plt.figure()
plot_learning_curve(LinearRegression(), X, y)
plot_learning_curve(Ridge(alpha=10), X, y)
plot_learning_curve(Lasso(alpha=10), X, y)
All linear models for classification learn a coefficient parameter coef_
and an offset intercept_
to make predictions using a linear combination of features:
y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_ > 0
As you can see, this is very similar to regression, only that a threshold at zero is applied.
Again, the difference between the linear models for classification what kind of regularization is put on coef_
and intercept_
, but there are also minor differences in how the fit to the training set is measured (the so-called loss function).
The two most common models for linear classification are the linear SVM as implemented in LinearSVC and LogisticRegression.
A good intuition for regularization of linear classifiers is that with high regularization, it is enough if most of the points are classified correctly. But with less regularization, more importance is given to each individual data point.
This is illustrated using an linear SVM with different values of C
below.
from plots import plot_linear_svc_regularization
plot_linear_svc_regularization()
Similar to the Ridge/Lasso separation, you can set the 'penalty' parameter to 'l1' to enforce sparsity of the coefficients.
from sklearn.datasets import make_blobs
plt.figure()
X, y = make_blobs(random_state=42)
plt.scatter(X[:, 0], X[:, 1], c=y)
from sklearn.svm import LinearSVC
linear_svm = LinearSVC().fit(X, y)
print(linear_svm.coef_.shape)
print(linear_svm.intercept_.shape)
plt.scatter(X[:, 0], X[:, 1], c=y)
line = np.linspace(-15, 15)
for coef, intercept in zip(linear_svm.coef_, linear_svm.intercept_):
plt.plot(line, -(line * coef[0] + intercept) / coef[1])
plt.ylim(-10, 15)
plt.xlim(-10, 8)
Use LogisticRegression to classify digits, and grid-search the C parameter. How do you think the learning curves above change when you increase or decrease alpha? Try out to change the alpha parameter in ridge and lasso, and see if your intuition was correct.