#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np from scipy.linalg import inv, svd from scipy.optimize import minimize from scipy.sparse.linalg import lsqr from sklearn.datasets import load_boston from sklearn.linear_model import Ridge as skRidge # ### Implementation 1 # - based on scipy.sparse.linalg.lsqr # - center the dataset and calculate the intercept manually # - similar to scikit-learn solver='lsqr' # In[2]: class Ridge: def __init__(self, alpha=1.0): self.alpha = alpha def fit(self, X, y): X_mean = np.mean(X, axis=0) y_mean = np.mean(y) X_train = X - X_mean y_train = y - y_mean info = lsqr(X_train, y_train, np.sqrt(self.alpha)) self.coef_ = info[0] self.intercept_ = y_mean - np.dot(X_mean, self.coef_) return self def predict(self, X): y_pred = np.dot(X, self.coef_) + self.intercept_ return y_pred # In[3]: X, y = load_boston(return_X_y=True) for alpha in [0.1, 1, 10]: reg1 = Ridge(alpha=alpha).fit(X, y) reg2 = skRidge(alpha=alpha).fit(X, y) assert np.allclose(reg1.coef_, reg2.coef_) assert np.isclose(reg1.intercept_, reg2.intercept_) pred1 = reg1.predict(X) pred2 = reg2.predict(X) assert np.allclose(pred1, pred2) # ### Implementation 2 # - based on formula # - center the dataset and calculate the intercept manually # - similar to scikit-learn solver='cholesky' when n_features <= n_samples # In[4]: class Ridge: def __init__(self, alpha=1.0): self.alpha = alpha def fit(self, X, y): X_mean = np.mean(X, axis=0) y_mean = np.mean(y) X_train = X - X_mean y_train = y - y_mean A = np.dot(X_train.T, X_train) + np.diag(np.full(X_train.shape[1], alpha)) Xy = np.dot(X_train.T, y_train) self.coef_ = np.dot(inv(A), Xy).ravel() self.intercept_ = y_mean - np.dot(X_mean, self.coef_) return self def predict(self, X): y_pred = np.dot(X, self.coef_) + self.intercept_ return y_pred # In[5]: X, y = load_boston(return_X_y=True) for alpha in [0.1, 1, 10]: reg1 = Ridge(alpha=alpha).fit(X, y) reg2 = skRidge(alpha=alpha).fit(X, y) assert np.allclose(reg1.coef_, reg2.coef_) assert np.isclose(reg1.intercept_, reg2.intercept_) pred1 = reg1.predict(X) pred2 = reg2.predict(X) assert np.allclose(pred1, pred2) # ### Implementation 3 # - based on formula # - center the dataset and calculate the intercept manually # - similar to scikit-learn solver='cholesky' when n_features > n_samples # - ref: https://www.ics.uci.edu/~welling/classnotes/papers_class/Kernel-Ridge.pdf # - ref: https://stat.fandom.com/wiki/Kernel_Ridge_Regression # In[6]: class Ridge: def __init__(self, alpha=1.0): self.alpha = alpha def fit(self, X, y): X_mean = np.mean(X, axis=0) y_mean = np.mean(y) X_train = X - X_mean y_train = y - y_mean A = np.dot(X_train, X_train.T) + np.diag(np.full(X_train.shape[0], alpha)) self.coef_ = np.dot(np.dot(X_train.T, inv(A)), y_train).ravel() self.intercept_ = y_mean - np.dot(X_mean, self.coef_) return self def predict(self, X): y_pred = np.dot(X, self.coef_) + self.intercept_ return y_pred # In[7]: X, y = load_boston(return_X_y=True) for alpha in [0.1, 1, 10]: reg1 = Ridge(alpha=alpha).fit(X, y) reg2 = skRidge(alpha=alpha).fit(X, y) assert np.allclose(reg1.coef_, reg2.coef_) assert np.isclose(reg1.intercept_, reg2.intercept_) pred1 = reg1.predict(X) pred2 = reg2.predict(X) assert np.allclose(pred1, pred2) # ### Implementation 4 # - based on svd # - center the dataset and calculate the intercept manually # - similar to scikit-learn solver='svd' # In[8]: class Ridge: def __init__(self, alpha=1.0): self.alpha = alpha def fit(self, X, y): X_mean = np.mean(X, axis=0) y_mean = np.mean(y) X_train = X - X_mean y_train = y - y_mean U, S, VT = svd(X_train, full_matrices=False) self.coef_ = np.dot(np.dot(np.dot(VT.T, np.diag(S / (np.square(S) + self.alpha))), U.T), y).ravel() self.intercept_ = y_mean - np.dot(X_mean, self.coef_) return self def predict(self, X): y_pred = np.dot(X, self.coef_) + self.intercept_ return y_pred # In[9]: X, y = load_boston(return_X_y=True) for alpha in [0.1, 1, 10]: reg1 = Ridge(alpha=alpha).fit(X, y) reg2 = skRidge(alpha=alpha).fit(X, y) assert np.allclose(reg1.coef_, reg2.coef_) assert np.isclose(reg1.intercept_, reg2.intercept_) pred1 = reg1.predict(X) pred2 = reg2.predict(X) assert np.allclose(pred1, pred2) # ### Implementation 5 # - based on gradient decent # - center the dataset and calculate the intercept manually # In[10]: def cost_gradient(theta, X, y, alpha): h = np.dot(X, theta) - y J = np.dot(h, h) / (2 * X.shape[0]) + alpha * np.dot(theta, theta) / (2 * X.shape[0]) grad = np.dot(X.T, h) / X.shape[0] + alpha * theta / X.shape[0] return J, grad class Ridge: def __init__(self, alpha=1.0): self.alpha = alpha def fit(self, X, y): X_mean = np.mean(X, axis=0) y_mean = np.mean(y) X_train = X - X_mean y_train = y - y_mean theta = np.zeros(X_train.shape[1]) res = minimize(fun=cost_gradient, jac=True, x0=theta, args=(X_train, y_train, alpha), method='L-BFGS-B') self.coef_ = res.x self.intercept_ = y_mean - np.dot(X_mean, self.coef_) return self def predict(self, X): y_pred = np.dot(X, self.coef_) + self.intercept_ return y_pred # In[11]: X, y = load_boston(return_X_y=True) # center and scale the dataset to improve performance X = (X - X.mean(axis=0)) / X.std(axis=0) for alpha in [0.1, 1, 10]: reg1 = Ridge(alpha=alpha).fit(X, y) reg2 = skRidge(alpha=alpha).fit(X, y) assert np.allclose(reg1.coef_, reg2.coef_, atol=1e-3) assert np.isclose(reg1.intercept_, reg2.intercept_) pred1 = reg1.predict(X) pred2 = reg2.predict(X) assert np.allclose(pred1, pred2, atol=1e-3)