#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np from sklearn.datasets import load_iris from sklearn.preprocessing import StandardScaler from sklearn.linear_model import SGDClassifier as skSGDClassifier # ### Implementation 1 # - scikit-learn loss = "hinge", penalty="l2"/"none" # - similar to sklearn.svm.LinearSVC # In[2]: def _loss(x, y, coef, intercept): p = np.dot(x, coef) + intercept z = p * y if z <= 1: return 1 - z else: return 0 def _grad(x, y, coef, intercept): p = np.dot(x, coef) + intercept z = p * y if z <= 1: dloss = -y else: dloss = 0 # clip gradient (consistent with scikit-learn) dloss = np.clip(dloss, -1e12, 1e12) coef_grad = dloss * x intercept_grad = dloss return coef_grad, intercept_grad # In[3]: class SGDClassifier(): def __init__(self, penalty="l2", alpha=0.0001, max_iter=1000, tol=1e-3, shuffle=True, random_state=0, # use learning_rate = 'invscaling' for simplicity eta0=0, power_t=0.5, n_iter_no_change=5): self.penalty = penalty self.alpha = alpha self.max_iter = max_iter self.tol = tol self.shuffle = shuffle self.random_state = random_state self.eta0 = eta0 self.power_t = power_t self.n_iter_no_change = n_iter_no_change def _encode(self, y): classes = np.unique(y) y_train = np.full((y.shape[0], len(classes)), -1) for i, c in enumerate(classes): y_train[y == c, i] = 1 if len(classes) == 2: y_train = y_train[:, 1].reshape(-1, 1) return classes, y_train def fit(self, X, y): self.classes_, y_train = self._encode(y) if len(self.classes_) == 2: coef = np.zeros((1, X.shape[1])) intercept = np.zeros(1) else: coef = np.zeros((len(self.classes_), X.shape[1])) intercept = np.zeros(len(self.classes_)) n_iter = 0 rng = np.random.RandomState(self.random_state) for class_ind in range(y_train.shape[1]): cur_y = y_train[:, class_ind] cur_coef = np.zeros(X.shape[1]) cur_intercept = 0 best_loss = np.inf no_improvement_count = 0 t = 1 for epoch in range(self.max_iter): # different from how data is shuffled in scikit-learn if self.shuffle: ind = rng.permutation(X.shape[0]) X, cur_y = X[ind], cur_y[ind] sumloss = 0 for i in range(X.shape[0]): sumloss += _loss(X[i], cur_y[i], cur_coef, cur_intercept) eta = self.eta0 / np.power(t, self.power_t) coef_grad, intercept_grad = _grad(X[i], cur_y[i], cur_coef, cur_intercept) if self.penalty == "l2": cur_coef *= 1 - eta * self.alpha cur_coef -= eta * coef_grad cur_intercept -= eta * intercept_grad t += 1 if sumloss > best_loss - self.tol * X.shape[0]: no_improvement_count += 1 else: no_improvement_count = 0 if no_improvement_count == self.n_iter_no_change: break if sumloss < best_loss: best_loss = sumloss coef[class_ind] = cur_coef intercept[class_ind] = cur_intercept n_iter = max(n_iter, epoch + 1) self.coef_ = coef self.intercept_ = intercept self.n_iter_ = n_iter return self def decision_function(self, X): scores = np.dot(X, self.coef_.T) + self.intercept_ if scores.shape[1] == 1: return scores.ravel() else: return scores def predict(self, X): scores = self.decision_function(X) if len(scores.shape) == 1: indices = (scores > 0).astype(int) else: indices = np.argmax(scores, axis=1) return self.classes_[indices] # In[4]: # binary classification X, y = load_iris(return_X_y=True) X, y = X[y != 2], y[y != 2] X = StandardScaler().fit_transform(X) clf1 = SGDClassifier(eta0=0.1, shuffle=False).fit(X, y) clf2 = skSGDClassifier(learning_rate='invscaling', eta0=0.1, shuffle=False).fit(X, y) assert np.allclose(clf1.coef_, clf2.coef_) assert np.allclose(clf1.intercept_, clf2.intercept_) prob1 = clf1.decision_function(X) prob2 = clf2.decision_function(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2) # In[5]: # shuffle=False penalty="none" X, y = load_iris(return_X_y=True) X = StandardScaler().fit_transform(X) clf1 = SGDClassifier(eta0=0.1, shuffle=False).fit(X, y) clf2 = skSGDClassifier(learning_rate='invscaling', eta0=0.1, shuffle=False).fit(X, y) assert np.allclose(clf1.coef_, clf2.coef_) assert np.allclose(clf1.intercept_, clf2.intercept_) prob1 = clf1.decision_function(X) prob2 = clf2.decision_function(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2) # In[6]: # shuffle=False penalty="l2" for alpha in [0.1, 1, 10]: X, y = load_iris(return_X_y=True) X = StandardScaler().fit_transform(X) clf1 = SGDClassifier(eta0=0.1, alpha=alpha, shuffle=False).fit(X, y) clf2 = skSGDClassifier(learning_rate='invscaling', eta0=0.1, alpha=alpha, shuffle=False).fit(X, y) assert np.allclose(clf1.coef_, clf2.coef_) assert np.allclose(clf1.intercept_, clf2.intercept_) prob1 = clf1.decision_function(X) prob2 = clf2.decision_function(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2) # ### Implementation 2 # - scikit-learn loss = "squared_hinge", penalty="l2"/"none" # - similar to sklearn.svm.LinearSVC # In[7]: def _loss(x, y, coef, intercept): p = np.dot(x, coef) + intercept z = 1 - p * y if z > 0: return z * z else: return 0 def _grad(x, y, coef, intercept): p = np.dot(x, coef) + intercept z = 1 - p * y if z > 0: dloss = -2 * y * z else: dloss = 0 # clip gradient (consistent with scikit-learn) dloss = np.clip(dloss, -1e12, 1e12) coef_grad = dloss * x intercept_grad = dloss return coef_grad, intercept_grad # In[8]: # shuffle=False penalty="none" X, y = load_iris(return_X_y=True) X = StandardScaler().fit_transform(X) clf1 = SGDClassifier(eta0=0.1, shuffle=False).fit(X, y) clf2 = skSGDClassifier(loss="squared_hinge", learning_rate='invscaling', eta0=0.1, shuffle=False).fit(X, y) assert np.allclose(clf1.coef_, clf2.coef_) assert np.allclose(clf1.intercept_, clf2.intercept_) prob1 = clf1.decision_function(X) prob2 = clf2.decision_function(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2) # In[9]: # shuffle=False penalty="l2" for alpha in [0.1, 1, 10]: X, y = load_iris(return_X_y=True) X = StandardScaler().fit_transform(X) clf1 = SGDClassifier(eta0=0.1, alpha=alpha, shuffle=False).fit(X, y) clf2 = skSGDClassifier(loss="squared_hinge", learning_rate='invscaling', eta0=0.1, alpha=alpha, shuffle=False).fit(X, y) assert np.allclose(clf1.coef_, clf2.coef_) assert np.allclose(clf1.intercept_, clf2.intercept_) prob1 = clf1.decision_function(X) prob2 = clf2.decision_function(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2) # ### Implementation 3 # - scikit-learn loss = "modified_huber", penalty="l2"/"none" # In[10]: def _loss(x, y, coef, intercept): p = np.dot(x, coef) + intercept z = p * y if z > 1: return 0 elif z > -1: return (1 - z) * (1 - z) else: return -4 * z def _grad(x, y, coef, intercept): p = np.dot(x, coef) + intercept z = p * y if z > 1: dloss = 0 elif z > -1: dloss = -2 * (1 - z) * y else: dloss = -4 * y # clip gradient (consistent with scikit-learn) dloss = np.clip(dloss, -1e12, 1e12) coef_grad = dloss * x intercept_grad = dloss return coef_grad, intercept_grad # In[11]: # shuffle=False penalty="none" X, y = load_iris(return_X_y=True) X = StandardScaler().fit_transform(X) clf1 = SGDClassifier(eta0=0.1, shuffle=False).fit(X, y) clf2 = skSGDClassifier(loss="modified_huber", learning_rate='invscaling', eta0=0.1, shuffle=False).fit(X, y) assert np.allclose(clf1.coef_, clf2.coef_) assert np.allclose(clf1.intercept_, clf2.intercept_) prob1 = clf1.decision_function(X) prob2 = clf2.decision_function(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2) # In[12]: # shuffle=False penalty="l2" for alpha in [0.1, 1, 10]: X, y = load_iris(return_X_y=True) X = StandardScaler().fit_transform(X) clf1 = SGDClassifier(eta0=0.1, alpha=alpha, shuffle=False).fit(X, y) clf2 = skSGDClassifier(loss="modified_huber", learning_rate='invscaling', eta0=0.1, alpha=alpha, shuffle=False).fit(X, y) assert np.allclose(clf1.coef_, clf2.coef_) assert np.allclose(clf1.intercept_, clf2.intercept_) prob1 = clf1.decision_function(X) prob2 = clf2.decision_function(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2) # ### Implementation 4 # - scikit-learn loss = "log", penalty="l2"/"none" # - similar to sklearn.linear_model.LogisticRegression # In[13]: def _loss(x, y, coef, intercept): p = np.dot(x, coef) + intercept z = p * y # follow scikit-learn if z > 18: return np.exp(-z) elif z < -18: return -z else: return np.log(1 + np.exp(-z)) def _grad(x, y, coef, intercept): p = np.dot(x, coef) + intercept z = p * y if z > 18: dloss = -np.exp(-z) * y elif z < -18: dloss = -y else: dloss = -y / (1 + np.exp(z)) # clip gradient (consistent with scikit-learn) dloss = np.clip(dloss, -1e12, 1e12) coef_grad = dloss * x intercept_grad = dloss return coef_grad, intercept_grad # In[14]: # shuffle=False penalty="none" X, y = load_iris(return_X_y=True) X = StandardScaler().fit_transform(X) clf1 = SGDClassifier(eta0=0.1, shuffle=False).fit(X, y) clf2 = skSGDClassifier(loss="log", learning_rate='invscaling', eta0=0.1, shuffle=False).fit(X, y) assert np.allclose(clf1.coef_, clf2.coef_) assert np.allclose(clf1.intercept_, clf2.intercept_) prob1 = clf1.decision_function(X) prob2 = clf2.decision_function(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2) # In[15]: # shuffle=False penalty="l2" for alpha in [0.1, 1, 10]: X, y = load_iris(return_X_y=True) X = StandardScaler().fit_transform(X) clf1 = SGDClassifier(eta0=0.1, alpha=alpha, shuffle=False).fit(X, y) clf2 = skSGDClassifier(loss="log", learning_rate='invscaling', eta0=0.1, alpha=alpha, shuffle=False).fit(X, y) assert np.allclose(clf1.coef_, clf2.coef_) assert np.allclose(clf1.intercept_, clf2.intercept_) prob1 = clf1.decision_function(X) prob2 = clf2.decision_function(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2)