#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np from scipy.special import expit, logsumexp from scipy.optimize import minimize from sklearn.datasets import load_iris, load_breast_cancer from sklearn.linear_model import LogisticRegression as skLogisticRegression # ### Implementation 1 # - convert multiclass classification problem to binary classification problem in a one-vs-all fashion # - based on gradient decent # - similar to sklearn multi_class='ovr' & solver='lbfgs' # - reference: http://fa.bianp.net/blog/2013/numerical-optimizers-for-logistic-regression/ # In[2]: class LogisticRegression(): def __init__(self, C=1.0): self.C = C def _encode(self, y): classes = np.unique(y) y_train = np.full((y.shape[0], len(classes)), -1) for i, c in enumerate(classes): y_train[y == c, i] = 1 if len(classes) == 2: y_train = y_train[:, 1].reshape(-1, 1) return classes, y_train @staticmethod def _cost_grad(w, X, y, alpha): def _log_logistic(x): if x > 0: return -np.log(1 + np.exp(-x)) else: return x - np.log(1 + np.exp(x)) yz = y * (np.dot(X, w[:-1]) + w[-1]) cost = -np.sum(np.vectorize(_log_logistic)(yz)) + 0.5 * alpha * np.dot(w[:-1], w[:-1]) grad = np.zeros(len(w)) t = (expit(yz) - 1) * y grad[:-1] = np.dot(X.T, t) + alpha * w[:-1] grad[-1] = np.sum(t) return cost, grad def _solve_lbfgs(self, X, y): result = np.zeros((y.shape[1], X.shape[1] + 1)) for i in range(y.shape[1]): cur_y = y[:, i] w0 = np.zeros(X.shape[1] + 1) res = minimize(fun=self._cost_grad, jac=True, x0=w0, args=(X, cur_y, 1 / self.C), method='L-BFGS-B') result[i] = res.x return result[:, :-1], result[:, -1] def fit(self, X, y): self.classes_, y_train = self._encode(y) self.coef_, self.intercept_ = self._solve_lbfgs(X, y_train) return self def decision_function(self, X): scores = np.dot(X, self.coef_.T) + self.intercept_ if scores.shape[1] == 1: return scores.ravel() else: return scores def predict(self, X): scores = self.decision_function(X) if len(scores.shape) == 1: indices = (scores > 0).astype(int) else: indices = np.argmax(scores, axis=1) return self.classes_[indices] def predict_proba(self, X): scores = self.decision_function(X) prob = expit(scores) if len(scores.shape) == 1: prob = np.vstack((1 - prob, prob)).T else: prob /= np.sum(prob, axis=1)[:, np.newaxis] return prob # In[3]: # binary classification for C in [0.1, 1, 10]: X, y = load_breast_cancer(return_X_y = True) clf1 = LogisticRegression(C=C).fit(X, y) clf2 = skLogisticRegression(C=C, multi_class="ovr", solver="lbfgs", # keep consisent with scipy default tol=1e-5, max_iter=15000).fit(X, y) assert clf1.coef_.shape == (1, X.shape[1]) assert np.allclose(clf1.coef_, clf2.coef_) assert np.allclose(clf1.intercept_, clf2.intercept_) prob1 = clf1.decision_function(X) prob2 = clf2.decision_function(X) assert np.allclose(prob1, prob2) prob1 = clf1.predict_proba(X) prob2 = clf2.predict_proba(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2) # In[4]: # multiclass classification for C in [0.1, 1, 10]: X, y = load_iris(return_X_y=True) clf1 = LogisticRegression(C=C).fit(X, y) clf2 = skLogisticRegression(C=C, multi_class="ovr", solver="lbfgs", # keep consisent with scipy default tol=1e-5, max_iter=15000).fit(X, y) assert clf1.coef_.shape == (len(np.unique(y)), X.shape[1]) assert np.allclose(clf1.coef_, clf2.coef_) assert np.allclose(clf1.intercept_, clf2.intercept_) prob1 = clf1.decision_function(X) prob2 = clf2.decision_function(X) assert np.allclose(prob1, prob2) prob1 = clf1.predict_proba(X) prob2 = clf2.predict_proba(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2) # In[5]: # penalty = 'none' X, y = load_iris(return_X_y=True) clf1 = LogisticRegression(C=np.inf).fit(X, y) clf2 = skLogisticRegression(penalty='none', multi_class="ovr", solver="lbfgs", # keep consisent with scipy default tol=1e-5, max_iter=15000).fit(X, y) assert clf1.coef_.shape == (len(np.unique(y)), X.shape[1]) assert np.allclose(clf1.coef_, clf2.coef_) assert np.allclose(clf1.intercept_, clf2.intercept_) prob1 = clf1.decision_function(X) prob2 = clf2.decision_function(X) assert np.allclose(prob1, prob2) prob1 = clf1.predict_proba(X) prob2 = clf2.predict_proba(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2) # ### Implementation 2 # - support multiclass classification problem directly # - based on gradient decent # - similar to sklearn multi_class='multinomial' & solver='lbfgs' # In[6]: class LogisticRegression(): def __init__(self, C=1.0): self.C = C def _encode(self, y): classes = np.unique(y) y_train = np.zeros((y.shape[0], len(classes))) for i, c in enumerate(classes): y_train[y == c, i] = 1 return classes, y_train @staticmethod def _cost_grad(w, X, y, alpha): w = w.reshape(y.shape[1], -1) p = np.dot(X, w[:, :-1].T) + w[:, -1] p -= logsumexp(p, axis=1)[:, np.newaxis] cost = -np.sum(y * p) + 0.5 * alpha * np.dot(w[:, :-1].ravel(), w[:, :-1].ravel()) grad = np.zeros_like(w) diff = np.exp(p) - y grad[:, :-1] = np.dot(diff.T, X) + alpha * w[:, :-1] grad[:, -1] = np.sum(diff, axis=0) return cost, grad.ravel() def _solve_lbfgs(self, X, y): w0 = np.zeros(y.shape[1] * (X.shape[1] + 1)) res = minimize(fun=self._cost_grad, jac=True, x0=w0, args=(X, y, 1 / self.C), method='L-BFGS-B') result = res.x.reshape(y.shape[1], -1) if y.shape[1] == 2: result = result[1][np.newaxis, :] return result[:, :-1], result[:, -1] def fit(self, X, y): self.classes_, y_train = self._encode(y) self.coef_, self.intercept_ = self._solve_lbfgs(X, y_train) return self def decision_function(self, X): scores = np.dot(X, self.coef_.T) + self.intercept_ if scores.shape[1] == 1: return scores.ravel() else: return scores def predict(self, X): scores = self.decision_function(X) if len(scores.shape) == 1: indices = (scores > 0).astype(int) else: indices = np.argmax(scores, axis=1) return self.classes_[indices] def predict_proba(self, X): scores = self.decision_function(X) if len(scores.shape) == 1: scores = np.c_[-scores, scores] scores -= np.max(scores, axis=1)[:, np.newaxis] prob = np.exp(scores) prob /= np.sum(prob, axis=1)[:, np.newaxis] return prob # In[7]: # binary classification for C in [0.1, 1, 10]: X, y = load_breast_cancer(return_X_y = True) clf1 = LogisticRegression(C=C).fit(X, y) clf2 = skLogisticRegression(C=C, multi_class="multinomial", solver="lbfgs", # keep consisent with scipy default tol=1e-5, max_iter=15000).fit(X, y) assert clf1.coef_.shape == (1, X.shape[1]) assert np.allclose(clf1.coef_, clf2.coef_) assert np.allclose(clf1.intercept_, clf2.intercept_) prob1 = clf1.decision_function(X) prob2 = clf2.decision_function(X) assert np.allclose(prob1, prob2) prob1 = clf1.predict_proba(X) prob2 = clf2.predict_proba(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2) # In[8]: # multiclass classification for C in [0.1, 1, 10]: X, y = load_iris(return_X_y=True) clf1 = LogisticRegression(C=C).fit(X, y) clf2 = skLogisticRegression(C=C, multi_class="multinomial", solver="lbfgs", # keep consisent with scipy default tol=1e-5, max_iter=15000).fit(X, y) assert clf1.coef_.shape == (len(np.unique(y)), X.shape[1]) assert np.allclose(clf1.coef_, clf2.coef_) assert np.allclose(clf1.intercept_, clf2.intercept_) prob1 = clf1.decision_function(X) prob2 = clf2.decision_function(X) assert np.allclose(prob1, prob2) prob1 = clf1.predict_proba(X) prob2 = clf2.predict_proba(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2) # In[9]: # penalty = 'none' X, y = load_iris(return_X_y=True) clf1 = LogisticRegression(C=np.inf).fit(X, y) clf2 = skLogisticRegression(penalty='none', multi_class="multinomial", solver="lbfgs", # keep consisent with scipy default tol=1e-5, max_iter=15000).fit(X, y) assert clf1.coef_.shape == (len(np.unique(y)), X.shape[1]) assert np.allclose(clf1.coef_, clf2.coef_) assert np.allclose(clf1.intercept_, clf2.intercept_) prob1 = clf1.decision_function(X) prob2 = clf2.decision_function(X) assert np.allclose(prob1, prob2) prob1 = clf1.predict_proba(X) prob2 = clf2.predict_proba(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2)