#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
from scipy.special import expit, logsumexp
from scipy.optimize import minimize
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.linear_model import LogisticRegression as skLogisticRegression


# ### Implementation 1
# - convert multiclass classification problem to binary classification problem in a one-vs-all fashion
# - based on gradient decent
# - similar to sklearn multi_class='ovr' & solver='lbfgs'
# - reference: http://fa.bianp.net/blog/2013/numerical-optimizers-for-logistic-regression/

# In[2]:


class LogisticRegression():
    def __init__(self, C=1.0):
        self.C = C

    def _encode(self, y):
        classes = np.unique(y)
        y_train = np.full((y.shape[0], len(classes)), -1)
        for i, c in enumerate(classes):
            y_train[y == c, i] = 1
        if len(classes) == 2:
            y_train = y_train[:, 1].reshape(-1, 1)
        return classes, y_train

    @staticmethod
    def _cost_grad(w, X, y, alpha):
        def _log_logistic(x):
            if x > 0:
                return -np.log(1 + np.exp(-x))
            else:
                return x - np.log(1 + np.exp(x))
        yz = y * (np.dot(X, w[:-1]) + w[-1])
        cost = -np.sum(np.vectorize(_log_logistic)(yz)) + 0.5 * alpha * np.dot(w[:-1], w[:-1])
        grad = np.zeros(len(w))
        t = (expit(yz) - 1) * y
        grad[:-1] = np.dot(X.T, t) + alpha * w[:-1]
        grad[-1] = np.sum(t)
        return cost, grad

    def _solve_lbfgs(self, X, y):
        result = np.zeros((y.shape[1], X.shape[1] + 1))
        for i in range(y.shape[1]):
            cur_y = y[:, i]
            w0 = np.zeros(X.shape[1] + 1)
            res = minimize(fun=self._cost_grad, jac=True, x0=w0,
                           args=(X, cur_y, 1 / self.C), method='L-BFGS-B')
            result[i] = res.x
        return result[:, :-1], result[:, -1]

    def fit(self, X, y):
        self.classes_, y_train = self._encode(y)
        self.coef_, self.intercept_ = self._solve_lbfgs(X, y_train)
        return self

    def decision_function(self, X):
        scores = np.dot(X, self.coef_.T) + self.intercept_
        if scores.shape[1] == 1:
            return scores.ravel()
        else:
            return scores

    def predict(self, X):
        scores = self.decision_function(X)
        if len(scores.shape) == 1:
            indices = (scores > 0).astype(int)
        else:
            indices = np.argmax(scores, axis=1)
        return self.classes_[indices]

    def predict_proba(self, X):
        scores = self.decision_function(X)
        prob = expit(scores)
        if len(scores.shape) == 1:
            prob = np.vstack((1 - prob, prob)).T
        else:
            prob /= np.sum(prob, axis=1)[:, np.newaxis]
        return prob


# In[3]:


# binary classification
for C in [0.1, 1, 10]:
    X, y = load_breast_cancer(return_X_y = True)
    clf1 = LogisticRegression(C=C).fit(X, y)
    clf2 = skLogisticRegression(C=C, multi_class="ovr", solver="lbfgs",
                                # keep consisent with scipy default
                                tol=1e-5, max_iter=15000).fit(X, y)
    assert clf1.coef_.shape == (1, X.shape[1])
    assert np.allclose(clf1.coef_, clf2.coef_)
    assert np.allclose(clf1.intercept_, clf2.intercept_)
    prob1 = clf1.decision_function(X)
    prob2 = clf2.decision_function(X)
    assert np.allclose(prob1, prob2)
    prob1 = clf1.predict_proba(X)
    prob2 = clf2.predict_proba(X)
    assert np.allclose(prob1, prob2)
    pred1 = clf1.predict(X)
    pred2 = clf2.predict(X)
    assert np.array_equal(pred1, pred2)


# In[4]:


# multiclass classification
for C in [0.1, 1, 10]:
    X, y = load_iris(return_X_y=True)
    clf1 = LogisticRegression(C=C).fit(X, y)
    clf2 = skLogisticRegression(C=C, multi_class="ovr", solver="lbfgs",
                                # keep consisent with scipy default
                                tol=1e-5, max_iter=15000).fit(X, y)
    assert clf1.coef_.shape == (len(np.unique(y)), X.shape[1])
    assert np.allclose(clf1.coef_, clf2.coef_)
    assert np.allclose(clf1.intercept_, clf2.intercept_)
    prob1 = clf1.decision_function(X)
    prob2 = clf2.decision_function(X)
    assert np.allclose(prob1, prob2)
    prob1 = clf1.predict_proba(X)
    prob2 = clf2.predict_proba(X)
    assert np.allclose(prob1, prob2)
    pred1 = clf1.predict(X)
    pred2 = clf2.predict(X)
    assert np.array_equal(pred1, pred2)


# In[5]:


# penalty = 'none'
X, y = load_iris(return_X_y=True)
clf1 = LogisticRegression(C=np.inf).fit(X, y)
clf2 = skLogisticRegression(penalty='none', multi_class="ovr", solver="lbfgs",
                            # keep consisent with scipy default
                            tol=1e-5, max_iter=15000).fit(X, y)
assert clf1.coef_.shape == (len(np.unique(y)), X.shape[1])
assert np.allclose(clf1.coef_, clf2.coef_)
assert np.allclose(clf1.intercept_, clf2.intercept_)
prob1 = clf1.decision_function(X)
prob2 = clf2.decision_function(X)
assert np.allclose(prob1, prob2)
prob1 = clf1.predict_proba(X)
prob2 = clf2.predict_proba(X)
assert np.allclose(prob1, prob2)
pred1 = clf1.predict(X)
pred2 = clf2.predict(X)
assert np.array_equal(pred1, pred2)


# ### Implementation 2
# - support multiclass classification problem directly
# - based on gradient decent
# - similar to sklearn multi_class='multinomial' & solver='lbfgs'

# In[6]:


class LogisticRegression():
    def __init__(self, C=1.0):
        self.C = C

    def _encode(self, y):
        classes = np.unique(y)
        y_train = np.zeros((y.shape[0], len(classes)))
        for i, c in enumerate(classes):
            y_train[y == c, i] = 1
        return classes, y_train

    @staticmethod
    def _cost_grad(w, X, y, alpha):
        w = w.reshape(y.shape[1], -1)
        p = np.dot(X, w[:, :-1].T) + w[:, -1]
        p -= logsumexp(p, axis=1)[:, np.newaxis]
        cost = -np.sum(y * p) + 0.5 * alpha * np.dot(w[:, :-1].ravel(), w[:, :-1].ravel())
        grad = np.zeros_like(w)
        diff = np.exp(p) - y
        grad[:, :-1] = np.dot(diff.T, X) + alpha * w[:, :-1]
        grad[:, -1] = np.sum(diff, axis=0)
        return cost, grad.ravel()

    def _solve_lbfgs(self, X, y):
        w0 = np.zeros(y.shape[1] * (X.shape[1] + 1))
        res = minimize(fun=self._cost_grad, jac=True, x0=w0,
                       args=(X, y, 1 / self.C), method='L-BFGS-B')
        result = res.x.reshape(y.shape[1], -1)
        if y.shape[1] == 2:
            result = result[1][np.newaxis, :]
        return result[:, :-1], result[:, -1]

    def fit(self, X, y):
        self.classes_, y_train = self._encode(y)
        self.coef_, self.intercept_ = self._solve_lbfgs(X, y_train)
        return self

    def decision_function(self, X):
        scores = np.dot(X, self.coef_.T) + self.intercept_
        if scores.shape[1] == 1:
            return scores.ravel()
        else:
            return scores

    def predict(self, X):
        scores = self.decision_function(X)
        if len(scores.shape) == 1:
            indices = (scores > 0).astype(int)
        else:
            indices = np.argmax(scores, axis=1)
        return self.classes_[indices]

    def predict_proba(self, X):
        scores = self.decision_function(X)
        if len(scores.shape) == 1:
            scores = np.c_[-scores, scores]
        scores -= np.max(scores, axis=1)[:, np.newaxis]
        prob = np.exp(scores)
        prob /= np.sum(prob, axis=1)[:, np.newaxis]
        return prob


# In[7]:


# binary classification
for C in [0.1, 1, 10]:
    X, y = load_breast_cancer(return_X_y = True)
    clf1 = LogisticRegression(C=C).fit(X, y)
    clf2 = skLogisticRegression(C=C, multi_class="multinomial", solver="lbfgs",
                                # keep consisent with scipy default
                                tol=1e-5, max_iter=15000).fit(X, y)
    assert clf1.coef_.shape == (1, X.shape[1])
    assert np.allclose(clf1.coef_, clf2.coef_)
    assert np.allclose(clf1.intercept_, clf2.intercept_)
    prob1 = clf1.decision_function(X)
    prob2 = clf2.decision_function(X)
    assert np.allclose(prob1, prob2)
    prob1 = clf1.predict_proba(X)
    prob2 = clf2.predict_proba(X)
    assert np.allclose(prob1, prob2)
    pred1 = clf1.predict(X)
    pred2 = clf2.predict(X)
    assert np.array_equal(pred1, pred2)


# In[8]:


# multiclass classification
for C in [0.1, 1, 10]:
    X, y = load_iris(return_X_y=True)
    clf1 = LogisticRegression(C=C).fit(X, y)
    clf2 = skLogisticRegression(C=C, multi_class="multinomial", solver="lbfgs",
                                # keep consisent with scipy default
                                tol=1e-5, max_iter=15000).fit(X, y)
    assert clf1.coef_.shape == (len(np.unique(y)), X.shape[1])
    assert np.allclose(clf1.coef_, clf2.coef_)
    assert np.allclose(clf1.intercept_, clf2.intercept_)
    prob1 = clf1.decision_function(X)
    prob2 = clf2.decision_function(X)
    assert np.allclose(prob1, prob2)
    prob1 = clf1.predict_proba(X)
    prob2 = clf2.predict_proba(X)
    assert np.allclose(prob1, prob2)
    pred1 = clf1.predict(X)
    pred2 = clf2.predict(X)
    assert np.array_equal(pred1, pred2)


# In[9]:


# penalty = 'none'
X, y = load_iris(return_X_y=True)
clf1 = LogisticRegression(C=np.inf).fit(X, y)
clf2 = skLogisticRegression(penalty='none', multi_class="multinomial", solver="lbfgs",
                            # keep consisent with scipy default
                            tol=1e-5, max_iter=15000).fit(X, y)
assert clf1.coef_.shape == (len(np.unique(y)), X.shape[1])
assert np.allclose(clf1.coef_, clf2.coef_)
assert np.allclose(clf1.intercept_, clf2.intercept_)
prob1 = clf1.decision_function(X)
prob2 = clf2.decision_function(X)
assert np.allclose(prob1, prob2)
prob1 = clf1.predict_proba(X)
prob2 = clf2.predict_proba(X)
assert np.allclose(prob1, prob2)
pred1 = clf1.predict(X)
pred2 = clf2.predict(X)
assert np.array_equal(pred1, pred2)