import dask.array as da
import numpy as np
from dask_glm.algorithms import (admm, gradient_descent,
newton, proximal_grad)
from dask_glm.families import Logistic
from dask_glm.utils import sigmoid, make_y
First, we will create some random data that fits nicely into the logistic family.
# turn off overflow warnings
np.seterr(all='ignore')
N = 1e3
p = 3
nchunks = 5
X = da.random.random((N, p), chunks=(N // nchunks, p))
true_beta = np.random.random(p)
y = make_y(X, beta=true_beta, chunks=(N // nchunks,))
# add an intercept
o = da.ones((X.shape[0], 1), chunks=(X.chunks[0], (1,)))
X_i = da.concatenate([X, o], axis=1)
Recall that when we "do logistic regression" we are solving an optimization problem (maximizing the appropriate log-likelihood function). Given input data $(X, y) \in \mathbb{R}^{n\times p}\times\{0, 1\}^n$, the gradient of our objective function at a point $\beta \in \mathbb{R}^p$ is given by
$$ X^T(\sigma(X\beta) - y) $$where
$$ \sigma(x) = 1 / (1 + \exp(-x)) $$is the sigmoid function.
As our objective function is convex, we will know we have found the global solution if the gradient at the estimate is the 0 vector. Let's check this condition for our unregularized algorithms: gradient descent and Newton's method.
newtons_beta = newton(X_i, y, tol=1e-8, family=Logistic)
grad_beta = gradient_descent(X_i, y, tol=1e-8, family=Logistic)
newtons_grad, grad_grad = da.compute(Logistic.pointwise_gradient(newtons_beta, X_i, y),
Logistic.pointwise_gradient(grad_beta, X_i, y))
## check the gradient
print('Size of gradient')
print('='*30)
print('Newton\'s Method : {0:.2f}'.format(np.linalg.norm(newtons_grad)))
print('Gradient Descent : {0:.2f}'.format(np.linalg.norm(grad_grad)))
## check the gradient
print('Size of gradient')
print('='*30)
print('Newton\'s Method : {0:.2f}'.format(np.max(np.abs(newtons_grad))))
print('Gradient Descent : {0:.2f}'.format(np.max(np.abs(grad_grad))))
As we can see, Newton's Method succesfully finds a true optimizer, whereas gradient descent doesn't do as well.
For problems with an intercept, notice that the first component of the gradient is:
$$ \Sigma_{i=1}^n \sigma(X\beta)_i - y_i) $$which implies that the true solution $\beta^*$ has the property that the average prediction is equal to the average rate of 1's in the training data. This provides an easy high-level test for how well our algorithms are peforming; however, this test tends to fail for gradient_descent
:
# check aggregate predictions
newton_preds = sigmoid(X_i.dot(newtons_beta))
grad_preds = sigmoid(X_i.dot(grad_beta))
print('Difference between aggregate predictions vs. aggregate level of 1\'s')
print('='*75)
print('Newton\'s Method : {:.2f}'.format((newton_preds - y).sum().compute()))
print('Gradient Descent : {:.2f}'.format((grad_preds - y).sum().compute()))
We can also compare the objective function directly for each of these estimates; recall that in practice we minimize the negative log-likelihood, so we are looking for smaller values:
newtons_loss, grad_loss = da.compute(Logistic.pointwise_loss(newtons_beta, X_i, y),
Logistic.pointwise_loss(grad_beta, X_i, y))
## check log-likelihood
print('Negative Log-Likelihood')
print('='*30)
print('Newton\'s Method : {0:.4f}'.format(newtons_loss))
print('Gradient Descent : {0:.4f}'.format(grad_loss))
We do see that the function values are surprisingly close, but as the aggregate predictions check shows us, there is a material model difference between the estimates.
Now let us consider problems where we modify the log-likelihood by adding a "regularizer"; in our particular case we are optimizing a modified function where $\lambda \sum_{i=1}^p \left|\beta_i\right| =: \lambda \|\beta\|_1$ has been added to the likelihood function.
As above, we can perform a 0 gradient check to test for optimality, but our regularizer is not differentiable at 0 so we have to be careful at any coefficient values that are 0. For this test, we will also compare against sklearn
.
lamduh = 4.0
We should see two convergence prints, one for admm
and one for proximal_grad
:
from sklearn.linear_model import LogisticRegression
mod = LogisticRegression(penalty='l1', C = 1. / lamduh, fit_intercept=False, tol=1e-8).fit(X.compute(), y.compute())
sk_beta = mod.coef_
admm_beta = admm(X, y, lamduh=lamduh, max_iter=700,
abstol=1e-8, reltol=1e-2, family=Logistic)
prox_beta = proximal_grad(X, y, family=Logistic, regularizer='l1', tol=1e-8, lamduh=lamduh)
# optimality check
def check_regularized_grad(beta, lamduh, tol=1e-6):
opt_grad = Logistic.pointwise_gradient(beta, X.compute(), y.compute())
for idx, b in enumerate(beta):
if b == 0:
try:
assert opt_grad[idx] - lamduh <= 0 <= opt_grad[idx] + lamduh
except AssertionError:
print('Optimality Fail')
break
else:
try:
assert np.abs(opt_grad[idx] + lamduh * np.sign(b)) < tol
except AssertionError:
print('Optimality Fail')
break
if b == beta[-1]:
print('Optimality Pass!')
# tolerance for 0's
tol = 1e-4
print('scikit-learn')
print('='*20)
check_regularized_grad(sk_beta[0,:], lamduh=lamduh, tol=tol)
print('\nADMM')
print('='*20)
check_regularized_grad(admm_beta, lamduh=lamduh, tol=tol)
print('\nProximal Gradient')
print('='*20)
check_regularized_grad(prox_beta, lamduh=lamduh, tol=tol)
print(prox_beta)
print(admm_beta)
print(sk_beta)