#!/usr/bin/env python
# coding: utf-8

# # Smooth Bilevel Programming for Sparse Regularization
# 
# This tour is a tutorial reviewing the method developped in the paper:
# 
# > _[Smooth Bilevel Programming for Sparse Regularization](https://arxiv.org/abs/2106.01429)_
# Clarice Poon, Gabriel Peyré, 2021
# 
# We present a surprisingly simple reformulation of the Lasso as a bilevel program, which can be solved using efficient method such as L-BFGS. This gives an algorithm which is simple to implement and is in the same ballpark as state of the art approach when the regularization parameter $\lambda$ is small (it can in particular cope in a seamless way with the constraint formulation when $\lambda=0$). For large $\lambda$, and when the solution is very sparse, the method is not as good as for instance coordinate methods with safe screening/support pruning such as the [Celer](https://github.com/mathurinm/celer) solver which we warmly recommend.

# In[1]:


import numpy as np
from numpy.linalg import norm
import matplotlib.pyplot as plt
import time
import warnings
import progressbar


# # Lasso problem
# 
# We aim at solving the [Lasso](https://en.wikipedia.org/wiki/Lasso_(statistics)) problem
# $$
#     \min_x \frac{1}{2\lambda}\| A x -y \|^2 + \|x\|_1. 
# $$
# We generate here a synthetic example using a random matrix $A \in \mathbb{R}^{n \times p}$ and $y \in \mathbb{R}^p$.

# In[2]:


n = 10
p = n*2
np.random.seed(0)
A = np.random.randn(n, p)
y = np.random.randn(n)


# The simplest (but arguably not the fastest) algorithm to solve this problem is the iterative soft thresholding
# 
# $$
#     x_{k+1} = \text{Thresh}_{\tau \lambda}( y - \tau A^\top (Ax_k-y) )
# $$
# 
# where the step size should satisfies $\tau < 2/\|A\|^2$, and Thresh is the soft thresholding
# 
# $$
#     \text{Thresh}_\sigma( x ) = \text{sign}(x) \max(|x|-\sigma,0)
# $$

# In[3]:


def Thresh(x,sigma): 
    return np.sign(x) * np.maximum(np.abs(x) - sigma, 0)
x = np.linspace(-3, 3, 1000)

plt.plot(x, Thresh(x, 1.5), label='Thresh$_{1.5}(x)$')
plt.legend();


# The critical $\lambda$ for which the solution of the Lasso is 0 is $\|A^\top y\|_\infty$. We select the $\lambda$ as a fraction of this value.

# In[4]:


la = .5 * np.max(A.T @ y)


# In[5]:


tau = 1 / np.linalg.norm(A) ** 2
niter = 100
x = np.zeros(p)
x_ista = np.zeros((p, niter))
for it in range(niter):
    x = Thresh(x - tau*A.T @ (A@x - y), tau*la)
    x_ista[:, it] = x
plt.plot(x_ista.T);


# # Non-convex Variational Projection
# 
# The following variational formulation is pivotal for the developpement of our approach
# $$
#     \|x\|_1 = \min_{u \odot v = x} \frac{\|u\|_2^2 + \|v\|_2^2 }{2}
# $$
# where $u \odot v = (u_i v_i)_{i=1}^p$.
# 
# Thanks to this formula, we can minimize the Lasso problem on $(u,v)$ instead of $x$ (which corresponds to an over-parameterization of the problem). The crux of the method is that instead of doing an alternative minimization over $u$ and $v$ (which is a classical method), we rather consider a bilevel formulation
# 
# $$
#     \min_{v} f(v)    
#      \quad
#      \text{where}
#      \quad
#     f(v) = \min_{u}  F(u,v) \triangleq \frac{1}{2 \lambda}\| A(u \odot v) - y \|^2
#     + \frac{\|u\|_2^2 + \|v\|_2^2 }{2} 
# $$
# The effect of this bilevel formulation is to make the function $f$ better conditionned (and in particular smooth) than the original Lasso functional. 
# 

# For a given $v$, the optimal $u$ solving the inner problem $\min_u F(u,v)$ is unique and can be found by solving a linear system
# 
# $$
#     u^\star(v) = ( \text{diag}(v) A^\top A \text{diag}(v) + \lambda \text{Id}_p )^{-1} ( v \odot A^\top y ) 
# $$

# In[6]:


def u_opt1(v): 
    T = np.diag(v) @ A.T @A @np.diag(v) + la * np.eye(p)
    return np.linalg.solve(T, v * (A.T@y))


# Using [Sherman–Morrison](https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula), this equivalently reads
# $$
#     u^\star(v) = v \odot A^\top ( A \text{diag}(v^2) A^\top + \lambda \text{Id}_n )^{-1} y.
# $$
# This formula is more efficient when $p>n$ (which is the case here).

# In[7]:


def u_opt(v): 
    S = A @ np.diag(v**2) @ A.T + la * np.eye(n)
    return v * (A.T @ np.linalg.solve(S, y))


# Compare the two formula.

# In[8]:


v = np.random.randn(p)
print(f"Should be 0: {norm(u_opt1(v) - u_opt(v)):.2e}") 


# According to the [envelope theorem](https://en.wikipedia.org/wiki/Envelope_theorem), the function $f$ is thus differentiable and its gradient reads
# $$
#     \nabla f(v) = \nabla_v F(u^\star(v),v)
#       = 
#      \frac{1}{\lambda} u^\star \odot A^\top ( A (u^\star \odot v) - y ) + v
# $$

# In[9]:


def nabla_f(v):
    u = u_opt(v)
    function = 1/(2*la) * norm( A@(u*v) - y )**2 + (norm(u)**2 + norm(v)**2)/2
    gradient = u * ( A.T@( A@(u*v)-y ) ) / la + v
    return function, gradient


# Test using finite difference that the formula is correct
# $$
#     \frac{f(v+t \delta) - f(v)}{t} \approx \langle \nabla f(v), \delta \rangle
# $$

# In[10]:


t = 1e-6
delta = np.random.randn(p)
f,g = nabla_f(v)
f1,g1 = nabla_f(v+t*delta)
d1 = ( f1-f ) / t
d2 = np.sum( delta*g )
print( "Should be small: " + str( (d1-d2)/d1 ) )


# We can also let scipy do the work for us using `scipy.optimize.check_grad()`:

# In[11]:


from scipy import optimize

optimize.check_grad(
    lambda v: nabla_f(v)[0], lambda v: nabla_f(v)[1], np.random.randn(p))


# We implement a simple [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent)
# $$
#     v_{k+1} = v_k - \tau \nabla f(v_k)
# $$
# and display the evolution of $x_k = u^\star(v_k) \odot v_k$.

# In[12]:


tau = .05
niter = 200
x_pro = np.zeros((p, niter))
v = np.random.randn(p)
for it in range(niter):
    f, g = nabla_f(v)
    v -= tau*g
    x_pro[:, it] = v * u_opt(v)
plt.plot(x_pro.T);


# While the trajectory of ISTA are piecewise smooth, and are not differentiable when crossing 0, the trajectory of this bilevel method can smoothly cross or approach 0. 

# # Non-convex Pro method: using BFGS
# 
# One can show that $f$ is a smooth function. Although it is non-convex, it has no spurious local minima and its saddle points (such as $v=0$) are "ridable" (aka strict saddle), so that trajectory of descent method are not blocked as such saddle points.

# Loading a dataset from LibSVM using https://github.com/mathurinm/libsvmdata. 
# In some cases, the $A$ matrix is sparse.

# In[13]:


name = 'w8a'
name = 'connect-4'
name = 'mnist'
from libsvmdata import fetch_libsvm

A, y = fetch_libsvm(name)
n, p = A.shape
print(f'Dataset shape: n={n}, p={p}')


# In[14]:


import scipy

if scipy.sparse.issparse(A):
    A = A.toarray()

from sklearn.preprocessing import StandardScaler
A = StandardScaler().fit_transform(A)
y -= y.mean()


# We select $\lambda$ as a fraction of $\|A^\top y\|_\infty$.

# In[15]:


la = .1 * norm(A.T @ y, ord=np.inf)


# Then re-define the function computing $\nabla f$, select the most efficient formula to compute $u^\star(v)$.

# In[16]:


def u_opt(v): 
    S = A @ np.diag(v**2) @ A.T + la * np.eye(n)
    return v * (A.T @ np.linalg.solve(S, y))

def nabla_f(v):
    u = u_opt(v)
    f = 1/(2*la) * norm(A@(u*v) - y)**2 + (norm(u)**2 + norm(v)**2) / 2
    g = u * (A.T @(A@(u*v) - y)) / la + v
    return f, g


# Denoting $C \triangleq A^\top A \in \mathbb{R}^{p \times p}$ the correlation matrix and $h \triangleq A^\top y \in \mathbb{R}^p$, since
# $$
#     \|Ax-y\|^2 = \langle Cx,x \rangle + \langle y,y \rangle - 2 \langle x,h \rangle
# $$
# the Lasso problem only depends on $C$ and $h$.
# When $n>p$, it is faster to implement the solver in these variables.

# In[17]:


C = A.T@A
ATy = A.T@y
y2 = y @ y 

if scipy.sparse.issparse(C):
    C = C.toarray()

def u_opt_cov(v):         
    T = np.outer(v, v) * C  + la * np.eye(p)
    return np.linalg.solve( T, v * ATy )

def nabla_f_cov(v):
    u = u_opt(v)
    x = u * v
    E = (C@x) @ x + y2 - 2* x @ ATy
    f = 1/(2*la) * E + (norm(u)**2 + norm(v)**2 )/2
    g = u * ( C@x-ATy  ) / la + v
    return f, g


# In[18]:


if n < 200 and p < 200:
    v = np.random.randn(p)
    f1,g1 = nabla_f(v)
    f2,g2 = nabla_f_cov(v)
    print(f'Function difference, should be 0: {(f1-f2):.2e}')
    print(f'Gradient difference, should be 0: {norm(g1-g2):.2e}')


# In[19]:


if p<n:
    print('p<n, using covariance mode')
    u_opt = u_opt_cov
    nabla_f = nabla_f_cov
else:
    print('p>n, using full mode')


# In order to obtain an efficient numerical scheme, one should use a quasi-Newton [L-BFGS](https://en.wikipedia.org/wiki/Broyden%E2%80%93Fletcher%E2%80%93Goldfarb%E2%80%93Shanno_algorithm).
# The option `maxcor` controls the size of the memory.

# In[20]:


# callback to store intermediate results
flist = []
tlist = []
t0 = time.time()
def callback(v):
    f, g = nabla_f(v)
    flist.append(f)
    tlist.append(time.time() - t0)
    return f, g

# run L-BFGS
import scipy.optimize
v0 = np.random.randn(p)
opts = { 'gtol': 1e-30, 'maxiter':1000, 'maxcor': 10, 'ftol': 1e-30, 'maxfun':10000 }
result = scipy.optimize.minimize(
    callback, v0, method='L-BFGS-B', jac=True, tol=1e-30, options=opts)
# retrieve optimal solution 
v = result.x
x_pro = v * u_opt(v)


# In[22]:


tlist = np.array(tlist)
flist = np.array(flist)
fstar = np.min(flist)
plt.semilogy(tlist, flist - fstar)
plt.xlabel("time (s)")
plt.ylabel("$f(v_k) - f^\star$");


# In[23]:


plt.stem(x_pro)
s = np.sum(np.abs(x_pro) > 1e-3)
print(f'Sparsity: {(s/p*100).astype(int)} %')


# We compare agains [Celer](https://github.com/mathurinm/celer)'s algorithm. Note that this algorithm is very efficient (more efficient than the bilevel method) for large $\lambda$. 

# In[24]:


def lasso_func(x): 
    return 1/(2*la) * norm(A@x-y)**2 + norm(x,1)


# In[25]:


import celer

warnings.filterwarnings("ignore")
m = 10
niter_list = np.arange(0, 20, 20 // m)
tlist_celer = np.zeros(m)
flist_celer = np.zeros(m)

for i in progressbar.progressbar(range(m)):
    niter = niter_list[i]
    t0 = time.time()
    x_celer = celer.Lasso(
        alpha = la/n, verbose=False,fit_intercept=False, max_iter=niter, tol=1e-20).fit(
        A,y).coef_.T        
    tlist_celer[i] = time.time()-t0
    flist_celer[i] = lasso_func(x_celer)


# In[27]:


fstar = min(np.min(flist_celer), np.min(flist))
tmax = np.inf
plt.semilogy(tlist[tlist<tmax], flist[tlist<tmax] - fstar, label='NonCvx-Pro')
plt.semilogy(tlist_celer, flist_celer - fstar, label='Celer')
plt.legend();
plt.xlabel("time (s)")
plt.ylabel("$f(v_k)-f^\star$");


# In[ ]: