#!/usr/bin/env python
# coding: utf-8

# Simplex Regression with Trend Filtering
# =========================================
# 
# This notebook showcase a structured regression method which defines a predictor as an average of several values (e.g. baseline methods), with a temporal regularization which enforces piecewise constant choices (often called "trend filtering").
# $\newcommand{\dotp}[2]{\langle #1, #2 \rangle}$
# $\newcommand{\enscond}[2]{\lbrace #1, #2 \rbrace}$
# $\newcommand{\pd}[2]{ \frac{ \partial #1}{\partial #2} }$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\umax}[1]{\underset{#1}{\max}\;}$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\uargmin}[1]{\underset{#1}{argmin}\;}$
# $\newcommand{\norm}[1]{\|#1\|}$
# $\newcommand{\abs}[1]{\left|#1\right|}$
# $\newcommand{\choice}[1]{ \left\{  \begin{array}{l} #1 \end{array} \right. }$
# $\newcommand{\pa}[1]{\left(#1\right)}$
# $\newcommand{\diag}[1]{{diag}\left( #1 \right)}$
# $\newcommand{\qandq}{\quad\text{and}\quad}$
# $\newcommand{\qwhereq}{\quad\text{where}\quad}$
# $\newcommand{\qifq}{ \quad \text{if} \quad }$
# $\newcommand{\qarrq}{ \quad \Longrightarrow \quad }$
# $\newcommand{\ZZ}{\mathbb{Z}}$
# $\newcommand{\CC}{\mathbb{C}}$
# $\newcommand{\RR}{\mathbb{R}}$
# $\newcommand{\EE}{\mathbb{E}}$
# $\newcommand{\Zz}{\mathcal{Z}}$
# $\newcommand{\Ww}{\mathcal{W}}$
# $\newcommand{\Vv}{\mathcal{V}}$
# $\newcommand{\Nn}{\mathcal{N}}$
# $\newcommand{\NN}{\mathcal{N}}$
# $\newcommand{\Hh}{\mathcal{H}}$
# $\newcommand{\Bb}{\mathcal{B}}$
# $\newcommand{\Ee}{\mathcal{E}}$
# $\newcommand{\Cc}{\mathcal{C}}$
# $\newcommand{\Gg}{\mathcal{G}}$
# $\newcommand{\Ss}{\mathcal{S}}$
# $\newcommand{\Pp}{\mathcal{P}}$
# $\newcommand{\Ff}{\mathcal{F}}$
# $\newcommand{\Xx}{\mathcal{X}}$
# $\newcommand{\Mm}{\mathcal{M}}$
# $\newcommand{\Ii}{\mathcal{I}}$
# $\newcommand{\Dd}{\mathcal{D}}$
# $\newcommand{\Ll}{\mathcal{L}}$
# $\newcommand{\Tt}{\mathcal{T}}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\al}{\alpha}$
# $\newcommand{\la}{\lambda}$
# $\newcommand{\ga}{\gamma}$
# $\newcommand{\Ga}{\Gamma}$
# $\newcommand{\La}{\Lambda}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\Si}{\Sigma}$
# $\newcommand{\be}{\beta}$
# $\newcommand{\de}{\delta}$
# $\newcommand{\De}{\Delta}$
# $\newcommand{\phi}{\varphi}$
# $\newcommand{\th}{\theta}$
# $\newcommand{\om}{\omega}$
# $\newcommand{\Om}{\Omega}$

# You need to install [CVXPY](https://www.cvxpy.org/). _Warning:_ seems to not be working on Python 3.7, use rather 3.6.

# In[ ]:


import numpy as np
import matplotlib.pyplot as plt
import cvxpy as cp


# Simplex Regression
# =========================================

# In the following $n$ is the number of "temporal" samples (for instance angle) and $m$ the number of methods.

# In[ ]:


n = 360  # angles
m = 4 # methods


# The set of regression weights is $W \in \RR^{n \times m}$ and the linear regression of $y_i$ at time (or angle $i$) reads
# $$
#     y_i \approx \sum_{j} M_{i,j} W_{i,j}.
# $$
# Here $M_{i,j}$ is the output of method $j$ at time $i$.
# 
# The constraint is that for each time $i$, the set of weights $W_{i,\cdot} = (W_{i,j})_{j=1}^m$ is in the probability simplex
# $$
#     W_{i,j} \geq 0 \qandq
#     \sum_j W_{i,j}=1.
# $$
# We re-write this constraint conveniently as $W \geq 0, W 1_m = 1_n$.

# We use a $\ell^2$ regression penalized by the sum of absolute value of temporal differences
# $$ \min_{W \geq 0, W 1_m = 1_n} \sum_i \Big( y_i - \sum_{j} M_{i,j} W_{i,j} \Big)^2 + \lambda \sum_{i,j} |W_{i+1,j}-W_{i,j}| $$

# Generate random data $M$ and $y$.

# In[ ]:


M = np.random.randn(n,m)
y = np.random.randn(n)


# Define the optimized over variable $W$.

# In[ ]:


W = cp.Variable((n,m))


# Define the set constraints (vectors in the simplex).

# In[ ]:


u = np.ones((m,1))
v = np.ones((n,1))
U = [0 <= W, cp.matmul(W,u)==v]


# Regularization parameter $\lambda$. Increase this value makes the resulting weights more and more constant.

# In[ ]:


Lambda = 10


# Solve the minimization using CVXPY

# In[ ]:


objective = cp.Minimize( cp.sum( ( cp.sum(cp.multiply(M,W), axis=1) - y )**2 ) + Lambda * cp.sum( cp.abs(W[1:,:] - W[:-1,:]) )  )
prob = cp.Problem(objective, U)
result = prob.solve()


# Display the evolution of the weights associated to the three first variables as function of time.

# In[8]:


Wm = W.value
plt.plot(Wm[:,0])
plt.plot(Wm[:,1])
plt.plot(Wm[:,2]);


# Group-Lasso Regularization
# ============================
# 
# It is possible to enforce that breakpoints in time are somehow "synchronized" by using a group lasso regularization.
# $$
#     \min_{W \geq 0, W 1_m = 1_n}
#     \sum_i \Big( y_i - \sum_{j} M_{i,j} W_{i,j} \Big)^2
#     + \lambda \sum_{i} \sqrt{ \sum_j|W_{i+1,j}-W_{i,j}|^2 }
# $$

# In[ ]:


Lambda = 20
objective = cp.Minimize( cp.sum( ( cp.sum(cp.multiply(M,W), axis=1) - y )**2 ) + Lambda * cp.mixed_norm( W[1:,:] - W[:-1,:], 2, 1 ) )
prob = cp.Problem(objective, U)
result = prob.solve()


# In[10]:


Wm = W.value
plt.plot(Wm[:,0])
plt.plot(Wm[:,1])
plt.plot(Wm[:,2]);


# We now show the evolution of the solution for varying $\lambda$.

# In[11]:


Lambda_list = np.array([2, 5, 10, 20])
for i in np.arange(0,4):
  Lambda = Lambda_list[i];
  objective = cp.Minimize( cp.sum( ( cp.sum(cp.multiply(M,W), axis=1) - y )**2 ) + Lambda * cp.mixed_norm( W[1:,:] - W[:-1,:], 2, 1 ) )
  prob = cp.Problem(objective, U)
  result = prob.solve()
  Wm = W.value
  ax1=plt.subplot(2, 2, i+1)
  plt.plot(Wm[:,0])
  plt.plot(Wm[:,1])
  plt.plot(Wm[:,2])
  plt.title('$\lambda$='+str(Lambda))
  ax1.set_xticklabels([])


# In[ ]: