#!/usr/bin/env python
# coding: utf-8

# Performance of Sparse Recovery Using L1 Minimization
# ====================================================
# 
# *Important:* Please read the [installation page](http://gpeyre.github.io/numerical-tours/installation_python/) for details about how to install the toolboxes.
# $\newcommand{\dotp}[2]{\langle #1, #2 \rangle}$
# $\newcommand{\enscond}[2]{\lbrace #1, #2 \rbrace}$
# $\newcommand{\pd}[2]{ \frac{ \partial #1}{\partial #2} }$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\umax}[1]{\underset{#1}{\max}\;}$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\uargmin}[1]{\underset{#1}{argmin}\;}$
# $\newcommand{\norm}[1]{\|#1\|}$
# $\newcommand{\abs}[1]{\left|#1\right|}$
# $\newcommand{\choice}[1]{ \left\{  \begin{array}{l} #1 \end{array} \right. }$
# $\newcommand{\pa}[1]{\left(#1\right)}$
# $\newcommand{\diag}[1]{{diag}\left( #1 \right)}$
# $\newcommand{\qandq}{\quad\text{and}\quad}$
# $\newcommand{\qwhereq}{\quad\text{where}\quad}$
# $\newcommand{\qifq}{ \quad \text{if} \quad }$
# $\newcommand{\qarrq}{ \quad \Longrightarrow \quad }$
# $\newcommand{\ZZ}{\mathbb{Z}}$
# $\newcommand{\CC}{\mathbb{C}}$
# $\newcommand{\RR}{\mathbb{R}}$
# $\newcommand{\EE}{\mathbb{E}}$
# $\newcommand{\Zz}{\mathcal{Z}}$
# $\newcommand{\Ww}{\mathcal{W}}$
# $\newcommand{\Vv}{\mathcal{V}}$
# $\newcommand{\Nn}{\mathcal{N}}$
# $\newcommand{\NN}{\mathcal{N}}$
# $\newcommand{\Hh}{\mathcal{H}}$
# $\newcommand{\Bb}{\mathcal{B}}$
# $\newcommand{\Ee}{\mathcal{E}}$
# $\newcommand{\Cc}{\mathcal{C}}$
# $\newcommand{\Gg}{\mathcal{G}}$
# $\newcommand{\Ss}{\mathcal{S}}$
# $\newcommand{\Pp}{\mathcal{P}}$
# $\newcommand{\Ff}{\mathcal{F}}$
# $\newcommand{\Xx}{\mathcal{X}}$
# $\newcommand{\Mm}{\mathcal{M}}$
# $\newcommand{\Ii}{\mathcal{I}}$
# $\newcommand{\Dd}{\mathcal{D}}$
# $\newcommand{\Ll}{\mathcal{L}}$
# $\newcommand{\Tt}{\mathcal{T}}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\al}{\alpha}$
# $\newcommand{\la}{\lambda}$
# $\newcommand{\ga}{\gamma}$
# $\newcommand{\Ga}{\Gamma}$
# $\newcommand{\La}{\Lambda}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\Si}{\Sigma}$
# $\newcommand{\be}{\beta}$
# $\newcommand{\de}{\delta}$
# $\newcommand{\De}{\Delta}$
# $\newcommand{\phi}{\varphi}$
# $\newcommand{\th}{\theta}$
# $\newcommand{\om}{\omega}$
# $\newcommand{\Om}{\Omega}$

# This tour explores theoritical garantees for the performance of recovery
# using $\ell^1$ minimization.

# In[1]:


from __future__ import division

import numpy as np
import scipy as scp
import pylab as pyl
import matplotlib.pyplot as plt

from nt_toolbox.general import *
from nt_toolbox.signal import *

import warnings
warnings.filterwarnings('ignore')

get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# Sparse $\ell^1$ Recovery
# --------------------------
# We consider the inverse problem of estimating an unknown signal $x_0 \in
# \RR^N$ from noisy measurements $y=\Phi x_0 + w \in \RR^P$ where $\Phi \in \RR^{P \times N}$
# is a measurement matrix with $P \leq N$, and $w$ is some noise.
# 
# 
# This tour is focused on recovery using $\ell^1$ minimization
# $$ x^\star \in \uargmin{x \in \RR^N} \frac{1}{2}\norm{y-\Phi x}^2 + \la \norm{x}_1. $$
# 
# 
# Where there is no noise, we consider the problem $ \Pp(y) $
# $$ x^\star \in \uargmin{\Phi x = y} \norm{x}_1. $$
# 
# 
# We are not concerned here about the actual way to solve this convex
# problem (see the other numerical tours on sparse regularization) but
# rather on the theoritical analysis of wether $x^\star$ is close to
# $x_0$.
# 
# 
# More precisely, we consider the following three key properties
# 
# 
# * *Noiseless identifiability*: $x_0$ is the unique solution of $
# \Pp(y) $ for $y=\Phi x_0$.
# * *Robustess to small noise*: one has $\norm{x^\star - x_0} =
# O(\norm{w})$ for $y=\Phi x_0+w$ if $\norm{w}$ is smaller than
# an arbitrary small constant that depends on $x_0$ if $\la$ is well chosen according to $\norm{w}$.
# * *Robustess to bounded noise:* same as above, but $\norm{w}$ can be
# arbitrary.
# 
# 
# 
# 
# Note that noise robustness implies identifiability, but the converse
# is not true in general.
# 
# 
# Coherence Criteria
# ------------------
# The simplest criteria for identifiality are based on the coherence of the
# matrix $\Phi$ and depends only on the sparsity $\norm{x_0}_0$ of the
# original signal. This criteria is thus not very precise and gives very pessimistic
# bounds.
# 
# 
# The coherence of the matrix $\Phi = ( \phi_i )_{i=1}^N \in \RR^{P \times
# N}$ with unit norm colum $\norm{\phi_i}=1$ is
# $$ \mu(\Phi) = \umax{i \neq j} \abs{\dotp{\phi_i}{\phi_j}}. $$
# 
# 
# 
# Compute the correlation matrix (remove the diagonal of 1's).

# In[2]:


remove_diag = lambda C: C-np.diag(np.diag(C))
Correlation = lambda Phi: remove_diag(abs(np.dot(np.transpose(Phi),Phi)))


# Compute the coherence $\mu(\Phi)$.

# In[3]:


mu = lambda Phi: np.max(Correlation(Phi))


# The condition
# $$ \norm{x_0}_0 < \frac{1}{2}\pa{1 + \frac{1}{\mu(\Phi)}} $$
# implies that $x_0$ is identifiable, and also implies to robustess to small and bounded noise.
# 
# 
# Equivalently, this condition can be written as $\text{Coh}(\norm{x_0}_0)<1$
# where
# $$ \text{Coh}(k) = \frac{k \mu(\Phi)}{ 1 - (k-1)\mu(\Phi) } $$

# In[4]:


Coh = lambda Phi, k: (k*mu(Phi))/(1 - (k-1)*mu(Phi))


# Generate a matrix with random unit columns in $\RR^P$.

# In[5]:


from numpy import random

normalize = lambda Phi:  Phi/np.tile(np.sqrt(np.sum(Phi**2,0)), (np.shape(Phi)[0],1))
PhiRand = lambda P, N: normalize(random.randn(P, N))
Phi = PhiRand(250, 1000)


# Compute the coherence and the maximum possible sparsity to ensure
# recovery using the coherence bound.

# In[6]:


c = mu(Phi)
print("Coherence:    %.2f" %c)
print("Sparsity max: %d" %np.floor(1/2*(1 + 1/c)))


# __Exercise 1__
# 
# Display how the average coherence of a random matrix
# decays with the redundancy $\eta = P/N$ of
# the matrix $\Phi$. Can you derive an empirical law between
# $P$ and the maximal sparsity?

# In[7]:


run -i nt_solutions/sparsity_6_l1_recovery/exo1


# In[8]:


## Insert your code here.


# Support and Sign-based Criteria
# -------------------------------
# In the following we will consider the support
# $$ \text{supp}(x_0) = \enscond{i}{x_0(i) \neq 0} $$
# of the vector $x_0$. The co-support is its complementary $I^c$.

# In[9]:


supp   = lambda s: np.where(abs(s) > 1e-5)[0]
cosupp = lambda s: np.where(abs(s) < 1e-5)[0]


# Given some support $ I \subset \{0,\ldots,N-1\} $, we will denote as
# $ \Phi = (\phi_m)_{m \in I} \in \RR^{N \times \abs{I}}$ the
# sub-matrix extracted from $\Phi$ using the columns indexed by $I$.
# 
# 
# J.J. Fuchs introduces a criteria $F$ for identifiability that depends on the
# sign of $x_0$.
# 
# 
# J.J. Fuchs. _Recovery of exact sparse representations in the presence of
# bounded noise._ IEEE Trans. Inform. Theory, 51(10), p. 3601-3608, 2005
# 
# 
# Under the condition that $\Phi_I$ has full rank, the $F$ measure
# of a sign vector $s \in \{+1,0,-1\}^N$ with $\text{supp}(s)=I$ reads
# $$ \text{F}(s) = \norm{ \Psi_I s_I }_\infty
#       \qwhereq \Psi_I = \Phi_{I^c}^* \Phi_I^{+,*} $$
# where $ A^+ = (A^* A)^{-1} A^* $ is the pseudo inverse of a
# matrix $A$.
# 
# 
# The condition
# $$ \text{F}(\text{sign}(x_0))<1 $$
# implies that $x_0$ is identifiable, and also implies to robustess to
# small noise. It does not however imply robustess to a bounded noise.
# 
# 
# Compute $\Psi_I$ matrix.

# In[10]:


from numpy import linalg

PsiI = lambda Phi,I: np.dot(np.transpose(Phi[:,np.setdiff1d(np.arange(0,np.shape(Phi)[1]), I)]),np.transpose(linalg.pinv(Phi[:,I])))


# Compute $\text{F}(s)$.

# In[11]:


F = lambda Phi,s: linalg.norm(np.dot(PsiI(Phi, supp(s)),s[supp(s)]), np.float("inf"))


# The Exact Recovery Criterion (ERC) of a support $I$,
# introduced by Tropp in
# 
# 
# J. A. Tropp. _Just relax: Convex programming methods for identifying
# sparse signals._ IEEE Trans. Inform. Theory, vol. 52, num. 3, pp. 1030-1051, Mar. 2006.
# 
# 
# Under the condition that $\Phi_I$ has full rank, this condition reads
# $$ \text{ERC}(I) = \norm{\Psi_{I}}_{\infty,\infty}
#       =  \umax{j \in I^c} \norm{ \Phi_I^+ \phi_j }_1. $$
# where $\norm{A}_{\infty,\infty}$ is the $\ell^\infty-\ell^\infty$
# operator norm of a matrix $A$.

# In[12]:


erc = lambda Phi, I: linalg.norm(PsiI(Phi, I), np.float("inf"))


# The condition
# $$ \text{ERC}(\text{supp}(x_0))<1 $$
# implies that $x_0$ is identifiable, and also implies to robustess to
# small and bounded noise.
# 
# 
# One can prove that the ERC is the maximum of the F criterion for all signs of the given
# support
# $$ \text{ERC}(I) = \umax{ s, \text{supp}(s) \subset I } \text{F}(s). $$
# 
# 
# The weak-ERC is an approximation of the ERC using only the correlation
# matrix
# $$ \text{w-ERC}(I) = \frac{
#       \umax{j \in I^c} \sum_{i \in I} \abs{\dotp{\phi_i}{\phi_j}}
#  }{
#       1-\umax{j \in I} \sum_{i \neq j \in I} \abs{\dotp{\phi_i}{\phi_j}}
#  }$$

# In[13]:


g = lambda C,I: np.sum(C[:,I], 1)
werc_g = lambda g,I,J: np.max(g[J])/(1-np.max(g[I]))
werc = lambda Phi,I: werc_g(g(Correlation(Phi), I), I, np.setdiff1d(np.arange(0,np.shape(Phi)[1]), I))


# One has, if $\text{w-ERC}(I)>0$,  for $ I = \text{supp}(s) $,
# $$ \text{F}(s) \leq \text{ERC}(I) \leq \text{w-ERC}(I) \leq
#       \text{Coh}(\abs{I}). $$
# 
# 
# This shows in particular that the condition
# $$ \text{w-ERC}(\text{supp}(x_0))<1 $$
# implies identifiability and robustess to small and bounded noise.

# __Exercise 2__
# 
# Show that this inequality holds on a given matrix.
# What can you conclude about the sharpness of these criteria ?

# In[14]:


run -i nt_solutions/sparsity_6_l1_recovery/exo2


# In[15]:


## Insert your code here.

N = 2000
P = N-10
Phi = PhiRand(N, P)
s = np.zeros(N)
s[:6] = 1
I = supp(s)
k = len(I)

print("N = %d, P = %d, |I| = %d" %(N,P,k))
print("F(s)     = %.2f"  %F(Phi, s))
print("ERC(I)   = %.2f"  %erc(Phi, I))
print("w-ERC(s) = %.2f" %werc(Phi, I))
print("Coh(|s|) = %.2f" %Coh(Phi, k))


# __Exercise 3__
# 
# For a given matrix $\Phi$ generated using PhiRand, draw as a function of the sparsity $k$
# the probability that a random sign vector $s$ of sparsity
# $\norm{s}_0=k$ satisfies the conditions $\text{F}(x_0)<1$,
# $\text{ERC}(x_0)<1$ and $\text{w-ERC}(x_0)<1$

# In[16]:


run -i nt_solutions/sparsity_6_l1_recovery/exo3


# In[17]:


## Insert your code here.

N = 600
P = N/2
Phi = PhiRand(P, N)
klist = np.round(np.linspace(1, P/7., 20))
ntrials = 60
proba = np.zeros([len(klist),3])

for i in range(len(klist)):
    proba[i,:3] = 0
    k = klist[i]
    for j in range(ntrials):
        s = np.zeros(N)
        I = random.permutation(N)
        I = I[:k]
        s[I] = np.sign(random.randn(k, 1))
        proba[i, 0] = proba[i, 0] + (F(Phi, s) < 1)
        proba[i, 1] = proba[i, 1] + (erc(Phi, I) < 1)
        proba[i, 2] = proba[i, 2] + (werc(Phi, I) > 0)*(werc(Phi, I) < 1)
        
plt.figure(figsize = (8,5))
plt.plot(klist, proba/ntrials, linewidth=2)
plt.xlabel("k")
plt.legend(['F <1', 'ERC <1', 'w-ERC <1'])
plt.title("N = %d, P = %d" %(N,P))
plt.show()


# Restricted Isometry Criteria
# ----------------------------
# The restricted isometry constants $\de_k^1,\de_k^2$ of a matrix $\Phi$ are the
# smallest $\de^1,\de^2$ that satisfy
# $$ \forall x \in \RR^N, \quad \norm{x}_0 \leq k \qarrq
#       (1-\de^1)\norm{x}^2 \leq \norm{\Phi x}^2 \leq (1+\de^2)\norm{x}^2.  $$
# 
# 
# E. Candes shows in
# 
# 
# E. J. Cand s. _The restricted isometry property and its implications for
# compressed sensing_. Compte Rendus de l'Academie des Sciences, Paris, Serie I, 346 589-592
# 
# 
# that if
# $$ \de_{2k} \leq \sqrt{2}-1 ,$$
# then $\norm{x_0} \leq k$ implies identifiability as well as robustness to small and bounded noise.
# 
# 
# The stability constant $\la^1(A), \la^2(A)$ of a matrix
# $A = \Phi_I$ extracted from $\Phi$ is the smallest $\tilde \la_1,\tilde \la_2$ such that
# $$ \forall \al \in \RR^{\abs{I}}, \quad
#       (1-\tilde\la_1)\norm{\al}^2 \leq \norm{A \al}^2 \leq (1+\tilde \la_2)\norm{\al}^2.  $$
# 
# 
# These constants $\la^1(A), \la^2(A)$ are easily computed from the
# largest and smallest eigenvalues of $A^* A \in \RR^{\abs{I} \times \abs{I}}$

# In[18]:


minmax = lambda v: (1-np.min(v), max(v)-1)
ric = lambda A: minmax(linalg.eig(np.dot(np.transpose(A),A))[0])


# The restricted isometry constant of $\Phi$ are computed as the largest
# stability constants of extracted matrices
# $$ \de^\ell_k = \umax{ \abs{I}=k } \la^\ell( \Phi_I ).  $$
# 
# 
# The eigenvalues of $\Phi$ are essentially contained in the
# interval $ [a,b] $ where $a=(1-\sqrt{\be})^2$ and $b=(1+\sqrt{\be})^2$
# with $\beta = k/P$
# More precisely, as $k=\be P$ tends to infinity, the distribution of the
# eigenvalues tends to the Marcenko-Pastur law
# $ f_\be(\la) = \frac{1}{2\pi \be \la}\sqrt{ (\la-b)^+ (a-\la)^+ }. $

# __Exercise 4__
# 
# Display, for an increasing value of $k$ the histogram of repartition
# of the eigenvalues $A^* A$ where $A$ is a Gaussian matrix of size $(P,k)$ and
# variance $1/P$. For this, accumulate the eigenvalues for many
# realizations of $A$.

# In[19]:


run -i nt_solutions/sparsity_6_l1_recovery/exo4


# In[20]:


## Insert your code here.


# __Exercise 5__
# 
# Estimate numerically lower bound on $\de_k^1,\de_k^2$ by Monte-Carlo
# sampling of sub-matrices.

# In[21]:


run -i nt_solutions/sparsity_6_l1_recovery/exo5


# In[22]:


## Insert your code here.


# Sparse Spikes Deconvolution
# ---------------------------
# We now consider a convolution dictionary $ \Phi $.
# Such a dictionary is used with sparse regulariz
# 
# 
# Second derivative of Gaussian kernel $g$ with a given variance $\si^2$.

# In[23]:


sigma = 6
g = lambda x: (1-x**2/sigma**2)*np.exp(-x**2/(2*sigma**2))


# Create a matrix $\Phi$ so that $\Phi x = g \star x$ with periodic
# boundary conditions.

# In[24]:


P = 1024
[Y, X] = np.meshgrid(np.arange(0,P), np.arange(0,P))
Phi = normalize(g((X - Y + P/2.)%P-P/2.))


# To improve the conditionning of the dictionary, we sub-sample its atoms,
# so that $ P = \eta N > N $.

# In[25]:


eta = 2
N = P/eta
Phi = Phi[:,::eta]


# Plot the correlation function associated to the filter.
# Can you determine the value of the coherence $\mu(\Phi)$?

# In[26]:


c = np.dot(np.transpose(Phi),Phi)
c = np.abs(c[:,np.shape(c)[1]/2])

plt.figure(figsize=(8,5))
plt.plot(c[len(c)/2 - 50:len(c)/2 + 50], '.-')
plt.show()


# Create a data a sparse $x_0$ with two diracs of opposite signes with spacing $d$.

# In[27]:


twosparse = lambda d: np.roll(np.concatenate(([1.], np.zeros(d), [-1.], np.zeros(N-d-2))), int(N/2 - d/2))


# Display $x_0$ and $\Phi x_0$.

# In[28]:


x0 = twosparse(50)

plt.figure(figsize=(10,7))
plt.subplot(2, 1, 1)
plt.plot(x0, 'r', linewidth=2)
plt.subplot(2, 1, 2)
plt.plot(Phi*x0, 'b', linewidth=2)
plt.show()


# __Exercise 6__
# 
# Plot the evolution of the criteria F, ERC and Coh as a function of $d$.
# Do the same plot for other signs patterns for $x_0$.
# Do the same plot for a Dirac comb with a varying spacing $d$.

# In[29]:


run -i nt_solutions/sparsity_6_l1_recovery/exo6


# In[30]:


## Insert your code here.

<script>
  $(document).ready(function(){
      $('div.prompt').hide();
  });
</script>