#!/usr/bin/env python
# coding: utf-8

# Advanced Topics on Sinkhorn Algorithm
# =====================================
# $\newcommand{\dotp}[2]{\langle #1, #2 \rangle}$
# $\newcommand{\enscond}[2]{\lbrace #1, #2 \rbrace}$
# $\newcommand{\pd}[2]{ \frac{ \partial #1}{\partial #2} }$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\umax}[1]{\underset{#1}{\max}\;}$
# $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$
# $\newcommand{\uargmin}[1]{\underset{#1}{argmin}\;}$
# $\newcommand{\norm}[1]{\|#1\|}$
# $\newcommand{\abs}[1]{\left|#1\right|}$
# $\newcommand{\choice}[1]{ \left\{  \begin{array}{l} #1 \end{array} \right. }$
# $\newcommand{\pa}[1]{\left(#1\right)}$
# $\newcommand{\diag}[1]{{diag}\left( #1 \right)}$
# $\newcommand{\qandq}{\quad\text{and}\quad}$
# $\newcommand{\qwhereq}{\quad\text{where}\quad}$
# $\newcommand{\qifq}{ \quad \text{if} \quad }$
# $\newcommand{\qarrq}{ \quad \Longrightarrow \quad }$
# $\newcommand{\ZZ}{\mathbb{Z}}$
# $\newcommand{\CC}{\mathbb{C}}$
# $\newcommand{\RR}{\mathbb{R}}$
# $\newcommand{\EE}{\mathbb{E}}$
# $\newcommand{\Zz}{\mathcal{Z}}$
# $\newcommand{\Ww}{\mathcal{W}}$
# $\newcommand{\Vv}{\mathcal{V}}$
# $\newcommand{\Nn}{\mathcal{N}}$
# $\newcommand{\NN}{\mathcal{N}}$
# $\newcommand{\Hh}{\mathcal{H}}$
# $\newcommand{\Bb}{\mathcal{B}}$
# $\newcommand{\Ee}{\mathcal{E}}$
# $\newcommand{\Cc}{\mathcal{C}}$
# $\newcommand{\Gg}{\mathcal{G}}$
# $\newcommand{\Ss}{\mathcal{S}}$
# $\newcommand{\Pp}{\mathcal{P}}$
# $\newcommand{\Ff}{\mathcal{F}}$
# $\newcommand{\Xx}{\mathcal{X}}$
# $\newcommand{\Mm}{\mathcal{M}}$
# $\newcommand{\Ii}{\mathcal{I}}$
# $\newcommand{\Dd}{\mathcal{D}}$
# $\newcommand{\Ll}{\mathcal{L}}$
# $\newcommand{\Tt}{\mathcal{T}}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\al}{\alpha}$
# $\newcommand{\la}{\lambda}$
# $\newcommand{\ga}{\gamma}$
# $\newcommand{\Ga}{\Gamma}$
# $\newcommand{\La}{\Lambda}$
# $\newcommand{\si}{\sigma}$
# $\newcommand{\Si}{\Sigma}$
# $\newcommand{\be}{\beta}$
# $\newcommand{\de}{\delta}$
# $\newcommand{\De}{\Delta}$
# $\newcommand{\phi}{\varphi}$
# $\newcommand{\th}{\theta}$
# $\newcommand{\om}{\omega}$
# $\newcommand{\Om}{\Omega}$
# $\newcommand{\eqdef}{\equiv}$

# This numerical tour explore several extensions of the basic Sinkhorn
# method.

# In[1]:


import numpy as np
import matplotlib.pyplot as plt


# Log-domain Sinkhorn 
# --------------------
# For simplicity, we consider uniform distributions on point clouds, so
# that the associated histograms are $ (a,b) \in \RR^n \times \RR^m$
# being constant $1/n$ and $1/m$.

# In[2]:


n = 100
m = 200
a = np.ones((n,1))/n
b = np.ones((1,m))/m


# Point clouds $x$ and $y$.

# In[3]:


x = np.random.rand(2,n)-.5
theta = 2*np.pi*np.random.rand(1,m)
r = .8 + .2*np.random.rand(1,m)
y = np.vstack((np.cos(theta)*r,np.sin(theta)*r))


# Display of the two clouds.

# In[4]:


plotp = lambda x,col: plt.scatter(x[0,:], x[1,:], s=150, edgecolors="k", c=col, linewidths=2)
plt.figure(figsize=(6,6))
plotp(x, 'b')
plotp(y, 'r')
plt.axis("off");


# Cost matrix $C_{i,j} = \norm{x_i-y_j}^2$.

# In[5]:


def distmat(x,y):
    return np.sum(x**2,0)[:,None] + np.sum(y**2,0)[None,:] - 2*x.transpose().dot(y)
C = distmat(x,y)


# Sinkhorn algorithm is originally implemented using matrix-vector
# multipliciation, which is unstable for small epsilon.
# Here we consider a log-domain implementation, which operates by
# iteratively updating so-called Kantorovitch dual potentials $ (f,g) \in \RR^n \times \RR^m $.
# 
# 
# The update are obtained by regularized c-transform, which reads
# $$ f_i \leftarrow {\min}_\epsilon^b( C_{i,\cdot} - g ) $$
# $$ g_j \leftarrow {\min}_\epsilon^a( C_{\cdot,j} - f ),  $$
# where the regularized minimum operator reads
# $$ {\min}_\epsilon^a(h) \eqdef -\epsilon \log \sum_i a_i e^{-h_i/\epsilon}. $$

# In[6]:


def mina_u(H,epsilon): return -epsilon*np.log( np.sum(a * np.exp(-H/epsilon),0) )
def minb_u(H,epsilon): return -epsilon*np.log( np.sum(b * np.exp(-H/epsilon),1) )


# The regularized min operator defined this way is non-stable, but it can
# be stabilized using the celebrated log-sum-exp trick, wich relies on the
# fact that for any constant $c \in \RR$, one has
# $$ {\min}_\epsilon^a(h+c) = {\min}_\epsilon^a(h) + c,  $$
# and stabilization is achieved using $c=\min(h)$.

# In[7]:


def mina(H,epsilon): return mina_u(H-np.min(H,0),epsilon) + np.min(H,0);
def minb(H,epsilon): return minb_u(H-np.min(H,1)[:,None],epsilon) + np.min(H,1);


# Value of $\epsilon$. 

# In[8]:


epsilon = .01


# __Exercise 1__
# 
# Implement Sinkhorn in log domain.

# In[9]:


def Sinkhorn(C,epsilon,f,niter = 500):    
    Err = np.zeros(niter)
    for it in range(niter):
        g = mina(C-f[:,None],epsilon)
        f = minb(C-g[None,:],epsilon)
        # generate the coupling
        P = a * np.exp((f[:,None]+g[None,:]-C)/epsilon) * b
        # check conservation of mass
        Err[it] = np.linalg.norm(np.sum(P,0)-b,1)
    return (P,Err)
# run with 0 initialization for the potential f
(P,Err) =  Sinkhorn(C,epsilon,np.zeros(n),3000)  
plt.plot(np.log10(Err));


# __Exercise 2__
# 
# Study the impact of $\epsilon$ on the convergence rate of the algorithm.

# In[10]:


for epsilon in (.1, .05, .01, .001):
    (P,Err) =  Sinkhorn(C,epsilon,np.zeros(n),1000)  
    plt.plot(np.log10(Err), label='$\epsilon=$' + str(epsilon))
plt.legend();


# Wasserstein Flow for Matching 
# ------------------------------
# We aim at performing a "Lagrangian" gradient (also called Wasserstein flow) descent of Wasserstein
# distance, in order to perform a non-parametric fitting. This corresponds
# to minimizing the energy function
# $$ \Ee(z) \eqdef W_\epsilon\pa{ \frac{1}{n}\sum_i \de_{z_i}, \frac{1}{m}\sum_i \de_{y_i} }. $$
# 
# 
# Here we have denoted the Sinkhorn score as
# $$ W_\epsilon(\al,\be) \eqdef \dotp{P}{C} - \epsilon \text{KL}(P|ab^\top)$$
# where $\al=\frac{1}{n}\sum_i \de_{x_i}$ and
# $\be=\frac{1}{m}\sum_i \de_{y_i}$ are the measures (beware that $C$
# depends on the points positions).

# In[11]:


z = x # initialization


# The gradient of this energy reads
# $$ ( \nabla \Ee(z) )_i = \sum_j P_{i,j}(z_i-y_j) = a_i z_i - \sum_j P_{i,j} y_j,  $$
# where $P$ is the optimal coupling. It is better to consider a renormalized gradient, which corresponds
# to using the inner product associated to the measure $a$ on the
# deformation field, in which case
# $$ ( \bar\nabla \Ee(z) )_i = z_i - \bar y_i   \qwhereq \bar y_i \eqdef \frac{\sum_j P_{i,j} y_j}{a_i}.  $$
# Here $\bar y_i$ is often called the "barycentric projection" associated
# to the coupling matrix $P$.
# 
# 
# First run Sinkhorn, beware you need to recompute the cost matrix at each step.

# In[12]:


epsilon = .01
niter = 300
(P,Err) = Sinkhorn(distmat(z,y), epsilon,np.zeros(n),niter);


# Compute the gradient

# In[13]:


G = z - ( y.dot(P.transpose()) ) / a.transpose()


# Display the gradient field.

# In[14]:


plt.figure(figsize=(6,6))
plotp(x, 'b')
plotp(y, 'r')
for i in range(n):
    plt.plot([x[0,i], x[0,i]-G[0,i]], [x[1,i], x[1,i]-G[1,i]], 'k')
plt.axis("off");


# Set the descent step size.

# In[15]:


tau = .1


# Update the point cloud.

# In[16]:


z = z - tau * G


# __Exercise 3__
# 
# Implement the gradient flow.

# In[17]:


z = x; # initialization
tau = .03; # step size for the descent 
giter = 20; # iter for the gradient descent
ndisp = np.round( np.linspace(0,giter-1,6) )
kdisp = 0
f = np.zeros(n) # use warm restart in the following
for j in range(giter):
    # drawing
    if ndisp[kdisp]==j:        
        plt.subplot(2,3,kdisp+1)
        s = j/(giter-1)
        col = np.array([s,0,1-s])[None,:]
        plotp(z, col )
        plt.axis("off")
        kdisp = kdisp+1
    # Sinkhorn
    (P,Err) = Sinkhorn(distmat(z,y), epsilon,f,niter)
    # gradient
    G = z - ( y.dot(P.transpose()) ) / a.transpose()
    z = z - tau * G;


# __Exercise 4__
# 
# Show the evolution of the fit as $\epsilon$ increases. What do you observe.
# Replace the Sinkhorn score $W_\epsilon(\al,\be)$ by the Sinkhorn divergence
# $W_\epsilon(\al,\be)-W_\epsilon(\al,\al)/2-W_\epsilon(\be,\be)/2$.

# In[18]:


## Insert your code here.


# Generative Model Fitting
# ------------------------
# The Wasserstein is a non-parametric idealization which does not
# corresponds to any practical application. We consider here a simple toy
# example of density fitting, where the goal is to find a parameter $\theta$
# to fit a deformed point cloud of the form $ (g_\theta(x_i))_i $ using a
# Sinkhorn cost. This is ofen called a generative model in the machine
# learning litterature, and corresponds to the problem of shape
# registration in imaging.
# 
# 
# The matching is achieved by solving
# $$ \min_\th \Ff(\th) \eqdef \Ee(G_\th(z)) =  W_\epsilon\pa{ \frac{1}{n}\sum_i \de_{g_\th(z_i)}, \frac{1}{m}\sum_i \de_{y_i} },  $$
# where the function $G_\th(z)=( g_\th(z_i) )_i$ operates independently on
# each point.
# 
# 
# The gradient reads
# $$ \nabla \Ff(\th) = \sum_i \partial g_\th(z_i)^*[ \nabla \Ee(G_\th(z))_i ], $$
# where $\partial g_\th(z_i)^*$ is the adjoint of the Jacobian of
# $g_\th$.
# 
# 
# We consider here a simple model of affine transformation, where
# $\th=(A,h) \in \RR^{d \times d} \times \RR^d $
# and $g_\th(z_i)=Az_i+h$.
# 
# 
# Denoting $ v_i = \nabla \Ee(G_\th(z))_i $ the gradient of the Sinkhorn
# loss (which is computed as in the previous section), the gradient with
# respect to the parameter reads
# $$ \nabla_A \Ff(\th) = \sum_i  v_i z_i^\top
#   \qandq \nabla_h \Ff(\th) = \sum_i  v_i. $$
# 
# 
# Generate the data.

# In[19]:


z = np.random.randn(2,n)*.2
y = np.random.randn(2,m)*.2
y[0,:] = y[0,:]*.05 + 1


# Initialize the parameters.

# In[20]:


A = np.eye(2)
h = np.zeros(2)


# Display the clouds.

# In[21]:


plotp(A.dot(z)+h[:,None], 'b')
plotp(y, 'r')
plt.xlim(-.7,1.1)
plt.ylim(-.7,.7);


# Run Sinkhorn.

# In[22]:


x = A.dot(z)+h[:,None]
f = np.zeros(n)
(P,Err) = Sinkhorn(distmat(x,y), epsilon,f,niter)


# Compute gradient with respect to positions.

# In[23]:


v = a.transpose() * x - y.dot(P.transpose())


# gradient with respect to parameters

# In[24]:


nabla_A = v.dot(z.transpose())
nabla_h = np.sum(v,1)


# __Exercise 5__
# 
# Implement the gradient descent.

# In[25]:


A = np.eye(2)
h = np.zeros(2)
# step size for the descent
tau_A = .8
tau_h = .1
# #iter for the gradient descent
giter = 40 
ndisp = np.round( np.linspace(0,giter-1,6) )
kdisp = 0
f = np.zeros(n)
for j in range(giter):
    x = A.dot(z)+h[:,None] 
    if ndisp[kdisp]==j:
        plt.subplot(2,3,kdisp+1)
        plotp(y, 'r')
        plotp(x, 'b')
        kdisp = kdisp+1
        plt.xlim(-.7,1.3)
        plt.ylim(-.7,.7)
    (P,Err) = Sinkhorn(distmat(x,y), epsilon,f,niter)
    v = a.transpose() * x - y.dot(P.transpose())
    nabla_A = v.dot(z.transpose())
    nabla_h = np.sum(v,1)
    A = A - tau_A * nabla_A
    h = h - tau_h * nabla_h   


# __Exercise 5__
# 
# Test using a more complicated deformation (for instance a square being deformed by a random $A$.

# In[ ]: