#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# # Variational Inference
# 
# Inference and learning often involve intractable integrals. 
# 
# Prime examples include __bayesian inference__
# 
# $$p(y | \mathbf{x}, \mathcal{D})=\int p(y, \mathbf{w} | \mathbf{x}, \mathcal{D}) \mathrm{d} \mathbf{w}=\int p(y | \mathbf{x}, \mathbf{w}) p(\mathbf{w} | \mathcal{D}) \mathrm{d} \mathbf{w}$$
# 
# 
# or marginalization of unseen variables
# 
# $$L(\boldsymbol{\theta})=p(\mathcal{D} ; \boldsymbol{\theta})=\int_{\mathbf{u}} p(\mathbf{u}, \mathcal{D} ; \boldsymbol{\theta}) \mathrm{d} \mathbf{u}$$
# 
# 
# There are two possible methods to approximate such integrals:
# - Monte Carlo estimate through sampling
# - Variational approach
# 
# In this notebook I will focus on this second approach.
# 
# ## Kullback-Leibler divergence
# 
# The KL divergence is a fundamental concept in variational inference and consequently for variational autoencoders.
# The KL divergence between two distributions $p$ and $q$ is:
# 
# $$\mathrm{KL}(p \| q)=\int p(\mathbf{x}) \,\log \left(\frac{p(\mathbf{x})}{q(\mathbf{x})}  \right)\mathrm{d} \mathbf{x}=\mathbb{E}_{p(\mathbf{x})}\left[\log \frac{p(\mathbf{x})}{q(\mathbf{x})}\right]$$
# 
# Properties of KL divergence:
# - $\mathrm{KL}(p \| q)=0$ iff $p = q$ 
# - $\mathrm{KL}(p \| q)\neq \mathrm{KL}(q \| p)$ (non-commutative)
# - $\mathrm{KL}(p \| q)\geq0$ (always non-negative)
# 
# ## Variational principle
# 
# Given a joint distribution $p(\mathbf{x}, \mathbf{y})$, the Variational principle states that we can __formulate inference tasks__ such as marginalization $p(\mathbf{x})=\int p(\mathbf{x}, \mathbf{y}) \mathrm{d} \mathbf{y}$, and conditioning $p(\mathbf{y} | \mathbf{x})$, __as optimization problems__.
# 
# Specifically, the maximisation of variational free energy
# 
# $$\mathcal{F}(\mathbf{x}, q)=\mathbb{E}_{q(\mathbf{y})}\left[\log \frac{p(\mathbf{x}, \mathbf{y})}{q(\mathbf{y})}\right]$$
# 
# leads to 
# - $\log p(\mathbf{x})=\max _{q(\mathbf{y})} \mathcal{F}(\mathbf{x}, q)$
# - $p(\mathbf{y} | \mathbf{x})=\operatorname{argmax}_{q(\mathbf{y})} \mathcal{F}(\mathbf{x}, q)$
# 
# 
# By separating the joint distribution $p$ in the formulation of the free energy, we find that
# $$\log p\left(\mathbf{x}\right)=\mathrm{KL}\left(q(\mathbf{y}) \| p\left(\mathbf{y} | \mathbf{x}\right)\right)+\mathcal{F}\left(\mathbf{x}, q\right)=\mathrm{const}$$
# 
# Meaning that maximising the variational free energy is equivalent to minimising the KL divergence $\mathrm{KL}(q \| p)$
# 
# Since the KL-divergence is always non-negative, $\mathcal{F}$ is also referred to as the Evidence Lower Bound (ELBO), since it provides a lower bound for the marginal likelihood.
# $$\log p\left(\mathbf{x}\right)\geq\mathcal{F}\left(\mathbf{x}, q\right)$$
# 
# In variational inference the $q$ distribution involved in the ELBO is parametrised as $q(\mathbf{y}; \mathbf{\theta})$, and the parameters are optimised to push the ELBO as high as possible.

# ## Example implementation: variational inference in 1D

# In[2]:


# Generate the data: let us consider a mixture of laplacian distributions
SEED = 2314
N = 200 # number of points
K = 4 # distributions in the mixture
np.random.seed(SEED)
source_distr = np.random.randint(0, K, N)
data = np.zeros(N)

fig, ax = plt.subplots(figsize=(14, 10))
for i in range(K):
    mean = 20* np.random.random()
    scale = 3 * np.random.random()
    
    idxs = np.where(source_distr==i)[0]
    laplacian_data = np.random.laplace(mean, scale, len(idxs))
    data[idxs] = laplacian_data
    sns.distplot(laplacian_data, bins="auto", kde=True, rug=True, kde_kws={"linewidth": 4}, ax=ax)
    

# In[3]:


# Fitting a gaussian model


# In[ ]: