#!/usr/bin/env python
# coding: utf-8

# #A Brief Guide of Using Hybrid Monte Carlo for Gaussian Process Models
# 
# ### Zhenwen Dai and Mu Niu, November 2014

# In[1]:


import numpy as np
import GPy
get_ipython().run_line_magic('matplotlib', 'inline')
#%config InlineBackend.figure_format = 'svg'
from pylab import *


# ## Example 1  HMC Inference for GP Regression

# Let's first generate some synthetic data.

# In[2]:


# Let's make some synthetic data
x = np.linspace(0.,2*np.pi,100)[:,None]
y = -cos(x)+np.random.randn(*x.shape)*0.3+1
_ = plot(x,y,'.')


# Let us Make a GP Regression model and give some general prior distributions to model parameters.

# In[3]:


# Make a GP regression model
m = GPy.models.GPRegression(x,y)
# Give some general prior distributions for model parameters
m.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
m.kern.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
m.likelihood.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
_=m.plot()


# ###Sampling the posterior distribution of model parameters

# Draw 1000 samples from the GP model

# In[4]:


hmc = GPy.inference.mcmc.HMC(m,stepsize=5e-2)
s = hmc.sample(num_samples=1000) # Burnin
s = hmc.sample(num_samples=1000)


# Plot the samples:

# In[5]:


plot(s)


# Plot the posterior marginal distribution of model parameters:

# In[6]:


labels = ['kern variance', 'kern lengthscale','noise variance']
samples = s[300:] # cut out the burn-in period
from scipy import stats
xmin = samples.min()
xmax = samples.max()
xs = np.linspace(xmin,xmax,100)
for i in xrange(samples.shape[1]):
    kernel = stats.gaussian_kde(samples[:,i])
    plot(xs,kernel(xs),label=labels[i])
_ = legend()


# Plot the model parameters (lengthscale, variance and noise variance) against each other:

# In[7]:


fig = figure(figsize=(14,4))
ax = fig.add_subplot(131)
_=ax.plot(samples[:,0],samples[:,1],'.')
ax.set_xlabel(labels[0]); ax.set_ylabel(labels[1])
ax = fig.add_subplot(132)
_=ax.plot(samples[:,1],samples[:,2],'.')
ax.set_xlabel(labels[1]); ax.set_ylabel(labels[2])
ax = fig.add_subplot(133)
_=ax.plot(samples[:,0],samples[:,2],'.')
ax.set_xlabel(labels[0]); ax.set_ylabel(labels[2])


# By setting the model parameters to the posterior mean, we can visualize the model fit:

# In[8]:


# Set the model parameters as the posterior mean
m.kern.variance[:] = samples[:,0].mean()
m.kern.lengthscale[:] = samples[:,1].mean()
m.likelihood.variance[:] = samples[:,2].mean()
print m
_=m.plot()


# ### Sample the posterior distribution of X given some new Y

# Given some new observations, inferring the posterior distribution of the corresponding inputs is difficult, because it can lead to multi-modal distributions. 

# Assume we have a new observation $1.5$, and try to infer its input distribution.

# In[9]:


y_new = np.array([1.5])[:,None]


# Generate the inference model for the new observations. X_new are the MAP estimations by optimizing the log likelihood. As plotted with a red dot, the MAP estimation corresponds to only one of the modes.

# In[10]:


x_new,mi = m.infer_newX(y_new)
print mi
m.plot()
plot(x_new,y_new,'or')


# Draw 10,000 samples from the inference model:

# In[11]:


hmc_new = GPy.inference.mcmc.HMC(mi,stepsize=2e-1)
s_new = hmc_new.sample(num_samples=10000,hmc_iters=10)


# Plot the samples:

# In[12]:


_ = plot(s_new[:,:])


# Plot the marginal distribution of inferred inputs. The two modes of inputs are clearly visible from the sampled posterior distribution.

# In[13]:


from scipy import stats
samples_new = s_new[:]
xmin = samples_new.min()
xmax = samples_new.max()
xs = np.linspace(xmin,xmax,100)
for i in xrange(samples_new.shape[1]):
    kernel = stats.gaussian_kde(samples_new[:,i])
    plot(xs,kernel(xs))


# # Example 2 HMC for lengthscale and variance with marathon data

# ##### we set prior for lengthscale and variance of kernel. The mean of the prior is close to the result of GP optimisation. we then allow a big variance. In the case below,we set gamma prior to lengthscale and variance. E(lengthscale) = 120, Var(lengthscale)=2000, E(variance) = 25, Var(variance) = 150 

# In[14]:


m = GPy.examples.regression.olympic_marathon_men()
#
#set prior for lengthscale and variance.
m.kern.variance.set_prior(GPy.priors.Gamma.from_EV(25.,150.))
m.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(120.,2000.))
print m


# ### Run HMC

# ##### we plot the full length of hmc iteration. The first 5000 could be burn in stage and can be ignored. 

# In[15]:


# initialise hmc
hmc = GPy.inference.mcmc.HMC(m,stepsize=2e-1)
# run hmc
t = hmc.sample(num_samples=20000,hmc_iters=20)

# Sample parameters 
#hmc = GPy.inference.optimization.HMC(m, stepsize=5e-1)
#t = hmc.sample(m_iters=50000,hmc_iters=20)

_=plot(t)


# In[16]:


print t.mean(axis=0)
print t.std(axis=0)
_=hist(t[:,:2],50)


# ## Seaborn
# Using Seaborn for plotting distributions over Hyperparameters:

# In[17]:


import seaborn as sns, pandas as pd
plt.rcParams['text.usetex'] = False


# In[18]:


df = pd.DataFrame(t, columns=m.parameter_names_flat())
ax = sns.kdeplot(df['rbf.variance'],
                 color="b", shade=True, shade_lowest=False)
ax = sns.kdeplot(df['rbf.lengthscale'],
                 color="r", shade=True, shade_lowest=False)


# In[19]:


sns.set(style="white", color_codes=True)
_ = sns.jointplot(data=df, x='rbf.variance', y='rbf.lengthscale', kind="hex", 
                  marginal_kws=dict(kde=True, hist=True, kde_kws=dict(shade=False)),
                 stat_func=None
                 )


# In[20]:


df


# In[ ]: