## Bayesian Linear Regression¶

Bishop, C.M. (2006), Pattern Recognition and Machine Learning, Springer Science+Business Media, New York. https://www.microsoft.com/en-us/research/people/cmbishop/

Example taken from section 3.3.

In [1]:
# %load std_ipython_import.txt
import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pymc3 as pmc

pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
%config InlineBackend.figure_formats = {'retina',}
plt.style.use('seaborn-white')


### Generate data¶

• Consider single input variable $x$ and output variable $t$ and model

### $$y(x,\textbf{w}) = w_0 + w_1x$$¶

• Select $x_n$ from the uniform distribution where $x_n$ between -1 and 1.
• Create data using the function $f(x,a) = a_0 + a_1x \,$ where $a_0 = -0.3$ and $a_1 = 0.5$
• Add Gaussian noise with standard deviation 0.2 to obtain target values $t_n$.
In [2]:
# Constants
size = 20
a = [-0.3, 0.5]
sd = .2
alpha = 2.
beta = 1./(sd**2)

# grid for plotting
g_min = -1.5
g_max = 1.5
step = 100
xx, yy = np.meshgrid(np.linspace(g_min,g_max,step), np.linspace(g_min,g_max,step))
grid = np.c_[xx.ravel(), yy.ravel()]

x_ = np.linspace(g_min,g_max,step).reshape(-1,1)

In [11]:
x = stats.uniform.rvs(-1, scale=2, size=size)
y_true = a[0] + a[1] * x

t_n =  y_true + np.random.normal(scale=sd, size=size)

# plot the generated data
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,5))

# Left plot - histogram on x
ax1.hist(x, histtype='stepfilled', alpha=0.5, color='b')
ax1.set(xlabel='$x$', ylabel='value')

# Right plot
ax2.scatter(x, t_n, s=30, marker='o', facecolors='None', edgecolors='b', label='generated data')
ax2.plot(x, y_true, c='g', label='true regression line')
ax2.set(xlabel='$x$', ylabel='$y$', xlim=(-1,1), ylim=(-1,1), aspect='equal')
ax2.xaxis.set_ticks([-1,0,1])
ax2.yaxis.set_ticks([-1,0,1])
ax2.legend(loc=2);


### $$p(w\,|\,\alpha)= \mathcal{N}(w\,|\,0,\alpha^{-1}I)$$¶

In [12]:
def calc_init_prior(mean=[0,0], alpha=2.):
rv = stats.multivariate_normal(mean=mean, cov=(alpha**-1)*np.identity(2))

return(rv)


### $$p(t\,|\,X_i,w, \beta) = \mathcal{N}(t\,|\,w^{T}\phi(X_i), \beta^{-1})$$¶

In [13]:
def calc_datapoint_likelihood(datapoint=1):
Z = []

# for every w0 and w1 on the grid, calculate the likelihood for datapoint i
for w0, w1 in grid:
Z.append(stats.norm.pdf(w0 + w1*x[datapoint-1], loc=t_n[datapoint-1], scale=beta**-1))

return(Z)


### $$S_{N}^{-1} = \alpha\,I + \beta\, \Phi^T\Phi$$¶

In [14]:
def calc_posterior(t, X):

Sn = np.linalg.inv(alpha*np.identity(2) + beta*X.T.dot(X))

mn = beta*Sn.dot(X.T).dot(t)

rv = stats.multivariate_normal(mn.ravel(), cov=Sn)

return(rv)

In [15]:
def plot_posterior_update(i=1):

Z = calc_datapoint_likelihood(i)

# Draw 6 samples from the posterior and calculate y
X = np.c_[np.ones(i), x[:i]]
t = t_n[:i]
post = calc_posterior(t, X)
w = post.rvs(6)
y = w[:,0] + w[:,1]*x_

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15,5))

# Left plot
ax1.contourf(xx, yy, np.asarray(Z).reshape(step,step), cmap=plt.cm.jet)
ax1.set(title='likelihood of datapoint {}'.format(i), xlabel='$w_0$', ylabel='$w_1$')
ax1.scatter(*a, s=80, c='w', marker='+', lw=2)

# Middle plot
ax2.contourf(xx, yy, post.pdf(grid).reshape(step,step), cmap=plt.cm.jet)
ax2.scatter(*a, s=80, c='w', marker='+', lw=2)
ax2.set(title='Posterior after observing {} datapoints'.format(i), xlabel='$w_0$', ylabel='$w_1$')

# Right plot
ax3.plot(np.tile(x_,(1,6)), y, c='r' , alpha=0.5, linewidth=3, zorder=0)
ax3.scatter(x[:i], t_n[:i],s=30, marker='o', linewidth=1, facecolors='None', edgecolors='darkblue',
label='observed data points', zorder=99)
ax3.set(title='data space', xlabel='$x$', ylabel='$y$')
ax3.legend(loc=2);

for ax in fig.axes:
ax.set(xlim=(g_min,g_max), ylim=(g_min,g_max), aspect='equal')
ax.xaxis.set_ticks([-1,0,1])
ax.yaxis.set_ticks([-1,0,1])

return(fig)


Plot the inital prior distribution and the model in data space using 6 samples from the prior.

In [16]:
# Values for contourplot
p_prior = calc_init_prior()
Z = p_prior.pdf(grid)

# Draw 6 samples from the prior and calculate y
w = p_prior.rvs(6, random_state=3)
y = w[:,0] + w[:,1]*x_

# Plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,5))

# Left plot
c = ax1.contourf(xx, yy, Z.reshape(step,step), cmap=plt.cm.jet)
ax1.scatter(w[:,0], w[:,1], c='w', s=30, zorder=99)
ax1.set(title='prior(initial)', xlabel='$w_0$', ylabel='$w_1$')

# Right plot
ax2.plot(np.tile(x_,(1,6)), y, c='r', linewidth=3)
ax2.set(title='data space', xlabel='$x$', ylabel='$y$')

for ax in fig.axes:
ax.set(xlim=(g_min,g_max), ylim=(g_min,g_max), aspect='equal')
ax.xaxis.set_ticks([-1,0,1])
ax.yaxis.set_ticks([-1,0,1])


Now we we can plot what happens when we observe more and more data points (sequentially).

• Likelihood for a datapoint $X_i$
• Posterior distribution after observing the datapoints $X_1$ to $X_i$
• Six versions of the model $y(x,\textbf{w}) = w_0 + w_1x$ after taking 6 samples from the posterior distribution of $w$

Note: the white cross in the left and middle plot indicate the parameter values used to generate the data.

In [17]:
[plot_posterior_update(i) for i in [1,2,5,20]];


### MCMC sampling¶

Let's see what results we get when we use sampling instead of calculating the posterior analytically.

In [23]:
data = pd.DataFrame({'x':x, 'y':t_n})

with pmc.Model() as model:

pmc.glm.glm('y ~ x', data, )

trace = pmc.sample(10000, progressbar=True)

Applied log-transform to sd and added transformed sd_log_ to model.
Assigned NUTS to Intercept
Assigned NUTS to x
Assigned NUTS to sd_log_
[-----------------100%-----------------] 10000 of 10000 complete in 7.0 sec
In [26]:
cov = pmc.trace_cov(trace, vars=['Intercept', 'x'])

rv = stats.multivariate_normal(mean=[trace.point(-1)['Intercept'], trace.point(-1)['x']], cov=alpha**-1*cov)
Z = rv.pdf(grid)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,8))

# Left plot
ax1.contourf(xx, yy, Z.reshape(100,100), cmap=plt.cm.jet)
ax1.scatter(-.3, .5, s=80, c='w', marker='+', lw=2)
ax1.set(title='posterior after sampling', xlabel='$w_0$', ylabel='$w_1$')

# Right plot
ax2.scatter(data.iloc[:,0], data.iloc[:,1], s=30, marker='o', facecolors='None', edgecolors='b',
label='data points')
pmc.glm.plot_posterior_predictive(trace, samples=20, eval=np.asarray([g_min,g_max]), alpha=0.8, c='r',
label='Posterior predictive regression lines')
ax2.plot(x, y_true, c='g', label='true regression line', lw=1)
ax2.set(title='data space', xlabel='$x$', ylabel='$y$', xlim=(-1,1), ylim=(-1,1))
ax2.legend(loc=2)

for ax in fig.axes:
ax.set(xlim=(g_min,g_max), ylim=(g_min,g_max), aspect='equal')
ax.xaxis.set_ticks([-1,0,1])
ax.yaxis.set_ticks([-1,0,1])