Bayesian Linear Regression

Bishop, C.M. (2006), Pattern Recognition and Machine Learning, Springer Science+Business Media, New York.

Example taken from section 3.3.

In [1]:
# %load std_ipython_import.txt
import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pymc3 as pmc

pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
%config InlineBackend.figure_formats = {'retina',}'seaborn-white')

Generate data

  • Consider single input variable $x$ and output variable $t$ and model

$$y(x,\textbf{w}) = w_0 + w_1x$$

  • Select $x_n$ from the uniform distribution where $x_n$ between -1 and 1.
  • Create data using the function $f(x,a) = a_0 + a_1x \,$ where $a_0 = -0.3$ and $a_1 = 0.5$
  • Add Gaussian noise with standard deviation 0.2 to obtain target values $t_n$.
In [2]:
# Constants
size = 20
a = [-0.3, 0.5]
sd = .2
alpha = 2.
beta = 1./(sd**2)

# grid for plotting
g_min = -1.5
g_max = 1.5
step = 100
xx, yy = np.meshgrid(np.linspace(g_min,g_max,step), np.linspace(g_min,g_max,step))
grid = np.c_[xx.ravel(), yy.ravel()]

x_ = np.linspace(g_min,g_max,step).reshape(-1,1)
In [11]:
x = stats.uniform.rvs(-1, scale=2, size=size)
y_true = a[0] + a[1] * x 

t_n =  y_true + np.random.normal(scale=sd, size=size)

# plot the generated data
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,5))

# Left plot - histogram on x 
ax1.hist(x, histtype='stepfilled', alpha=0.5, color='b')
ax1.set(xlabel='$x$', ylabel='value')

# Right plot
ax2.scatter(x, t_n, s=30, marker='o', facecolors='None', edgecolors='b', label='generated data')
ax2.plot(x, y_true, c='g', label='true regression line')
ax2.set(xlabel='$x$', ylabel='$y$', xlim=(-1,1), ylim=(-1,1), aspect='equal')

Isotropic Gaussian prior

$$p(w\,|\,\alpha)= \mathcal{N}(w\,|\,0,\alpha^{-1}I)$$

In [12]:
def calc_init_prior(mean=[0,0], alpha=2.):
    rv = stats.multivariate_normal(mean=mean, cov=(alpha**-1)*np.identity(2))

Likelihood of a single datapoint as a function of $w$ and $\beta$

$$p(t\,|\,X_i,w, \beta) = \mathcal{N}(t\,|\,w^{T}\phi(X_i), \beta^{-1})$$

In [13]:
def calc_datapoint_likelihood(datapoint=1):
    Z = []
    # for every w0 and w1 on the grid, calculate the likelihood for datapoint i
    for w0, w1 in grid:
        Z.append(stats.norm.pdf(w0 + w1*x[datapoint-1], loc=t_n[datapoint-1], scale=beta**-1))

Gaussian posterior

$$p(w\,|\,t) = \mathcal{N}(w\,|\,m_{N}, S_N)$$

$$m_N = \beta\, S_N\Phi^Tt$$

$$S_{N}^{-1} = \alpha\,I + \beta\, \Phi^T\Phi$$

In [14]:
def calc_posterior(t, X):
    Sn = np.linalg.inv(alpha*np.identity(2) + beta*
    mn = beta*
    rv = stats.multivariate_normal(mn.ravel(), cov=Sn)
In [15]:
def plot_posterior_update(i=1):
    Z = calc_datapoint_likelihood(i)
    # Draw 6 samples from the posterior and calculate y
    X = np.c_[np.ones(i), x[:i]]
    t = t_n[:i]  
    post = calc_posterior(t, X)
    w = post.rvs(6)
    y = w[:,0] + w[:,1]*x_
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15,5))

    # Left plot
    ax1.contourf(xx, yy, np.asarray(Z).reshape(step,step),
    ax1.set(title='likelihood of datapoint {}'.format(i), xlabel='$w_0$', ylabel='$w_1$')
    ax1.scatter(*a, s=80, c='w', marker='+', lw=2)
    # Middle plot      
    ax2.contourf(xx, yy, post.pdf(grid).reshape(step,step),
    ax2.scatter(*a, s=80, c='w', marker='+', lw=2)
    ax2.set(title='Posterior after observing {} datapoints'.format(i), xlabel='$w_0$', ylabel='$w_1$')

    # Right plot
    ax3.plot(np.tile(x_,(1,6)), y, c='r' , alpha=0.5, linewidth=3, zorder=0)
    ax3.scatter(x[:i], t_n[:i],s=30, marker='o', linewidth=1, facecolors='None', edgecolors='darkblue', 
                label='observed data points', zorder=99)
    ax3.set(title='data space', xlabel='$x$', ylabel='$y$')

    for ax in fig.axes:
        ax.set(xlim=(g_min,g_max), ylim=(g_min,g_max), aspect='equal')

Plot the inital prior distribution and the model in data space using 6 samples from the prior.

In [16]:
# Values for contourplot
p_prior = calc_init_prior()
Z = p_prior.pdf(grid)

# Draw 6 samples from the prior and calculate y
w = p_prior.rvs(6, random_state=3)
y = w[:,0] + w[:,1]*x_

# Plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,5))

# Left plot
c = ax1.contourf(xx, yy, Z.reshape(step,step),
ax1.scatter(w[:,0], w[:,1], c='w', s=30, zorder=99)
ax1.set(title='prior(initial)', xlabel='$w_0$', ylabel='$w_1$')

# Right plot
ax2.plot(np.tile(x_,(1,6)), y, c='r', linewidth=3)
ax2.set(title='data space', xlabel='$x$', ylabel='$y$')

for ax in fig.axes:
    ax.set(xlim=(g_min,g_max), ylim=(g_min,g_max), aspect='equal')

Now we we can plot what happens when we observe more and more data points (sequentially).

  • Likelihood for a datapoint $X_i$
  • Posterior distribution after observing the datapoints $X_1$ to $X_i$
  • Six versions of the model $y(x,\textbf{w}) = w_0 + w_1x$ after taking 6 samples from the posterior distribution of $w$

Note: the white cross in the left and middle plot indicate the parameter values used to generate the data.

In [17]:
[plot_posterior_update(i) for i in [1,2,5,20]];

MCMC sampling

Let's see what results we get when we use sampling instead of calculating the posterior analytically.

In [23]:
data = pd.DataFrame({'x':x, 'y':t_n})

with pmc.Model() as model:
    pmc.glm.glm('y ~ x', data, )
    trace = pmc.sample(10000, progressbar=True)
Applied log-transform to sd and added transformed sd_log_ to model.
Assigned NUTS to Intercept
Assigned NUTS to x
Assigned NUTS to sd_log_
 [-----------------100%-----------------] 10000 of 10000 complete in 7.0 sec
In [26]:
cov = pmc.trace_cov(trace, vars=['Intercept', 'x'])

rv = stats.multivariate_normal(mean=[trace.point(-1)['Intercept'], trace.point(-1)['x']], cov=alpha**-1*cov)
Z = rv.pdf(grid)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,8))

# Left plot
ax1.contourf(xx, yy, Z.reshape(100,100),
ax1.scatter(-.3, .5, s=80, c='w', marker='+', lw=2)
ax1.set(title='posterior after sampling', xlabel='$w_0$', ylabel='$w_1$')

# Right plot
ax2.scatter(data.iloc[:,0], data.iloc[:,1], s=30, marker='o', facecolors='None', edgecolors='b',
            label='data points')
pmc.glm.plot_posterior_predictive(trace, samples=20, eval=np.asarray([g_min,g_max]), alpha=0.8, c='r',
                                  label='Posterior predictive regression lines')
ax2.plot(x, y_true, c='g', label='true regression line', lw=1)
ax2.set(title='data space', xlabel='$x$', ylabel='$y$', xlim=(-1,1), ylim=(-1,1))

for ax in fig.axes:
    ax.set(xlim=(g_min,g_max), ylim=(g_min,g_max), aspect='equal')