%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pymc3 as pm
import seaborn as sns
import theano.tensor as tt
plt.style.use('seaborn-darkgrid')
plt.rc('font', size=12)
%config Inline.figure_formats = ['retina']
rat_tumor = np.loadtxt('data/rat_tumor_data.txt', skiprows=3)
# If you can use bash
!cat data/rat_tumor_data.txt | head
As you can see, the first column of rat_tumor
is $y$ and the second column is $N$.
print('The shape of rat_tumor is {}'.format(rat_tumor.shape))
We construct the model.
with pm.Model() as model_1:
alpha = pm.Uniform('alpha', lower=0, upper=10)
beta = pm.Uniform('beta', lower=0, upper=25)
theta = pm.Beta('theta', alpha, beta, shape=rat_tumor.shape[0])
y = pm.Binomial('y', n=rat_tumor[:, 1], p=theta, observed=rat_tumor[:, 0])
model_1.check_test_point()
pm.model_to_graphviz(model_1)
with model_1:
trace_1 = pm.sample(draws=5_000, tune=10_000)
pm.traceplot(trace_1, varnames=['alpha', 'beta']);
pm.summary(trace_1, varnames=['alpha', 'beta'])
The posterior plot of alpha
and beta
.
pm.plot_posterior(trace_1, varnames=['alpha', 'beta']);
As you can see, I have used a uniform distribution for alpha
and beta
. You could say that it is a regular prior, not good, not bad, but at the end disappointing. So let's try to improve the priors.
The following is taken from here. According to the book, the prior for $\alpha$ and $\beta$ is
$$p(\alpha, \beta) \propto (\alpha + \beta) ^{-5/2}$$In order to use this prior, you have to define the logarithm of $p(\alpha, \beta)$ and use it in pm.Potential
, look here.
def logp_ab(value):
"""
Prior density for this problem
"""
return tt.log(tt.pow(tt.sum(value), -5/2))
with pm.Model() as model_10:
# Uninformative prior for alpha and beta
ab = pm.HalfFlat('ab', shape=2, testval=np.asarray([1., 1.]))
pm.Potential('p(a, b)', logp_ab(ab))
# Our alpha and beta
alpha = pm.Deterministic('alpha', ab[0])
beta = pm.Deterministic('beta', ab[1])
theta = pm.Beta('theta', alpha=ab[0], beta=ab[1], shape=rat_tumor.shape[0])
p = pm.Binomial('y', p=theta, observed=rat_tumor[:,0], n=rat_tumor[:,1])
with model_10:
trace_10 = pm.sample(draws=5_000, tune=5_000)
pm.traceplot(trace_10, varnames=['alpha', 'beta']);
pm.summary(trace_10, varnames=['alpha', 'beta'])
pm.plot_posterior(trace_10, varnames=['alpha', 'beta']);
Better, don't you think? And they are closer to the value $(\alpha, \beta) = (2.4, 14.0)$ showed in the book.
The next plots are a contour plot and a scatter plot.
begi = 4_000
alpha_trunc = trace_10['beta'][begi: begi + 1000]
beta_trunc = trace_10['alpha'][begi: begi + 1000]
fig, ax = plt.subplots(figsize=(8, 7))
sns.kdeplot(np.log(alpha_trunc / beta_trunc), np.log(alpha_trunc + beta_trunc),
cmap=plt.cm.viridis, n_levels=11, ax=ax)
ax.set_xlim(1.5, 2.1)
ax.set_ylim(1.7, 3.75)
ax.set_xlabel('log(alpha / beta)')
ax.set_ylabel('log(alpha + beta)');
g = sns.jointplot(np.log(alpha_trunc / beta_trunc), np.log(alpha_trunc + beta_trunc),
alpha=0.8, color='C5', kind='hex', height=8)
g.set_axis_labels('log(alpha / beta)', 'log(alpha + beta)');
Now, we draw 1000 samples for theta
from the posterior distribution.
post_theta = pm.sample_ppc(trace_10, samples=1000, model=model_10, vars=[theta], progressbar=False)
Posterior medians and 95% highest posterior density of rat tumor rates, $\theta_j$ (plotted vs. observed tumor rates $y_j / n_j$).
median_theta = []
for i in range(post_theta['theta'].shape[1]):
median_theta.append(np.median(post_theta['theta'][:, i]))
error_up = []
error_down = []
for i in range(post_theta['theta'].shape[1]):
a, b = pm.hpd(post_theta['theta'][:, i], alpha=0.05)
error_up.append(b)
error_down.append(a)
plt.figure(figsize=(10,8))
plt.errorbar(rat_tumor[:, 0] / rat_tumor[:, 1], median_theta, fmt='o',
yerr=[error_down, error_up], ecolor='C5', markerfacecolor='k', mec='k',
errorevery=1)
plt.plot(np.linspace(0, .4, 10), np.linspace(0, .4, 10),
color='k')
plt.xlabel('Observed rate, y(i) / N(i)', fontsize=14)
plt.ylabel('95% hightest posterior density for each theta', fontsize=14);
Data of the eight schools model.
yy = np.array([28., 8., -3., 7., -1., 1., 18., 12.])
sigma = np.array([15., 10., 16., 11., 9., 11., 10., 18.])
with pm.Model() as model_2:
mu = pm.Uniform('mu', lower=0, upper=30)
tau = pm.Uniform('tau', lower=0, upper=30)
theta = pm.Normal('theta', mu=mu, sd=tau, shape=yy.shape[0])
obs = pm.Normal('obs', mu=theta, sd=sigma, observed=yy)
pm.model_to_graphviz(model_2)
with model_2:
trace_2 = pm.sample(draws=15_000, tune=30_000, nuts_kwargs={'target_accept':0.99})
pm.traceplot(trace_2, varnames=['mu', 'tau']);
pm.summary(trace_2)
The posterior plot of tau
pm.plot_posterior(trace_2, varnames=['tau']);
We draw 1000 samples of tau
and theta
from the posterior distribution.
with model_2:
post_trace_sample = pm.sample_ppc(trace_2, samples=1_000, vars=[tau, theta])
def partition_mean(ll, part):
"""
It takes two arguments, a list of pairs of points, (x, y), and the
length of the partition. It returns two lists, the first is a
list of means for each partiton on x and the second is a list of means
for each partition on y.
Returns:
mean_x::list
mean of each partition on x
mean_obs::list
mean of each partition on y
"""
mean_x = []
mean_obs = []
for i in range(0, len(ll), part):
ll_temp = ll[i: i + part]
x_temp = []
y_temp = []
for j in range(len(ll_temp)):
x_temp.append(ll_temp[j][0])
y_temp.append(ll_temp[j][1])
mean_x.append(np.mean(x_temp))
mean_obs.append(np.mean(y_temp))
return mean_x, mean_obs
def partition_std(ll, part):
"""
It takes two arguments, a list of pairs of points, (x, y), and the
length of the partition. It returns two lists, the first is a
list of means for each partiton on x and the second is a list of
standard deviation for each partition on y.
Returns:
mean_x::list
mean of each partition on x
mean_obs::list
standard deviation of each partition on y
"""
mean_x = []
mean_obs = []
for i in range(0, len(ll), part):
ll_temp = ll[i: i + part]
x_temp = []
y_temp = []
for j in range(len(ll_temp)):
x_temp.append(ll_temp[j][0])
y_temp.append(ll_temp[j][1])
mean_x.append(np.mean(x_temp))
mean_obs.append(np.std(y_temp, ddof=1))
return mean_x, mean_obs
This plot is for the conditional posterior means of treatment effects, $E(\theta_j \mid \tau, y)$, as functions of the between-school standard deviation $\tau$, for the educational testing example.
plt.figure(figsize=(13, 7))
for i, z in zip(range(post_trace_sample['theta'].shape[1]),
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']):
tau_post = post_trace_sample['tau']
obs_post = post_trace_sample['theta'][:, i]
# It is necessary to sort the list of pairs of points
ll = sorted(list(zip(tau_post, obs_post)))
x, y = partition_mean(ll, 100)
plt.plot(x, y, label=z)
plt.xlabel('tau', fontsize=14)
plt.ylabel('Estimated treatment effects', fontsize=14)
plt.legend();
This plot is for the conditional posterior standard deviations of treatment effects, $\textrm{sd}\,(\theta_j \mid \tau, y)$, as functions of the between-school standard deviation $\tau$, for the educational testing example.
plt.figure(figsize=(13, 7))
for i, z in zip(range(post_trace_sample['theta'].shape[1]),
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']):
tau_post = post_trace_sample['tau']
obs_post = post_trace_sample['theta'][:, i]
# It is necessary to sort the list of pairs of points
ll = sorted(list(zip(tau_post, obs_post)))
x, y = partition_std(ll, 100)
plt.plot(x, y, label=z)
plt.xlabel('tau', fontsize=14)
plt.ylabel('Posterior standard deviations', fontsize=14)
plt.legend();
We draw 200 samples of obs
.
with model_2:
post_obs = pm.sample_ppc(trace_2, samples=200, vars=[obs])
Reproducing the Table 5.3
print('{:^1} {:^50}'.format('School', 'Posterior quantiles'))
print('{:^7} {:^10} {:^10} {:^10} {:^10} {:^10}'.format(' ', '2.5%', '25%', 'median', '75%', '97.5%'))
print('-' * 70)
for i, z in zip(range(post_obs['obs'].shape[1]),
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']):
a1 = np.percentile(post_obs['obs'][:, i], 2.5)
a2 = np.percentile(post_obs['obs'][:, i], 25)
a3 = np.percentile(post_obs['obs'][:, i], 50)
a4 = np.percentile(post_obs['obs'][:, i], 75)
a5 = np.percentile(post_obs['obs'][:, i], 97.5)
print('{:^7} {:^10.0f} {:^10.0f} {:^10.0f} {:^10.0f} {:^10.0f}'.format(z, a1, a2, a3, a4, a5))
plt.figure(figsize=(10, 6))
_, _, _ = plt.hist(post_trace_sample['theta'][:, 0], color='C5', bins=25, edgecolor='w')
plt.xlabel('Effect in school A', fontsize=14);
max_theta = []
for i in range(200):
with model_2:
post_theta = pm.sample_ppc(trace_2, samples=200, vars=[theta], progressbar=False)
max_theta.append(np.max(post_theta['theta'][:, :]))
plt.figure(figsize=(10, 6))
_, _, _ = plt.hist(max_theta, color='C5', bins=25, edgecolor='w')
plt.xlabel('Largest effect', fontsize=14);
np.mean(np.array(max_theta) >= 28.4)
Ok, so the results are not quite good, I mean, there are a lot of discrepancies and the differences are notorious when you compare the two histograms previously showed with the Figure 5.8 (even more if you compare it with Table 5.3). Yes, the question is: Why does this happen? Is the algorithm behind pymc3
the reason of all this? I would say that priors are really bad and the geometry behind the model is nasty. More info here: https://docs.pymc.io/notebooks/Diagnosing_biased_Inference_with_Divergences.html
Changing the prior of tau_sq
to inv-gamma(1, 1)
.
with pm.Model() as model_3:
mu = pm.Uniform('mu', lower=0, upper=30)
tau_sq = pm.InverseGamma('tau_sq', 1, 1)
theta = pm.Normal('theta', mu=mu, sd=tau_sq, shape=yy.shape[0])
obs = pm.Normal('obs', mu=theta, sd=sigma, observed=yy)
model_3.check_test_point()
with model_3:
trace_3 = pm.sample(draws=5_000, tune=15_000)
pm.traceplot(trace_3, varnames=['mu', 'tau_sq']);
pm.plot_posterior(trace_3, varnames=['tau_sq']);
Although this plot is similar to the Figure 5.9, they are not the same.
Changing the prior of tau_sq
to inv-gamma(0.001, 0.001)
.