Notebook

In [1]:

import pymc3 as pm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%qtconsole --colors=linux
plt.style.use('ggplot')

from matplotlib import gridspec
from theano import tensor as tt
from scipy import stats

Chapter 5 - Some examples of data analysis¶

5.1 Pearson correlation¶

The Pearson product-moment correlation coefficient, usually denoted r, is a widely used measure of the relationship between two variables. It ranges from −1, indicating a perfect negative linear relationship, to +1, indicating a perfect positive relationship. A value of 0 indicates that there is no linear relationship. Usually the correlation r is reported as a single point estimate, perhaps together with a frequentist significance test.

But, rather than just having a single number to measure the correlation, it would be nice to have a posterior distribution for r, saying how likely each possible level of correlation was. There are frequentist confidence interval methods that try to do this, as well as various analytic Bayesian results based on asymptotic approximations (e.g., Donner & Wells, 1986).

Inferring the mean and variance of a Gaussian distribution. $$ \mu_{1},\mu_{2} \sim \text{Gaussian}(0, .001) $$ $$ \sigma_{1},\sigma_{2} \sim \text{InvSqrtGamma} (.001, .001) $$ $$ r \sim \text{Uniform} (-1, 1) $$
$$ x_{i} \sim \text{MvGaussian} ((\mu_{1},\mu_{2}), \begin{bmatrix}\sigma_{1}^2 & r\sigma_{1}\sigma_{2}\\r\sigma_{1}\sigma_{2} & \sigma_{2}^2\end{bmatrix}^{-1}) $$

The observed data take the form x_i = (x_i1, x_i2) for the ith observation, and, following the theory behind the correlation coefficient, are modeled as draws from a multivariate Gaussian distribution. The parameters of this distribution are the means μ = (μ₁,μ₂) and standard deviations σ = (σ₁,σ₂) of the two variables, and the correlation coefficient r that links them.

In [2]:

# The datasets:
y = np.array([.8, 102, 1,98, .5,100, 0.9,105, .7,103, 
               0.4,110, 1.2,99, 1.4,87, 0.6,113, 1.1,89, 1.3,93]).reshape((11, 2))

# y = np.array([.8,102, 1,98, .5,100, 0.9,105, .7,103, 
#                0.4,110, 1.2,99, 1.4,87, 0.6,113, 1.1,89, 1.3,93,
#                .8,102, 1,98, .5,100, 0.9,105, .7,103, 
#                0.4,110, 1.2,99, 1.4,87, 0.6,113, 1.1,89, 1.3,93]).reshape((22, 2))

with pm.Model() as model:
    # r∼Uniform(−1,1)
    r =  pm.Uniform('r', lower=-1, upper=1)
    
    # μ1,μ2∼Gaussian(0,.001)
    mu = pm.Normal('mu', mu=0, tau=.001, shape=2)
    
    # σ1,σ2∼InvSqrtGamma(.001,.001)
    lambda1 = pm.Gamma('lambda1', alpha=.001, beta=.001)
    lambda2 = pm.Gamma('lambda2', alpha=.001, beta=.001)
    sigma1 = pm.Deterministic('sigma1', 1/np.sqrt(lambda1))
    sigma2 = pm.Deterministic('sigma2', 1/np.sqrt(lambda2))
    
    cov = pm.Deterministic('cov', tt.stacklists([[lambda1**-1, r*sigma1*sigma2],
                                                [r*sigma1*sigma2, lambda2**-1]]))

    # xi∼MvGaussian((μ1,μ2),[σ1^2,rσ1σ2;rσ1σ2,σ2^2]^−1)
    yd = pm.MvNormal('yd', mu=mu, cov=cov, observed=y,shape=2)

    trace2=pm.sample(3e3, njobs=2)

pm.traceplot(trace2[50:], varnames=['r', 'mu', 'sigma1', 'sigma2']);

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
100%|██████████| 3500/3500.0 [00:37<00:00, 93.48it/s]

In [3]:

from matplotlib import gridspec
from scipy.stats.kde import gaussian_kde
import scipy as scipy 

r = trace2['r'][50:]
freqr = scipy.corrcoef(y[:, 0], y[:, 1])
print('The r estimation is: ', np.mean(r, axis=0))
print('The Pearson correlation r is: ', freqr[0, 1])

fig = plt.figure(figsize=(12, 4)) 
gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1]) 
ax0 = plt.subplot(gs[0])
my_pdf = gaussian_kde(r)
x=np.linspace(-1, 1, 200)
ax0.plot(x, my_pdf(x), 'r') # distribution function
ax0.hist(r, bins=100, normed=1, alpha=.3)
ax0.plot([freqr[0, 1], freqr[0, 1]], [0, 3.5], 'k', lw=2.5)
plt.xlabel('Pearson correlation')
plt.ylabel('Posterior Density')

ax1 = plt.subplot(gs[1])
ax1.scatter(y[:, 0], y[:, 1]);

The r estimation is:  -0.695576221086
The Pearson correlation r is:  -0.810967075636

5.2 Pearson correlation with uncertainty¶

$$ \mu_{1},\mu_{2} \sim \text{Gaussian}(0, .001) $$$$ \sigma_{1},\sigma_{2} \sim \text{InvSqrtGamma} (.001, .001) $$$$ r \sim \text{Uniform} (-1, 1) $$

$$ y_{i} \sim \text{MvGaussian} ((\mu_{1},\mu_{2}), \begin{bmatrix}\sigma_{1}^2 & r\sigma_{1}\sigma_{2}\\r\sigma_{1}\sigma_{2} & \sigma_{2}^2\end{bmatrix}^{-1}) $$ $$ x_{ij} \sim \text{Gaussian}(y_{ij},\lambda_{j}^e) $$

In [4]:

# The datasets:
y = np.array([.8, 102, 1,98, .5,100, 0.9,105, .7,103, 
               0.4,110, 1.2,99, 1.4,87, 0.6,113, 1.1,89, 1.3,93]).reshape((11, 2))
# y = np.array([.8,102, 1,98, .5,100, 0.9,105, .7,103, 
#                0.4,110, 1.2,99, 1.4,87, 0.6,113, 1.1,89, 1.3,93,
#                .8,102, 1,98, .5,100, 0.9,105, .7,103, 
#                0.4,110, 1.2,99, 1.4,87, 0.6,113, 1.1,89, 1.3,93]).reshape((22, 2))

n1, n2 = np.shape(y)
sigmaerror = np.array([.03, 1])
se = np.tile(sigmaerror, (n1, 1))

with pm.Model() as model2:
    # r∼Uniform(−1,1)
    r = pm.Uniform('r', lower=-1, upper=1)
    
    # μ1,μ2∼Gaussian(0,.001)
    mu = pm.Normal('mu', mu=0, tau=.001, shape=n2)
    
    # σ1,σ2∼InvSqrtGamma(.001,.001)
    lambda1 = pm.Gamma('lambda1', alpha=.001, beta=.001)
    lambda2 = pm.Gamma('lambda2', alpha=.001, beta=.001)
    sigma1 = pm.Deterministic('sigma1', 1/np.sqrt(lambda1))
    sigma2 = pm.Deterministic('sigma2', 1/np.sqrt(lambda2))
    
    cov = pm.Deterministic('cov', tt.stacklists([[lambda1**-1, r*sigma1*sigma2],
                                                [r*sigma1*sigma2, lambda2**-1]]))
    
    # xi∼MvGaussian((μ1,μ2),[σ1^2,rσ1σ2;rσ1σ2,σ2^2]^−1)
    yd = pm.MvNormal('yd', mu=mu, cov=cov, shape=(n1, n2))
    
    xi = pm.Normal('xi', mu=yd , sd=sigmaerror, observed=y)
    
    trace2=pm.sample(1e3, njobs=2)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
100%|██████████| 1500/1500.0 [00:32<00:00, 46.12it/s]

In [5]:

pm.traceplot(trace2, varnames=['r', 'mu', 'sigma1', 'sigma2']);

r = trace2['r']
freqr=scipy.corrcoef(y[:, 0], y[:, 1])
print('The r estimation is: ', np.mean(r, axis=0))
print('The Pearson correlation r is: ', freqr[0, 1])

fig = plt.figure(figsize=(12, 4)) 
gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1]) 
ax0 = plt.subplot(gs[0])
my_pdf = gaussian_kde(r)
x=np.linspace(-1, 1, 200)
ax0.plot(x, my_pdf(x), 'r') # distribution function
ax0.hist(r, bins=100, normed=1, alpha=.3)
ax0.plot([freqr[0, 1], freqr[0, 1]], [0, 3.5], 'k', lw=2.5)
plt.xlabel('Pearson correlation')
plt.ylabel('Posterior Density')

ax1 = plt.subplot(gs[1])
ax1.scatter(y[:, 0], y[:, 1]);

The r estimation is:  -0.70135557984
The Pearson correlation r is:  -0.810967075636

5.3 The kappa coefficient of agreement¶

A measurement to decide how well one decision-making method agrees with another. For this problem, when both decision-making methods make n independent assessments, the data y take the form of four counts: a observations where both methods decide “one,” b observations where the objective method decides “one” but the surrogate method decides “zero,” c observations where the objective method decides “zero” but the surrogate method decides “one,” and d observations where both methods decide “zero,” with n = a + b + c + d.

$$ \kappa = (\xi-\psi)/(1-\psi) $$$$ \xi = \alpha\beta + (1-\alpha) \gamma $$$$ \psi = (\pi_{a}+\pi_{b})(\pi_{a}+\pi_{c})+(\pi_{b}+\pi_{d})(\pi_{c}+\pi_{d}) $$$$ \alpha,\beta,\gamma \sim \text{Beta} (1, 1) $$

$$ \pi_{a} = \alpha\beta $$ $$ \pi_{b} = \alpha(1-\beta) $$
$$ \pi_{c} = (1-\alpha)(1-\gamma) $$
$$ \pi_{d} = (1-\alpha)\gamma $$
$$ y \sim \text{Multinomial} ([\pi_{a},\pi_{b},\pi_{c},\pi_{d}],n) $$

In [6]:

# CHOOSE a data set:
# Influenza 
y = np.array([14, 4, 5, 210])
# Hearing Loss 
# y = np.array([20, 7, 103, 417])
# Rare Disease
# y = np.array([0, 0, 13, 157])

with pm.Model() as model1:
    # prior
    alpha = pm.Beta('alpha', alpha=1, beta=1)
    beta = pm.Beta('beta', alpha=1, beta=1)
    gamma = pm.Beta('gamma', alpha=1, beta=1)
    
    pi1 = alpha*beta
    pi2 = alpha*(1-beta)
    pi3 = (1-alpha)*(1-gamma)
    pi4 = (1-alpha)*gamma
    
    # Derived Measures   
    # Rate Surrogate Method Agrees With the Objective Method
    xi = alpha*beta+(1-alpha)*gamma
    
    yd = pm.Multinomial('yd', n=y.sum(), p=[pi1, pi2, pi3, pi4], observed=y)

    # Rate of Chance Agreement
    psi = pm.Deterministic('psi', (pi1+pi2)*(pi1+pi3)+(pi2+pi4)*(pi3+pi4))
    
    # Chance-Corrected Agreement
    kappa = pm.Deterministic('kappa', (xi-psi)/(1-psi))
    
    trace=pm.sample(1e3, njobs=2)

pm.traceplot(trace, varnames=['kappa', 'psi']);

# Compare to Cohen's point estimate
n = y.sum()
p0 = (y[0]+y[3])/n
pe = (((y[0]+y[1]) * (y[0]+y[2])) + ((y[1]+y[3]) * (y[2]+y[3]))) / n**2
kappa_Cohen = (p0-pe) / (1-pe)

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/theano/tensor/basic.py:2105: UserWarning: theano.tensor.round() changed its default from `half_away_from_zero` to `half_to_even` to have the same default as NumPy. Use the Theano flag `warn.round=False` to disable this warning.
  "theano.tensor.round() changed its default from"
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
100%|██████████| 1500/1500.0 [00:01<00:00, 807.39it/s]

In [7]:

import matplotlib.pyplot as plt
from scipy.stats.kde import gaussian_kde # for plotting: to calculate a continuous 
                                         # approximation of the posterior and prior densities. 

kappa = trace['kappa']

my_pdf = gaussian_kde(kappa)
x = np.linspace(0.2, 1, 200)
plt.plot(x, my_pdf(x), 'r') # distribution function

plt.hist(kappa, bins=100, normed=1, alpha=.3)
plt.plot([kappa_Cohen, kappa_Cohen], [0, 6], 'k', lw=2.5)
plt.xlabel('kappa')
plt.ylabel('Posterior Density');

print("Cohen's point estimate: ", kappa_Cohen)
print("MCMC estimate: ", np.mean(kappa))

Cohen's point estimate:  0.735794380748
MCMC estimate:  0.696304224857

5.4 Change detection in time series data¶

Detecting a single change point in time series.

$$ \mu_{1},\mu_{2} \sim \text{Gaussian}(0, .001) $$$$ \lambda \sim \text{Gamma} (.001, .001) $$$$ \tau \sim \text{Uniform} (0, t_{max}) $$

$$ c_{i} \sim \begin{cases} \text{Gaussian}(\mu_{1}, \lambda), & \text{if $t_{i} \lt \tau$} \\ \text{Gaussian}(\mu_{2}, \lambda), & \text{if $t_{i} \ge \tau$} \end{cases} $$

In [8]:

# pymc3
# Load data
dat = pd.read_csv('changepointdata.csv')

c = dat.data
n = np.size(c)
sample = np.arange(0, n)

with pm.Model() as model3:
    # μ1,μ2∼Gaussian(0,.001)
    mu = pm.Normal('mu', mu=0, tau=.001, shape=2)
    # λ∼Gamma(.001,.001)
    lambd = pm.Gamma('lambd', alpha=.001, beta=.001)
    #　τ∼Uniform(0,tmax)
    tau = pm.DiscreteUniform('tau', lower=0, upper=n)
    
    muvect = pm.math.switch(tau>=sample, mu[0], mu[1])

    cobs = pm.Normal('cobs', mu=muvect, tau=lambd, observed=c)
    
    trace1=pm.sample(1e3, njobs=2)

pm.traceplot(trace1, varnames=['mu', 'tau']);

Assigned NUTS to mu
Assigned NUTS to lambd_log__
Assigned Metropolis to tau
100%|██████████| 1500/1500.0 [00:02<00:00, 605.96it/s]
/Users/jlao/Documents/Github/pymc3/pymc3/step_methods/hmc/nuts.py:452: UserWarning: The acceptance probability in chain 1 does not match the target. It is 0.911090496441, but should be close to 0.8. Try to increase the number of tuning steps.
  % (self._chain_id, mean_accept, target_accept))

In [9]:

MCtau = trace1['tau'][100:]
MCmu = trace1['mu'][100:]

mu1 = np.mean(MCmu, axis = 0)
tau1 = int(np.mean(MCtau))
fited = np.zeros(n)
fited[:tau1] = np.ones(tau1)*mu1[0]
fited[tau1:] = np.ones(n-tau1)*mu1[1]

plt.figure(1, figsize=(15, 9))
plt.plot(np.arange(0, n, 1), c, color='k')
plt.plot(np.arange(0, n, 1), fited, color='r', lw=2.5)
plt.xlabel("Samples")
plt.ylabel("Values");

print (mu1)
print (tau1)

[ 37.86792845  30.57069104]
731

5.5 Censored data¶

After 949 failed attempts, Cha Sa-Soon from Korea finally passed her theoretical drivers exam. What can we infer about theta, the probability of answering any one question correctly?

$$ \theta \sim \text{Uniform}(0.25, 1) $$$$ z_{i} \sim \text{Binomial}(\theta, n) $$$$ 15 \le z_{i} \le 25, \text{if} \; y_{i} = 1 $$

In [10]:

nattempts = 950   
nfails = 949   
n = 50    # Number of questions
y = np.zeros(nattempts)
y[nattempts-1] = 1
z = 30
unobsmin = 15
unobsmax = 25
unobsrange = np.arange(unobsmin, unobsmax+1)
from pymc3.distributions import dist_math

def logp(value, n, p):
    return dist_math.bound(dist_math.binomln(n, value) + 
                          dist_math.logpow(p, value) + 
                          dist_math.logpow(1-p, n-value),
                          0 <= value, value <= n,
                          0 <= p, p <= 1)
def Censorlike(value=z, n=n, censorn=nfails, unobs=unobsrange):
    ylikeobs = logp(value=value, n=n, p=theta)
    ylikeunobs = 0
    for i in unobs:
        ylikeunobs += logp(value=i, n=n, p=theta)
    return ylikeobs+ylikeunobs*censorn
    
with pm.Model():
    theta = pm.Uniform('theta', lower=.25, upper=1)
    
    ylike = pm.DensityDist('ylike', Censorlike, 
                           observed={'value': z, 'n': n, 'censorn': nfails, 'unobs': unobsrange})
    trace = pm.sample(3e3, njobs=2)

pm.traceplot(trace);

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
 99%|█████████▉| 3463/3500.0 [00:02<00:00, 1043.60it/s]/Users/jlao/Documents/Github/pymc3/pymc3/step_methods/hmc/nuts.py:452: UserWarning: The acceptance probability in chain 0 does not match the target. It is 0.888622685346, but should be close to 0.8. Try to increase the number of tuning steps.
  % (self._chain_id, mean_accept, target_accept))
100%|██████████| 3500/3500.0 [00:02<00:00, 1258.55it/s]
/Users/jlao/Documents/Github/pymc3/pymc3/step_methods/hmc/nuts.py:452: UserWarning: The acceptance probability in chain 1 does not match the target. It is 0.901444297128, but should be close to 0.8. Try to increase the number of tuning steps.
  % (self._chain_id, mean_accept, target_accept))

5.6 Recapturing planes¶

$$ k \sim \text{Hypergeometric}(n, x, t) $$

$$ t \sim \text{Categorical}(\alpha) $$

In [11]:

import matplotlib.pyplot as plt
import pymc as pymc
import numpy as np
%matplotlib inline

x = 10 # number of captures
k = 4  # number of recaptures from n
n = 5  # size of second sample
tmax = 50 # maximum population size

tau = pymc.DiscreteUniform("tau", lower = x + (n-k), upper = tmax)
kobs = pymc.Hypergeometric("kobs", N = tau, n = n, m = x, value = k, observed=True)

testmodel = pymc.Model([kobs,tau])
mcmc = pymc.MCMC(testmodel)
mcmc.sample(iter = 20000, burn = 50, thin = 2)

pymc.Matplot.plot(mcmc)

tau =  mcmc.trace("tau")[:]

plt.figure(figsize=(12, 6)) 
plt.hist(tau, bins=tmax - (x + (n-k)), normed=1,alpha=.5,color='k')
plt.xlabel('Number of Planes')
plt.ylabel('Posterior Mass');

 [-----------------100%-----------------] 20000 of 20000 complete in 1.1 secPlotting tau

In [12]:

# pymc3 - Hypergeometric Likelihood Via Zeros Trick as in JAGS

x = 10 # number of captures
k = 4  # number of recaptures from n
n = 5  # size of second sample
tmax = 50 # maximum population size

factln = dist_math.factln
with pm.Model():
    tau = pm.DiscreteUniform("tau", lower=x + (n-k), upper=tmax)
    logterm1 = factln(x+.01) - factln(k+.01) - factln(x-k+.01)
    logterm2 = factln(tau-x+.01) - factln(n-k+.01) - factln((tau-x)-(n-k)+.01)
    logterm3 = factln(tau+.01) - factln(n+.01) - factln(tau-n+.01)
    C = 1000
    phi = -(logterm1+logterm2-logterm3)+C 
    
    obs = pm.Poisson("obs", mu=phi, observed=0)
    trace = pm.sample(1e3)
    
tau = trace['tau']

plt.figure(figsize=(12, 6)) 
plt.hist(tau, bins=tmax - (x + (n-k)), normed=1,alpha=.5,color='k')
plt.xlabel('Number of Planes')
plt.ylabel('Posterior Mass');

Assigned Metropolis to tau
100%|██████████| 1500/1500.0 [00:00<00:00, 5332.16it/s]