from collections import OrderedDict
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc3 as pm
import scipy as sp
from theano import shared
%config InlineBackend.figure_format = 'retina'
az.style.use('arviz-darkgrid')
trolley_df = pd.read_csv('Data/Trolley.csv', sep=';')
trolley_df.head()
case | response | order | id | age | male | edu | action | intention | contact | story | action2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | cfaqu | 4 | 2 | 96;434 | 14 | 0 | Middle School | 0 | 0 | 1 | aqu | 1 |
1 | cfbur | 3 | 31 | 96;434 | 14 | 0 | Middle School | 0 | 0 | 1 | bur | 1 |
2 | cfrub | 4 | 16 | 96;434 | 14 | 0 | Middle School | 0 | 0 | 1 | rub | 1 |
3 | cibox | 3 | 32 | 96;434 | 14 | 0 | Middle School | 0 | 1 | 1 | box | 1 |
4 | cibur | 3 | 4 | 96;434 | 14 | 0 | Middle School | 0 | 1 | 1 | bur | 1 |
ax = (trolley_df.response
.value_counts()
.sort_index()
.plot(kind='bar'))
ax.set_xlabel("response", fontsize=14);
ax.set_ylabel("Frequency", fontsize=14);
ax = (trolley_df.response
.value_counts()
.sort_index()
.cumsum()
.div(trolley_df.shape[0])
.plot(marker='o'))
ax.set_xlim(0.9, 7.1);
ax.set_xlabel("response", fontsize=14)
ax.set_ylabel("cumulative proportion", fontsize=14);
resp_lco = (trolley_df.response
.value_counts()
.sort_index()
.cumsum()
.iloc[:-1]
.div(trolley_df.shape[0])
.apply(lambda p: np.log(p / (1. - p))))
ax = resp_lco.plot(marker='o')
ax.set_xlim(0.9, 7);
ax.set_xlabel("response", fontsize=14)
ax.set_ylabel("log-cumulative-odds", fontsize=14);
with pm.Model() as m11_1:
a = pm.Normal(
'a', 0., 10.,
transform=pm.distributions.transforms.ordered,
shape=6, testval=np.arange(6) - 2.5)
resp_obs = pm.OrderedLogistic(
'resp_obs', 0., a,
observed=trolley_df.response.values - 1
)
/home/osvaldo/anaconda3/lib/python3.7/site-packages/theano/tensor/subtensor.py:2197: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. rval = inputs[0].__getitem__(inputs[1:]) /home/osvaldo/anaconda3/lib/python3.7/site-packages/theano/tensor/subtensor.py:2197: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. rval = inputs[0].__getitem__(inputs[1:])
with m11_1:
map_11_1 = pm.find_MAP()
/home/osvaldo/proyectos/00_PyMC3/pymc3/pymc3/tuning/starting.py:61: UserWarning: find_MAP should not be used to initialize the NUTS sampler, simply call pymc3.sample() and it will automatically initialize NUTS in a better way. warnings.warn('find_MAP should not be used to initialize the NUTS sampler, simply call pymc3.sample() and it will automatically initialize NUTS in a better way.')
HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))
/home/osvaldo/anaconda3/lib/python3.7/site-packages/theano/tensor/subtensor.py:2197: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. rval = inputs[0].__getitem__(inputs[1:])
map_11_1['a']
array([-1.9160707 , -1.26658298, -0.71862013, 0.24778795, 0.88986631, 1.76937289])
daf
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-11-978c12f15f3b> in <module> ----> 1 daf NameError: name 'daf' is not defined
sp.special.expit(map_11_1['a'])
with m11_1:
trace_11_1 = pm.sample(1000, tune=1000)
az.summary(trace_11_1, var_names=['a'], credible_interval=.89, rount_to=2)
def ordered_logistic_proba(a):
pa = sp.special.expit(a)
p_cum = np.concatenate(([0.], pa, [1.]))
return p_cum[1:] - p_cum[:-1]
ordered_logistic_proba(trace_11_1['a'].mean(axis=0))
(ordered_logistic_proba(trace_11_1['a'].mean(axis=0)) \
* (1 + np.arange(7))).sum()
ordered_logistic_proba(trace_11_1['a'].mean(axis=0) - 0.5)
(ordered_logistic_proba(trace_11_1['a'].mean(axis=0) - 0.5) \
* (1 + np.arange(7))).sum()
action = shared(trolley_df.action.values)
intention = shared(trolley_df.intention.values)
contact = shared(trolley_df.contact.values)
with pm.Model() as m11_2:
a = pm.Normal(
'a', 0., 10.,
transform=pm.distributions.transforms.ordered,
shape=6,
testval=trace_11_1['a'].mean(axis=0)
)
bA = pm.Normal('bA', 0., 10.)
bI = pm.Normal('bI', 0., 10.)
bC = pm.Normal('bC', 0., 10.)
phi = bA * action + bI * intention + bC * contact
resp_obs = pm.OrderedLogistic(
'resp_obs', phi, a,
observed=trolley_df.response.values - 1
)
with m11_2:
map_11_2 = pm.find_MAP()
with pm.Model() as m11_3:
a = pm.Normal(
'a', 0., 10.,
transform=pm.distributions.transforms.ordered,
shape=6,
testval=trace_11_1['a'].mean(axis=0)
)
bA = pm.Normal('bA', 0., 10.)
bI = pm.Normal('bI', 0., 10.)
bC = pm.Normal('bC', 0., 10.)
bAI = pm.Normal('bAI', 0., 10.)
bCI = pm.Normal('bCI', 0., 10.)
phi = bA * action + bI * intention + bC * contact \
+ bAI * action * intention \
+ bCI * contact * intention
resp_obs = pm.OrderedLogistic(
'resp_obs', phi, a,
observed=trolley_df.response - 1
)
with m11_3:
map_11_3 = pm.find_MAP()
def get_coefs(map_est):
coefs = OrderedDict()
for i, ai in enumerate(map_est['a']):
coefs[f'a_{i}'] = ai
coefs['bA'] = map_est.get('bA', np.nan)
coefs['bI'] = map_est.get('bI', np.nan)
coefs['bC'] = map_est.get('bC', np.nan)
coefs['bAI'] = map_est.get('bAI', np.nan)
coefs['bCI'] = map_est.get('bCI', np.nan)
return coefs
(pd.DataFrame.from_dict(
OrderedDict([
('m11_1', get_coefs(map_11_1)),
('m11_2', get_coefs(map_11_2)),
('m11_3', get_coefs(map_11_3))
]))
.astype(np.float64)
.round(2))
with m11_2:
trace_11_2 = pm.sample(1000, tune=1000)
with m11_3:
trace_11_3 = pm.sample(1000, tune=1000)
comp_df = pm.compare({m11_1:trace_11_1,
m11_2:trace_11_2,
m11_3:trace_11_3})
comp_df.loc[:,'model'] = pd.Series(['m11.1', 'm11.2', 'm11.3'])
comp_df = comp_df.set_index('model')
comp_df
pp_df = pd.DataFrame(np.array([[0, 0, 0],
[0, 0, 1],
[1, 0, 0],
[1, 0, 1],
[0, 1, 0],
[0, 1, 1]]),
columns=['action', 'contact', 'intention'])
pp_df
action.set_value(pp_df.action.values)
contact.set_value(pp_df.contact.values)
intention.set_value(pp_df.intention.values)
with m11_3:
pp_trace_11_3 = pm.sample_ppc(trace_11_3, samples=1500)
PP_COLS = [f'pp_{i}' for i, _ in enumerate(pp_trace_11_3['resp_obs'])]
pp_df = pd.concat((pp_df,
pd.DataFrame(pp_trace_11_3['resp_obs'].T, columns=PP_COLS)),
axis=1)
pp_cum_df = (pd.melt(
pp_df,
id_vars=['action', 'contact', 'intention'],
value_vars=PP_COLS, value_name='resp'
)
.groupby(['action', 'contact', 'intention', 'resp'])
.size()
.div(1500)
.rename('proba')
.reset_index()
.pivot_table(
index=['action', 'contact', 'intention'],
values='proba',
columns='resp'
)
.cumsum(axis=1)
.iloc[:, :-1])
pp_cum_df
for (plot_action, plot_contact), plot_df in pp_cum_df.groupby(level=['action', 'contact']):
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot([0, 1], plot_df, c='C0');
ax.plot([0, 1], [0, 0], '--', c='C0');
ax.plot([0, 1], [1, 1], '--', c='C0');
ax.set_xlim(0, 1);
ax.set_xlabel("intention");
ax.set_ylim(-0.05, 1.05);
ax.set_ylabel("probability");
ax.set_title(
"action = {action}, contact = {contact}".format(
action=plot_action, contact=plot_contact
)
);
# define parameters
PROB_DRINK = 0.2 # 20% of days
RATE_WORK = 1. # average 1 manuscript per day
# sample one year of production
N = 365
drink = np.random.binomial(1, PROB_DRINK, size=N)
y = (1 - drink) * np.random.poisson(RATE_WORK, size=N)
drink_zeros = drink.sum()
work_zeros = (y == 0).sum() - drink_zeros
bins = np.arange(y.max() + 1) - 0.5
plt.hist(y, bins=bins);
plt.bar(0., drink_zeros, width=1., bottom=work_zeros, color='C1', alpha=.5);
plt.xticks(bins + 0.5);
plt.xlabel("manuscripts completed");
plt.ylabel("Frequency");
with pm.Model() as m11_4:
ap = pm.Normal('ap', 0., 1.)
p = pm.math.sigmoid(ap)
al = pm.Normal('al', 0., 10.)
lambda_ = pm.math.exp(al)
y_obs = pm.ZeroInflatedPoisson('y_obs', 1. - p, lambda_, observed=y)
with m11_4:
map_11_4 = pm.find_MAP()
map_11_4
sp.special.expit(map_11_4['ap']) # probability drink
np.exp(map_11_4['al']) # rate finish manuscripts, when not drinking
def dzip(x, p, lambda_, log=True):
like = p**(x == 0) + (1 - p) * sp.stats.poisson.pmf(x, lambda_)
return np.log(like) if log else like
PBAR = 0.5
THETA = 5.
a = PBAR * THETA
b = (1 - PBAR) * THETA
p = np.linspace(0, 1, 100)
plt.plot(p, sp.stats.beta.pdf(p, a, b));
plt.xlim(0, 1);
plt.xlabel("probability");
plt.ylabel("Density");
admit_df = pd.read_csv('Data/UCBadmit.csv', sep=';')
admit_df.head()
with pm.Model() as m11_5:
a = pm.Normal('a', 0., 2.)
pbar = pm.Deterministic('pbar', pm.math.sigmoid(a))
theta = pm.Exponential('theta', 1.)
admit_obs = pm.BetaBinomial(
'admit_obs',
pbar * theta, (1. - pbar) * theta,
admit_df.applications.values,
observed=admit_df.admit.values
)
with m11_5:
trace_11_5 = pm.sample(1000, tune=1000)
pm.summary(trace_11_5, alpha=.11).round(2)
np.percentile(trace_11_5['pbar'], [2.5, 50., 97.5])
pbar_hat = trace_11_5['pbar'].mean()
theta_hat = trace_11_5['theta'].mean()
p_plot = np.linspace(0, 1, 100)
plt.plot(
p_plot,
sp.stats.beta.pdf(p_plot, pbar_hat * theta_hat, (1. - pbar_hat) * theta_hat)
);
plt.plot(
p_plot,
sp.stats.beta.pdf(
p_plot[:, np.newaxis],
trace_11_5['pbar'][:100] * trace_11_5['theta'][:100],
(1. - trace_11_5['pbar'][:100]) * trace_11_5['theta'][:100]
),
c='C0', alpha=0.1
);
plt.xlim(0., 1.);
plt.xlabel("probability admit");
plt.ylim(0., 3.);
plt.ylabel("Density");
with m11_5:
pp_trace_11_5 = pm.sample_ppc(trace_11_5)
x_case = np.arange(admit_df.shape[0])
plt.scatter(
x_case,
pp_trace_11_5['admit_obs'].mean(axis=0) \
/ admit_df.applications.values
);
plt.scatter(x_case, admit_df.admit / admit_df.applications);
high = np.percentile(pp_trace_11_5['admit_obs'], 95, axis=0) \
/ admit_df.applications.values
plt.scatter(x_case, high, marker='x', c='k');
low = np.percentile(pp_trace_11_5['admit_obs'], 5, axis=0) \
/ admit_df.applications.values
plt.scatter(x_case, low, marker='x', c='k');
mu = 3.
theta = 1.
x = np.linspace(0, 10, 100)
plt.plot(x, sp.stats.gamma.pdf(x, mu / theta, scale=theta));
import platform
import sys
import IPython
import matplotlib
import scipy
print("This notebook was createad on a computer {} running {} and using:\nPython {}\nIPython {}\nPyMC3 {}\nNumPy {}\nPandas {}\nSciPy {}\nMatplotlib {}\n".format(platform.machine(), ' '.join(platform.linux_distribution()[:2]), sys.version[:5], IPython.__version__, pm.__version__, np.__version__, pd.__version__, scipy.__version__, matplotlib.__version__))