## Chapter 20 - Metric Predicted Variable with Multiple Nominal Predictors¶

In [2]:
# %load std_ipython_import.txt
import pandas as pd
import numpy as np
import numpy.ma as ma
import matplotlib.pyplot as plt
import seaborn as sns
import pymc3 as pm
import theano.tensor as tt

from scipy.stats import norm
from IPython.display import Image

%matplotlib inline
plt.style.use('seaborn-white')

color = '#87ceeb'

In [3]:
# Calculate Gamma shape and rate from mode and sd.
def gammaShRaFromModeSD(mode, sd):
rate = (mode + np.sqrt( mode**2 + 4 * sd**2 ) ) / ( 2 * sd**2 )
shape = 1 + mode * rate
return(shape, rate)

In [4]:
def plot_mustache(var, sd, j, width=.75, ax=None):
for i in np.arange(0, len(var), int(len(var)*.1)):
rv = norm(loc=var[i], scale=sd[i])
yrange = np.linspace(rv.ppf(0.01), rv.ppf(0.99), 100)
xrange = rv.pdf(yrange)

# When the SD of a group is large compared to others, then the top of its mustache is relatively
# low and does not plot well together with low SD groups.
# Scale the xrange so that the 'height' of the all mustaches is 0.75
xrange_scaled = xrange*(width/xrange.max())

# Using the negative value to flip the mustache in the right direction.
ax.plot(-xrange_scaled+j, yrange, color=color, alpha=.6, zorder=99)


### 20.2 - Hierarchical Bayesian Approach¶

In [6]:
df = pd.read_csv('data/Salary.csv', usecols=[0,3,5], dtype={'Org': 'category', 'Pos': 'category'})

# Reorder the Pos categories and rename categories
df.Pos.cat.reorder_categories(['FT3', 'FT2', 'FT1', 'NDW', 'DST'], ordered=True, inplace=True)
df.Pos.cat.rename_categories(['Assis', 'Assoc', 'Full', 'Endow', 'Disting'], inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1080 entries, 0 to 1079
Data columns (total 3 columns):
Org       1080 non-null category
Pos       1080 non-null category
Salary    1080 non-null int64
dtypes: category(2), int64(1)
memory usage: 13.8 KB

In [7]:
df.groupby('Pos').apply(lambda x: x.head(2))

Out[7]:
Org Pos Salary
Pos
Assis 4 LGED Assis 63796
6 INFO Assis 98814
Assoc 0 PL Assoc 72395
1 MUTH Assoc 61017
Full 7 CRIN Full 107745
9 PSY Full 173302
Endow 5 MGMT Endow 219600
8 CRIN Endow 114275
Disting 29 SPEA Disting 285000
128 MUHI Disting 114189

#### Model (Kruschke, 2015)¶

In [6]:
Image('images/fig20_2.png')

Out[6]:

Reparameterization of hierarchical models generally results in much more efficient and faster sampling.
See http://twiecki.github.io/blog/2017/02/08/bayesian-hierchical-non-centered/ and

In [83]:
y = df.Salary
yMean = y.mean()
ySD = y.std()

x1 = df.Pos.cat.codes.values
Nx1Lvl = len(df.Pos.cat.categories)

x2 = df.Org.cat.codes.values
Nx2Lvl = len(df.Org.cat.categories)

agammaShRa = gammaShRaFromModeSD(ySD/2 , 2*ySD)

with pm.Model() as model1:

#a0 = pm.Normal('a0', yMean, tau=1/(ySD*5)**2)
a0_tilde = pm.Normal('a0_tilde', mu=0, sd=1)
a0 = pm.Deterministic('a0', yMean + ySD*5*a0_tilde)

a1SD = pm.Gamma('a1SD', agammaShRa[0], agammaShRa[1])
#a1 = pm.Normal('a1', 0.0, tau=1/a1SD**2, shape=Nx1Lvl)
a1_tilde = pm.Normal('a1_tilde', mu=0, sd=1, shape=Nx1Lvl)
a1 = pm.Deterministic('a1', 0.0 + a1SD*a1_tilde)

a2SD = pm.Gamma('a2SD', agammaShRa[0], agammaShRa[1])
#a2 = pm.Normal('a2', 0.0, tau=1/a2SD**2, shape=Nx2Lvl)
a2_tilde = pm.Normal('a2_tilde', mu=0, sd=1, shape=Nx2Lvl)
a2 = pm.Deterministic('a2', 0.0 + a2SD*a2_tilde)

a1a2SD = pm.Gamma('a1a2SD', agammaShRa[0], agammaShRa[1])
#a1a2 = pm.Normal('a1a2', 0.0, 1/a1a2SD**2, shape=(Nx1Lvl, Nx2Lvl))
a1a2_tilde = pm.Normal('a1a2_tilde', mu=0, sd=1, shape=(Nx1Lvl, Nx2Lvl))
a1a2 = pm.Deterministic('a1a2', 0.0 + a1a2SD*a1a2_tilde)

mu = a0 + a1[x1] + a2[x2] +a1a2[x1, x2]
ySigma = pm.Uniform('ySigma', ySD/100, ySD*10)

like = pm.Normal('like', mu, sd=ySigma, observed=y)

In [85]:
n_samples = 10000
with model1:
trace1 = pm.sample(n_samples, nuts_kwargs={'target_accept':.95})

Auto-assigning NUTS sampler...
Average Loss = 12,355:  15%|█▍        | 29604/200000 [00:11<01:06, 2554.59it/s]
Convergence archived at 29700
Interrupted at 29,700 [14%]: Average Loss = 13,120
100%|██████████| 10500/10500 [02:44<00:00, 63.86it/s]

In [86]:
pm.traceplot(trace1, varnames=['a0', 'a1', 'a1SD', 'a2', 'a2SD', 'a1a2', 'a1a2SD', 'ySigma']);

In [122]:
# Transforming the trace data to sum-to-zero values
m = np.zeros((Nx1Lvl,Nx2Lvl, len(trace1)))
b1b2 = m.copy()

for (j1,j2) in np.ndindex(Nx1Lvl,Nx2Lvl):
m[j1,j2] =  (trace1['a0'] +
trace1['a1'][:,j1] +
trace1['a2'][:,j2] +
trace1['a1a2'][:,j1,j2])

b0 = np.mean(m, axis=(0,1))
b1 = np.mean(m, axis=1) - b0
b2 = np.mean(m, axis=0) - b0

for (j1,j2) in np.ndindex(Nx1Lvl,Nx2Lvl):
b1b2[j1,j2] =  (m[j1,j2] - (b0 + b1[j1] + b2[j2]))

# Below are the values corresponding to the first column of table 20.2
print('b0: {}'.format(np.round(np.mean(b0))))
print('b1: {}'.format(np.round(np.mean(b1, axis=1))))
print('b2: {}'.format(np.round(np.mean(b2, axis=1))[[20,48,12,7]]))
print('b1b2: {}'.format(np.round(np.mean(b1b2, axis=2))[0,48]))
print('b1b2: {}'.format(np.round(np.mean(b1b2, axis=2))[2,48]))
print('b1b2: {}'.format(np.round(np.mean(b1b2, axis=2))[0,12]))
print('b1b2: {}'.format(np.round(np.mean(b1b2, axis=2))[2,12]))
print('ySigma: {}'.format(np.round(np.mean(trace1['ySigma']))))

b0: 127106.0
b1: [-46398. -33094.  -3145.  26997.  55640.]
b2: [ -19362.    6687.   19206.  109196.]
b1b2: -3264.0
b1b2: -15098.0
b1b2: -12821.0
b1b2: 12978.0
ySigma: 17990.0


#### Figure 20.3¶

In [88]:
burnin = 200
scale = trace1['ySigma']

# Figure 20.8 in the book shows the Salary data for four of the sixty departments.
# Let's create the subset for plotting.
subset_org = ['BFIN', 'CHEM', 'PSY', 'ENG']
subset_df = df[df.Org.isin(subset_org)]

fg = sns.FacetGrid(subset_df, col='Org', col_order=subset_org,
col_wrap=2, size=3.5, aspect=1.3, despine=False, sharex=False)
fg.map(sns.swarmplot, 'Pos', 'Salary', data=subset_df, color='r')
fg.fig.suptitle('Data with Posterior Prediction', y=1.05, fontsize=16)
for ax in fg.axes:
ax.set_xlim(xmin=-1)
ax.set_ylim((0,350000))

for i, org_idx in enumerate([7,12,48,20]):
for pos_idx in np.arange(5):
plot_mustache(b0[burnin:]+
b1[pos_idx, burnin:]+
b2[org_idx,burnin:]+
b1b2[pos_idx,org_idx,burnin:], scale[burnin:], pos_idx, ax=fg.axes.flatten()[i])


### 20.2.3 - Main effect contrasts¶

#### Figure 20.4¶

In [89]:
contrasts = [b1[1,:]-b1[0,:],
b2[12,:]-b2[48,:],
b2[7,:]-np.mean([b2[48,:], b2[12,:], b2[20,:]], axis=0)]

titles = ['Assoc\n vs\n Assis',
'CHEM\n vs\n PSY',
'BFIN\n vs\n PSY.CHEM.ENG']

fig, axes = plt.subplots(1,3, figsize=(20,4))

for c, t, ax in zip(contrasts, titles, axes):
pm.plot_posterior(c, point_estimate='mode', round_to=0, ref_val=0, color=color, ax=ax)
ax.set_title(t)
ax.set_xlabel('Difference')


### 20.2.4 - Interaction contrasts and simple effects¶

#### Figure 20.5¶

In [17]:
# The posteriors for the main contrasts can be calculated directly from the parameters.
# The posterior for the interaction contrast (rightmost plot) however, is calculated as follows.

# Full vs Assis
P = np.zeros(Nx1Lvl, dtype=np.int)
P[df.Pos.cat.categories == 'Full'] = 1
P[df.Pos.cat.categories == 'Assis'] = -1

# CHEM vs PSY
O = np.zeros(Nx2Lvl, dtype=np.int)
O[df.Org.cat.categories == 'CHEM'] = 1
O[df.Org.cat.categories == 'PSY'] = -1

# The signs in the above vectors can be flipped, the end result (posterior) will be the same.
# Using the outer product of these two vectors, we get the matrix we need to multiply
# with the trace values of b1b2.
ic_factors = np.outer(P,O)
pd.DataFrame(ic_factors, index=df.Pos.cat.categories, columns=df.Org.cat.categories)

Out[17]:
ACTG AFRO AMST ANTH APHS AST BEPP BFIN BI BLAN ... RPAD SLIS SLS SOC SPAN SPEA SPHS STAT TELC THTR
Assis 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Assoc 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Full 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Endow 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Disting 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 60 columns

In [101]:
contrasts = [m[(df.Pos.cat.categories == 'Full').argmax(),
(df.Org.cat.categories == 'PSY').argmax(),:] -
m[(df.Pos.cat.categories == 'Assis').argmax(),
(df.Org.cat.categories == 'PSY').argmax(),:],

m[(df.Pos.cat.categories == 'Full').argmax(),
(df.Org.cat.categories == 'CHEM').argmax(),:] -
m[(df.Pos.cat.categories == 'Assis').argmax(),
(df.Org.cat.categories == 'CHEM').argmax(),:],
np.tensordot(ic_factors, b1b2)]

titles = ['Full - Assis @ PSY',
'Full - Assis @ CHEM',
'Full.v.Assis\n(x)\nCHEM.v.PSY']

xlabels = ['Difference in Salary']*2 + ['Difference of Differences']

fig, axes = plt.subplots(1,3, figsize=(20,4))

for c, t, l, ax in zip(contrasts, titles, xlabels, axes):
pm.plot_posterior(c, point_estimate='mode', round_to=0, ref_val=0, color=color, ax=ax)
ax.set_title(t)
ax.set_xlabel(l)


### 20.4 - Heterogenous Variances and Robustness against Outliers¶

#### Model (Kruschke, 2015)¶

In [8]:
Image('images/fig20_7.png')

Out[8]:
In [19]:
y = df.Salary

yMean = y.mean()
ySD = y.std()
agammaShRa = gammaShRaFromModeSD(ySD/2 , 2*ySD )

x1 = df.Pos.cat.codes.values
Nx1Lvl = len(df.Pos.cat.categories)

x2 = df.Org.cat.codes.values
Nx2Lvl = len(df.Org.cat.categories)

cellSDs = df.groupby(['Org','Pos']).std().dropna()
medianCellSD = cellSDs.median().squeeze()
sdCellSD = cellSDs.std().squeeze()
sgammaShRa = gammaShRaFromModeSD(medianCellSD, 2*sdCellSD)

with pm.Model() as model2:

#a0 = pm.Normal('a0', yMean, tau=1/(ySD*5)**2)
a0_tilde = pm.Normal('a0_tilde', mu=0, sd=1)
a0 = pm.Deterministic('a0', yMean + ySD*5*a0_tilde)

a1SD = pm.Gamma('a1SD', agammaShRa[0], agammaShRa[1])
#a1 = pm.Normal('a1', 0.0, tau=1/a1SD**2, shape=Nx1Lvl)
a1_tilde = pm.Normal('a1_tilde', mu=0, sd=1, shape=Nx1Lvl)
a1 = pm.Deterministic('a1', 0.0 + a1SD*a1_tilde)

a2SD = pm.Gamma('a2SD', agammaShRa[0], agammaShRa[1])
#a2 = pm.Normal('a2', 0.0, tau=1/a2SD**2, shape=Nx2Lvl)
a2_tilde = pm.Normal('a2_tilde', mu=0, sd=1, shape=Nx2Lvl)
a2 = pm.Deterministic('a2', 0.0 + a2SD*a2_tilde)

a1a2SD = pm.Gamma('a1a2SD', agammaShRa[0], agammaShRa[1])
#a1a2 = pm.Normal('a1a2', 0.0, 1/a1a2SD**2, shape=(Nx1Lvl, Nx2Lvl))
a1a2_tilde = pm.Normal('a1a2_tilde', mu=0, sd=1, shape=(Nx1Lvl, Nx2Lvl))
a1a2 = pm.Deterministic('a1a2', 0.0 + a1a2SD*a1a2_tilde)

mu = a0 + a1[x1] + a2[x2] + a1a2[x1, x2]

sigmaMode = pm.Gamma('sigmaMode', sgammaShRa[0], sgammaShRa[1])
sigmaSD = pm.Gamma('sigmaSD', sgammaShRa[0], sgammaShRa[1])
sigmaRa = pm.Deterministic('sigmaRa', (sigmaMode + tt.sqrt(sigmaMode**2 + 4*sigmaSD**2)) / (2*sigmaSD**2))
sigmaSh = pm.Deterministic('sigmaSh', (sigmaMode * sigmaRa))
sigma = pm.Gamma('sigma', sigmaSh, sigmaRa, shape=(Nx1Lvl, Nx2Lvl))
ySigma = pm.Deterministic('ySigma', tt.maximum(sigma, medianCellSD/1000))

nu = pm.Exponential('nu', 1/30.)

like = pm.StudentT('like', nu, mu, lam=1/(ySigma[x1, x2])**2, observed=y)

In [21]:
n_samples = 1000
with model2:
trace2 = pm.sample(n_samples, nuts_kwargs={'target_accept':.95})

Auto-assigning NUTS sampler...

pm.traceplot(trace2, varnames=['a0', 'a1', 'a1SD', 'a2', 'a2SD', 'a1a2', 'a1a2SD', 'sigma', 'nu']);