## Chapter 18 - Metric Predicted Variable with Multiple Metric Predictors¶

In [1]:
# %load ../../standard_import.txt
import pandas as pd
import numpy as np
import pymc3 as pm
import arviz as az
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import gridspec
from IPython.display import Image

%matplotlib inline
plt.style.use('seaborn-white')

color = '#87ceeb'
f_dict = {'size':16}


### 18.1 - Multiple Linear Regression¶

#### Data¶

In [2]:
df = pd.read_csv('data/Guber1999data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 8 columns):
#   Column     Non-Null Count  Dtype
---  ------     --------------  -----
0   State      50 non-null     object
1   Spend      50 non-null     float64
2   StuTeaRat  50 non-null     float64
3   Salary     50 non-null     float64
4   PrcntTake  50 non-null     int64
5   SATV       50 non-null     int64
6   SATM       50 non-null     int64
7   SATT       50 non-null     int64
dtypes: float64(3), int64(4), object(1)
memory usage: 3.2+ KB

In [3]:
df.head()

Out[3]:
State Spend StuTeaRat Salary PrcntTake SATV SATM SATT
0 Alabama 4.405 17.2 31.144 8 491 538 1029
1 Alaska 8.963 17.6 47.951 47 445 489 934
2 Arizona 4.778 19.3 32.175 27 448 496 944
3 Arkansas 4.459 17.1 28.934 6 482 523 1005
4 California 4.992 24.0 41.078 45 417 485 902
In [4]:
X = df[['Spend', 'PrcntTake']]
y = df['SATT']

meanx = X.mean().to_numpy()
scalex = X.std().to_numpy()
zX = ((X-meanx)/scalex).to_numpy()

meany = y.mean()
scaley = y.std()
zy = ((y-meany)/scaley).to_numpy()


#### Model (Kruschke, 2015)¶

In [5]:
Image('images/fig18_4.png', width=400)

Out[5]:
In [6]:
with pm.Model() as model:

zbeta0 = pm.Normal('zbeta0', mu=0, sd=2)
#zbeta1 = pm.Normal('zbetaj1', mu=0, sd=2)
#zbeta2 = pm.Normal('zbetaj2', mu=0, sd=2)
#zmu =  zbeta0 + (zbeta1 * X[:,0]) + (zbeta2 * X[:,1])
zbetaj = pm.Normal('zbetaj', mu=0, sd=2, shape=(2))
zmu =  zbeta0 + pm.math.dot(zbetaj, zX.T)

nu = pm.Exponential('nu', 1/29.)
zsigma = pm.Uniform('zsigma', 10**-5, 10)

likelihood = pm.StudentT('likelihood', nu=nu, mu=zmu, lam=1/zsigma**2, observed=zy)

In [7]:
with model:
trace = pm.sample(10000, target_accept=0.9)

/tmp/ipykernel_15723/3508249430.py:2: FutureWarning: In v4.0, pm.sample will return an arviz.InferenceData object instead of a MultiTrace by default. You can pass return_inferencedata=True or return_inferencedata=False to be safe and silence this warning.
trace = pm.sample(10000, target_accept=0.9)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [zsigma, nu, zbetaj, zbeta0]

100.00% [22000/22000 00:17<00:00 Sampling 2 chains, 0 divergences]
Sampling 2 chains for 1_000 tune and 10_000 draw iterations (2_000 + 20_000 draws total) took 19 seconds.

In [8]:
with model:
az.plot_trace(trace);


#### Figure 18.5¶

In [9]:
# Transform parameters back to original scale
beta0 = trace['zbeta0']*scaley + meany - np.sum(trace['zbetaj']*meanx/scalex, axis=1)*scaley
betaj = (trace['zbetaj']/scalex)*scaley
scale = (trace['zsigma']*scaley)

intercept = beta0
spend = betaj[:,0]
prcnttake =  betaj[:,1]
#normality = np.log10(trace['nu'])
normality = trace['nu']

fig, ([ax1, ax2, ax3], [ax4, ax5, ax6]) = plt.subplots(2,3, figsize=(12,6))

for ax, estimate, title, xlabel in zip(fig.axes,
[intercept, spend, prcnttake, scale, normality],
['Intercept', 'Spend', 'PrcntTake', 'Scale', 'Normality'],
[r'$\beta_0$', r'$\beta_1$', r'$\beta_2$', r'$\sigma$', r'$\nu$']):
az.plot_posterior(estimate, point_estimate='mode', ax=ax, color=color)
ax.set_title(title, fontdict=f_dict)
ax.set_xlabel(xlabel, fontdict=f_dict)

fig.tight_layout()
ax6.set_visible(False)


Below we create the scatterplots of figure 18.5 using pairplot() in seaborn and then tweak the lower triangle.

In [10]:
# DataFrame with the columns in correct order
pair_plt = pd.DataFrame({'Intercept':intercept,
'Spend':spend,
'PrcntTake': prcnttake,
'Scale':scale,
'Normality': normality},
columns=['Intercept', 'Spend', 'PrcntTake', 'Scale', 'Normality'])

# Correlation coefficients
corr = np.round(np.corrcoef(pair_plt, rowvar=0), decimals=3)
# Indexes of the lower triangle, below the diagonal
lower_idx = np.tril(corr, -1).nonzero()

# The seaborn pairplot
pgrid = sns.pairplot(pair_plt, plot_kws={'edgecolor':color, 'facecolor':'none'})

# Replace the plots on the diagonal with the parameter names
for i, ax in enumerate(pgrid.diag_axes):
ax.clear()
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
ax.text(.5,.5, pair_plt.columns[i], transform=ax.transAxes, fontdict={'size':16, 'weight':'bold'}, ha='center')

# Replace the lower triangle with the correlation coefficients
for i, ax in enumerate(pgrid.axes[lower_idx]):
#ax.clear()
ax.collections[0].remove()
ax.xaxis.set_visible(False)
#ax.yaxis.set_visible(False)
ax.text(.5,.5, corr[lower_idx][i], transform=ax.transAxes, fontdict=f_dict, ha='center')


### 18.1.4 - Redundant predictors¶

In [11]:
X2 = X.assign(PropNotTake = (100-df['PrcntTake']) / 100)

meanx2 = X2.mean().to_numpy()
scalex2 = X2.std().to_numpy()
zX2 = ((X2-meanx2)/scalex2).to_numpy()

In [12]:
sns.pairplot(X2, plot_kws={'edgecolor':color, 'facecolor':'none'})

Out[12]:
<seaborn.axisgrid.PairGrid at 0x7f512c352b50>
In [13]:
with pm.Model() as model2:

zbeta0 = pm.Normal('zbeta0', mu=0, sd=2)
zbetaj = pm.Normal('zbetaj', mu=0, sd=2, shape=(3))
zmu =  zbeta0 + pm.math.dot(zbetaj, zX2.T)

nu = pm.Exponential('nu', 1/29.)
zsigma = pm.Uniform('zsigma', 10**-5, 10)

likelihood = pm.StudentT('likelihood', nu=nu, mu=zmu, lam=1/zsigma**2, observed=zy)

In [14]:
with model2:
trace2 = pm.sample(5000, target_accept=0.9)

/tmp/ipykernel_15723/2323202616.py:2: FutureWarning: In v4.0, pm.sample will return an arviz.InferenceData object instead of a MultiTrace by default. You can pass return_inferencedata=True or return_inferencedata=False to be safe and silence this warning.
trace2 = pm.sample(5000, target_accept=0.9)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [zsigma, nu, zbetaj, zbeta0]

100.00% [12000/12000 01:13<00:00 Sampling 2 chains, 0 divergences]
Sampling 2 chains for 1_000 tune and 5_000 draw iterations (2_000 + 10_000 draws total) took 74 seconds.

In [15]:
with model2:
az.plot_trace(trace2);


Note: The trace/histogram above for zbetaj may look like one of the chains got stuck (i.e., all its values are piled up just above zero). However, paying attention to the plotting, you can actually tell that it isn't a chain that is piled up above zero, it's is one of the parameters in the array of parameters stored in zbetaj. Specifically, it is the first parameter, which is the coefficient associated with the Spend variable.

#### Figure 18.6¶

In [16]:
# Transform parameters back to original scale
beta0 = trace2['zbeta0']*scaley + meany - np.sum(trace2['zbetaj']*meanx2/scalex2, axis=1)*scaley
betaj = (trace2['zbetaj']/scalex2)*scaley
scale = (trace2['zsigma']*scaley)

intercept = beta0
spend = betaj[:,0]
prcnttake =  betaj[:,1]
propnottake =  betaj[:,2]
#normality = np.log10(trace2['nu'])
normality = trace2['nu']

fig, ([ax1, ax2, ax3], [ax4, ax5, ax6]) = plt.subplots(2,3, figsize=(12,6))

for ax, estimate, title, xlabel in zip(fig.axes,
[intercept, spend, prcnttake, propnottake, scale, normality],
['Intercept', 'Spend', 'PrcntTake', 'PropNotTake', 'Scale', 'Normality'],
[r'$\beta_0$', r'$\beta_1$', r'$\beta_2$', r'$\beta_3$', r'$\sigma$', r'log10($\nu$)']):
pm.plot_posterior(estimate, point_estimate='mode', ax=ax, color=color)
ax.set_title(title, fontdict=f_dict)
ax.set_xlabel(xlabel, fontdict=f_dict)

plt.tight_layout()