#!/usr/bin/env python
# coding: utf-8
# # Analysis for SSFA project: NewEplsar model
# ## Table of contents
# 1. [Used packages](#imports)
# 1. [Global settings](#settings)
# 1. [Load data](#load)
# 1. [Explore data](#exp)
# 1. [Model specification](#model)
# 1. [Inference](#inference)
# 1. [NewEplsar](#NewEplsar)
# 1. [Summary](#summary)
# ## Used packages
# In[1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pickle
import arviz as az
import pymc3 as pm
# In[2]:
from matplotlib.colors import to_rgb
# In[3]:
import scipy.stats as stats
# In[4]:
from IPython.display import display
# In[5]:
import matplotlib as mpl
# In[6]:
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
# In[7]:
import plotting_lib
# ## Global settings
# #### Output
# In[8]:
writeOut = True
outPathPlots = "../plots/statistical_model_neweplsar/"
outPathData = "../derived_data/statistical_model_neweplsar/"
# #### Plotting
# In[9]:
widthMM = 190
widthInch = widthMM / 25.4
ratio = 0.66666
heigthInch = ratio*widthInch
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 12
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
sns.set_style("ticks")
dpi = 300
# In[10]:
sizes = [SMALL_SIZE,MEDIUM_SIZE,BIGGER_SIZE]
# #### Computing
# In[11]:
numSamples = 500
numCores = 10
numTune = 1000
numPredSamples = 2000
random_seed=3651456135
# ## Load data
# In[12]:
datafile = "../derived_data/preprocessing/preprocessed.dat"
# In[13]:
with open(datafile, "rb") as f:
x1,x2,_,df,dataZ,dictMeanStd,dictTreatment,dictSoftware = pickle.load(f)
# Show that everything is correct:
# In[14]:
display(pd.DataFrame.from_dict({'x1':x1,'x2':x2}))
# x1 indicates the software used, x2 indicates the treatment applied.
# In[15]:
for surfaceParam,(mean,std) in dictMeanStd.items():
print("Surface parameter {} has mean {} and standard deviation {}".format(surfaceParam,mean,std))
# In[16]:
for k,v in dictTreatment.items():
print("Number {} encodes treatment {}".format(k,v))
# In[17]:
for k,v in dictSoftware.items():
print("Number {} encodes software {}".format(k,v))
# In[18]:
display(dataZ)
# In[19]:
display(df)
# ## Exploration
# In[20]:
dfNewAvail = df[~df.NewEplsar.isna()].copy()
# We look at the overall relationship between both epLsar variants on ConfoMap.
# In[21]:
yrange = [0.015,0.022]
xrange = [ -0.0005 , 0.0085]
# In[22]:
ax = sns.scatterplot(data=dfNewAvail,x='epLsar',y='NewEplsar');
ax.set_xlim(xrange);
ax.set_ylim(yrange);
# Could be linear, but there is also a lot of noise.
#
# Maybe different treatments have different behavior?
# In[23]:
ax = sns.scatterplot(data=dfNewAvail,x='epLsar',y='NewEplsar',hue="Treatment");
ax.set_xlim(xrange);
ax.set_ylim(yrange);
# Too crowded, let's try it per dataset
# In[24]:
ax = sns.scatterplot(data=dfNewAvail[dfNewAvail.Dataset == "Sheeps"],x='epLsar',y='NewEplsar',hue="Treatment");
ax.set_xlim(xrange);
ax.set_ylim(yrange);
# In[25]:
ax = sns.scatterplot(data=dfNewAvail[dfNewAvail.Dataset == "GuineaPigs"],x='epLsar',y='NewEplsar',hue="Treatment");
ax.set_xlim(xrange);
ax.set_ylim(yrange);
# In[26]:
ax = sns.scatterplot(data=dfNewAvail[dfNewAvail.Dataset == "Lithics"],x='epLsar',y='NewEplsar',hue="Treatment");
ax.set_xlim(xrange);
ax.set_ylim(yrange);
# ### Standardization in z scores
# In[27]:
dfNewAvail.NewEplsar
# In[28]:
mu = dfNewAvail.NewEplsar.mean()
mu
# In[29]:
sig = dfNewAvail.NewEplsar.std()
sig
# In[30]:
dictMeanStd['NewEplsar'] = (mu,sig)
# In[31]:
newZ = (dfNewAvail.NewEplsar.values-mu) / sig
# In[32]:
newIndex = df[~df.NewEplsar.isna()].index.values
# ## Model specification
# In[33]:
class Model_NewEplsar(pm.Model):
"""
Compute params of priors and hyperpriors.
"""
def getParams(self,x2,y):
# get lengths
Nx2Lvl = np.unique(x2).size
dims = (Nx2Lvl)
### get standard deviations
# convert to pandas dataframe to use their logic
df = pd.DataFrame.from_dict({'x2':x2,'y':y})
s2 = df.groupby('x2').std()['y'].max()
stdSingle = s2
return (dims, stdSingle)
def printParams(self,x2,y):
dims, stdSingle= self.getParams(x2,y)
Nx2Lvl = dims
s2 = stdSingle
print("The number of levels of the x variables are {}".format(dims))
print("The standard deviations used for the beta prior is {}".format(stdSingle))
def __init__(self,name,x2,y,model=None):
# call super's init first, passing model and name
super().__init__(name, model)
# get parameter of hyperpriors
dims, stdSingle = self.getParams(x2,y)
Nx2Lvl = dims
s2 = stdSingle
### hyperpriors ###
# observation hyperpriors
lamY = 1/30.
muGamma = 0.5
sigmaGamma = 2.
# prediction hyperpriors
sigma0 = pm.HalfNormal('sigma0',sd=1)
sigma2 = pm.HalfNormal('sigma2',sd=s2, shape=Nx2Lvl)
mu_b0 = pm.Normal('mu_b0', mu=0., sd=1)
mu_b2 = pm.Normal('mu_b2', mu=0., sd=1, shape=Nx2Lvl)
beta2 = 1.5*(np.sqrt(6)*sigma2)/(np.pi)
### priors ###
# observation priors
nuY = pm.Exponential('nuY',lam=lamY)
sigmaY = pm.Gamma('sigmaY',mu=muGamma, sigma=sigmaGamma)
# prediction priors
b0_dist = pm.Normal('b0_dist', mu=0, sd=1)
b0 = pm.Deterministic("b0", mu_b0 + b0_dist * sigma0)
b2_beta = pm.HalfNormal('b2_beta', sd=beta2, shape=Nx2Lvl)
b2_dist = pm.Gumbel('b2_dist', mu=0, beta=1)
b2 = pm.Deterministic("b2", mu_b2 + b2_beta * b2_dist)
#### prediction ###
mu = pm.Deterministic('mu',b0 + b2[x2])
### observation ###
y = pm.StudentT('y',nu = nuY, mu=mu, sd=sigmaY, observed=y)
# ## Inference
# ### NewEplsar
# In[34]:
with pm.Model() as model:
new_epLsarModel = Model_NewEplsar('NewEplsar',x2[newIndex],newZ)
# #### Verify model settings
# In[35]:
new_epLsarModel.printParams(x2[newIndex],newZ)
# In[36]:
try:
graph_new_epLsar = pm.model_to_graphviz(new_epLsarModel)
except:
graph_new_epLsar = "Could not make graph"
graph_new_epLsar
# #### Check prior choice
# In[37]:
with new_epLsarModel as model:
prior_pred_new_epLsar = pm.sample_prior_predictive(samples=numPredSamples,random_seed=random_seed)
# In[38]:
plotting_lib.plotPriorPredictive(widthInch,heigthInch,dpi,writeOut,outPathPlots,dfNewAvail.reset_index(),dictMeanStd,prior_pred_new_epLsar,newZ,'NewEplsar')
# Prior choice is as intended: Broad over the data range.
# #### Sampling
# In[39]:
with new_epLsarModel as model:
trace_new_epLsar = pm.sample(numSamples,cores=numCores,tune=numTune,max_treedepth=20, init='auto',target_accept=0.99,random_seed=random_seed)
# In[40]:
with new_epLsarModel as model:
if writeOut:
with open(outPathData + 'model_{}.pkl'.format('NewEplsar'), 'wb') as buff:
pickle.dump({'model':new_epLsarModel, 'trace': trace_new_epLsar}, buff)
# #### Check sampling
# In[41]:
with new_epLsarModel as model:
dataTrace_new_epLsar = az.from_pymc3(trace=trace_new_epLsar)
# In[42]:
pm.summary(dataTrace_new_epLsar,hdi_prob=0.95).round(2)
# In[44]:
az.plot_forest(dataTrace_new_epLsar,var_names=['b0','b2'],filter_vars='like',figsize=(widthInch,5*heigthInch),hdi_prob=0.95,ess=True,r_hat=True);
if writeOut:
plt.savefig(outPathPlots + "posterior_forest_{}.pdf".format('NewEplsar'),dpi=dpi)
# In[45]:
with new_epLsarModel as model:
plotting_lib.plotTracesB(widthInch,heigthInch,dpi,writeOut,outPathPlots,trace_new_epLsar,'NewEplsar')
# In[46]:
with new_epLsarModel as model:
plotting_lib.pm.energyplot(trace_new_epLsar)
# #### Posterior predictive distribution
# In[47]:
with new_epLsarModel as model:
posterior_pred_new_epLsar = pm.sample_posterior_predictive(trace_new_epLsar,samples=numPredSamples,random_seed=random_seed)
# In[48]:
plotting_lib.plotPriorPosteriorPredictive(widthInch,heigthInch,dpi,writeOut,outPathPlots,dfNewAvail.reset_index(),dictMeanStd,prior_pred_new_epLsar,posterior_pred_new_epLsar,newZ,'NewEplsar')
# ### Posterior check
# In[49]:
with new_epLsarModel as model:
pm_data_new_epLsar = az.from_pymc3(trace=trace_new_epLsar,prior=prior_pred_new_epLsar,posterior_predictive=posterior_pred_new_epLsar)
# In[50]:
plotting_lib.plotPosterior(widthInch,heigthInch,dpi,writeOut,outPathPlots,dictMeanStd,pm_data_new_epLsar,'NewEplsar')
# ### Compare treatment differences with other epLsar values
# In[51]:
b1P_Old = np.load("../derived_data/statistical_model_two_factors/epLsar_oldb1.npy")
b2P_Old = np.load("../derived_data/statistical_model_two_factors/epLsar_oldb2.npy")
M12P_Old = np.load("../derived_data/statistical_model_two_factors/epLsar_oldM12.npy")
# In[52]:
from collections import defaultdict
# In[53]:
def plotTreatmentPosterior(widthInch,heigthInch,dpi,sizes,writeOut,path,dictMeanStd,dictTreatment,dictSoftware,trace,yname,x1,x3):
SMALL_SIZE,MEDIUM_SIZE,BIGGER_SIZE = sizes
mu_Val,sig_Val = dictMeanStd[yname]
# get posterior samples
b2P = sig_Val*trace['{}_b2'.format(yname)]
# prepare color dict for treatments
# use groups of 4 colors, as in tab20c
colorIndex = dict({5:0,6:1,7:2,0:4,1:5,4:6,10:7,2:8,3:9,8:10,9:11})
# prepare dataset dict for treatments
dictDataset = dict({5:0,6:0,7:0,0:1,1:1,4:1,10:1,2:2,3:2,8:2,9:2})
# === inverse dict ====
inv_dictDataset = defaultdict(list)
# using loop to perform reverse mapping
for keys, vals in dictDataset.items():
for val in [vals]:
inv_dictDataset[val].append(keys)
# ===
# get number of datasets
numDatasets = len(np.unique(list(dictDataset.values())))
# get number of treatments per dataset
dictDataset2NumberTreats = dict()
for numDataset in range(numDatasets):
n = len(inv_dictDataset[numDataset])
dictDataset2NumberTreats[numDataset] = n
# Get maximum of treatments per dataset
tmax = np.max(list(dictDataset2NumberTreats.values()))
# compute maximal number of pairs
maxpair = int(tmax*(tmax-1)/2)
fig = plt.subplots(squeeze=False, figsize=(numDatasets*widthInch,maxpair*heigthInch), dpi=dpi);
# store list for hdi
hdiList = []
for indexDataset in np.arange(numDatasets):
# counter for row
rowCounter = 0
# first treatment
for treatmentNum_i,lvl2_i in enumerate(inv_dictDataset[indexDataset]):
# second treatment
for treatmentNum_j,lvl2_j in enumerate(inv_dictDataset[indexDataset]):
if treatmentNum_i > treatmentNum_j:
# set subplot
curr_ax = plt.subplot2grid((maxpair, numDatasets), (rowCounter,indexDataset))
# compute difference between treatments for each software
diffS0 = sig_Val*((M12P_Old[:,0,lvl2_i]+b2P_Old[:,lvl2_i]) -(M12P_Old[:,0,lvl2_j]+b2P_Old[:,lvl2_j]))
diffS1 = sig_Val*((M12P_Old[:,1,lvl2_i]+b2P_Old[:,lvl2_i]) -(M12P_Old[:,1,lvl2_j]+b2P_Old[:,lvl2_j]))
#plot posterior
sns.kdeplot(diffS1,ax=curr_ax,label="epLsar on {}".format(dictSoftware[1]),color='gray',alpha=0.3,ls='--');
sns.kdeplot(diffS0,ax=curr_ax,label="epLsar on {}".format(dictSoftware[0]),color='gray',alpha=0.3,ls='dotted');
sns.kdeplot(b2P[:,lvl2_i]-b2P[:,lvl2_j],ax=curr_ax,label="NewEplsar on {}".format(dictSoftware[0]),color='C0',alpha=0.3,ls='dotted');
# plot reference value zero
curr_ax.axvline(x=0,color="C1")
# get hdi
hdi_new = az.hdi(az.convert_to_inference_data(b2P[:,lvl2_i]-b2P[:,lvl2_j]),hdi_prob=0.95)['x'].values
hdiS0 = az.hdi(az.convert_to_inference_data(diffS0),hdi_prob=0.95)['x'].values
hdiS1 = az.hdi(az.convert_to_inference_data(diffS1),hdi_prob=0.95)['x'].values
isSignificant = lambda x: (x[0] > 0.0) or (x[1] < 0.0)
# store hdi
hdiList.append([dictTreatment[lvl2_i],dictTreatment[lvl2_j],
hdi_new[0],hdi_new[1],isSignificant(hdi_new),
hdiS0[0],hdiS0[1],isSignificant(hdiS0),
hdiS1[0],hdiS1[1],isSignificant(hdiS1)
])
# set title
nameFirst = dictTreatment[lvl2_i]
nameSecond = dictTreatment[lvl2_j]
title = "{} vs. {}".format(nameFirst,nameSecond)
if isSignificant(hdi_new):
title += ": Significant on NewEplsar"
curr_ax.set_title(title)
# add legend
curr_ax.legend()
# set x label
curr_ax.set_xlabel('Delta')
# remove y label decoration
curr_ax.tick_params(left=False)
curr_ax.set(yticklabels=[])
# increment counter
rowCounter += 1
#plt.suptitle('Estimated differences between treatments on {}'.format(yname))
plt.tight_layout()
if writeOut:
plt.savefig(path + "treatment_pairs_{}.pdf".format(yname),dpi=dpi)
plt.show()
# convert hdi to df
df = pd.DataFrame(hdiList,columns=["Treatment_i","Treatment_j",
"hdi_NewEplsar_2.5%","hdi_NewEplsar_97.5%","isSignificant_NewEplsar","hdi_{}_2.5%".format(dictSoftware[0]),"hdi_{}_97.5%".format(dictSoftware[0]),"isSignificant_on_{}".format(dictSoftware[0]),
"hdi_{}_2.5%".format(dictSoftware[1]),"hdi_{}_97.5%".format(dictSoftware[1]),"isSignificant_on_{}".format(dictSoftware[1])])
return df
# In[54]:
dfHDI = plotTreatmentPosterior(widthInch,heigthInch,dpi,sizes,writeOut,outPathPlots,dictMeanStd,dictTreatment,dictSoftware,trace_new_epLsar,'NewEplsar',x1[newIndex],x2[newIndex])
# In[55]:
dfHDI
# In[56]:
if writeOut:
dfHDI.to_csv(outPathData+ 'hdi_{}.csv'.format('NewEplsar'))
# ## Summary
# Show where NewEplsar yields other results then epLsar:
# In[65]:
df_summary = dfHDI[(dfHDI.isSignificant_NewEplsar != dfHDI.isSignificant_on_ConfoMap) | (dfHDI.isSignificant_NewEplsar != dfHDI.isSignificant_on_Toothfrax) ][["Treatment_i","Treatment_j","isSignificant_NewEplsar","isSignificant_on_Toothfrax","isSignificant_on_ConfoMap","hdi_NewEplsar_2.5%","hdi_NewEplsar_97.5%","hdi_ConfoMap_2.5%","hdi_ConfoMap_97.5%","hdi_Toothfrax_2.5%","hdi_Toothfrax_97.5%"]]
df_summary
# In[66]:
if writeOut:
df_summary.to_csv(outPathData+ 'summary.csv')
# ### Write out
# In[67]:
get_ipython().system('jupyter nbconvert --to html Statistical_Model_NewEplsar.ipynb')
# In[68]:
get_ipython().system('jupyter nbconvert --to markdown Statistical_Model_NewEplsar.ipynb')
# In[ ]: