#!/usr/bin/env python # coding: utf-8 # # Analysis for SSFA project: NewEplsar model # ## Table of contents # 1. [Used packages](#imports) # 1. [Global settings](#settings) # 1. [Load data](#load) # 1. [Explore data](#exp) # 1. [Model specification](#model) # 1. [Inference](#inference) # 1. [NewEplsar](#NewEplsar) # 1. [Summary](#summary) # ## Used packages # In[1]: import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns; sns.set() import pickle import arviz as az import pymc3 as pm # In[2]: from matplotlib.colors import to_rgb # In[3]: import scipy.stats as stats # In[4]: from IPython.display import display # In[5]: import matplotlib as mpl # In[6]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[7]: import plotting_lib # ## Global settings # #### Output # In[8]: writeOut = True outPathPlots = "../plots/statistical_model_neweplsar/" outPathData = "../derived_data/statistical_model_neweplsar/" # #### Plotting # In[9]: widthMM = 190 widthInch = widthMM / 25.4 ratio = 0.66666 heigthInch = ratio*widthInch SMALL_SIZE = 8 MEDIUM_SIZE = 10 BIGGER_SIZE = 12 plt.rc('font', size=SMALL_SIZE) # controls default text sizes plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title sns.set_style("ticks") dpi = 300 # In[10]: sizes = [SMALL_SIZE,MEDIUM_SIZE,BIGGER_SIZE] # #### Computing # In[11]: numSamples = 500 numCores = 10 numTune = 1000 numPredSamples = 2000 random_seed=3651456135 # ## Load data # In[12]: datafile = "../derived_data/preprocessing/preprocessed.dat" # In[13]: with open(datafile, "rb") as f: x1,x2,_,df,dataZ,dictMeanStd,dictTreatment,dictSoftware = pickle.load(f) # Show that everything is correct: # In[14]: display(pd.DataFrame.from_dict({'x1':x1,'x2':x2})) # x1 indicates the software used, x2 indicates the treatment applied. # In[15]: for surfaceParam,(mean,std) in dictMeanStd.items(): print("Surface parameter {} has mean {} and standard deviation {}".format(surfaceParam,mean,std)) # In[16]: for k,v in dictTreatment.items(): print("Number {} encodes treatment {}".format(k,v)) # In[17]: for k,v in dictSoftware.items(): print("Number {} encodes software {}".format(k,v)) # In[18]: display(dataZ) # In[19]: display(df) # ## Exploration # In[20]: dfNewAvail = df[~df.NewEplsar.isna()].copy() # We look at the overall relationship between both epLsar variants on ConfoMap. # In[21]: yrange = [0.015,0.022] xrange = [ -0.0005 , 0.0085] # In[22]: ax = sns.scatterplot(data=dfNewAvail,x='epLsar',y='NewEplsar'); ax.set_xlim(xrange); ax.set_ylim(yrange); # Could be linear, but there is also a lot of noise. # # Maybe different treatments have different behavior? # In[23]: ax = sns.scatterplot(data=dfNewAvail,x='epLsar',y='NewEplsar',hue="Treatment"); ax.set_xlim(xrange); ax.set_ylim(yrange); # Too crowded, let's try it per dataset # In[24]: ax = sns.scatterplot(data=dfNewAvail[dfNewAvail.Dataset == "Sheeps"],x='epLsar',y='NewEplsar',hue="Treatment"); ax.set_xlim(xrange); ax.set_ylim(yrange); # In[25]: ax = sns.scatterplot(data=dfNewAvail[dfNewAvail.Dataset == "GuineaPigs"],x='epLsar',y='NewEplsar',hue="Treatment"); ax.set_xlim(xrange); ax.set_ylim(yrange); # In[26]: ax = sns.scatterplot(data=dfNewAvail[dfNewAvail.Dataset == "Lithics"],x='epLsar',y='NewEplsar',hue="Treatment"); ax.set_xlim(xrange); ax.set_ylim(yrange); # ### Standardization in z scores # In[27]: dfNewAvail.NewEplsar # In[28]: mu = dfNewAvail.NewEplsar.mean() mu # In[29]: sig = dfNewAvail.NewEplsar.std() sig # In[30]: dictMeanStd['NewEplsar'] = (mu,sig) # In[31]: newZ = (dfNewAvail.NewEplsar.values-mu) / sig # In[32]: newIndex = df[~df.NewEplsar.isna()].index.values # ## Model specification # In[33]: class Model_NewEplsar(pm.Model): """ Compute params of priors and hyperpriors. """ def getParams(self,x2,y): # get lengths Nx2Lvl = np.unique(x2).size dims = (Nx2Lvl) ### get standard deviations # convert to pandas dataframe to use their logic df = pd.DataFrame.from_dict({'x2':x2,'y':y}) s2 = df.groupby('x2').std()['y'].max() stdSingle = s2 return (dims, stdSingle) def printParams(self,x2,y): dims, stdSingle= self.getParams(x2,y) Nx2Lvl = dims s2 = stdSingle print("The number of levels of the x variables are {}".format(dims)) print("The standard deviations used for the beta prior is {}".format(stdSingle)) def __init__(self,name,x2,y,model=None): # call super's init first, passing model and name super().__init__(name, model) # get parameter of hyperpriors dims, stdSingle = self.getParams(x2,y) Nx2Lvl = dims s2 = stdSingle ### hyperpriors ### # observation hyperpriors lamY = 1/30. muGamma = 0.5 sigmaGamma = 2. # prediction hyperpriors sigma0 = pm.HalfNormal('sigma0',sd=1) sigma2 = pm.HalfNormal('sigma2',sd=s2, shape=Nx2Lvl) mu_b0 = pm.Normal('mu_b0', mu=0., sd=1) mu_b2 = pm.Normal('mu_b2', mu=0., sd=1, shape=Nx2Lvl) beta2 = 1.5*(np.sqrt(6)*sigma2)/(np.pi) ### priors ### # observation priors nuY = pm.Exponential('nuY',lam=lamY) sigmaY = pm.Gamma('sigmaY',mu=muGamma, sigma=sigmaGamma) # prediction priors b0_dist = pm.Normal('b0_dist', mu=0, sd=1) b0 = pm.Deterministic("b0", mu_b0 + b0_dist * sigma0) b2_beta = pm.HalfNormal('b2_beta', sd=beta2, shape=Nx2Lvl) b2_dist = pm.Gumbel('b2_dist', mu=0, beta=1) b2 = pm.Deterministic("b2", mu_b2 + b2_beta * b2_dist) #### prediction ### mu = pm.Deterministic('mu',b0 + b2[x2]) ### observation ### y = pm.StudentT('y',nu = nuY, mu=mu, sd=sigmaY, observed=y) # ## Inference # ### NewEplsar # In[34]: with pm.Model() as model: new_epLsarModel = Model_NewEplsar('NewEplsar',x2[newIndex],newZ) # #### Verify model settings # In[35]: new_epLsarModel.printParams(x2[newIndex],newZ) # In[36]: try: graph_new_epLsar = pm.model_to_graphviz(new_epLsarModel) except: graph_new_epLsar = "Could not make graph" graph_new_epLsar # #### Check prior choice # In[37]: with new_epLsarModel as model: prior_pred_new_epLsar = pm.sample_prior_predictive(samples=numPredSamples,random_seed=random_seed) # In[38]: plotting_lib.plotPriorPredictive(widthInch,heigthInch,dpi,writeOut,outPathPlots,dfNewAvail.reset_index(),dictMeanStd,prior_pred_new_epLsar,newZ,'NewEplsar') # Prior choice is as intended: Broad over the data range. # #### Sampling # In[39]: with new_epLsarModel as model: trace_new_epLsar = pm.sample(numSamples,cores=numCores,tune=numTune,max_treedepth=20, init='auto',target_accept=0.99,random_seed=random_seed) # In[40]: with new_epLsarModel as model: if writeOut: with open(outPathData + 'model_{}.pkl'.format('NewEplsar'), 'wb') as buff: pickle.dump({'model':new_epLsarModel, 'trace': trace_new_epLsar}, buff) # #### Check sampling # In[41]: with new_epLsarModel as model: dataTrace_new_epLsar = az.from_pymc3(trace=trace_new_epLsar) # In[42]: pm.summary(dataTrace_new_epLsar,hdi_prob=0.95).round(2) # In[44]: az.plot_forest(dataTrace_new_epLsar,var_names=['b0','b2'],filter_vars='like',figsize=(widthInch,5*heigthInch),hdi_prob=0.95,ess=True,r_hat=True); if writeOut: plt.savefig(outPathPlots + "posterior_forest_{}.pdf".format('NewEplsar'),dpi=dpi) # In[45]: with new_epLsarModel as model: plotting_lib.plotTracesB(widthInch,heigthInch,dpi,writeOut,outPathPlots,trace_new_epLsar,'NewEplsar') # In[46]: with new_epLsarModel as model: plotting_lib.pm.energyplot(trace_new_epLsar) # #### Posterior predictive distribution # In[47]: with new_epLsarModel as model: posterior_pred_new_epLsar = pm.sample_posterior_predictive(trace_new_epLsar,samples=numPredSamples,random_seed=random_seed) # In[48]: plotting_lib.plotPriorPosteriorPredictive(widthInch,heigthInch,dpi,writeOut,outPathPlots,dfNewAvail.reset_index(),dictMeanStd,prior_pred_new_epLsar,posterior_pred_new_epLsar,newZ,'NewEplsar') # ### Posterior check # In[49]: with new_epLsarModel as model: pm_data_new_epLsar = az.from_pymc3(trace=trace_new_epLsar,prior=prior_pred_new_epLsar,posterior_predictive=posterior_pred_new_epLsar) # In[50]: plotting_lib.plotPosterior(widthInch,heigthInch,dpi,writeOut,outPathPlots,dictMeanStd,pm_data_new_epLsar,'NewEplsar') # ### Compare treatment differences with other epLsar values # In[51]: b1P_Old = np.load("../derived_data/statistical_model_two_factors/epLsar_oldb1.npy") b2P_Old = np.load("../derived_data/statistical_model_two_factors/epLsar_oldb2.npy") M12P_Old = np.load("../derived_data/statistical_model_two_factors/epLsar_oldM12.npy") # In[52]: from collections import defaultdict # In[53]: def plotTreatmentPosterior(widthInch,heigthInch,dpi,sizes,writeOut,path,dictMeanStd,dictTreatment,dictSoftware,trace,yname,x1,x3): SMALL_SIZE,MEDIUM_SIZE,BIGGER_SIZE = sizes mu_Val,sig_Val = dictMeanStd[yname] # get posterior samples b2P = sig_Val*trace['{}_b2'.format(yname)] # prepare color dict for treatments # use groups of 4 colors, as in tab20c colorIndex = dict({5:0,6:1,7:2,0:4,1:5,4:6,10:7,2:8,3:9,8:10,9:11}) # prepare dataset dict for treatments dictDataset = dict({5:0,6:0,7:0,0:1,1:1,4:1,10:1,2:2,3:2,8:2,9:2}) # === inverse dict ==== inv_dictDataset = defaultdict(list) # using loop to perform reverse mapping for keys, vals in dictDataset.items(): for val in [vals]: inv_dictDataset[val].append(keys) # === # get number of datasets numDatasets = len(np.unique(list(dictDataset.values()))) # get number of treatments per dataset dictDataset2NumberTreats = dict() for numDataset in range(numDatasets): n = len(inv_dictDataset[numDataset]) dictDataset2NumberTreats[numDataset] = n # Get maximum of treatments per dataset tmax = np.max(list(dictDataset2NumberTreats.values())) # compute maximal number of pairs maxpair = int(tmax*(tmax-1)/2) fig = plt.subplots(squeeze=False, figsize=(numDatasets*widthInch,maxpair*heigthInch), dpi=dpi); # store list for hdi hdiList = [] for indexDataset in np.arange(numDatasets): # counter for row rowCounter = 0 # first treatment for treatmentNum_i,lvl2_i in enumerate(inv_dictDataset[indexDataset]): # second treatment for treatmentNum_j,lvl2_j in enumerate(inv_dictDataset[indexDataset]): if treatmentNum_i > treatmentNum_j: # set subplot curr_ax = plt.subplot2grid((maxpair, numDatasets), (rowCounter,indexDataset)) # compute difference between treatments for each software diffS0 = sig_Val*((M12P_Old[:,0,lvl2_i]+b2P_Old[:,lvl2_i]) -(M12P_Old[:,0,lvl2_j]+b2P_Old[:,lvl2_j])) diffS1 = sig_Val*((M12P_Old[:,1,lvl2_i]+b2P_Old[:,lvl2_i]) -(M12P_Old[:,1,lvl2_j]+b2P_Old[:,lvl2_j])) #plot posterior sns.kdeplot(diffS1,ax=curr_ax,label="epLsar on {}".format(dictSoftware[1]),color='gray',alpha=0.3,ls='--'); sns.kdeplot(diffS0,ax=curr_ax,label="epLsar on {}".format(dictSoftware[0]),color='gray',alpha=0.3,ls='dotted'); sns.kdeplot(b2P[:,lvl2_i]-b2P[:,lvl2_j],ax=curr_ax,label="NewEplsar on {}".format(dictSoftware[0]),color='C0',alpha=0.3,ls='dotted'); # plot reference value zero curr_ax.axvline(x=0,color="C1") # get hdi hdi_new = az.hdi(az.convert_to_inference_data(b2P[:,lvl2_i]-b2P[:,lvl2_j]),hdi_prob=0.95)['x'].values hdiS0 = az.hdi(az.convert_to_inference_data(diffS0),hdi_prob=0.95)['x'].values hdiS1 = az.hdi(az.convert_to_inference_data(diffS1),hdi_prob=0.95)['x'].values isSignificant = lambda x: (x[0] > 0.0) or (x[1] < 0.0) # store hdi hdiList.append([dictTreatment[lvl2_i],dictTreatment[lvl2_j], hdi_new[0],hdi_new[1],isSignificant(hdi_new), hdiS0[0],hdiS0[1],isSignificant(hdiS0), hdiS1[0],hdiS1[1],isSignificant(hdiS1) ]) # set title nameFirst = dictTreatment[lvl2_i] nameSecond = dictTreatment[lvl2_j] title = "{} vs. {}".format(nameFirst,nameSecond) if isSignificant(hdi_new): title += ": Significant on NewEplsar" curr_ax.set_title(title) # add legend curr_ax.legend() # set x label curr_ax.set_xlabel('Delta') # remove y label decoration curr_ax.tick_params(left=False) curr_ax.set(yticklabels=[]) # increment counter rowCounter += 1 #plt.suptitle('Estimated differences between treatments on {}'.format(yname)) plt.tight_layout() if writeOut: plt.savefig(path + "treatment_pairs_{}.pdf".format(yname),dpi=dpi) plt.show() # convert hdi to df df = pd.DataFrame(hdiList,columns=["Treatment_i","Treatment_j", "hdi_NewEplsar_2.5%","hdi_NewEplsar_97.5%","isSignificant_NewEplsar","hdi_{}_2.5%".format(dictSoftware[0]),"hdi_{}_97.5%".format(dictSoftware[0]),"isSignificant_on_{}".format(dictSoftware[0]), "hdi_{}_2.5%".format(dictSoftware[1]),"hdi_{}_97.5%".format(dictSoftware[1]),"isSignificant_on_{}".format(dictSoftware[1])]) return df # In[54]: dfHDI = plotTreatmentPosterior(widthInch,heigthInch,dpi,sizes,writeOut,outPathPlots,dictMeanStd,dictTreatment,dictSoftware,trace_new_epLsar,'NewEplsar',x1[newIndex],x2[newIndex]) # In[55]: dfHDI # In[56]: if writeOut: dfHDI.to_csv(outPathData+ 'hdi_{}.csv'.format('NewEplsar')) # ## Summary # Show where NewEplsar yields other results then epLsar: # In[65]: df_summary = dfHDI[(dfHDI.isSignificant_NewEplsar != dfHDI.isSignificant_on_ConfoMap) | (dfHDI.isSignificant_NewEplsar != dfHDI.isSignificant_on_Toothfrax) ][["Treatment_i","Treatment_j","isSignificant_NewEplsar","isSignificant_on_Toothfrax","isSignificant_on_ConfoMap","hdi_NewEplsar_2.5%","hdi_NewEplsar_97.5%","hdi_ConfoMap_2.5%","hdi_ConfoMap_97.5%","hdi_Toothfrax_2.5%","hdi_Toothfrax_97.5%"]] df_summary # In[66]: if writeOut: df_summary.to_csv(outPathData+ 'summary.csv') # ### Write out # In[67]: get_ipython().system('jupyter nbconvert --to html Statistical_Model_NewEplsar.ipynb') # In[68]: get_ipython().system('jupyter nbconvert --to markdown Statistical_Model_NewEplsar.ipynb') # In[ ]: