#!/usr/bin/env python # coding: utf-8 # # Descriptive statistics # # - get overall info about participants # - assess normality of response times (RTs) # In[6]: import numpy as np import matplotlib.pyplot as plt import pandas as pd import pingouin as pg import os alpha = 0.05 dt = pd.read_csv(os.path.join("..", "data", "data.csv"), index_col=0) agg = pd.read_csv(os.path.join("..", "data", "data_aggregated.csv"), index_col=0) aggwo = pd.read_csv(os.path.join("..", "data", "data_outliers-replaced_aggregated.csv"), index_col=0) print("Loading:", len(d), "lines", len(d["ParticipantID"].unique()), "participants") # ## Overall info about participants # In[7]: TPP = 72 # there are 72 trials in a session/participant TPT = 36 # there are 36 trials in a part/test TPL = 20 # there are 20 trials in a lexical test print("Number of participants:") display(pd.DataFrame(d["StudyID"].value_counts() / TPP)) print() print("Fluent vs. non-fluent:") display(pd.DataFrame(d.groupby("StudyID")["Fluent"].value_counts() / TPP)) print() print() print("Different kinds of designers:") display(pd.DataFrame(d.groupby("StudyID")["Training"].value_counts() / TPP)) print() print() print("Different kinds of designers and which font was first:") for sid in [1, 2]: print("Study #%s" % sid) dtt = pd.DataFrame(columns=["Designer", "Non-designer", "total"], index=["sansforgetica", "arial", "total"]) dtt["Designer"] = dt[(dt["StudyID"] == sid) & (dt["TestID"] == 1) & (dt["Type"] == "lexical") & (dt["Training"] != "Non-designer")]["Font"].value_counts() dtt["Non-designer"] = dt[(dt["StudyID"] == sid) & (dt["TestID"] == 1) & (dt["Type"] == "lexical") & (dt["Training"] == "Non-designer")]["Font"].value_counts() dtt /= TPL dtt["total"] = dtt.T.sum() dtt.loc["total"] = dtt.sum() display(dtt) print() print() print("JoM for categories of training:") print() display(pd.DataFrame(d.groupby(["StudyID", "Training"])["JoM"].mean())) print() print() print("JoL for categories of training:") print() display(pd.DataFrame(d.groupby(["StudyID", "Training"])["JoL"].value_counts() / TPT)) # # Assess normality of RTs # # The distributions of RTs are not normal, but close enough. # In[9]: # assess normality of RTs fig, axes = plt.subplots(2, 4, figsize=(20, 10)) plt.subplots_adjust(wspace=0, hspace=0.2) for i, ttype in enumerate(["lexical", "recognition"]): rts = agg[agg["Type"] == ttype]["RT"] rts.plot.hist(bins=50, ax=axes[i][0], sharey=True, title="%s" % ttype) rts = agg[agg["Type"] == ttype]["RTnorm"] rts.plot.hist(bins=50, ax=axes[i][1], sharey=True, title="%s (normalized)" % ttype) rts = aggwo[aggwo["Type"] == ttype]["RT"] rts.plot.hist(bins=50, ax=axes[i][2], sharey=True, title="%s (outliers replaced)" % ttype) rts = aggwo[aggwo["Type"] == ttype]["RTnorm"] rts.plot.hist(bins=50, ax=axes[i][3], sharey=True, title="%s (outliers replaced, normalized)" % ttype) # test for normality # null hypothesis: RTs come from a normal distribution for ttype in ["lexical", "recognition"]: for col in ["RTnorm"]: print("Normality test for %ss in %s task" % (col, ttype)) display(pg.normality(agg[col])) print("Normality test for %ss in %s task (outliers replaced)" % (col, ttype)) display(pg.normality(aggwo[col])) # In[10]: # Q-Q plots for ttype in ["lexical", "recognition"]: for col in ["RTnorm"]: print("Q-Q plot for %s task" % ttype) pg.qqplot(aggwo["RTnorm"], dist="norm") # In[ ]: