#!/usr/bin/env python
# coding: utf-8

# # Descriptive statistics
# 
# - get overall info about participants
# - assess normality of response times (RTs)

# In[6]:


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pingouin as pg
import os

alpha = 0.05

dt = pd.read_csv(os.path.join("..", "data", "data.csv"), index_col=0)
agg = pd.read_csv(os.path.join("..", "data", "data_aggregated.csv"), index_col=0)
aggwo = pd.read_csv(os.path.join("..", "data", "data_outliers-replaced_aggregated.csv"), index_col=0)

print("Loading:", len(d), "lines", len(d["ParticipantID"].unique()), "participants")


# ## Overall info about participants

# In[7]:


TPP = 72  # there are 72 trials in a session/participant
TPT = 36  # there are 36 trials in a part/test
TPL = 20  # there are 20 trials in a lexical test


print("Number of participants:")
display(pd.DataFrame(d["StudyID"].value_counts() / TPP))

print()
print("Fluent vs. non-fluent:")
display(pd.DataFrame(d.groupby("StudyID")["Fluent"].value_counts() / TPP))

print()
print()
print("Different kinds of designers:")

display(pd.DataFrame(d.groupby("StudyID")["Training"].value_counts() / TPP))

print()
print()
print("Different kinds of designers and which font was first:")
for sid in [1, 2]:
    print("Study #%s" % sid)
    dtt = pd.DataFrame(columns=["Designer", "Non-designer", "total"], index=["sansforgetica", "arial", "total"])
    dtt["Designer"] = dt[(dt["StudyID"] == sid) & (dt["TestID"] == 1) & (dt["Type"] == "lexical") & (dt["Training"] != "Non-designer")]["Font"].value_counts()
    dtt["Non-designer"] = dt[(dt["StudyID"] == sid) & (dt["TestID"] == 1) & (dt["Type"] == "lexical") & (dt["Training"] == "Non-designer")]["Font"].value_counts()
    dtt /=  TPL
    dtt["total"] = dtt.T.sum()
    dtt.loc["total"] = dtt.sum()
    display(dtt)

print()
print()
print("JoM for categories of training:")
print()
display(pd.DataFrame(d.groupby(["StudyID", "Training"])["JoM"].mean()))
print()
print()
print("JoL for categories of training:")
print()
display(pd.DataFrame(d.groupby(["StudyID", "Training"])["JoL"].value_counts() / TPT))


# # Assess normality of RTs
# 
# The distributions of RTs are not normal, but close enough.

# In[9]:


# assess normality of RTs

fig, axes = plt.subplots(2, 4, figsize=(20, 10))
plt.subplots_adjust(wspace=0, hspace=0.2)
for i, ttype in enumerate(["lexical", "recognition"]):
    rts = agg[agg["Type"] == ttype]["RT"]
    rts.plot.hist(bins=50, ax=axes[i][0], sharey=True, title="%s" % ttype)
    rts = agg[agg["Type"] == ttype]["RTnorm"]
    rts.plot.hist(bins=50, ax=axes[i][1], sharey=True, title="%s (normalized)" % ttype)
    
    rts = aggwo[aggwo["Type"] == ttype]["RT"]
    rts.plot.hist(bins=50, ax=axes[i][2], sharey=True, title="%s (outliers replaced)" % ttype)
    rts = aggwo[aggwo["Type"] == ttype]["RTnorm"]
    rts.plot.hist(bins=50, ax=axes[i][3], sharey=True, title="%s (outliers replaced, normalized)" % ttype)
    
# test for normality
# null hypothesis: RTs come from a normal distribution
for ttype in ["lexical", "recognition"]:
    for col in ["RTnorm"]:
        print("Normality test for %ss in %s task" % (col, ttype))
        display(pg.normality(agg[col]))
        print("Normality test for %ss in %s task (outliers replaced)" % (col, ttype))
        display(pg.normality(aggwo[col]))


# In[10]:


# Q-Q plots

for ttype in ["lexical", "recognition"]:
    for col in ["RTnorm"]:
        print("Q-Q plot for %s task" % ttype)
        pg.qqplot(aggwo["RTnorm"], dist="norm")


# In[ ]: