#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt


# In[2]:


pd.options.mode.chained_assignment = 'raise'
# pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 2000)

# Display full output/results of a cell, not just the last one. 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# InteractiveShell.ast_node_interactivity = "last" # reset


# In[3]:


df_timings = pd.read_csv('../data/synth_100.csv') # testdata, synth_100
df_timings.shape


# In[4]:


df_timings.head()


# Aborted respondents

# In[5]:


df_timings['lastpage'].describe()


# In[6]:


df_timings['lastpage'].max()


# In[7]:


# df_timings['abortpage'] = 0 # value for non-aborted respondents
df_timings['abortpage'] = df_timings['lastpage'].mask(df_timings['lastpage'] == df_timings['lastpage'].max(), np.nan)
df_timings #['abortpage']


# Fraction of aborted respondents

# In[8]:


df_timings['abortpage'].notna().sum() / df_timings.shape[0]


# Fraction of participants who have not aborted at the respective question

# In[9]:


fig, ax = plt.subplots()
fig = sns.ecdfplot(data=df_timings, x="lastpage", complementary=True, ax=ax)
ax.set_xlim(0,  df_timings['lastpage'].max())
plt.xlabel("Question page")
plt.ylabel("Participants share")
plt.show()


# Number of aborted responses per question (note binning if used)
# 
# This histogram can be overlayed with the time spent per question to see correlations.

# In[10]:


sns.histplot(data=df_timings, x="abortpage", binwidth=1)
plt.xlabel("Question page")
plt.ylabel("No. of aborts at page")
plt.show()


# In[ ]: