#!/usr/bin/env python
# coding: utf-8

# Scoresheet handwriting quiz: plots and regressions
# ==========================================
# 
# _Chuan-Zheng Lee <czlee@stanford.edu>_
# 
# This notebook accompanies [this Medium post summarizing some results](https://medium.com/@czlee/no-you-cant-read-a-debate-scoresheet-2b7c2239d091) from [this quiz](https://docs.google.com/forms/d/e/1FAIpQLSeohVm7cjg7zavMiYmI-RqpiWEuTQBNwpcIAKTHrCE2De-k5w/viewform?usp=sf_link) asking respondents to decipher some numbers that were on scoresheet at the 2018 United States Universities Debating Championships. There's also [this Google spreadsheet](https://docs.google.com/spreadsheets/d/1TUgPou6ZvyGr_xdj2Wpj5BZuJOs7buNwk_D5rcYkthU/edit?usp=sharing) with some similar statistics in a different format.
# 
# If you want to run this notebook, you'll need the dataset "quizresults.csv", which isn't in this repository. To find it, go to [the Google spreadsheet](https://docs.google.com/spreadsheets/d/1TUgPou6ZvyGr_xdj2Wpj5BZuJOs7buNwk_D5rcYkthU/edit?usp=sharing), open the "Export to quizresults.csv" sheet, download it as a CSV file (_File_ > _Download as…_ > _Comma-separated values (.csv, current sheet)_), and save it as "quizresults.csv" in the same directory as this notebook.
# 
# Many thanks to Emma Pierson for giving me a primer on how to do basic statistical regression using `statsmodels` in Python.

# In[1]:


import pandas as pd
import statsmodels.api as sm
import statsmodels.stats.api as sms
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnchoredOffsetbox, OffsetImage
from patsy import Treatment
from PIL import Image as PILImage
from IPython.display import display


# In[2]:


df = pd.read_csv('quizresults.csv')
pd.set_option('precision', 3)
mpl.rc('font', family='DejaVu Serif', size=14)
questions = list('abcdefghij')
answers = [4, 5, 9, 5, 9, 6, 9, 5, 5, 4]


# ## Overall score and answer key

# In[3]:


df['score'].count()


# In[4]:


df['score'].mean()


# In[5]:


counts = df['score'].value_counts().sort_index()
x = range(0, 11)
counts = [counts.get(i, 0) for i in x]
fig = plt.figure(figsize=(8,4))
plt.bar(x, counts, tick_label=x, width=0.94)
plt.xlabel('score')
plt.ylabel('number of respondents')
fig.savefig('overall-hist.png')
counts


# In[6]:


correct_columns = ['%s_correct' % q for q in questions]
means = df[correct_columns].mean()
plt.figure(figsize=(8,6))
means.plot(kind='barh', xlim=(0,1), width=0.8, color='C2')
plt.xticks([i*0.1 for i in range(0,11,2)], ['%d%%' % (x*10) for x in range(0,11,2)])
ax = plt.gca()
ax.invert_yaxis()
ax.set_yticklabels(['(%s)' % q for q in questions])
for i, m in enumerate(means):
    ax.text(m+0.02, i, "%.0f%%" % (m*100), va='center')
plt.title("Percentage answering question correctly");


# ## Answer pie charts

# In[7]:


SUBPLOT_WIDTH = 5.75


# In[8]:


def answer_pie(ax, df, question):
    """Plots a pie chart of responses in the given dataframe `df`, for
    the given `question`, and superimposes the question image in the
    lower right corner."""
    
    counts = df['%s_answer' % question].value_counts().sort_index()
    x = range(0, 11)
    counts = [counts.get(i, 0) for i in x]
    
    patches, texts, autotexts = ax.pie(counts, labels=x, startangle=90, 
            counterclock=False, autopct='%.0f%%', pctdistance=0.7)

    ax.set_title('answers to (%s)' % question)
    ax.axis('square')
    
    # remove labels for negligible slices
    for text, autotext in zip(texts, autotexts):
        if autotext.get_text()[0] == '0':
            autotext.set_text('')
            text.set_text('')
    
    # mark the correct answer
    correct = answers[questions.index(question)]
    texts[correct].set_text("%d (√)" % correct)
    
    # draw the question image
    im = PILImage.open('images/%s.jpg' % question)
    im.thumbnail((80,80), PILImage.LANCZOS)
    imbox = OffsetImage(im)
    oab = AnchoredOffsetbox('lower right', child=imbox, pad=0, frameon=False)
    ax.add_artist(oab)


# In[9]:


# Contentious questions
fig, axs = plt.subplots(2, 2, figsize=(2*SUBPLOT_WIDTH, 2*SUBPLOT_WIDTH))
for ax, q in zip(axs.flatten(), 'achj'):
    answer_pie(ax, df, q)
fig.savefig('contentious.png', bbox_inches='tight')


# In[10]:


# All questions
fig, axs = plt.subplots(5, 2, figsize=(2*SUBPLOT_WIDTH, 5*SUBPLOT_WIDTH))
for ax, q in zip(axs.flatten(), questions):
    answer_pie(ax, df, q)


# ## Responses to particular questions by region

# In[11]:


def answer_pie_for_query(ax, query, question, title):
    """Plots a pie chart for responses to the given `question`, among those
    responses satisfying the given `query`, and adds the given `title` to
    the plot title."""
    sub_df = df[query]
    answer_pie(ax, sub_df, question)
    n = sub_df.count()[0]
    ax.set_title("answers to (%s)\n%s (n = %d)" % (question, title, n))


# ### Image (c), for continental Europe

# In[12]:


fig, axs = plt.subplots(2, 2, figsize=(2*SUBPLOT_WIDTH, 2*SUBPLOT_WIDTH))
regions = ['US & Canada', 'Western Europe', 'Southeast Asia', 'Eastern Europe']
for ax, region in zip(axs.flatten(), regions):
    answer_pie_for_query(ax, df.region == region, 'c', region)


# It's a bit more succinct just to compare "continental Europe" to "everyone else".

# In[13]:


df['europe'] = df.region.isin(['Western Europe', 'Eastern Europe'])

fig, axs = plt.subplots(1, 2, figsize=(2*SUBPLOT_WIDTH, SUBPLOT_WIDTH), squeeze=True)
answer_pie_for_query(axs[0], df.europe, 'c', 'Continental Europe')
answer_pie_for_query(axs[1], ~df.europe, 'c', 'All other regions')
fig.savefig('c-europe.png', bbox_inches='tight')


# I didn't cite significance measures in the post, because for this one it's pretty obvious. But for what it's worth, under logistic regression on the answer being correct, we're looking at a $p$-value of less than $10^{-44}$.

# In[14]:


model = sm.Logit.from_formula('c_correct ~ europe', data=df).fit()
model.summary()


# ### Image (h), for East Asia

# In[15]:


df['east_asia'] = df.region == 'East Asia'

fig, axs = plt.subplots(1, 2, figsize=(2*SUBPLOT_WIDTH, SUBPLOT_WIDTH), squeeze=True)
answer_pie_for_query(axs[0], df.east_asia, 'h', 'East Asia')
answer_pie_for_query(axs[1], ~df.east_asia, 'h', 'All other regions')
fig.savefig('h-eastasia.png', bbox_inches='tight')


# In[16]:


model = sm.Logit.from_formula('h_correct ~ east_asia', data=df).fit()
model.summary()


# ### Image (a), for US & Canada, and Oceania

# In[17]:


fig, axs = plt.subplots(1, 2, figsize=(2*SUBPLOT_WIDTH, SUBPLOT_WIDTH), squeeze=True)
answer_pie_for_query(axs[0], df.region == 'US & Canada', 'a', 'US & Canada')
answer_pie_for_query(axs[1], df.region == 'Oceania', 'a', 'Oceania')


# In[18]:


df_uscanada_oceania = df[df.region.isin(['US & Canada', 'Oceania'])]
model = sm.Logit.from_formula('a_correct ~ region', data=df_uscanada_oceania).fit()
model.summary()


# ### Image (d), for Germany and the Netherlands

# In[19]:


fig, axs = plt.subplots(1, 2, figsize=(2*SUBPLOT_WIDTH, SUBPLOT_WIDTH), squeeze=True)
answer_pie_for_query(axs[0], df.country.isin(['Germany', 'Netherlands']), 'd', 'Germany and the Netherlands')
answer_pie_for_query(axs[1], ~df.country.isin(['Germany', 'Netherlands']), 'd', 'All other countries')


# In[20]:


model = sm.Logit.from_formula('d_correct ~ (country == "Germany") + (country == "Netherlands")', data=df).fit()
model.summary()


# ## Average score by country or region

# In[21]:


def categories_chart(ax, data, color=None):
    """Plots a horizontal bar chart using the given `data`, which 
    should be a list of `(country, mean, (upper, lower), nobs)`
    tuples. `color` is passed to matplotlib."""
    
    data.sort(key=lambda x: x[1], reverse=True)
    categories, means, confints, ns = zip(*data)
    errors = [[], []]
    for mean, (lower, upper) in zip(means, confints):
        errors[0].append(mean - lower)
        errors[1].append(upper - mean)
    ax.barh(categories, means, xerr=errors, color=color)
    ax.set_xlim((0, 6))
    ax.set_axisbelow(True)
    ax.xaxis.grid()
    ax.invert_yaxis()
    for i, (mean, n) in enumerate(zip(means, ns)):
        ax.text(mean-1, i, "%.2f" % mean, va='center', ha='right', color='white')
        ax.text(5.9, i, str(int(n)), va='center', ha='right')


# In[22]:


data = []
for country in df.country.unique():
    if df[df.country == country]['score'].count() < 15:
        continue
    stats = sms.DescrStatsW(df[df.country == country]['score'])
    if country == 'Hong Kong SAR China':
        country = 'Hong Kong'  # save space
    data.append((country, stats.mean, stats.tconfint_mean(), stats.nobs))

fig = plt.figure(figsize=(6,12))
ax = fig.gca()
categories_chart(ax, data, 'C4')
ax.text(5.9, -1, 'n', style='italic', va='center', ha='right')
fig.savefig('country.png', bbox_inches='tight')


# In[23]:


countries = df.country.value_counts()[df.country.value_counts() >= 15].index.tolist()
treat_us = Treatment(reference="United States")  # make United States the base category
model = sm.OLS.from_formula('score ~ C(country, treat_us)', 
                            data=df[df.country.isin(countries)]).fit()
model.summary()


# In[24]:


# Checking that all these countries have only statistically insignificant differences with the US
countries = ['Philippines', 'South Africa', 'United States', 'Bangladesh', 
             'Mexico', 'Indonesia', 'Australia', 'India', 'New Zealand', 'Malaysia']
model = sm.OLS.from_formula('score ~ C(country, treat_us)',
                            data=df[df.country.isin(countries)]).fit()
model.summary()


# In[25]:


data = []
for region in df.region.unique():
    if not isinstance(region, str):
        continue
    stats = sms.DescrStatsW(df[df.region == region]['score'])
    data.append((region, stats.mean, stats.tconfint_mean(), stats.nobs))

fig = plt.figure(figsize=(6,7))
categories_chart(fig.gca(), data, 'C5')
fig.savefig('region.png', bbox_inches='tight')


# In[26]:


treat_usca = Treatment(reference="US & Canada")  # make US & Canada the base category
model = sm.OLS.from_formula('score ~ C(region, treat_usca)', data=df).fit()
model.summary()


# In[27]:


# Checking that all these regions have only statistically insignificant differences with the US & Canada
regions = ['US & Canada', 'Latin America', 'Oceania', 'South Asia', 'Southeast Asia', 'Africa']
model = sm.OLS.from_formula('score ~ region', data=df[df.region.isin(regions)]).fit()
model.summary()


# ## Score by background
# 
# This section compares the average scores of those who said they had particular backgrounds in debating.
# 
# Note that the column names don't actually mean what they say, they're just labels for brevity. They're actually stricter than the labels imply. The descriptors respondents were asked to apply are as follows, emphasis added:
# - `none` means "I've never been involved in competitive debate _at the university level_."
# - `current` means "I debate or judge in university-level debating tournaments (in any style/format)."
# - `retiree` means "I used to debate or judge in _university-level_ debating tournaments (in any style/format), but have retired from both."
# - `judge` means "I've _broken_ as an adjudicator at a _national or international British Parliamentary title tournament_, or Australs or UADC."
# - `tabber` means "I've tabbed a _British Parliamentary_ tournament _with at least 50 teams_".
# 
# Note that the background categories aren't mutually exclusive, _e.g._, someone could be both a (breaking) `judge` and a `tabber`.

# In[28]:


data = []
for field in ['none', 'current', 'retiree', 'judge', 'tabber']:
    if df[df[field]]['score'].count() < 15:
        continue
    stats = sms.DescrStatsW(df[df[field]]['score'])
    data.append((field, stats.mean, stats.tconfint_mean(), stats.nobs))

fig = plt.figure(figsize=(6,4))
categories_chart(fig.gca(), data, 'C6')


# In[29]:


model = sm.OLS.from_formula('score ~ none + current + retiree + judge + tabber', data=df).fit()
model.summary()


# There's maybe a case to be made that judges and retirees were worse by a statistically significant difference, but it's not really convincing.

# In[30]:


model = sm.OLS.from_formula('score ~ judge + retiree', data=df).fit()
model.summary()


# In[31]:


model = sm.OLS.from_formula('score ~ tabber', data=df).fit()
model.summary()


# ## Questions by background
# 
# This code runs a logistic regression on whether answers to each question were correct, against respondents' answers to the background question.
# 
# As with the section just above, the column names don't actually mean what they say, they're just labels for brevity. Full descriptions are above.
# 
# If the log-likelihood ratio p-value is greater than 0.05, then there's no real difference between the categories, so it's no longer interested. If it's less than 0.05, it prints the whole model summary for perusal. This applies to two questions: (a) and (c). On further examination, there's a potentially significant coefficient (P>|z|) for tabbers in question (a) (a bit worse) and both non-debaters and breaking judges in (c) (again slightly worse). But because we're testing 50 hypotheses, I wouldn't read very much into a couple of apparently-significant differences.

# In[32]:


for q in questions:
    model = sm.Logit.from_formula('%s_correct ~ none + current + retiree + judge + tabber' % q, data=df).fit(disp=False)
    p = model.llr_pvalue
    print("Question (%s): LLR p-value = %.4f" % (q, p))
    if p <= 0.05:
        display(model.summary())


# ## Are countries better at deciphering themselves?
# Not really, no.
# 
# `author_countries` are the countries the authors of the respective images, so image (a) was written by a Canadian, (b) by an American, and so on.

# In[33]:


countries = ['United States', 'Canada', 'Mexico']
author_countries = ['Canada', 'United States', 'United States', 'United States', 'Canada', 
                    'Canada', 'United States', 'United States', 'United States', 'Mexico']
questions = 'abcdefghij'
for q, author_country in zip(questions, author_countries):
    model = sm.Logit.from_formula('%s_correct ~ C(country, Treatment(reference="%s"))' % (q, author_country),
                                        data=df[df.country.isin(countries)]).fit(disp=False)
    p = model.llr_pvalue
    print("Question (%s) (from %13s)  -  LLR p-value = %.4f" % (q, author_country, p))
    if p <= 0.05:
        display(model.summary())