#!/usr/bin/env python # coding: utf-8 # Scoresheet handwriting quiz: plots and regressions # ========================================== # # _Chuan-Zheng Lee _ # # This notebook accompanies [this Medium post summarizing some results](https://medium.com/@czlee/no-you-cant-read-a-debate-scoresheet-2b7c2239d091) from [this quiz](https://docs.google.com/forms/d/e/1FAIpQLSeohVm7cjg7zavMiYmI-RqpiWEuTQBNwpcIAKTHrCE2De-k5w/viewform?usp=sf_link) asking respondents to decipher some numbers that were on scoresheet at the 2018 United States Universities Debating Championships. There's also [this Google spreadsheet](https://docs.google.com/spreadsheets/d/1TUgPou6ZvyGr_xdj2Wpj5BZuJOs7buNwk_D5rcYkthU/edit?usp=sharing) with some similar statistics in a different format. # # If you want to run this notebook, you'll need the dataset "quizresults.csv", which isn't in this repository. To find it, go to [the Google spreadsheet](https://docs.google.com/spreadsheets/d/1TUgPou6ZvyGr_xdj2Wpj5BZuJOs7buNwk_D5rcYkthU/edit?usp=sharing), open the "Export to quizresults.csv" sheet, download it as a CSV file (_File_ > _Download as…_ > _Comma-separated values (.csv, current sheet)_), and save it as "quizresults.csv" in the same directory as this notebook. # # Many thanks to Emma Pierson for giving me a primer on how to do basic statistical regression using `statsmodels` in Python. # In[1]: import pandas as pd import statsmodels.api as sm import statsmodels.stats.api as sms import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib.offsetbox import AnchoredOffsetbox, OffsetImage from patsy import Treatment from PIL import Image as PILImage from IPython.display import display # In[2]: df = pd.read_csv('quizresults.csv') pd.set_option('precision', 3) mpl.rc('font', family='DejaVu Serif', size=14) questions = list('abcdefghij') answers = [4, 5, 9, 5, 9, 6, 9, 5, 5, 4] # ## Overall score and answer key # In[3]: df['score'].count() # In[4]: df['score'].mean() # In[5]: counts = df['score'].value_counts().sort_index() x = range(0, 11) counts = [counts.get(i, 0) for i in x] fig = plt.figure(figsize=(8,4)) plt.bar(x, counts, tick_label=x, width=0.94) plt.xlabel('score') plt.ylabel('number of respondents') fig.savefig('overall-hist.png') counts # In[6]: correct_columns = ['%s_correct' % q for q in questions] means = df[correct_columns].mean() plt.figure(figsize=(8,6)) means.plot(kind='barh', xlim=(0,1), width=0.8, color='C2') plt.xticks([i*0.1 for i in range(0,11,2)], ['%d%%' % (x*10) for x in range(0,11,2)]) ax = plt.gca() ax.invert_yaxis() ax.set_yticklabels(['(%s)' % q for q in questions]) for i, m in enumerate(means): ax.text(m+0.02, i, "%.0f%%" % (m*100), va='center') plt.title("Percentage answering question correctly"); # ## Answer pie charts # In[7]: SUBPLOT_WIDTH = 5.75 # In[8]: def answer_pie(ax, df, question): """Plots a pie chart of responses in the given dataframe `df`, for the given `question`, and superimposes the question image in the lower right corner.""" counts = df['%s_answer' % question].value_counts().sort_index() x = range(0, 11) counts = [counts.get(i, 0) for i in x] patches, texts, autotexts = ax.pie(counts, labels=x, startangle=90, counterclock=False, autopct='%.0f%%', pctdistance=0.7) ax.set_title('answers to (%s)' % question) ax.axis('square') # remove labels for negligible slices for text, autotext in zip(texts, autotexts): if autotext.get_text()[0] == '0': autotext.set_text('') text.set_text('') # mark the correct answer correct = answers[questions.index(question)] texts[correct].set_text("%d (√)" % correct) # draw the question image im = PILImage.open('images/%s.jpg' % question) im.thumbnail((80,80), PILImage.LANCZOS) imbox = OffsetImage(im) oab = AnchoredOffsetbox('lower right', child=imbox, pad=0, frameon=False) ax.add_artist(oab) # In[9]: # Contentious questions fig, axs = plt.subplots(2, 2, figsize=(2*SUBPLOT_WIDTH, 2*SUBPLOT_WIDTH)) for ax, q in zip(axs.flatten(), 'achj'): answer_pie(ax, df, q) fig.savefig('contentious.png', bbox_inches='tight') # In[10]: # All questions fig, axs = plt.subplots(5, 2, figsize=(2*SUBPLOT_WIDTH, 5*SUBPLOT_WIDTH)) for ax, q in zip(axs.flatten(), questions): answer_pie(ax, df, q) # ## Responses to particular questions by region # In[11]: def answer_pie_for_query(ax, query, question, title): """Plots a pie chart for responses to the given `question`, among those responses satisfying the given `query`, and adds the given `title` to the plot title.""" sub_df = df[query] answer_pie(ax, sub_df, question) n = sub_df.count()[0] ax.set_title("answers to (%s)\n%s (n = %d)" % (question, title, n)) # ### Image (c), for continental Europe # In[12]: fig, axs = plt.subplots(2, 2, figsize=(2*SUBPLOT_WIDTH, 2*SUBPLOT_WIDTH)) regions = ['US & Canada', 'Western Europe', 'Southeast Asia', 'Eastern Europe'] for ax, region in zip(axs.flatten(), regions): answer_pie_for_query(ax, df.region == region, 'c', region) # It's a bit more succinct just to compare "continental Europe" to "everyone else". # In[13]: df['europe'] = df.region.isin(['Western Europe', 'Eastern Europe']) fig, axs = plt.subplots(1, 2, figsize=(2*SUBPLOT_WIDTH, SUBPLOT_WIDTH), squeeze=True) answer_pie_for_query(axs[0], df.europe, 'c', 'Continental Europe') answer_pie_for_query(axs[1], ~df.europe, 'c', 'All other regions') fig.savefig('c-europe.png', bbox_inches='tight') # I didn't cite significance measures in the post, because for this one it's pretty obvious. But for what it's worth, under logistic regression on the answer being correct, we're looking at a $p$-value of less than $10^{-44}$. # In[14]: model = sm.Logit.from_formula('c_correct ~ europe', data=df).fit() model.summary() # ### Image (h), for East Asia # In[15]: df['east_asia'] = df.region == 'East Asia' fig, axs = plt.subplots(1, 2, figsize=(2*SUBPLOT_WIDTH, SUBPLOT_WIDTH), squeeze=True) answer_pie_for_query(axs[0], df.east_asia, 'h', 'East Asia') answer_pie_for_query(axs[1], ~df.east_asia, 'h', 'All other regions') fig.savefig('h-eastasia.png', bbox_inches='tight') # In[16]: model = sm.Logit.from_formula('h_correct ~ east_asia', data=df).fit() model.summary() # ### Image (a), for US & Canada, and Oceania # In[17]: fig, axs = plt.subplots(1, 2, figsize=(2*SUBPLOT_WIDTH, SUBPLOT_WIDTH), squeeze=True) answer_pie_for_query(axs[0], df.region == 'US & Canada', 'a', 'US & Canada') answer_pie_for_query(axs[1], df.region == 'Oceania', 'a', 'Oceania') # In[18]: df_uscanada_oceania = df[df.region.isin(['US & Canada', 'Oceania'])] model = sm.Logit.from_formula('a_correct ~ region', data=df_uscanada_oceania).fit() model.summary() # ### Image (d), for Germany and the Netherlands # In[19]: fig, axs = plt.subplots(1, 2, figsize=(2*SUBPLOT_WIDTH, SUBPLOT_WIDTH), squeeze=True) answer_pie_for_query(axs[0], df.country.isin(['Germany', 'Netherlands']), 'd', 'Germany and the Netherlands') answer_pie_for_query(axs[1], ~df.country.isin(['Germany', 'Netherlands']), 'd', 'All other countries') # In[20]: model = sm.Logit.from_formula('d_correct ~ (country == "Germany") + (country == "Netherlands")', data=df).fit() model.summary() # ## Average score by country or region # In[21]: def categories_chart(ax, data, color=None): """Plots a horizontal bar chart using the given `data`, which should be a list of `(country, mean, (upper, lower), nobs)` tuples. `color` is passed to matplotlib.""" data.sort(key=lambda x: x[1], reverse=True) categories, means, confints, ns = zip(*data) errors = [[], []] for mean, (lower, upper) in zip(means, confints): errors[0].append(mean - lower) errors[1].append(upper - mean) ax.barh(categories, means, xerr=errors, color=color) ax.set_xlim((0, 6)) ax.set_axisbelow(True) ax.xaxis.grid() ax.invert_yaxis() for i, (mean, n) in enumerate(zip(means, ns)): ax.text(mean-1, i, "%.2f" % mean, va='center', ha='right', color='white') ax.text(5.9, i, str(int(n)), va='center', ha='right') # In[22]: data = [] for country in df.country.unique(): if df[df.country == country]['score'].count() < 15: continue stats = sms.DescrStatsW(df[df.country == country]['score']) if country == 'Hong Kong SAR China': country = 'Hong Kong' # save space data.append((country, stats.mean, stats.tconfint_mean(), stats.nobs)) fig = plt.figure(figsize=(6,12)) ax = fig.gca() categories_chart(ax, data, 'C4') ax.text(5.9, -1, 'n', style='italic', va='center', ha='right') fig.savefig('country.png', bbox_inches='tight') # In[23]: countries = df.country.value_counts()[df.country.value_counts() >= 15].index.tolist() treat_us = Treatment(reference="United States") # make United States the base category model = sm.OLS.from_formula('score ~ C(country, treat_us)', data=df[df.country.isin(countries)]).fit() model.summary() # In[24]: # Checking that all these countries have only statistically insignificant differences with the US countries = ['Philippines', 'South Africa', 'United States', 'Bangladesh', 'Mexico', 'Indonesia', 'Australia', 'India', 'New Zealand', 'Malaysia'] model = sm.OLS.from_formula('score ~ C(country, treat_us)', data=df[df.country.isin(countries)]).fit() model.summary() # In[25]: data = [] for region in df.region.unique(): if not isinstance(region, str): continue stats = sms.DescrStatsW(df[df.region == region]['score']) data.append((region, stats.mean, stats.tconfint_mean(), stats.nobs)) fig = plt.figure(figsize=(6,7)) categories_chart(fig.gca(), data, 'C5') fig.savefig('region.png', bbox_inches='tight') # In[26]: treat_usca = Treatment(reference="US & Canada") # make US & Canada the base category model = sm.OLS.from_formula('score ~ C(region, treat_usca)', data=df).fit() model.summary() # In[27]: # Checking that all these regions have only statistically insignificant differences with the US & Canada regions = ['US & Canada', 'Latin America', 'Oceania', 'South Asia', 'Southeast Asia', 'Africa'] model = sm.OLS.from_formula('score ~ region', data=df[df.region.isin(regions)]).fit() model.summary() # ## Score by background # # This section compares the average scores of those who said they had particular backgrounds in debating. # # Note that the column names don't actually mean what they say, they're just labels for brevity. They're actually stricter than the labels imply. The descriptors respondents were asked to apply are as follows, emphasis added: # - `none` means "I've never been involved in competitive debate _at the university level_." # - `current` means "I debate or judge in university-level debating tournaments (in any style/format)." # - `retiree` means "I used to debate or judge in _university-level_ debating tournaments (in any style/format), but have retired from both." # - `judge` means "I've _broken_ as an adjudicator at a _national or international British Parliamentary title tournament_, or Australs or UADC." # - `tabber` means "I've tabbed a _British Parliamentary_ tournament _with at least 50 teams_". # # Note that the background categories aren't mutually exclusive, _e.g._, someone could be both a (breaking) `judge` and a `tabber`. # In[28]: data = [] for field in ['none', 'current', 'retiree', 'judge', 'tabber']: if df[df[field]]['score'].count() < 15: continue stats = sms.DescrStatsW(df[df[field]]['score']) data.append((field, stats.mean, stats.tconfint_mean(), stats.nobs)) fig = plt.figure(figsize=(6,4)) categories_chart(fig.gca(), data, 'C6') # In[29]: model = sm.OLS.from_formula('score ~ none + current + retiree + judge + tabber', data=df).fit() model.summary() # There's maybe a case to be made that judges and retirees were worse by a statistically significant difference, but it's not really convincing. # In[30]: model = sm.OLS.from_formula('score ~ judge + retiree', data=df).fit() model.summary() # In[31]: model = sm.OLS.from_formula('score ~ tabber', data=df).fit() model.summary() # ## Questions by background # # This code runs a logistic regression on whether answers to each question were correct, against respondents' answers to the background question. # # As with the section just above, the column names don't actually mean what they say, they're just labels for brevity. Full descriptions are above. # # If the log-likelihood ratio p-value is greater than 0.05, then there's no real difference between the categories, so it's no longer interested. If it's less than 0.05, it prints the whole model summary for perusal. This applies to two questions: (a) and (c). On further examination, there's a potentially significant coefficient (P>|z|) for tabbers in question (a) (a bit worse) and both non-debaters and breaking judges in (c) (again slightly worse). But because we're testing 50 hypotheses, I wouldn't read very much into a couple of apparently-significant differences. # In[32]: for q in questions: model = sm.Logit.from_formula('%s_correct ~ none + current + retiree + judge + tabber' % q, data=df).fit(disp=False) p = model.llr_pvalue print("Question (%s): LLR p-value = %.4f" % (q, p)) if p <= 0.05: display(model.summary()) # ## Are countries better at deciphering themselves? # Not really, no. # # `author_countries` are the countries the authors of the respective images, so image (a) was written by a Canadian, (b) by an American, and so on. # In[33]: countries = ['United States', 'Canada', 'Mexico'] author_countries = ['Canada', 'United States', 'United States', 'United States', 'Canada', 'Canada', 'United States', 'United States', 'United States', 'Mexico'] questions = 'abcdefghij' for q, author_country in zip(questions, author_countries): model = sm.Logit.from_formula('%s_correct ~ C(country, Treatment(reference="%s"))' % (q, author_country), data=df[df.country.isin(countries)]).fit(disp=False) p = model.llr_pvalue print("Question (%s) (from %13s) - LLR p-value = %.4f" % (q, author_country, p)) if p <= 0.05: display(model.summary())