#!/usr/bin/env python
# coding: utf-8
# # Gmail Experiment: Data Analysis
# Author: Leon Yin and Surya Mattu
# Links: [GitHub](https://github.com/the-markup/investigation-wheres-my-email/blob/master/notebooks/2-analysis.ipynb) | [nbviewer](https://nbviewer.jupyter.org/github/the-markup/investigation-wheres-my-email/blob/master/notebooks/2-analysis.ipynb) | [Story](https://themarkup.org/google-the-giant/2020/02/26/wheres-my-email)
#
# This notebook shows how we aggregate the data we preprocessed in the [last notebook](https://github.com/the-markup/investigation-wheres-my-email/blob/master/notebooks/1-data-preprocessing.ipynb) ([see in nbviewer](https://github.com/the-markup/investigation-wheres-my-email/blob/master/notebooks/1-data-preprocessing.ipynb)). The tables in this notebook are featured in our [experiment](https://themarkup.org/google-the-giant/2020/02/26/show-your-work-wheres-my-email) and [story](https://themarkup.org/google-the-giant/2020/02/26/wheres-my-email).
# In[1]:
get_ipython().run_line_magic('matplotlib', 'inline')
import os
import numpy as np
import pandas as pd
# In[2]:
# inputs
fn_input = '../data/intermediates/email_metadata_merged_with_newsletters.csv.gz'
fn_newsletter_categories = '../data/input/newsletter_categories.csv'
# outputs
appedix_dir = '../data/output/appendix'
table_dir = '../data/output/tables'
for d in [appedix_dir, table_dir]:
os.makedirs(d, exist_ok=True)
# In[3]:
# read the dataset we preprocessed in the last notebook.
df = pd.read_csv(fn_input)
len(df)
# Recall our dataset looks like this:
# In[4]:
df.iloc[-1].to_dict()
# In[5]:
categories = df.Category.unique()
categories
# ## Appendix 1: Taking inventory
# How many emails did we get from each email sender? Which newsletters didn't send us anything?
# In[6]:
data = []
for eID, _df in df.groupby('Entity_ID'):
first_email = _df.Date.min().split()[0]
last_email = _df.Date.max().split()[0]
n_email = _df.markup_id.nunique()
cat = _df.Category.iloc[0]
name = _df.Name.iloc[0]
row = {
'Name' : name,
'Entity_ID' : eID,
'Emails received' : n_email,
'Date of first email' : first_email,
'Date of last email' : last_email,
'Newsletter Category' : cat
}
data.append(row)
emails_received = pd.DataFrame(data)
emails_received.head(2)
# In[7]:
# newsletters we signed up for and which alias we used.
newsletter_categories = pd.read_csv(fn_newsletter_categories)
newsletter_categories.head(2)
# In[8]:
A1_output_cols = [
'Name', 'Website',
'Emails received',
'Date of first email',
'Date of last email',
]
# this prevents duplicate rows
cols_to_use = emails_received.columns.difference(
newsletter_categories.columns
)
no_shows = []
for cat in categories:
fn_out = os.path.join(
appedix_dir, f"A1-{cat.replace(' ', '-')}.csv"
)
newsletter_categories_cat = newsletter_categories[
newsletter_categories.Category == cat].set_index('Name')
emails_received_cat = emails_received[
emails_received['Newsletter Category'] == cat].set_index('Name')
# here we combine all emails we opt-d in for, with all recieved
email_tracker = newsletter_categories_cat.merge(emails_received_cat[cols_to_use],
on= ['Name'],
how='left') \
.sort_values(by= 'Name') \
.reset_index()
# For email senders with no match, set filler values.
email_tracker['Emails received'].fillna(0, inplace=True)
for col in ['Date of first email', 'Date of last email']:
email_tracker[col].fillna('-', inplace=True)
# let's record who we didn't get!
no_shows.extend(
email_tracker[email_tracker['Emails received'] == 0][[
'Name', 'Category', 'Entity_ID'
]].to_dict(orient='records')
)
# save this for the appendix
email_tracker[A1_output_cols].to_csv(fn_out, index=False)
del fn_out
# print some summary stats
n_signed_up = len(newsletter_categories_cat)
n_received = len(emails_received_cat)
print(f"{cat}:\nwe signed up for {n_signed_up}\n"
f"got emails from {n_received}\n")
# Let's get the stats for all senders:
# In[9]:
n_no_shows = len(no_shows)
n_received = df.Entity_ID.nunique()
n_subscribed = newsletter_categories.Entity_ID.nunique()
print(f"Out of the {n_subscribed} email senders we opted into "
f"we received {n_received} of them and didn't recieve {n_no_shows}.")
# In[10]:
# Who didn't send us emails?
fn_out = os.path.join(appedix_dir, 'Amisc_email_senders_who_didnt_send.csv')
pd.DataFrame(no_shows).sort_values('Name').to_csv(fn_out, index=False)
# ### Table 1: How Gmail classified our emails
# In[11]:
fn_out = os.path.join(table_dir, 'T1-overview.csv')
data = []
for cat, _df in df.groupby('Category'):
spam = []
promos = []
primary = []
for user, __df in _df.groupby('To_Email'):
n_emails = __df.markup_id.nunique()
n_spam = __df[__df['Spam'] == 1].markup_id.nunique()/ n_emails
n_promo = __df[(__df['Category Promotions'] == 1) &
(__df['Spam'] != 1)].markup_id.nunique()/ n_emails
n_primary = __df[__df['Primary'] == 1].markup_id.nunique() / n_emails
spam.append(n_spam)
promos.append(n_promo)
primary.append(n_primary)
# this is to get the variance
spam = np.std(spam)
promos = np.std(promos)
primary = np.std(primary)
n_emails = _df.markup_id.nunique()
n_spam = _df[_df['Spam'] == 1].markup_id.nunique() / n_emails
n_promo = _df[(_df['Category Promotions'] == 1) &
(_df['Spam'] != 1)].markup_id.nunique()/ n_emails
n_primary = _df[_df['Primary'] == 1].markup_id.nunique()/ n_emails
row = {
'Category' : cat,
'Primary' : n_primary,
'Promotions' : n_promo,
'Spam' : n_spam,
'Total Emails' : n_emails,
'STD Spam' : spam,
'STD Promotions' :promos,
'STD Primary' : primary
}
data.append(row)
# Average for all
spam = []
promos = []
primary = []
for user, __df in df.groupby('To_Email'):
n_emails = __df.markup_id.nunique()
n_spam = __df[__df['Spam'] == 1].markup_id.nunique()/ n_emails
n_promo = __df[(__df['Category Promotions'] == 1) &
(__df['Spam'] != 1)].markup_id.nunique()/ n_emails
n_primary = __df[__df['Primary'] == 1].markup_id.nunique() / n_emails
spam.append(n_spam)
promos.append(n_promo)
primary.append(n_primary)
spam = np.std(spam)
promos = np.std(promos)
primary = np.std(primary)
n_emails = df.markup_id.nunique()
n_spam = df[df['Spam'] == 1].markup_id.nunique() / n_emails
n_promo = df[(df['Category Promotions'] == 1) &
(df['Spam'] != 1)].markup_id.nunique()/ n_emails
n_primary = df[df['Primary'] == 1].markup_id.nunique()/ n_emails
row = {
'Category' : 'All emails',
'Primary' : n_primary,
'Promotions' : n_promo,
'Spam' : n_spam,
'Total Emails' : n_emails,
'STD Spam' : spam,
'STD Promotions' :promos,
'STD Primary' : primary
}
data.append(row)
overview = pd.DataFrame(data)
overview.to_csv(fn_out, index=False)
del fn_out
overview
# ### Table 2: Gmail classification statistics for each email sender
# In[12]:
cat2stats = {}
data_all = []
for cat in categories:
stats = []
for eID, messages in df[df.Category == cat].groupby('Entity_ID'):
name = messages.iloc[0].Name
n_cat = messages.markup_id.nunique()
n_pro = messages[(messages['Category Promotions'] == 1) &
(messages['Spam'] != 1) ].markup_id.nunique()
n_u = messages[messages['Category Updates'] == 1].markup_id.nunique()
n_s = messages[messages['Spam'] == 1].markup_id.nunique()
n_pri = messages[messages['Primary'] == 1].markup_id.nunique()
start_date = messages.Date.min()
end_date = messages.Date.max()
website = messages.Website.iloc[0]
row = {
'Entity_ID' : eID,
'Name' : name,
'Website' : website,
'n_promos' : n_pro,
'n_updates' : n_u,
'n_spam' : n_s,
'n_primary' : n_pri,
'start_date' : start_date,
'end_date' : end_date,
'n_messages' : len(messages)
}
stats.append(row)
# assign all sender stats to the category
cat2stats[cat] = stats
# add all sender stats to the list of everything
data_all.extend(stats)
# In[13]:
# names of columns we're using
metrics = [ 'n_primary', 'n_promos', 'n_spam']
ratios = [col.replace('n_', 'perc_') for col in metrics]
# columns we want for our output table
t2_cols = ['Name'] + ratios + ['n_messages']
# rename these columns for the appendix
col2col = {
'Name' : 'Name',
'perc_primary' : 'Primary',
'perc_promos' : 'Promotions',
'perc_spam' : 'Spam',
'n_messages' : 'Total Emails'
}
# In[14]:
def get_inbox_ratios(df):
"""Gets some summary stats per sender and sorts by primary inbox"""
df[ratios] = df[metrics].div(df['n_messages'],
axis=0).round(4)
output = df[t2_cols]
output.columns = [col2col.get(c, c) for c in output.columns]
output = output.set_index('Name').sort_values(
by=['Primary', 'Total Emails'], ascending=False
).reset_index()
return output
# How is Gmail classifying emails from each of these presidential candidates?
# In[15]:
fn_out = os.path.join(table_dir, 'T2a-presidental-candidates.csv')
cat = 'Presidential candidate'
print(cat)
df_cat = pd.DataFrame(cat2stats[cat])
output = get_inbox_ratios(df_cat)
# remove Trump rallies and add Trump with zero emails.
output = output[output.Name != 'Trump rallies'].append(
pd.DataFrame([{
'Name' : 'Donald J. Trump',
'Primary' : 0.0,
'Promotions' : 0.0,
'Spam' : 0.0,
'Total Emails' : 0
}])).reset_index(drop=True)
# output = publ?ication_ready(output)
output.to_csv(fn_out, index=False)
del fn_out
output
# We can also get this for the coalition of advocacy groups plus Change.org.
# In[16]:
coalition_plus_change = [
'SumOfUs',
'Democracy for America',
'CREDO Action',
'Change.org'
]
# In[17]:
fn_out = os.path.join(table_dir, 'T2b-coalition-groups.csv')
cat = 'Advocacy organization or think tank'
df_cat = pd.DataFrame(cat2stats[cat])
df_cat = df_cat[df_cat.Name.isin(coalition_plus_change)]
output = get_inbox_ratios(df_cat)
output.to_csv(fn_out, index=False)
del fn_out
output
# ### Appendix 2
# The same stats as Table 2a and 2b, but for all the categories.
# In[18]:
for cat in categories:
fn_out = os.path.join(appedix_dir, f"A2-{cat.replace(' ', '-')}.csv")
df_cat = pd.DataFrame(cat2stats[cat])
output = get_inbox_ratios(df_cat)
output.to_csv(fn_out, index=False)
del fn_out
output.head(2)
# ## Misc Stats
# ### What is the percentage of email senders that never got an email in primary?
# When we created the `cat2stats` dictionary, we also added each row to `data_all`.
# In[19]:
email_sender_stats = pd.DataFrame(data_all)
len(email_sender_stats)
# Let's only look at senders who sent us at least two emails.
# In[20]:
senders = email_sender_stats[email_sender_stats.n_messages >= 2]
len(senders)
# What percentage these senders never got to the primary inbox?
# In[21]:
len(senders[senders.n_primary == 0]) / len(senders)
# ### What percentage of Spam emails are also labelled as promotions?
# In[22]:
len(df[(df['Category Promotions'] == 1) & (df['Spam'] == 1)]) / len(df[df['Spam'] == 1])
# ### How many emails _would_ have been in Updates if Updates was active?
# In[23]:
len(df[(df['Category Updates'] == 1) & (df['Spam'] != 1)])