#!/usr/bin/env python # coding: utf-8 # # Gmail Experiment: Data Analysis # Author: Leon Yin and Surya Mattu
# Links: [GitHub](https://github.com/the-markup/investigation-wheres-my-email/blob/master/notebooks/2-analysis.ipynb) | [nbviewer](https://nbviewer.jupyter.org/github/the-markup/investigation-wheres-my-email/blob/master/notebooks/2-analysis.ipynb) | [Story](https://themarkup.org/google-the-giant/2020/02/26/wheres-my-email)
# # This notebook shows how we aggregate the data we preprocessed in the [last notebook](https://github.com/the-markup/investigation-wheres-my-email/blob/master/notebooks/1-data-preprocessing.ipynb) ([see in nbviewer](https://github.com/the-markup/investigation-wheres-my-email/blob/master/notebooks/1-data-preprocessing.ipynb)). The tables in this notebook are featured in our [experiment](https://themarkup.org/google-the-giant/2020/02/26/show-your-work-wheres-my-email) and [story](https://themarkup.org/google-the-giant/2020/02/26/wheres-my-email). # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import os import numpy as np import pandas as pd # In[2]: # inputs fn_input = '../data/intermediates/email_metadata_merged_with_newsletters.csv.gz' fn_newsletter_categories = '../data/input/newsletter_categories.csv' # outputs appedix_dir = '../data/output/appendix' table_dir = '../data/output/tables' for d in [appedix_dir, table_dir]: os.makedirs(d, exist_ok=True) # In[3]: # read the dataset we preprocessed in the last notebook. df = pd.read_csv(fn_input) len(df) # Recall our dataset looks like this: # In[4]: df.iloc[-1].to_dict() # In[5]: categories = df.Category.unique() categories # ## Appendix 1: Taking inventory # How many emails did we get from each email sender? Which newsletters didn't send us anything? # In[6]: data = [] for eID, _df in df.groupby('Entity_ID'): first_email = _df.Date.min().split()[0] last_email = _df.Date.max().split()[0] n_email = _df.markup_id.nunique() cat = _df.Category.iloc[0] name = _df.Name.iloc[0] row = { 'Name' : name, 'Entity_ID' : eID, 'Emails received' : n_email, 'Date of first email' : first_email, 'Date of last email' : last_email, 'Newsletter Category' : cat } data.append(row) emails_received = pd.DataFrame(data) emails_received.head(2) # In[7]: # newsletters we signed up for and which alias we used. newsletter_categories = pd.read_csv(fn_newsletter_categories) newsletter_categories.head(2) # In[8]: A1_output_cols = [ 'Name', 'Website', 'Emails received', 'Date of first email', 'Date of last email', ] # this prevents duplicate rows cols_to_use = emails_received.columns.difference( newsletter_categories.columns ) no_shows = [] for cat in categories: fn_out = os.path.join( appedix_dir, f"A1-{cat.replace(' ', '-')}.csv" ) newsletter_categories_cat = newsletter_categories[ newsletter_categories.Category == cat].set_index('Name') emails_received_cat = emails_received[ emails_received['Newsletter Category'] == cat].set_index('Name') # here we combine all emails we opt-d in for, with all recieved email_tracker = newsletter_categories_cat.merge(emails_received_cat[cols_to_use], on= ['Name'], how='left') \ .sort_values(by= 'Name') \ .reset_index() # For email senders with no match, set filler values. email_tracker['Emails received'].fillna(0, inplace=True) for col in ['Date of first email', 'Date of last email']: email_tracker[col].fillna('-', inplace=True) # let's record who we didn't get! no_shows.extend( email_tracker[email_tracker['Emails received'] == 0][[ 'Name', 'Category', 'Entity_ID' ]].to_dict(orient='records') ) # save this for the appendix email_tracker[A1_output_cols].to_csv(fn_out, index=False) del fn_out # print some summary stats n_signed_up = len(newsletter_categories_cat) n_received = len(emails_received_cat) print(f"{cat}:\nwe signed up for {n_signed_up}\n" f"got emails from {n_received}\n") # Let's get the stats for all senders: # In[9]: n_no_shows = len(no_shows) n_received = df.Entity_ID.nunique() n_subscribed = newsletter_categories.Entity_ID.nunique() print(f"Out of the {n_subscribed} email senders we opted into " f"we received {n_received} of them and didn't recieve {n_no_shows}.") # In[10]: # Who didn't send us emails? fn_out = os.path.join(appedix_dir, 'Amisc_email_senders_who_didnt_send.csv') pd.DataFrame(no_shows).sort_values('Name').to_csv(fn_out, index=False) # ### Table 1: How Gmail classified our emails # In[11]: fn_out = os.path.join(table_dir, 'T1-overview.csv') data = [] for cat, _df in df.groupby('Category'): spam = [] promos = [] primary = [] for user, __df in _df.groupby('To_Email'): n_emails = __df.markup_id.nunique() n_spam = __df[__df['Spam'] == 1].markup_id.nunique()/ n_emails n_promo = __df[(__df['Category Promotions'] == 1) & (__df['Spam'] != 1)].markup_id.nunique()/ n_emails n_primary = __df[__df['Primary'] == 1].markup_id.nunique() / n_emails spam.append(n_spam) promos.append(n_promo) primary.append(n_primary) # this is to get the variance spam = np.std(spam) promos = np.std(promos) primary = np.std(primary) n_emails = _df.markup_id.nunique() n_spam = _df[_df['Spam'] == 1].markup_id.nunique() / n_emails n_promo = _df[(_df['Category Promotions'] == 1) & (_df['Spam'] != 1)].markup_id.nunique()/ n_emails n_primary = _df[_df['Primary'] == 1].markup_id.nunique()/ n_emails row = { 'Category' : cat, 'Primary' : n_primary, 'Promotions' : n_promo, 'Spam' : n_spam, 'Total Emails' : n_emails, 'STD Spam' : spam, 'STD Promotions' :promos, 'STD Primary' : primary } data.append(row) # Average for all spam = [] promos = [] primary = [] for user, __df in df.groupby('To_Email'): n_emails = __df.markup_id.nunique() n_spam = __df[__df['Spam'] == 1].markup_id.nunique()/ n_emails n_promo = __df[(__df['Category Promotions'] == 1) & (__df['Spam'] != 1)].markup_id.nunique()/ n_emails n_primary = __df[__df['Primary'] == 1].markup_id.nunique() / n_emails spam.append(n_spam) promos.append(n_promo) primary.append(n_primary) spam = np.std(spam) promos = np.std(promos) primary = np.std(primary) n_emails = df.markup_id.nunique() n_spam = df[df['Spam'] == 1].markup_id.nunique() / n_emails n_promo = df[(df['Category Promotions'] == 1) & (df['Spam'] != 1)].markup_id.nunique()/ n_emails n_primary = df[df['Primary'] == 1].markup_id.nunique()/ n_emails row = { 'Category' : 'All emails', 'Primary' : n_primary, 'Promotions' : n_promo, 'Spam' : n_spam, 'Total Emails' : n_emails, 'STD Spam' : spam, 'STD Promotions' :promos, 'STD Primary' : primary } data.append(row) overview = pd.DataFrame(data) overview.to_csv(fn_out, index=False) del fn_out overview # ### Table 2: Gmail classification statistics for each email sender # In[12]: cat2stats = {} data_all = [] for cat in categories: stats = [] for eID, messages in df[df.Category == cat].groupby('Entity_ID'): name = messages.iloc[0].Name n_cat = messages.markup_id.nunique() n_pro = messages[(messages['Category Promotions'] == 1) & (messages['Spam'] != 1) ].markup_id.nunique() n_u = messages[messages['Category Updates'] == 1].markup_id.nunique() n_s = messages[messages['Spam'] == 1].markup_id.nunique() n_pri = messages[messages['Primary'] == 1].markup_id.nunique() start_date = messages.Date.min() end_date = messages.Date.max() website = messages.Website.iloc[0] row = { 'Entity_ID' : eID, 'Name' : name, 'Website' : website, 'n_promos' : n_pro, 'n_updates' : n_u, 'n_spam' : n_s, 'n_primary' : n_pri, 'start_date' : start_date, 'end_date' : end_date, 'n_messages' : len(messages) } stats.append(row) # assign all sender stats to the category cat2stats[cat] = stats # add all sender stats to the list of everything data_all.extend(stats) # In[13]: # names of columns we're using metrics = [ 'n_primary', 'n_promos', 'n_spam'] ratios = [col.replace('n_', 'perc_') for col in metrics] # columns we want for our output table t2_cols = ['Name'] + ratios + ['n_messages'] # rename these columns for the appendix col2col = { 'Name' : 'Name', 'perc_primary' : 'Primary', 'perc_promos' : 'Promotions', 'perc_spam' : 'Spam', 'n_messages' : 'Total Emails' } # In[14]: def get_inbox_ratios(df): """Gets some summary stats per sender and sorts by primary inbox""" df[ratios] = df[metrics].div(df['n_messages'], axis=0).round(4) output = df[t2_cols] output.columns = [col2col.get(c, c) for c in output.columns] output = output.set_index('Name').sort_values( by=['Primary', 'Total Emails'], ascending=False ).reset_index() return output # How is Gmail classifying emails from each of these presidential candidates? # In[15]: fn_out = os.path.join(table_dir, 'T2a-presidental-candidates.csv') cat = 'Presidential candidate' print(cat) df_cat = pd.DataFrame(cat2stats[cat]) output = get_inbox_ratios(df_cat) # remove Trump rallies and add Trump with zero emails. output = output[output.Name != 'Trump rallies'].append( pd.DataFrame([{ 'Name' : 'Donald J. Trump', 'Primary' : 0.0, 'Promotions' : 0.0, 'Spam' : 0.0, 'Total Emails' : 0 }])).reset_index(drop=True) # output = publ?ication_ready(output) output.to_csv(fn_out, index=False) del fn_out output # We can also get this for the coalition of advocacy groups plus Change.org. # In[16]: coalition_plus_change = [ 'SumOfUs', 'Democracy for America', 'CREDO Action', 'Change.org' ] # In[17]: fn_out = os.path.join(table_dir, 'T2b-coalition-groups.csv') cat = 'Advocacy organization or think tank' df_cat = pd.DataFrame(cat2stats[cat]) df_cat = df_cat[df_cat.Name.isin(coalition_plus_change)] output = get_inbox_ratios(df_cat) output.to_csv(fn_out, index=False) del fn_out output # ### Appendix 2 # The same stats as Table 2a and 2b, but for all the categories. # In[18]: for cat in categories: fn_out = os.path.join(appedix_dir, f"A2-{cat.replace(' ', '-')}.csv") df_cat = pd.DataFrame(cat2stats[cat]) output = get_inbox_ratios(df_cat) output.to_csv(fn_out, index=False) del fn_out output.head(2) # ## Misc Stats # ### What is the percentage of email senders that never got an email in primary? # When we created the `cat2stats` dictionary, we also added each row to `data_all`. # In[19]: email_sender_stats = pd.DataFrame(data_all) len(email_sender_stats) # Let's only look at senders who sent us at least two emails. # In[20]: senders = email_sender_stats[email_sender_stats.n_messages >= 2] len(senders) # What percentage these senders never got to the primary inbox? # In[21]: len(senders[senders.n_primary == 0]) / len(senders) # ### What percentage of Spam emails are also labelled as promotions? # In[22]: len(df[(df['Category Promotions'] == 1) & (df['Spam'] == 1)]) / len(df[df['Spam'] == 1]) # ### How many emails _would_ have been in Updates if Updates was active? # In[23]: len(df[(df['Category Updates'] == 1) & (df['Spam'] != 1)])