Author: Leon Yin and Surya Mattu
Links: GitHub | nbviewer | Story
This notebook describes the data preprocessing sets for our Gmail promotions experiment. Documentation for this dataset can be found in Github here, read more about the context of why and how we collected this dataset in our paper.
import os
import sys
import glob
import gzip
import mailbox
from tqdm import tqdm
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
sys.path.append('..')
from utils.preprocessing import (
find_email,
find_name,
find_tld_from_email,
is_primary,
get_email_id,
get_text,
)
gzip_directory = '../data/input/google_takeout_mboxes_gzipped/'
fn_newsletter_categories = '../data/input/newsletter_categories.csv'
# outputs
mbox_directory = '../data/input/google_takeout_mboxes/'
fn_output = '../data/intermediates/email_metadata_merged_with_newsletters.csv.gz'
# make a directory for the uncompressed mbox files.
os.makedirs(mbox_directory, exist_ok=True)
# Let's identify the gzipped mbox files
mboxes_gzipped = glob.glob(os.path.join(gzip_directory, '*.mbox.gz'))
# un-gzip each, and write them to a new destination
for fn in tqdm(mboxes_gzipped):
fn_out = fn.replace(gzip_directory,
mbox_directory).replace('.gz', '')
# un-gzip if does not exist
if os.path.exists(fn_out):
continue
# write to a non-gzipped file.
with gzip.open(fn, 'rt') as f_:
with open(fn_out, 'w') as f:
for line in f_:
f.write(line)
100%|██████████| 6/6 [00:00<00:00, 14952.96it/s]
# Here let's identify the input files we're working with
mboxes = glob.glob(os.path.join(mbox_directory, '*.mbox'))
len(mboxes)
6
# what we'll be keeping from each mbox record
keep_from_mbox = [
'Date',
'X-Gmail-Labels',
'X-GM-THRID',
'To',
'From',
'Delivered-To',
'Received',
"Subject"
]
message_list = []
for fn in mboxes:
mb = mailbox.mbox(fn)
for _message in tqdm(mb.itervalues()):
# turn the _messages into a dictionary
email_metadata = dict(_message.items())
# filter out K-V pairs we're not interested in
email_metadata = {
k : v for k, v in email_metadata.items()
if k in keep_from_mbox
}
# parse text fields, and append to email metadata
text_meta = get_text(_message)
email_metadata = {**email_metadata, **text_meta}
message_list.append(email_metadata)
2929it [00:24, 120.42it/s] 1934it [00:16, 119.86it/s] 2473it [00:20, 119.36it/s] 448it [00:03, 127.72it/s] 3702it [00:31, 117.17it/s] 3387it [00:27, 121.04it/s]
len(message_list)
14873
# Load the dataframe from the message list and convert cols to datetime
messages = pd.DataFrame(message_list)
messages.Date = pd.to_datetime(messages.Date, utc=True)
messages.loc[:, 'markup_id'] = messages.apply(get_email_id, axis=1)
messages.drop_duplicates('markup_id', inplace=True)
# this is the start of our experiment
messages = messages[messages['Date'] > '2019-10-16']
messages.reset_index(drop=True, inplace=True)
len(messages)
5417
# convert the labels to binary columns and merge back in
vect = MultiLabelBinarizer()
X = vect.fit_transform(messages["X-Gmail-Labels"].str.split(','))
gmail_category_matrix = pd.DataFrame(X, columns=vect.classes_)
messages = messages.join(gmail_category_matrix)
We determed which emails are in the primary inbox by looking at the X-Gmail-Labels
in gmail_category_matrix
.
??is_primary
Signature: is_primary(row: dict) Source: def is_primary(row : dict): """ Determines if a row is in the Primary inbox. These are emails that are in the Inbox, but aren't in Spam, Promotions, or Trash. """ try: if ( not row.get('Spam') and not row.get('Category Promotions') and not row.get('Trash') and row['Inbox'] ): return 1 except Exception as e: print(e) print(row) return 0 File: ~/code/production-grade/promotions/utils/preprocessing.py Type: function
messages.loc[:, "Primary"] = messages.apply(is_primary, axis=1)
# Clean up email headers
messages = messages[~messages['From'].isnull()]
messages.loc[:, "From_Email"] = messages['From'].apply(find_email)
messages.loc[:, "From_Domain"] = messages["From_Email"].apply(find_tld_from_email)
messages.loc[:, "From_Name"] = messages["From"].apply(find_name)
messages.loc[:, "To_Email"] = messages['To'].apply(find_email).str.lower()
# late presidential candidates we added on 2/4. No email aliases, so we add them.
messages.loc[messages['From_Domain'].str.contains('bloomberg'),
'To_Email'] = 'ajcorrigan88+mbloomberg@gmail.com'
messages.loc[messages['From_Domain'].str.contains('steyer'),
'To_Email'] = 'ajcorrigan88+tsteyer@gmail.com'
# newsletters we signed up for and which alias we used.
newsletter_categories = pd.read_csv(fn_newsletter_categories)
len(newsletter_categories)
231
# breakdown of entities we signed up for
newsletter_categories.Category.value_counts()
Advocacy organization or think tank 73 House battleground campaign 71 House battleground official 70 Presidential candidate 17 Name: Category, dtype: int64
df = messages.merge(newsletter_categories, how='left',
left_on='To_Email', right_on='Email')
df.iloc[0]
X-GM-THRID 1654372657559145454 X-Gmail-Labels Spam,Category Promotions,Unread Delivered-To ajcorrigan88+jwalsh@gmail.com Received from nationbuilder.com (unknown [52.21.208.178... Date 2019-12-30 19:05:11+00:00 From Joe Walsh <joe@joewalsh.org> Subject What's he hiding? To ajcorrigan88+jwalsh@gmail.com text Hi Friend,\nIt was hard to miss Donald Trump’s... script [] style None markup_id d07816a54b6b0aa65faf4bcf0c142f2e Archived 0 Category Personal 0 Category Promotions 1 Category Updates 0 Important 0 Inbox 0 Opened 0 Spam 1 Unread 1 Primary 0 From_Email joe@joewalsh.org From_Domain joewalsh.org From_Name Joe Walsh To_Email ajcorrigan88+jwalsh@gmail.com Name Joe Walsh Website https://www.joewalsh.org Email ajcorrigan88+jwalsh@gmail.com Category Presidential candidate Entity_ID 3ee34b65e61fb39b1c6bf106224eb388 Name: 0, dtype: object
# create boolean index to filter each of these categories
official = newsletter_categories['Category'] == 'House battleground official'
campaign = newsletter_categories['Category'] == 'House battleground campaign'
# to update the Website column
name2official_site = dict(zip(
newsletter_categories[official].Name,
newsletter_categories[official].Website
))
name2campaign_site = dict(zip(
newsletter_categories[campaign].Name,
newsletter_categories[campaign].Website
))
# to update the Entity_ID columm
name2official_entity_id= dict(zip(
newsletter_categories[official].Name,
newsletter_categories[official].Entity_ID
))
name2campaign_entity_id= dict(zip(
newsletter_categories[campaign].Name,
newsletter_categories[campaign].Entity_ID
))
del official, campaign
Here's what one of these dictionaries looks like:
name2campaign_site['Ann Wagner']
'http://annwagner.com/'
name2campaign_entity_id['Ann Wagner']
'fb98f8fa9305ff53ef495e5777d9837f'
# what subset of the data are we re-assigning?
df.loc[(df['From_Domain'].str.contains('.gov')) &
(df['Category'].str.contains('House battleground')),
# what column is being re-assigned?
'Category'] = 'House battleground official'
df.loc[(~df['From_Domain'].str.contains('.gov')) &
(df['Category'].str.contains('House battleground')),
'Category'] = 'House battleground campaign'
# create boolean index to filter each of these categories
campaign = df.Category == 'House battleground campaign'
official = df.Category == 'House battleground official'
# update Website column
df.loc[campaign,
'Website'] = df.loc[campaign].Name.replace(name2campaign_site)
df.loc[official,
'Website'] = df.loc[official].Name.replace(name2official_site)
# update Entity_ID
df.loc[campaign,
'Entity_ID'] = df.loc[campaign].Name.replace(name2campaign_entity_id)
df.loc[official,
'Entity_ID'] = df.loc[official].Name.replace(name2official_entity_id)
del campaign, official
df = df.drop_duplicates(subset='markup_id')
Let's do a quick spot check for one of the House members:
# check against the original data
email_alias = 'ajcorrigan88+awagneroffice@gmail.com'
messages[messages.To_Email == email_alias].From_Email.value_counts()
info@annwagner.com 32 mo02awima@mail.house.gov 17 listserv@ls1.house.gov 1 Name: From_Email, dtype: int64
# Which sender category did we reassign?
df[df.Name == 'Ann Wagner'].Category.value_counts()
House battleground campaign 32 House battleground official 18 Name: Category, dtype: int64
# Check the entity ID also
df[df.Name == 'Ann Wagner'].Entity_ID.value_counts()
fb98f8fa9305ff53ef495e5777d9837f 32 0c5cea595602af6e7b7705e77d393d59 18 Name: Entity_ID, dtype: int64
eID = 'fb98f8fa9305ff53ef495e5777d9837f'
newsletter_categories[newsletter_categories.Entity_ID == eID]
Name | Website | Category | Entity_ID | ||
---|---|---|---|---|---|
30 | Ann Wagner | http://annwagner.com/ | ajcorrigan88+awagneroffice@gmail.com | House battleground campaign | fb98f8fa9305ff53ef495e5777d9837f |
# how many emails from Gmail?
len(df[df.From_Domain == 'google.com'])
44
# filter out emails from senders who didn't send us an email with an alias
len(df[(df.Name.isnull()) & (df.From_Domain != 'google.com')])
235
# emails from senders we're removing
df[df.Name.isnull()].From_Domain.value_counts()
frc.org 72 google.com 44 govdelivery.com 35 frcaction.org 31 congressnewsletter.net 23 crp.org 22 brianfitzpatrick.com 13 fotf.com 12 virginiazoo.org 10 teapartypatriots.org 4 aarp.org 4 house.gov 3 patriotsforperry.com 1 girlscouts.org 1 nrahq.org 1 vn.va 1 saymine.com 1 youtube.com 1 Name: From_Domain, dtype: int64
df = df[~df.Name.isnull()]
len(df)
5134
# remove these columns
drop_cols = [
'Email',
'script',
'style',
'Received',
'Delivered-To',
'To', 'From',
'Unread',
'Opened',
'Archived',
'Important',
]
df = df[[c for c in df.columns if c not in drop_cols]]
df.iloc[0]
X-GM-THRID 1654372657559145454 X-Gmail-Labels Spam,Category Promotions,Unread Date 2019-12-30 19:05:11+00:00 Subject What's he hiding? text Hi Friend,\nIt was hard to miss Donald Trump’s... markup_id d07816a54b6b0aa65faf4bcf0c142f2e Category Personal 0 Category Promotions 1 Category Updates 0 Inbox 0 Spam 1 Primary 0 From_Email joe@joewalsh.org From_Domain joewalsh.org From_Name Joe Walsh To_Email ajcorrigan88+jwalsh@gmail.com Name Joe Walsh Website https://www.joewalsh.org Category Presidential candidate Entity_ID 3ee34b65e61fb39b1c6bf106224eb388 Name: 0, dtype: object
# Let's save the dataset as a CSV.
df.to_csv(fn_output, index=False, compression='gzip')
len(df)
5134
def summary(messages):
n_all = len(messages)
n_ppro = len(messages[(messages['Category Promotions']) ==1])
n_pro = len(messages[(messages['Category Promotions'] == 1) &
(messages['Spam'] != 1) ] )
n_u = len(messages[messages['Category Updates'] == 1])
n_s = len(messages[messages['Spam'] == 1])
n_pri = len(messages[messages['Primary'] == 1])
first_email = messages.Date.min()
last_email = messages.Date.max()
senders = messages['From_Email'].nunique()
alias = messages['To_Email'].nunique()
print(f"There are a total of {n_all} emails\n"
f"Sent between {first_email.strftime('%Y-%m-%d')} and {last_email.strftime('%Y-%m-%d %H:%M')}\n"
f"From {senders} email addresses to {alias} alias\n\n"
"Categories:\n"
f"{n_pri} emails or {n_pri / n_all * 100:.2f}% are in the primary inbox\n"
f"{n_pro} emails or {n_pro / n_all * 100:.2f}% are in the promotions tab\n"
f"{n_ppro} emails or {n_ppro / n_all * 100:.2f}% are promotions (includes spam)\n"
f"{n_u} emails or {n_u / n_all * 100:.2f}% are in the updates tab (includes spam)\n"
f"{n_s} emails or {n_s / n_all * 100:.2f}% are in spam")
summary(df)
There are a total of 5134 emails Sent between 2019-10-16 and 2020-02-12 19:17 From 229 email addresses to 136 alias Categories: 546 emails or 10.63% are in the primary inbox 2551 emails or 49.69% are in the promotions tab 3961 emails or 77.15% are promotions (includes spam) 1157 emails or 22.54% are in the updates tab (includes spam) 2037 emails or 39.68% are in spam