Using Scattertext to visualize Emoji usage by gender and heritage on Twitter

DDSEA17: Understanding Cultures and Perspectives through Text and Emjoi Visualization

@jasonkessler

https://github.com/JasonKessler/scattertext

Cite as: Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.

Link to preprint: https://arxiv.org/abs/1703.00565

@article{kessler2017scattertext, author = {Kessler, Jason S.}, title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ}, booktitle = {ACL System Demonstrations}, year = {2017}, }

Data is from http://followthehashtag.com/datasets/

In [1]:
%matplotlib inline
import io, json
from zipfile import ZipFile
import urllib.request

import pandas as pd
import numpy as np
import agefromname
import nltk

import imp
import scattertext as st
from scattertext import tweet_tokenzier_factory
from scattertext.termranking import OncePerDocFrequencyRanker

from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
In [2]:
try:
    df = pd.read_csv('usa_tweets.csv.gz')
except:
    with ZipFile(io.BytesIO(urllib.request.urlopen(
            'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'
    ).read())) as zf:
        df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))
    df['first_name'] = df['User Name'].apply(
        lambda x: x.split()[0].lower() if type(x) == str and len(x.split()) > 0 else x)
    df['last_name'] = df['User Name'].apply(
        lambda x: x.split()[-1].lower() if type(x) == str and len(x.split()) > 1 else x)
    df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')
In [4]:
df[['first_name', 'last_name', 'User Name', 'Nickname', 'Tweet content']].iloc[:3]
Out[4]:
first_name last_name User Name Nickname Tweet content
0 bill schulhoff Bill Schulhoff BillSchulhoff Wind 3.2 mph NNE. Barometer 30.20 in, Rising s...
1 daniele polis Daniele Polis danipolis Pausa pro café antes de embarcar no próximo vô...
2 kasey jacobs Kasey Jacobs KJacobs27 Good. Morning. #morning #Saturday #diner #VT #...
In [5]:
male_prob = agefromname.AgeFromName().get_all_name_male_prob()
male_prob.iloc[:3]
Out[5]:
hi lo prob
first_name
aaban 1.000000 9.574095e-01 1.0
aabha 0.121295 -1.387779e-17 0.0
aabid 1.000000 5.628005e-01 1.0
In [6]:
df_aug = pd.merge(df, male_prob, left_on='first_name', right_index=True)
df_aug['gender'] = df_aug['prob'].apply(lambda x: 'm' if x > 0.9 else 'f' if x < 0.1 else '?')
df_mf = df_aug[df_aug['gender'].isin(['m', 'f'])]
df_mf.to_csv('emoji_data.csv', index=False)
In [7]:
df_mf = pd.read_csv('emoji_data.csv')
In [8]:
df_mf[['gender', 'first_name', 'User Name', 'Nickname', 'Tweet content']].iloc[:6]
Out[8]:
gender first_name User Name Nickname Tweet content
0 m bill Bill Schulhoff BillSchulhoff Wind 3.2 mph NNE. Barometer 30.20 in, Rising s...
1 m bill Bill S Kenney BillSKenney Planning the new focuslabllc website with the ...
2 m bill Bill Pendley BILLPENDLEY #bibleverseoftheday @ Bill The Mortgage Guy ...
3 m bill Bill Culver rilla6969 Start Wars Dark Side Challenge race number one...
4 m bill Bill Esparza streetgourmetla Spinach fusilli by @chef_timothy. A pre #mexic...
5 m bill Bill Meadows BillMeadows305 https://t.co/N8E5aTvIIN

Gender breakdown of Twitter Users

According to Pew, the gender breakdown of American adults Twitter was about even

However, among users with a gender-identifying first name, about 56% appear to be male

In [9]:
print(df_mf[['Nickname', 'gender']].drop_duplicates()['gender'].value_counts())
print(df_mf[['Nickname', 'gender']].drop_duplicates()['gender'].value_counts()/df_mf[['Nickname', 'gender']].drop_duplicates()['gender'].value_counts().sum())
m    28159
f    21844
Name: gender, dtype: int64
m    0.563146
f    0.436854
Name: gender, dtype: float64
In [10]:
df_mf['Tweet content'].iloc[:5]
Out[10]:
0    Wind 3.2 mph NNE. Barometer 30.20 in, Rising s...
1    Planning the new focuslabllc website with the ...
2    #bibleverseoftheday @ Bill The Mortgage Guy   ...
3    Start Wars Dark Side Challenge race number one...
4    Spinach fusilli by @chef_timothy. A pre #mexic...
Name: Tweet content, dtype: object
In [11]:
nlp = st.tweet_tokenzier_factory(nltk.tokenize.TweetTokenizer())
df_mf['parse'] = df_mf['Tweet content'].apply(nlp)
In [12]:
corpus = st.CorpusFromParsedDocuments(
    df_mf,
    parsed_col='parse',
    category_col='gender',
    feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build()
In [13]:
metadata= (df_mf['User Name']
           + ' (@' + df_mf['Nickname'] + ') '
           + df_mf['Date'].astype(str))
html = st.produce_scattertext_explorer(
    corpus,
    category='f',
    category_name='Female',
    not_category_name='Male',
    use_full_doc=True,
    term_ranker=OncePerDocFrequencyRanker,
    sort_by_dist=False,
    metadata=metadata,
    width_in_pixels=1000
)

file_name = 'output/emoji_gender_scattertext.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
Out[13]:
In [14]:
html = st.produce_fightin_words_explorer(corpus, 
                                         category='f', 
                                         category_name='Female', 
                                         not_category_name='Male', 
                                         term_ranker=OncePerDocFrequencyRanker,
                                         metadata=metadata)
file_name = 'output/emoji_gender_lorp.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
Out[14]:

Use US Census data to find last names that are associated with a particular heritage

From https://www.census.gov/data/developers/data-sets/surnames.2010.html

In [15]:
from urllib.request import urlopen
#url = 'https://api.census.gov/data/2010/surname?get=COUNT,CUM_PROP100K,NAME,PCT2PRACE,PCTAIAN,PCTAPI,PCTBLACK,PCTHISPANIC,PCTWHITE,PROP100K,RANK,&RANK=1:100000'
url ='https://api.census.gov/data/2010/surname?get=NAME,COUNT,CUM_PROP100K,PCT2PRACE,PCTAIAN,PCTAPI,PCTBLACK,PCTHISPANIC,PCTWHITE,PROP100K&RANK=1:200000'
with urlopen(url) as f:
    raw = f.read().decode('utf-8')
    rows = json.loads(raw)
    name_df = pd.DataFrame(rows[1:], columns=rows[0]).set_index('NAME').replace('(S)', 0).astype(float).reset_index()
name_df['NAME'] = name_df['NAME'].apply(str.lower)
name_df = name_df.set_index('NAME')
name_df['heritage'] = name_df.apply(lambda x: max([(v,k) for k,v in x.items() if k[:3] == 'PCT'])[1][3:]
                                    if max([(v,k) for k,v in x.items() if k[:3] == 'PCT'])[0] > 85
                                    else np.nan, axis=1)
name_df['heritage'] = name_df['heritage'].apply(lambda x:
                                                {'AIAN': 'Native American', 
                                                 'API': 'Asian', 
                                                 'BLACK': 'African American', 
                                                 'HISPANIC': 'Hispanic', 
                                                 'WHITE': 'White'}.get(x, np.nan))
In [16]:
name_df[name_df['PCTHISPANIC'] > 85].sort_values(by='COUNT', ascending=False).iloc[:5]
Out[16]:
COUNT CUM_PROP100K PCT2PRACE PCTAIAN PCTAPI PCTBLACK PCTHISPANIC PCTWHITE PROP100K RANK heritage
NAME
garcia 1166120.0 3400.12 0.26 0.47 1.41 0.45 92.03 5.38 395.32 6.0 Hispanic
rodriguez 1094924.0 4543.50 0.18 0.18 0.57 0.54 93.77 4.75 371.19 9.0 Hispanic
martinez 1060159.0 4902.90 0.22 0.51 0.60 0.49 92.91 5.28 359.40 10.0 Hispanic
hernandez 1043281.0 5256.58 0.16 0.19 0.60 0.36 94.89 3.79 353.68 11.0 Hispanic
lopez 874523.0 5553.05 0.25 0.38 1.02 0.57 92.92 4.86 296.47 12.0 Hispanic
In [71]:
name_df[name_df['PCTBLACK'] > 85].sort_values(by='COUNT', ascending=False).iloc[:5]
Out[71]:
COUNT CUM_PROP100K PCT2PRACE PCTAIAN PCTAPI PCTBLACK PCTHISPANIC PCTWHITE PROP100K RANK heritage
NAME
washington 177386.0 20370.63 3.78 0.68 0.30 87.53 2.54 5.17 60.14 145.0 African American
pierre 33913.0 41272.38 2.23 0.92 0.31 86.74 2.75 7.05 11.50 1026.0 African American
smalls 12435.0 53820.35 2.76 0.28 0.23 90.49 2.46 3.78 4.22 2888.0 African American
jeanbaptiste 7915.0 59139.71 2.50 0.13 0.21 94.04 2.15 0.97 2.68 4483.0 African American
diallo 7502.0 59784.56 1.55 0.09 0.12 95.64 0.76 1.84 2.54 4730.0 African American
In [18]:
name_df.fillna('N/A').groupby('heritage').sum()['COUNT'].sort_values()
Out[18]:
heritage
Native American         98523.0
African American      1343997.0
Asian                 7336493.0
Hispanic             36248570.0
White                92111160.0
N/A                 128528485.0
Name: COUNT, dtype: float64

Self-identified race in the US (via: https://en.wikipedia.org/wiki/Race_and_ethnicity_in_the_United_States)

  • White alone: 72.4%
  • Hispanic and Latino Americans (of any race): 16.3%
  • Black or African American: 12.6%
  • Asian: 4.8%
  • Native American and Alaska Natives: 0.9%
  • Native Hawaiians and Other Pacific Islanders: 0.2%
  • Two or more races: 2.9%
  • Some other race: 6.2%
In [72]:
name_df.dropna().groupby('heritage').sum()['COUNT'].sort_values()/name_df.dropna().groupby('heritage').sum()['COUNT'].sum()
Out[72]:
heritage
Native American     0.000718
African American    0.009800
Asian               0.053497
Hispanic            0.264320
White               0.671664
Name: COUNT, dtype: float64

Caution!!

  • Hispanics are over reprented (26.4% of Americans w/ heritage-identifiable names vs 16.3% of population)
  • African Americans are far under represented: (0.9% of Americans w/ heritage-identifiable names vs 12.6% of population)
  • White, Asians have similar shares
  • Native Americans are very difficult to identify
In [19]:
df_mf_heritage = pd.merge(df_mf, name_df[['heritage']].dropna(), left_on='last_name', right_index = True, how='inner')
In [25]:
df_mf_heritage['heritage'].value_counts()
Out[25]:
White               22176
Hispanic             9188
Asian                1249
African American      166
Native American         8
Name: heritage, dtype: int64
In [32]:
df_mf_heritage['Is-White'] = df_mf_heritage['heritage'].apply(lambda x: x if x == 'White' else 'Not White')
df_mf_heritage['Is-Hispanic'] = df_mf_heritage['heritage'].apply(lambda x: x if x == 'Hispanic' else 'Not Hispanic')
df_mf_heritage['Is-Asian'] = df_mf_heritage['heritage'].apply(lambda x: x if x == 'Asian' else 'Not Asian')

Emojis white people like

In [74]:
metadata= (df_mf_heritage['User Name']
           + ' (@' + df_mf_heritage['Nickname'] + ') '
           + df_mf_heritage['Date'].astype(str))
html = st.produce_scattertext_explorer(
    corpus = st.CorpusFromParsedDocuments(
        df_mf_heritage,
        parsed_col='parse',
        category_col='Is-White',
        feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
    ).build(),
    category='White',
    category_name='White',
    not_category_name='Not-White',
    use_full_doc=True,
    term_ranker=OncePerDocFrequencyRanker,
    sort_by_dist=False,
    metadata=metadata,
    width_in_pixels=1000,
    max_docs_per_category=1000
)

file_name = 'output/emoji_white_v_all.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
Out[74]:

Emojis Hispanic people like

In [75]:
metadata= (df_mf_heritage['User Name']
           + ' (@' + df_mf_heritage['Nickname'] + ') '
           + df_mf_heritage['Date'].astype(str))
html = st.produce_scattertext_explorer(
    corpus = st.CorpusFromParsedDocuments(
        df_mf_heritage,
        parsed_col='parse',
        category_col='Is-Hispanic',
        feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
    ).build(),
    category='Hispanic',
    category_name='Hispanic',
    not_category_name='Not-Hispanic',
    use_full_doc=True,
    term_ranker=OncePerDocFrequencyRanker,
    sort_by_dist=False,
    metadata=metadata,
    width_in_pixels=1000,
    max_docs_per_category=1000
)

file_name = 'output/emoji_hispanic_v_all.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
Out[75]:
In [ ]: