Using Scattertext to visualize Emoji usage by nationality on Twitter

DDSEA17: Understanding Cultures and Perspectives through Text and Emjoi Visualization

@jasonkessler

https://github.com/JasonKessler/scattertext

Cite as: Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.

Link to preprint: https://arxiv.org/abs/1703.00565

@article{kessler2017scattertext, author = {Kessler, Jason S.}, title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ}, booktitle = {ACL System Demonstrations}, year = {2017}, }

Data is from http://followthehashtag.com/datasets/

In [1]:
%matplotlib inline
import io, json
from zipfile import ZipFile
import urllib.request

import pandas as pd
import numpy as np
import agefromname
import nltk
import spacy

import scattertext as st
from scattertext import tweet_tokenzier_factory
from scattertext.termranking import OncePerDocFrequencyRanker

from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
In [2]:
nlp = spacy.en.English()
In [3]:
def load_us_tweets():
    try:
        df = pd.read_csv('usa_tweets.csv.gz')
    except:
        with ZipFile(io.BytesIO(urllib.request.urlopen(
                'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'
        ).read())) as zf:
            df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))
        df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')
    df['parse'] = df['Tweet content'].apply(nlp)
    return df
In [4]:
def load_jp_tweets():
    try:
        df = pd.read_csv('jp_tweets.csv.gz')
    except:
        with ZipFile(io.BytesIO(urllib.request.urlopen(
                'http://followthehashtag.com/content/uploads/Tokyo-Geolocated-tweets-free-dataset-Followthehashtag.zip'
        ).read())) as zf:
            df = pd.read_excel(zf.open('export_dashboard_geocode_35_6894875_139_69170639999993_52km_2016_04_21_20_09_45.xlsx'), sheetname='Stream')
    df['parse'] = df['Tweet content'].apply(nlp)
    df.to_csv('jp_tweets.csv.gz', index=False, compression='gzip')
    return df
In [5]:
def load_uk_tweets():
    try:
        df = pd.read_csv('uk_tweets.csv.gz')
    except:
        with ZipFile(io.BytesIO(urllib.request.urlopen(
                'http://followthehashtag.com/content/uploads/UK-Geolocated-tweets-free-dataset-Followthehashtag.zip'
        ).read())) as zf:
            df = pd.read_excel(zf.open('export_dashboard_x_uk_x_filter_nativeretweets_geocode_55_378051_3_43597299999999_750km_2016_04_21_10_32_03.xlsx'), sheetname='Stream')
    df['parse'] = df['Tweet content'].apply(nlp)
    df.to_csv('uk_tweets.csv.gz', index=False, compression='gzip')
    return df
In [8]:
df = pd.concat([load_jp_tweets(), load_us_tweets(), load_uk_tweets()])
In [9]:
df['Country'].value_counts().iloc[:5]
Out[9]:
JP    182331
US    178609
GB    155695
MX     20293
IE     10580
Name: Country, dtype: int64

Ensure tweets originate in either the US or Japan

In [11]:
df_clean = df[((df['Country'] == 'JP') & (df['Tweet language (ISO 639-1)'] != 'en')) 
              | ((df['Country'] == 'US') & (df['Tweet language (ISO 639-1)'] == 'en'))
              | ((df['Country'] == 'GB') & (df['Tweet language (ISO 639-1)'] == 'en'))]
In [15]:
def twitter_metadata(df):
    return (df['User Name']
           + ' (@' + df['Nickname'] + ') '
           + df['Date'].astype(str))
In [49]:
html = st.produce_scattertext_explorer(
    corpus = st.CorpusFromParsedDocuments(
        df_clean[df_clean.Country.isin(['US', 'JP'])],
        parsed_col='parse',
        category_col='Country',
        feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
    ).build(),
    category='JP',
    category_name='Japan',
    not_category_name='US',
    use_full_doc=True,
    term_ranker=st.OncePerDocFrequencyRanker,
    sort_by_dist=False,
    metadata=twitter_metadata(df_clean[df_clean.Country.isin(['US', 'JP'])]),
    width_in_pixels=1000
)
file_name = 'output/emoji_japan-v-us.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
Out[49]:
In [19]:
html = st.produce_scattertext_explorer(
    corpus = st.CorpusFromParsedDocuments(
        df_clean[df_clean.Country.isin(['US', 'GB'])],
        parsed_col='parse',
        category_col='Country',
        feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
    ).build(),
    category='GB',
    category_name='UK',
    not_category_name='US',
    use_full_doc=True,
    term_ranker=st.OncePerDocFrequencyRanker,
    sort_by_dist=False,
    metadata=twitter_metadata(df_clean[df_clean.Country.isin(['US', 'GB'])]),
    width_in_pixels=1000
)
file_name = 'output/emoji_uk-v-us.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src='output/emoji_uk-v-us.html', width = 1200, height=700)
Out[19]: