Notebook

Using Scattertext to visualize Emoji usage by nationality on Twitter¶

DDSEA17: Understanding Cultures and Perspectives through Text and Emjoi Visualization¶

@jasonkessler¶

https://github.com/JasonKessler/scattertext

Cite as: Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.

Link to preprint: https://arxiv.org/abs/1703.00565

@article{kessler2017scattertext, author = {Kessler, Jason S.}, title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ}, booktitle = {ACL System Demonstrations}, year = {2017}, }

Data is from http://followthehashtag.com/datasets/

In [1]:

%matplotlib inline
import io, json
from zipfile import ZipFile
import urllib.request

import pandas as pd
import numpy as np
import agefromname
import nltk
import spacy

import scattertext as st
from scattertext import tweet_tokenzier_factory
from scattertext.termranking import OncePerDocFrequencyRanker

from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [2]:

nlp = spacy.en.English()

In [3]:

def load_us_tweets():
    try:
        df = pd.read_csv('usa_tweets.csv.gz')
    except:
        with ZipFile(io.BytesIO(urllib.request.urlopen(
                'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'
        ).read())) as zf:
            df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))
        df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')
    df['parse'] = df['Tweet content'].apply(nlp)
    return df

In [4]:

def load_jp_tweets():
    try:
        df = pd.read_csv('jp_tweets.csv.gz')
    except:
        with ZipFile(io.BytesIO(urllib.request.urlopen(
                'http://followthehashtag.com/content/uploads/Tokyo-Geolocated-tweets-free-dataset-Followthehashtag.zip'
        ).read())) as zf:
            df = pd.read_excel(zf.open('export_dashboard_geocode_35_6894875_139_69170639999993_52km_2016_04_21_20_09_45.xlsx'), sheetname='Stream')
    df['parse'] = df['Tweet content'].apply(nlp)
    df.to_csv('jp_tweets.csv.gz', index=False, compression='gzip')
    return df

In [5]:

def load_uk_tweets():
    try:
        df = pd.read_csv('uk_tweets.csv.gz')
    except:
        with ZipFile(io.BytesIO(urllib.request.urlopen(
                'http://followthehashtag.com/content/uploads/UK-Geolocated-tweets-free-dataset-Followthehashtag.zip'
        ).read())) as zf:
            df = pd.read_excel(zf.open('export_dashboard_x_uk_x_filter_nativeretweets_geocode_55_378051_3_43597299999999_750km_2016_04_21_10_32_03.xlsx'), sheetname='Stream')
    df['parse'] = df['Tweet content'].apply(nlp)
    df.to_csv('uk_tweets.csv.gz', index=False, compression='gzip')
    return df

In [8]:

df = pd.concat([load_jp_tweets(), load_us_tweets(), load_uk_tweets()])

In [9]:

df['Country'].value_counts().iloc[:5]

Out[9]:

JP    182331
US    178609
GB    155695
MX     20293
IE     10580
Name: Country, dtype: int64

Ensure tweets originate in either the US or Japan¶

In [11]:

df_clean = df[((df['Country'] == 'JP') & (df['Tweet language (ISO 639-1)'] != 'en')) 
              | ((df['Country'] == 'US') & (df['Tweet language (ISO 639-1)'] == 'en'))
              | ((df['Country'] == 'GB') & (df['Tweet language (ISO 639-1)'] == 'en'))]

In [15]:

def twitter_metadata(df):
    return (df['User Name']
           + ' (@' + df['Nickname'] + ') '
           + df['Date'].astype(str))

In [49]:

html = st.produce_scattertext_explorer(
    corpus = st.CorpusFromParsedDocuments(
        df_clean[df_clean.Country.isin(['US', 'JP'])],
        parsed_col='parse',
        category_col='Country',
        feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
    ).build(),
    category='JP',
    category_name='Japan',
    not_category_name='US',
    use_full_doc=True,
    term_ranker=st.OncePerDocFrequencyRanker,
    sort_by_dist=False,
    metadata=twitter_metadata(df_clean[df_clean.Country.isin(['US', 'JP'])]),
    width_in_pixels=1000
)
file_name = 'output/emoji_japan-v-us.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

Out[49]:

In [19]:

html = st.produce_scattertext_explorer(
    corpus = st.CorpusFromParsedDocuments(
        df_clean[df_clean.Country.isin(['US', 'GB'])],
        parsed_col='parse',
        category_col='Country',
        feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
    ).build(),
    category='GB',
    category_name='UK',
    not_category_name='US',
    use_full_doc=True,
    term_ranker=st.OncePerDocFrequencyRanker,
    sort_by_dist=False,
    metadata=twitter_metadata(df_clean[df_clean.Country.isin(['US', 'GB'])]),
    width_in_pixels=1000
)
file_name = 'output/emoji_uk-v-us.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src='output/emoji_uk-v-us.html', width = 1200, height=700)

Out[19]: