Using Scattertext to Visualize Emoji Usage by Language (e.g., English or Spanish) on Twitter

DDSEA17: Understanding Cultures and Perspectives through Text and Emjoi Visualization

@jasonkessler

https://github.com/JasonKessler/scattertext

Cite as: Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.

Link to preprint: https://arxiv.org/abs/1703.00565

@article{kessler2017scattertext, author = {Kessler, Jason S.}, title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ}, booktitle = {ACL System Demonstrations}, year = {2017}, }

Data is from http://followthehashtag.com/datasets/

In [9]:
import scattertext as st
import pandas as pd
import numpy as np
import spacy

from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
In [2]:
nlp = spacy.en.English()
In [ ]:
try:
    df = pd.read_csv('usa_tweets.csv.gz')
except:
    with ZipFile(io.BytesIO(urllib.request.urlopen(
            'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'
    ).read())) as zf:
        df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))
    df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')
    df['first_name'] = df['User Name'].apply(
        lambda x: x.split()[0].lower() if type(x) == str and len(x.split()) > 0 else x)
    df['last_name'] = df['User Name'].apply(
        lambda x: x.split()[-1].lower() if type(x) == str and len(x.split()) > 1 else x)
    df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')
df['parse'] = df['Tweet content'].apply(nlp)
In [24]:
df['Tweet language (ISO 639-1)'].value_counts()
Out[24]:
en    172206
es     27062
fr      1695
it       959
pt       737
de       636
da       442
nl       282
ru       277
sv       271
fi       125
tr        91
hu        37
Name: Tweet language (ISO 639-1), dtype: int64
In [26]:
df['english'] = df['Tweet language (ISO 639-1)'].apply(lambda x: 'en' if x == 'en' else 'non-en')
df['spanish'] = df['Tweet language (ISO 639-1)'].apply(lambda x: 'es' if x == 'es' else 'non-es')
In [17]:
corpus.get_categories()
Out[17]:
['en', 'non-en']
In [23]:
metadata= (df['User Name']
           + ' (@' + df['Nickname'] + ') '
           + df['Date'].astype(str))

html = st.produce_scattertext_explorer(
    corpus = st.CorpusFromParsedDocuments(
        df,
        parsed_col='parse',
        category_col='english',
        feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
    ).build(),
    category='en',
    category_name='English',
    not_category_name='Non-English',
    use_full_doc=True,
    term_ranker=st.OncePerDocFrequencyRanker,
    sort_by_dist=False,
    metadata=metadata,
    width_in_pixels=1000
)
file_name = 'output/emoji_english-v-non.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
Out[23]:
In [22]:
metadata= (df['User Name']
           + ' (@' + df['Nickname'] + ') '
           + df['Date'].astype(str))

html = st.produce_scattertext_explorer(
    corpus = st.CorpusFromParsedDocuments(
        df,
        parsed_col='parse',
        category_col='spanish',
        feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
    ).build(),
    category='es',
    category_name='Spanish',
    not_category_name='Non-Spanish',
    use_full_doc=True,
    term_ranker=st.OncePerDocFrequencyRanker,
    sort_by_dist=False,
    metadata=metadata,
    width_in_pixels=1000
)
file_name = 'output/emoji_spanish-v-non.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
Out[22]:
In [ ]: