Using Scattertext to Visualize Emoji Usage by Language (e.g., English or Spanish) on Twitter¶

DDSEA17: Understanding Cultures and Perspectives through Text and Emjoi Visualization¶

@jasonkessler¶

https://github.com/JasonKessler/scattertext

Cite as: Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.

Link to preprint: https://arxiv.org/abs/1703.00565

@article{kessler2017scattertext, author = {Kessler, Jason S.}, title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ}, booktitle = {ACL System Demonstrations}, year = {2017}, }

Data is from http://followthehashtag.com/datasets/

In [9]:

import scattertext as st
import pandas as pd
import numpy as np
import spacy

from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [2]:

nlp = spacy.en.English()

In [ ]:

try:
    df = pd.read_csv('usa_tweets.csv.gz')
except:
    with ZipFile(io.BytesIO(urllib.request.urlopen(
            'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'
    ).read())) as zf:
        df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))
    df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')
    df['first_name'] = df['User Name'].apply(
        lambda x: x.split()[0].lower() if type(x) == str and len(x.split()) > 0 else x)
    df['last_name'] = df['User Name'].apply(
        lambda x: x.split()[-1].lower() if type(x) == str and len(x.split()) > 1 else x)
    df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')
df['parse'] = df['Tweet content'].apply(nlp)

In [24]:

df['Tweet language (ISO 639-1)'].value_counts()

Out[24]:

en    172206
es     27062
fr      1695
it       959
pt       737
de       636
da       442
nl       282
ru       277
sv       271
fi       125
tr        91
hu        37
Name: Tweet language (ISO 639-1), dtype: int64

In [26]:

df['english'] = df['Tweet language (ISO 639-1)'].apply(lambda x: 'en' if x == 'en' else 'non-en')
df['spanish'] = df['Tweet language (ISO 639-1)'].apply(lambda x: 'es' if x == 'es' else 'non-es')

In [17]:

corpus.get_categories()

Out[17]:

['en', 'non-en']

In [23]:

metadata= (df['User Name']
           + ' (@' + df['Nickname'] + ') '
           + df['Date'].astype(str))

html = st.produce_scattertext_explorer(
    corpus = st.CorpusFromParsedDocuments(
        df,
        parsed_col='parse',
        category_col='english',
        feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
    ).build(),
    category='en',
    category_name='English',
    not_category_name='Non-English',
    use_full_doc=True,
    term_ranker=st.OncePerDocFrequencyRanker,
    sort_by_dist=False,
    metadata=metadata,
    width_in_pixels=1000
)
file_name = 'output/emoji_english-v-non.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

Out[23]:

In [22]:

metadata= (df['User Name']
           + ' (@' + df['Nickname'] + ') '
           + df['Date'].astype(str))

html = st.produce_scattertext_explorer(
    corpus = st.CorpusFromParsedDocuments(
        df,
        parsed_col='parse',
        category_col='spanish',
        feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
    ).build(),
    category='es',
    category_name='Spanish',
    not_category_name='Non-Spanish',
    use_full_doc=True,
    term_ranker=st.OncePerDocFrequencyRanker,
    sort_by_dist=False,
    metadata=metadata,
    width_in_pixels=1000
)
file_name = 'output/emoji_spanish-v-non.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

Out[22]:

In [ ]: