https://github.com/JasonKessler/scattertext
Cite as: Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.
Link to preprint: https://arxiv.org/abs/1703.00565
@article{kessler2017scattertext, author = {Kessler, Jason S.}, title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ}, booktitle = {ACL System Demonstrations}, year = {2017}, }
Data is from http://followthehashtag.com/datasets/
import scattertext as st
import pandas as pd
import numpy as np
import spacy
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
nlp = spacy.en.English()
try:
df = pd.read_csv('usa_tweets.csv.gz')
except:
with ZipFile(io.BytesIO(urllib.request.urlopen(
'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'
).read())) as zf:
df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))
df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')
df['first_name'] = df['User Name'].apply(
lambda x: x.split()[0].lower() if type(x) == str and len(x.split()) > 0 else x)
df['last_name'] = df['User Name'].apply(
lambda x: x.split()[-1].lower() if type(x) == str and len(x.split()) > 1 else x)
df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')
df['parse'] = df['Tweet content'].apply(nlp)
df['Tweet language (ISO 639-1)'].value_counts()
en 172206 es 27062 fr 1695 it 959 pt 737 de 636 da 442 nl 282 ru 277 sv 271 fi 125 tr 91 hu 37 Name: Tweet language (ISO 639-1), dtype: int64
df['english'] = df['Tweet language (ISO 639-1)'].apply(lambda x: 'en' if x == 'en' else 'non-en')
df['spanish'] = df['Tweet language (ISO 639-1)'].apply(lambda x: 'es' if x == 'es' else 'non-es')
corpus.get_categories()
['en', 'non-en']
metadata= (df['User Name']
+ ' (@' + df['Nickname'] + ') '
+ df['Date'].astype(str))
html = st.produce_scattertext_explorer(
corpus = st.CorpusFromParsedDocuments(
df,
parsed_col='parse',
category_col='english',
feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build(),
category='en',
category_name='English',
not_category_name='Non-English',
use_full_doc=True,
term_ranker=st.OncePerDocFrequencyRanker,
sort_by_dist=False,
metadata=metadata,
width_in_pixels=1000
)
file_name = 'output/emoji_english-v-non.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
metadata= (df['User Name']
+ ' (@' + df['Nickname'] + ') '
+ df['Date'].astype(str))
html = st.produce_scattertext_explorer(
corpus = st.CorpusFromParsedDocuments(
df,
parsed_col='parse',
category_col='spanish',
feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build(),
category='es',
category_name='Spanish',
not_category_name='Non-Spanish',
use_full_doc=True,
term_ranker=st.OncePerDocFrequencyRanker,
sort_by_dist=False,
metadata=metadata,
width_in_pixels=1000
)
file_name = 'output/emoji_spanish-v-non.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)