https://github.com/JasonKessler/scattertext
Cite as: Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.
Link to preprint: https://arxiv.org/abs/1703.00565
@article{kessler2017scattertext, author = {Kessler, Jason S.}, title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ}, booktitle = {ACL System Demonstrations}, year = {2017}, }
Data is from http://followthehashtag.com/datasets/
%matplotlib inline
import io, json
from zipfile import ZipFile
import urllib.request
import pandas as pd
import numpy as np
import agefromname
import nltk
import spacy
import scattertext as st
from scattertext import tweet_tokenzier_factory
from scattertext.termranking import OncePerDocFrequencyRanker
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
nlp = spacy.en.English()
def load_us_tweets():
try:
df = pd.read_csv('usa_tweets.csv.gz')
except:
with ZipFile(io.BytesIO(urllib.request.urlopen(
'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'
).read())) as zf:
df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))
df.to_csv('usa_tweets.csv.gz', index=False, compression='gzip')
df['parse'] = df['Tweet content'].apply(nlp)
return df
def load_jp_tweets():
try:
df = pd.read_csv('jp_tweets.csv.gz')
except:
with ZipFile(io.BytesIO(urllib.request.urlopen(
'http://followthehashtag.com/content/uploads/Tokyo-Geolocated-tweets-free-dataset-Followthehashtag.zip'
).read())) as zf:
df = pd.read_excel(zf.open('export_dashboard_geocode_35_6894875_139_69170639999993_52km_2016_04_21_20_09_45.xlsx'), sheetname='Stream')
df['parse'] = df['Tweet content'].apply(nlp)
df.to_csv('jp_tweets.csv.gz', index=False, compression='gzip')
return df
def load_uk_tweets():
try:
df = pd.read_csv('uk_tweets.csv.gz')
except:
with ZipFile(io.BytesIO(urllib.request.urlopen(
'http://followthehashtag.com/content/uploads/UK-Geolocated-tweets-free-dataset-Followthehashtag.zip'
).read())) as zf:
df = pd.read_excel(zf.open('export_dashboard_x_uk_x_filter_nativeretweets_geocode_55_378051_3_43597299999999_750km_2016_04_21_10_32_03.xlsx'), sheetname='Stream')
df['parse'] = df['Tweet content'].apply(nlp)
df.to_csv('uk_tweets.csv.gz', index=False, compression='gzip')
return df
df = pd.concat([load_jp_tweets(), load_us_tweets(), load_uk_tweets()])
df['Country'].value_counts().iloc[:5]
JP 182331 US 178609 GB 155695 MX 20293 IE 10580 Name: Country, dtype: int64
df_clean = df[((df['Country'] == 'JP') & (df['Tweet language (ISO 639-1)'] != 'en'))
| ((df['Country'] == 'US') & (df['Tweet language (ISO 639-1)'] == 'en'))
| ((df['Country'] == 'GB') & (df['Tweet language (ISO 639-1)'] == 'en'))]
def twitter_metadata(df):
return (df['User Name']
+ ' (@' + df['Nickname'] + ') '
+ df['Date'].astype(str))
html = st.produce_scattertext_explorer(
corpus = st.CorpusFromParsedDocuments(
df_clean[df_clean.Country.isin(['US', 'JP'])],
parsed_col='parse',
category_col='Country',
feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build(),
category='JP',
category_name='Japan',
not_category_name='US',
use_full_doc=True,
term_ranker=st.OncePerDocFrequencyRanker,
sort_by_dist=False,
metadata=twitter_metadata(df_clean[df_clean.Country.isin(['US', 'JP'])]),
width_in_pixels=1000
)
file_name = 'output/emoji_japan-v-us.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
html = st.produce_scattertext_explorer(
corpus = st.CorpusFromParsedDocuments(
df_clean[df_clean.Country.isin(['US', 'GB'])],
parsed_col='parse',
category_col='Country',
feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build(),
category='GB',
category_name='UK',
not_category_name='US',
use_full_doc=True,
term_ranker=st.OncePerDocFrequencyRanker,
sort_by_dist=False,
metadata=twitter_metadata(df_clean[df_clean.Country.isin(['US', 'GB'])]),
width_in_pixels=1000
)
file_name = 'output/emoji_uk-v-us.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src='output/emoji_uk-v-us.html', width = 1200, height=700)