https://github.com/JasonKessler/scattertext
Cite as: Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.
Link to preprint: https://arxiv.org/abs/1703.00565
@article{kessler2017scattertext, author = {Kessler, Jason S.}, title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ}, booktitle = {Proceedings of ACL-2017 System Demonstrations}, year = {2017}, address = {Vancouver, Canada}, publisher = {Association for Computational Linguistics}, }
%matplotlib inline
import scattertext as st
import re, io
from pprint import pprint
import pandas as pd
import numpy as np
from scipy.stats import rankdata, hmean, norm
import spacy.en
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML("<style>.container { width:98% !important; }</style>"))
nlp = spacy.en.English()
convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df.iloc[0]
party democrat speaker BARACK OBAMA text Thank you. Thank you. Thank you. Thank you so ... Name: 0, dtype: object
print("Document Count")
print(convention_df.groupby('party')['text'].count())
print("Word Count")
convention_df.groupby('party').apply(lambda x: x.text.apply(lambda x: len(x.split())).sum())
convention_df['parsed'] = convention_df.text.apply(nlp)
Document Count party democrat 123 republican 66 Name: text, dtype: int64 Word Count
corpus = st.CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed').build()
term_freq_df = corpus.get_term_freq_df()
term_freq_df['dem_precision'] = term_freq_df['democrat freq'] * 1./(term_freq_df['democrat freq'] + term_freq_df['republican freq'])
term_freq_df['dem_recall'] = term_freq_df['democrat freq'] * 1./term_freq_df['democrat freq'].sum()
term_freq_df['dem_f_score'] = term_freq_df.apply(lambda x: (hmean([x['dem_precision'], x['dem_recall']])
if x['dem_precision'] > 0 and x['dem_recall'] > 0
else 0), axis=1)
term_freq_df.sort_values(by='dem_f_score', ascending=False).iloc[:10]
democrat freq | republican freq | dem_precision | dem_recall | dem_f_score | |
---|---|---|---|---|---|
term | |||||
the | 3402 | 2532 | 0.573306 | 0.022343 | 0.043009 |
and | 2709 | 2233 | 0.548159 | 0.017791 | 0.034464 |
to | 2340 | 1667 | 0.583978 | 0.015368 | 0.029948 |
a | 1602 | 1345 | 0.543604 | 0.010521 | 0.020643 |
of | 1569 | 1377 | 0.532587 | 0.010304 | 0.020218 |
that | 1400 | 1051 | 0.571195 | 0.009195 | 0.018098 |
we | 1318 | 1146 | 0.534903 | 0.008656 | 0.017036 |
in | 1291 | 986 | 0.566974 | 0.008479 | 0.016708 |
i | 1098 | 851 | 0.563366 | 0.007211 | 0.014240 |
's | 1037 | 631 | 0.621703 | 0.006811 | 0.013473 |
#term_freq_df['dem_precision_pctl'] = rankdata(term_freq_df['dem_precision'])*1./len(term_freq_df)
#term_freq_df['dem_recall_pctl'] = rankdata(term_freq_df['dem_recall'])*1./len(term_freq_df)
def normcdf(x):
return norm.cdf(x, x.mean(), x.std())
term_freq_df['dem_precision_normcdf'] = normcdf(term_freq_df['dem_precision'])
term_freq_df['dem_recall_normcdf'] = normcdf(term_freq_df['dem_recall'])
term_freq_df['dem_scaled_f_score'] = hmean([term_freq_df['dem_precision_normcdf'], term_freq_df['dem_recall_normcdf']])
term_freq_df.sort_values(by='dem_scaled_f_score', ascending=False).iloc[:10]
democrat freq | republican freq | dem_precision | dem_recall | dem_f_score | dem_precision_normcdf | dem_recall_normcdf | dem_scaled_f_score | |
---|---|---|---|---|---|---|---|---|
term | ||||||||
middle class | 148 | 18 | 0.891566 | 0.000972 | 0.001942 | 0.769762 | 1.000000 | 0.869905 |
auto | 37 | 0 | 1.000000 | 0.000243 | 0.000486 | 0.836010 | 0.889307 | 0.861835 |
fair | 45 | 3 | 0.937500 | 0.000296 | 0.000591 | 0.799485 | 0.933962 | 0.861507 |
insurance | 54 | 6 | 0.900000 | 0.000355 | 0.000709 | 0.775397 | 0.965959 | 0.860251 |
forward | 105 | 16 | 0.867769 | 0.000690 | 0.001378 | 0.753443 | 0.999858 | 0.859334 |
president barack | 47 | 4 | 0.921569 | 0.000309 | 0.000617 | 0.789447 | 0.942572 | 0.859241 |
class | 161 | 25 | 0.865591 | 0.001057 | 0.002112 | 0.751919 | 1.000000 | 0.858395 |
middle | 164 | 27 | 0.858639 | 0.001077 | 0.002151 | 0.747021 | 1.000000 | 0.855194 |
the middle | 98 | 17 | 0.852174 | 0.000644 | 0.001286 | 0.742422 | 0.999640 | 0.852041 |
medicare | 84 | 15 | 0.848485 | 0.000552 | 0.001103 | 0.739778 | 0.998050 | 0.849722 |
term_freq_df['dem_corner_score'] = corpus.get_rudder_scores('democrat')
term_freq_df.sort_values(by='dem_corner_score', ascending=True).iloc[:10]
democrat freq | republican freq | Republican Score | Democratic Score | dem_corner_score | |
---|---|---|---|---|---|
term | |||||
auto | 37 | 0 | 0.0 | 0.773567 | 0.227781 |
america forward | 28 | 0 | 0.0 | 0.740100 | 0.227870 |
insurance companies | 24 | 0 | 0.0 | 0.721765 | 0.227934 |
auto industry | 24 | 0 | 0.0 | 0.721765 | 0.227934 |
pell | 23 | 0 | 0.0 | 0.716824 | 0.227961 |
last week | 22 | 0 | 0.0 | 0.711735 | 0.227990 |
pell grants | 21 | 0 | 0.0 | 0.706498 | 0.228024 |
platform | 20 | 0 | 0.0 | 0.701110 | 0.228059 |
women 's | 20 | 0 | 0.0 | 0.701110 | 0.228059 |
coverage | 18 | 0 | 0.0 | 0.689877 | 0.228159 |
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Republican Score'] = corpus.get_scaled_f_scores('republican')
term_freq_df['Democratic Score'] = corpus.get_scaled_f_scores('democrat')
print("Top 10 Democratic terms")
pprint(list(term_freq_df.sort_values(by='Democratic Score', ascending=False).index[:10]))
print("Top 10 Republican terms")
pprint(list(term_freq_df.sort_values(by='Republican Score', ascending=False).index[:10]))
Top 10 Democratic terms ['auto', 'america forward', 'fought for', 'fair', 'insurance companies', 'auto industry', 'president barack', 'pell', 'fighting for', 'last week'] Top 10 Republican terms ['unemployment', 'do better', 'liberty', 'olympics', 'built it', 'reagan', 'it has', 'ann', 'big government', 'story of']
html = produce_scattertext_explorer(corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
width_in_pixels=1000,
minimum_term_frequency=5,
pmi_filter_thresold=4,
transform=st.Scalers.scale,
metadata=convention_df['speaker'])
file_name = 'Conventions2012ScattertextScale.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
html = st.produce_scattertext_explorer(corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
minimum_term_frequency=5,
pmi_filter_thresold=4,
width_in_pixels=1000,
transform=st.Scalers.log_scale_standardize)
file_name = 'Conventions2012ScattertextLog.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
html = produce_scattertext_explorer(corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
width_in_pixels=1000,
minimum_term_frequency=5,
pmi_filter_thresold=4,
transform=st.Scalers.percentile,
metadata=convention_df['speaker'])
file_name = 'Conventions2012ScattertextRankData.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
html = produce_scattertext_explorer(corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
width_in_pixels=1000,
jitter=0.1,
minimum_term_frequency=5,
pmi_filter_thresold=4,
transform=st.Scalers.percentile,
metadata=convention_df['speaker'])
file_name = 'Conventions2012ScattertextRankDataJitter.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
html = produce_scattertext_explorer(corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
width_in_pixels=1000,
minimum_term_frequency=5,
pmi_filter_thresold=4,
metadata=convention_df['speaker'],
term_significance = st.LogOddsRatioUninformativeDirichletPrior())
file_name = 'Conventions2012ScattertextRankDefault.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
Similar to Monroe et al. (2008).
Burt L. Monroe, Michael P. Colaresi, and Kevin M. Quinn. 2008. Fightin’ words: Lexical feature selection and evaluation for identifying the content of political conflict. Political Analysis.
from sklearn.linear_model import LogisticRegression
def scale(ar):
return (ar - ar.min()) / (ar.max() - ar.min())
def zero_centered_scale(ar):
ar[ar > 0] = scale(ar[ar > 0])
ar[ar < 0] = -scale(-ar[ar < 0])
return (ar + 1) / 2.
frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))
scores = corpus.get_logreg_coefs('democrat',
LogisticRegression(penalty='l2', C=10, max_iter=10000, n_jobs=-1))
scores_scaled = zero_centered_scale(scores)
html = produce_scattertext_explorer(corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
minimum_term_frequency=5,
pmi_filter_thresold=4,
width_in_pixels=1000,
x_coords=frequencies_scaled,
y_coords=scores_scaled,
scores=scores,
sort_by_dist=False,
metadata=convention_df['speaker'],
x_label='Log frequency',
y_label='L2-penalized logistic regression coef')
file_name = 'L2vsLog.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)