import scattertext as ST
import tarfile, urllib, io
import pandas as pd
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
/Users/kesslej/anaconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. "This module will be removed in 0.20.", DeprecationWarning)
'''From Bo Pang's website: https://www.cs.cornell.edu/people/pabo/movie-review-data/
Data from:
A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization
Based on Minimum Cuts'', Proceedings of the ACL, 2004
'''
SUBJECTIVITY_URL = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz'
data = io.BytesIO(urllib.request.urlopen(SUBJECTIVITY_URL).read())
tarball = tarfile.open(fileobj=data, mode = 'r:gz')
readme = tarball.extractfile('subjdata.README.1.0').read()
quote = tarball.extractfile('quote.tok.gt9.5000').read()
plot = tarball.extractfile('plot.tok.gt9.5000').read()
# Examples of subjective sentences in corpus
quote.decode('utf-8', errors='ignore').split('\n')[:3]
['smart and alert , thirteen conversations about one thing is a small gem . ', 'color , musical bounce and warm seas lapping on island shores . and just enough science to send you home thinking . ', 'it is not a mass-market entertainment but an uncompromising attempt by one artist to think about another . ']
'''Construct subjective vs. objective pandas dataframe,
treating review quotes as subjective, and plot points as objective.
'''
df = pd.DataFrame(
[{'text': text.strip(), 'label': 'subjective'} for text
in quote.decode('utf-8', errors='ignore').split('\n')]
+ [{'text': text.strip(), 'label': 'objective'} for text
in plot.decode('utf-8', errors='ignore').split('\n')]
)
'''Convert Pandas dataframe to a term-document matrix, indicating
the category column is "label" and the text column name is "text".'''
term_doc_mat = ST.TermDocMatrixFromPandas(data_frame = df,
category_col = 'label',
text_col = 'text',
# Note: use nlp=spacy.en.English() for text that's not pre-tokenized
nlp = ST.fast_but_crap_nlp
).build()
'''
Filter out bigrams with PMI < 3, and unigrams and bigrams that occur less than 20 times.
The variable html is a string containing the HTML that makes up the scattertext visualization
'''
html = ST.produce_scattertext_html(term_doc_mat,
category='subjective',
category_name='Subjective',
not_category_name='Objective',
protocol='https',
pmi_filter_thresold=3,
minimum_term_frequency=20,
width_in_pixels=1000)
# Hack to display HTML with D3 in Jupyter Notebook
open('subj_obj_scatter.html', 'wb').write(html.encode('utf-8'))
IFrame(src='subj_obj_scatter.html', width = 1200, height=1000)
--------------------------------------------------------------------------- AssertionError Traceback (most recent call last) <ipython-input-9-15a4f72afaba> in <module>() 10 pmi_filter_thresold=3, 11 minimum_term_frequency=20, ---> 12 width_in_pixels=1000) 13 14 # Hack to display HTML with D3 in Jupyter Notebook /Users/kesslej/anaconda3/lib/python3.5/site-packages/scattertext-0.0.1.9.8-py3.5.egg/scattertext/__init__.py in produce_scattertext_explorer(corpus, category, category_name, not_category_name, protocol, pmi_filter_thresold, minimum_term_frequency, max_terms, filter_unigrams, height_in_pixels, width_in_pixels, max_snippets, max_docs_per_category, metadata, scores, singleScoreMode, use_full_doc, term_ranker) 149 filter_unigrams=filter_unigrams, 150 max_terms=max_terms, --> 151 term_ranker=term_ranker) 152 scatter_chart_data = scatter_chart_explorer.to_dict(category=category, 153 category_name=category_name, /Users/kesslej/anaconda3/lib/python3.5/site-packages/scattertext-0.0.1.9.8-py3.5.egg/scattertext/ScatterChartExplorer.py in __init__(self, corpus, **kwargs) 13 14 ''' ---> 15 assert isinstance(corpus, Corpus) 16 ScatterChart.__init__(self, corpus, **kwargs) 17 AssertionError:
''' Display top 20 terms that are characteristic of a subjective document-label and their frequencies.
'''
term_freq_df = term_doc_mat.get_term_freq_df()
term_freq_df['Subjective Score'] = term_doc_mat.get_scaled_f_scores('subjective', scaler_algo='percentile')
term_freq_df = term_freq_df.sort_values(by='Subjective Score', ascending=False)
term_freq_df.iloc[:20]
objective freq | subjective freq | Subjective Score | |
---|---|---|---|
term | |||
movie that | 0 | 75 | 0.803250 |
entertaining | 2 | 73 | 0.771629 |
film s | 2 | 69 | 0.767533 |
but it | 6 | 157 | 0.766663 |
i | 13 | 275 | 0.755910 |
interesting | 3 | 70 | 0.752203 |
film that | 4 | 77 | 0.744846 |
performances | 5 | 89 | 0.742972 |
of its | 6 | 103 | 0.742011 |
in its | 5 | 84 | 0.737945 |
me | 2 | 51 | 0.737812 |
script | 4 | 71 | 0.736981 |
movie is | 5 | 83 | 0.736840 |
if you | 6 | 96 | 0.736319 |
fascinating | 2 | 48 | 0.730420 |
cinematic | 2 | 47 | 0.727758 |
funny | 9 | 126 | 0.726650 |
laughs | 0 | 30 | 0.725776 |
movie s | 0 | 30 | 0.725776 |
you re | 4 | 64 | 0.725331 |
''' Display unigrams most characteristic of corpus against all of English that aren't unique to it.
Note: "doesn", "isn", and "didn" are a result of the pre-tokenization of the corpus.
'''
characteristic_terms = term_doc_mat.get_posterior_mean_ratio_scores_vs_background()
characteristic_terms[characteristic_terms['background'] > 0].iloc[:20]
corpus | background | Log Posterior Mean Ratio | |
---|---|---|---|
doesn | 176.0 | 1101832.0 | 6.972770 |
isn | 125.0 | 1345149.0 | 6.392687 |
discovers | 70.0 | 1974534.0 | 5.356073 |
cinematic | 49.0 | 1255895.0 | 5.091466 |
filmmaker | 51.0 | 1493747.0 | 5.063639 |
cannot | 29.0 | 88737.0 | 4.860555 |
filmmaking | 37.0 | 1061519.0 | 4.768377 |
thriller | 78.0 | 5364843.0 | 4.722203 |
didn | 32.0 | 850882.0 | 4.648173 |
filmmakers | 39.0 | 1657073.0 | 4.629892 |
comedy | 229.0 | 22993280.0 | 4.591236 |
quirky | 35.0 | 1436076.0 | 4.553131 |
documentary | 113.0 | 10429008.0 | 4.547708 |
film | 1006.0 | 116097842.0 | 4.512189 |
entertaining | 75.0 | 6330073.0 | 4.503101 |
mysterious | 65.0 | 5252752.0 | 4.483029 |
decides | 58.0 | 4588774.0 | 4.447191 |
performances | 94.0 | 9272429.0 | 4.417802 |
learns | 40.0 | 2570984.0 | 4.390325 |
hasn | 20.0 | 76625.0 | 4.352190 |