import scattertext as st
import re
from pprint import pprint
import numpy as np
import pandas as pd
import spacy.en
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
nlp = spacy.en.English()
The function returns a Pandas data frame consisting of two columns, speaker and statement. Speaker is the name of the speaker, given in all caps, and statement is the speech made during a particular turn.
def debate_transcript_to_dataframe(fn, speakers):
lines = open(fn).read().split('\n')
cur_speaker = None
speaker_start_re = re.compile(r'^([(]?[A-Z][A-Z][A-Z]+):?(.+)$')
transcript = []
cur_statement = ''
cur_speaker = None
for line in lines:
match = speaker_start_re.match(line)
if match:
if match.group(1).startswith('('):
continue
if cur_speaker is not None:
transcript.append({'speaker': cur_speaker, 'statement': cur_statement})
cur_speaker = match.group(1).strip()
cur_statement = match.group(2).strip() + '\n'
for other_name in speakers:
if other_name+':' in cur_statement:
cur_statement, other_statement = cur_statement.split(other_name)
transcript.append({'speaker': cur_speaker, 'statement': cur_statement.strip()})
transcript.append({'speaker': other_name, 'statement': other_statement.strip()})
else:
cur_statement += line
df = pd.DataFrame(transcript)
return df
parties = {'QUIJANO':'Moderator',
'KAINE':'Democratic',
'PENCE':'Republican',
'HOLT':'Moderator',
'CLINTON':'Democratic',
'TRUMP':'Republican',
'COOPER':'Moderator',
'RADDATZ':'Moderator',
'WALLACE':'Moderator'}
debate_dfs = {}
for info in [
{'debate': '1st', 'fn': 'presidential-debate-2016-09-26.txt', 'participants': ['TRUMP','CLINTON','HOLT']},
{'debate': 'VP', 'fn': 'vp-debate-2016-10-04.txt', 'participants': ['PENCE','KAINE','QUIJANO']},
{'debate': '2nd', 'fn': 'debate-2016-10-09-rush.txt', 'participants': ['TRUMP','CLINTON','COOPER','RADDATZ']},
{'debate': '3rd', 'fn': 'debate-2016-10-19.txt', 'participants': ['TRUMP','CLINTON','WALLACE']}]:
cur_df = debate_transcript_to_dataframe(info['fn'], info['participants'])
cur_df['debate'] = info['debate']
cur_df['party'] = cur_df['speaker'].apply(lambda x: parties[x])
cur_df['speaker and debate']=cur_df['speaker'].apply(lambda x: x + ' ' + info['debate'])
debate_dfs[info['debate']] = cur_df
df_all = pd.concat(debate_dfs.values())
df_all.iloc[:2]
speaker | statement | debate | party | speaker and debate | |
---|---|---|---|---|---|
0 | QUIJANO | Good evening. From Longwood University in Farm... | VP | Moderator | QUIJANO VP |
1 | QUIJANO | I'm Elaine Quijano, anchor at CBSN, and corres... | VP | Moderator | QUIJANO VP |
df_all.to_csv('presidential_debates_2016.csv.gz', compression='gzip', index=False)
!cp presidential_debates_2016.csv.gz ../scattertext/scattertext/data/
Creates a chart from text in a data frame, df
. The category
and other_category
parameters are the names of the columns we'll compare. The category_col
is the column in df
that contains document categories, and contains category
and other_category
. For example, if category
is "TRUMP", then category_col
would be "speaker". extra
is append to the file name of the html file produced.
We'll look at the rest of the optional parameters later.
The function returns an iFrame containing containing the HTML visualization, and as a side-effect writes the visualization to an html file, named category.lower() + '-' + other_category.lower() + extra + '.html'
.
def draw_corpus(df, corpus, category, other_category, category_col, extra='', scores=None, singleScoreMode=False,
minimum_term_frequency=2, grey_zero_scores=False, sort_by_dist=True):
html = st.produce_scattertext_explorer(corpus,
category=category,
category_name=category.lower(),
not_category_name=other_category.lower(),
pmi_filter_thresold=2,
minimum_term_frequency=minimum_term_frequency,
metadata=df['speaker and debate'],
scores=scores,
width_in_pixels=1000,
grey_zero_scores=grey_zero_scores,
singleScoreMode=singleScoreMode,
sort_by_dist=sort_by_dist)
file_name = category.lower() + '-' + other_category.lower() + extra + '.html'
open(file_name, 'wb').write(html.encode('utf-8'))
return IFrame(src=file_name, width = 1200, height=1000)
def draw_plot(df, category, other_category, category_col, extra=''):
# Scattertext can only do a one column vs. all analysis. We're excluding any other speakrs
category_vs_other_df = df[(df[category_col] == category) | (df[category_col] == other_category)]
corpus = st.CorpusFromPandas(category_vs_other_df,
category_col = category_col,
text_col = 'statement',
nlp = nlp).build()
return draw_corpus(category_vs_other_df, corpus, category, other_category, category_col, extra=extra)
category, other_category, category_col = 'CLINTON', 'TRUMP', 'speaker'
debate_3 = st.CorpusFromPandas(data_frame = debate_dfs['3rd'][( debate_dfs['3rd'][category_col] == category)
| ( debate_dfs['3rd'][category_col] == other_category)],
category_col = category_col,
text_col = 'statement',
nlp = nlp).build()
term_df = debate_3.get_term_freq_df()
term_df['Trump'] = debate_3.get_scaled_f_scores('TRUMP')
term_df['Clinton'] = debate_3.get_scaled_f_scores('CLINTON')
print('Trump top terms')
print(term_df.sort_values(by='Trump', ascending=False).iloc[:20].index)
print('Clinton top terms')
print(term_df.sort_values(by='Clinton', ascending=False).iloc[:20].index)
Trump top terms Index(['hillary', 'bad', 'she wants', 'you have', 'the border', 'and she', 'justices', 'signed', 'percent', 'strong', 'outsmarted', 'a disaster', 'she 's', 'deals', 'no idea', 'have no', 'start', 'appoint', 'pouring', 'the baby'], dtype='object', name='term') Clinton top terms Index(['women', 'kind of', 'against', 'that is', 'work', 'stand', 'undocumented', 'also', 'most', 'guns', 'stand up', 'the debt', 'the kind', 'rights', 'against it', 'v.', 'million', 'families', 'new jobs', 'should be'], dtype='object', name='term')
draw_plot(df_all, 'CLINTON', 'TRUMP', 'speaker')
draw_plot(df_all, 'KAINE', 'PENCE', 'speaker')
draw_plot(df_all, 'Democratic', 'Republican', 'party')
df_dem_rep = df_all[df_all.party.isin({'Democratic', 'Republican'})]
corpus = st.CorpusFromPandas(df_dem_rep,
category_col = 'party',
text_col = 'statement',
nlp = nlp).build()
corpus_uni_stop
¶corpus_uni_stop = corpus.get_stoplisted_unigram_corpus()
from sklearn.decomposition import LatentDirichletAllocation
lda_models = {}
for party in ['Republican', 'Democratic', 'General']:
#subset the term-document matrix to only speech from one paraty or aanother
if party != 'General':
X = corpus_uni_stop._X[corpus_uni_stop._y == corpus_uni_stop.get_categories().index(party),:]
else:
X = corpus_uni_stop._X
lda_models[party] = (LatentDirichletAllocation(n_topics=20,
max_iter=60,
learning_method='online',
learning_offset=50.,
random_state=0)
.fit(X))
def top_words_in_topic(scores, corpus, n_top_words):
return [corpus._term_idx_store.getval(i) for i
in scores.argsort()[:-n_top_words - 1:-1]]
def print_some_topics(model):
for topic_idx, topic in list(enumerate(model.components_))[:3]:
print("Topic #%d:" % topic_idx)
print(', '.join(top_words_in_topic(model.components_[topic_idx], corpus_uni_stop, 10)))
print()
print("Some General Topics")
print_some_topics(lda_models['General'])
print("Some Republican Topics")
print_some_topics(lda_models['Republican'])
print("Some Democratic Topics")
print_some_topics(lda_models['Democratic'])
Some General Topics Topic #0: concerned, office, troubling, installers, deeply, visited, far, threat, man, met Topic #1: chicago, puppet, shootings, 4,000, guns, 1st, january, terrible, 2014, shared Topic #2: trashing, muslims, syrians, pick, bookstore, tomorrow, announced, book, called, stronger Some Republican Topics Topic #0: years, look, 30, entitled, miss, imagine, deductions, half, debt, number Topic #1: nonsense, oh, telling, puppet, cybersecurity, surge, speak, seventh, circuit, respectful Topic #2: slowest, recovery, great, economic, depression, $, audit, release, returns, audited Some Democratic Topics Topic #0: security, going, social, trade, read, solvent, book, energy, enforce, voted Topic #1: apologize, crisis, collapse, fact, worst, shared, minutes, dramatically, improved, 2014 Topic #2: think, donald, people, prepared, yes, undocumented, promised, privatize, said, jobs
topic_idx = 1
party='Democratic'
print('Top terms in Dem topic %s' % topic_idx,
top_words_in_topic(lda_models[party].components_[topic_idx], corpus_uni_stop, 10))
draw_corpus(df_dem_rep,
corpus_uni_stop,
'Democratic',
'Republican',
'party',
extra='_dem_topic_%s'%(topic_idx),
scores = lda_models[party].components_[topic_idx],
minimum_term_frequency=1,
singleScoreMode=True)
Top terms in Dem topic 1 ['actually', 'admit', 'completely', 'polished', 'values', 'antithetical', 'jeffersonian', 'bit', 'borders', 'open']
topic_idx = 7
party='Republican'
print('Top terms in Rep topic %s' % topic_idx,
top_words_in_topic(lda_models[party].components_[topic_idx], corpus_uni_stop, 10))
draw_corpus(df_dem_rep,
corpus_uni_stop,
'Democratic',
'Republican',
'party',
extra='_rep_topic_%s'%(topic_idx),
scores = lda_models[party].components_[topic_idx],
minimum_term_frequency=1,
singleScoreMode=True)
Top terms in Rep topic 7 ['doubt', 'jail', 'debunked', 'ugh', 'taunting', 'yeah', 'doing', 'just', 'defective', 'lester']
topic_idx = 2
party='General'
print('Top terms in general topic %s' % topic_idx,
top_words_in_topic(lda_models[party].components_[topic_idx], corpus_uni_stop, 10))
draw_corpus(df_dem_rep,
corpus_uni_stop,
'Democratic',
'Republican',
'party',
extra='_gen_topic_%s'%(topic_idx),
scores = lda_models[party].components_[topic_idx],
minimum_term_frequency=1,
singleScoreMode=True)
Top terms in general topic 2 ['shared', 'bet', 'u.s.', 'program', 'suspended', 'citizen', 'announced', 'website', 'think', 'actually']
base_term_text = 'job'
base_term = nlp(base_term_text)
scores=np.array([base_term.similarity(nlp(tok))
for tok
in corpus_uni_stop._term_idx_store._i2val])
print('Terms that are most similar to "%s"' % base_term)
print(top_words_in_topic(scores, corpus_uni_stop, 10))
draw_corpus(df_dem_rep,
corpus_uni_stop,
'Democratic',
'Republican',
'party',
extra = '_embedding_' + base_term_text,
scores = scores,
minimum_term_frequency=1,
singleScoreMode=True)
Terms that are most similar to "job" ['job', 'jobs', 'position', 'role', 'work', 'career', 'duty', 'responsibilities', 'tenure', 'contract']
base_term = nlp('wealth')
scores=np.array([base_term.similarity(nlp(tok))
for tok
in corpus_uni_stop._term_idx_store._i2val])
print('Terms that are most similar to "%s"' % base_term)
print(top_words_in_topic(scores, corpus_uni_stop, 10))
draw_corpus(df_dem_rep,
corpus_uni_stop,
'Democratic',
'Republican',
'party',
extra='_dem_topic_%s'%(topic_idx),
scores = scores,
minimum_term_frequency=1,
singleScoreMode=True)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-3-2ab6b232bf9f> in <module>() ----> 1 base_term = nlp('wealth') 2 scores=np.array([base_term.similarity(nlp(tok)) 3 for tok 4 in corpus_uni_stop._term_idx_store._i2val]) 5 NameError: name 'nlp' is not defined
l1scores, acc, baseline = corpus_uni_stop.get_logistic_regression_coefs_l1('Democratic')
print('Terms have the highest lasso coefficients for predicting Democrats are')
print(top_words_in_topic(l1scores, corpus_uni_stop, 10))
print('Terms have the highest lasso coefficients for predicting Republicans are')
print(top_words_in_topic(-1*l1scores, corpus_uni_stop, 10))
print('Cross-validated classification accuracy', acc)
print('Baseline (class-conditional) accuracy', baseline)
/Users/kesslej/anaconda3/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems. ConvergenceWarning)
Terms have the highest lasso coefficients for predicting Democrats are ['chief', 'kind', 'donald', 'mistake', 'intelligence', 'worked', 'debate', 'vladimir', 'good', 'book'] Terms have the highest lasso coefficients for predicting Republicans are ['tell', 'clinton', 'mean', 'tremendous', 'kaine', 'change', 'country', 'stop', 'happy', 'respond'] Cross-validated classification accuracy 0.627182044888 Baseline (class-conditional) accuracy 0.56608478803
#draw_corpus(df_dem_rep, corpus_uni_stop, 'Democratic', 'Republican', 'party', extra='lasso')
draw_corpus(df_dem_rep,
corpus_uni_stop,
'Democratic',
'Republican',
'party',
extra='lasso',
scores = l1scores,
minimum_term_frequency=1,
sort_by_dist = False,
grey_zero_scores = True)
draw_plot(debate_dfs['1st'], 'CLINTON', 'TRUMP', 'speaker', '1st')
draw_plot(df_all, 'TRUMP', 'PENCE', 'speaker', '_trumppence')
draw_plot(df_all, '1st', '2nd', 'debate', '_1st_vs_2nd')