In [2]:

import scattertext as st
import re
from pprint import pprint
import numpy as np
import pandas as pd
import spacy.en
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [3]:

nlp = spacy.en.English()

Parse debates and create plotting interface¶

The function returns a Pandas data frame consisting of two columns, speaker and statement. Speaker is the name of the speaker, given in all caps, and statement is the speech made during a particular turn.

In [4]:

def debate_transcript_to_dataframe(fn, speakers):
    lines = open(fn).read().split('\n')
    cur_speaker = None
    speaker_start_re = re.compile(r'^([(]?[A-Z][A-Z][A-Z]+):?(.+)$')
    transcript = []
    cur_statement = ''
    cur_speaker = None
    for line in lines:
        match = speaker_start_re.match(line)
        if match:
            if match.group(1).startswith('('):
                continue
            if cur_speaker is not None:
                transcript.append({'speaker': cur_speaker, 'statement': cur_statement})
            cur_speaker = match.group(1).strip()
            cur_statement = match.group(2).strip() + '\n'
            for other_name in speakers:
                if other_name+':' in cur_statement:
                    cur_statement, other_statement = cur_statement.split(other_name)
                    transcript.append({'speaker': cur_speaker, 'statement': cur_statement.strip()})
                    transcript.append({'speaker': other_name, 'statement': other_statement.strip()})   
        else:
            cur_statement += line 
    df = pd.DataFrame(transcript)
    return df

Read debates into Pandas data frames¶

In [5]:

parties = {'QUIJANO':'Moderator', 
           'KAINE':'Democratic', 
           'PENCE':'Republican', 
           'HOLT':'Moderator', 
           'CLINTON':'Democratic', 
           'TRUMP':'Republican',
           'COOPER':'Moderator',
           'RADDATZ':'Moderator',
           'WALLACE':'Moderator'}

debate_dfs = {}
for info in [
    {'debate': '1st', 'fn': 'presidential-debate-2016-09-26.txt', 'participants': ['TRUMP','CLINTON','HOLT']},
    {'debate': 'VP', 'fn': 'vp-debate-2016-10-04.txt', 'participants': ['PENCE','KAINE','QUIJANO']},
    {'debate': '2nd', 'fn': 'debate-2016-10-09-rush.txt', 'participants': ['TRUMP','CLINTON','COOPER','RADDATZ']},
    {'debate': '3rd', 'fn': 'debate-2016-10-19.txt', 'participants': ['TRUMP','CLINTON','WALLACE']}]:
    cur_df = debate_transcript_to_dataframe(info['fn'], info['participants'])
    cur_df['debate'] = info['debate']
    cur_df['party'] = cur_df['speaker'].apply(lambda x: parties[x])
    cur_df['speaker and debate']=cur_df['speaker'].apply(lambda x: x + ' ' + info['debate'])
    debate_dfs[info['debate']] = cur_df   
df_all = pd.concat(debate_dfs.values())
df_all.iloc[:2]

Out[5]:

	speaker	statement	debate	party	speaker and debate
0	QUIJANO	Good evening. From Longwood University in Farm...	VP	Moderator	QUIJANO VP
1	QUIJANO	I'm Elaine Quijano, anchor at CBSN, and corres...	VP	Moderator	QUIJANO VP

In [6]:

df_all.to_csv('presidential_debates_2016.csv.gz', compression='gzip', index=False)

In [7]:

!cp presidential_debates_2016.csv.gz ../scattertext/scattertext/data/

Function to draw scatter plot in notebook.¶

Creates a chart from text in a data frame, df. The category and other_category parameters are the names of the columns we'll compare. The category_col is the column in df that contains document categories, and contains category and other_category. For example, if category is "TRUMP", then category_col would be "speaker". extra is append to the file name of the html file produced.

We'll look at the rest of the optional parameters later.

The function returns an iFrame containing containing the HTML visualization, and as a side-effect writes the visualization to an html file, named category.lower() + '-' + other_category.lower() + extra + '.html'.

In [10]:

def draw_corpus(df, corpus, category, other_category, category_col, extra='', scores=None, singleScoreMode=False, 
                minimum_term_frequency=2, grey_zero_scores=False, sort_by_dist=True):
    html = st.produce_scattertext_explorer(corpus, 
                                           category=category, 
                                           category_name=category.lower(), 
                                           not_category_name=other_category.lower(),
                                           pmi_filter_thresold=2,
                                           minimum_term_frequency=minimum_term_frequency,
                                           metadata=df['speaker and debate'],
                                           scores=scores,
                                           width_in_pixels=1000,
                                           grey_zero_scores=grey_zero_scores,
                                           singleScoreMode=singleScoreMode,
                                           sort_by_dist=sort_by_dist)
    file_name = category.lower() + '-' + other_category.lower() + extra + '.html'
    open(file_name, 'wb').write(html.encode('utf-8'))
    return IFrame(src=file_name, width = 1200, height=1000)

def draw_plot(df, category, other_category, category_col, extra=''):
    # Scattertext can only do a one column vs. all analysis.  We're excluding any other speakrs
    category_vs_other_df = df[(df[category_col] == category) | (df[category_col] == other_category)]
    corpus = st.CorpusFromPandas(category_vs_other_df, 
                                 category_col = category_col, 
                                 text_col = 'statement',
                                 nlp = nlp).build()
    return draw_corpus(category_vs_other_df,  corpus, category, other_category, category_col, extra=extra)

Find the top words used by the candidates in the 3rd debate¶

In [11]:

category, other_category, category_col = 'CLINTON', 'TRUMP', 'speaker'
debate_3 = st.CorpusFromPandas(data_frame = debate_dfs['3rd'][( debate_dfs['3rd'][category_col] == category) 
                                                              | ( debate_dfs['3rd'][category_col] == other_category)], 
                               category_col = category_col, 
                               text_col = 'statement',
                               nlp = nlp).build()

term_df = debate_3.get_term_freq_df()
term_df['Trump'] = debate_3.get_scaled_f_scores('TRUMP')
term_df['Clinton'] = debate_3.get_scaled_f_scores('CLINTON')

print('Trump top terms')
print(term_df.sort_values(by='Trump', ascending=False).iloc[:20].index)
print('Clinton top terms')
print(term_df.sort_values(by='Clinton', ascending=False).iloc[:20].index)

Trump top terms
Index(['hillary', 'bad', 'she wants', 'you have', 'the border', 'and she',
       'justices', 'signed', 'percent', 'strong', 'outsmarted', 'a disaster',
       'she 's', 'deals', 'no idea', 'have no', 'start', 'appoint', 'pouring',
       'the baby'],
      dtype='object', name='term')
Clinton top terms
Index(['women', 'kind of', 'against', 'that is', 'work', 'stand',
       'undocumented', 'also', 'most', 'guns', 'stand up', 'the debt',
       'the kind', 'rights', 'against it', 'v.', 'million', 'families',
       'new jobs', 'should be'],
      dtype='object', name='term')

Clinton vs. Trump word use¶

In [12]:

draw_plot(df_all, 'CLINTON', 'TRUMP', 'speaker')

Out[12]:

In [8]:

draw_plot(df_all, 'KAINE', 'PENCE', 'speaker')

Out[8]:

In [9]:

draw_plot(df_all, 'Democratic', 'Republican', 'party')

Out[9]:

Visualize LDA topic model of the debates¶

First, create a corpus of all the 2016 debates¶

In [17]:

df_dem_rep = df_all[df_all.party.isin({'Democratic', 'Republican'})]
corpus = st.CorpusFromPandas(df_dem_rep, 
                             category_col = 'party', 
                             text_col = 'statement',
                             nlp = nlp).build()

Filter out bigrams and stopwords from the corpus, making a new one called `corpus_uni_stop`¶

In [18]:

corpus_uni_stop = corpus.get_stoplisted_unigram_corpus()

Train two, party-specifc topic models and one general model¶

In [19]:

from sklearn.decomposition import LatentDirichletAllocation
lda_models = {}
for party in ['Republican', 'Democratic', 'General']:
    #subset the term-document matrix to only speech from one paraty or aanother
    if party != 'General':
        X = corpus_uni_stop._X[corpus_uni_stop._y == corpus_uni_stop.get_categories().index(party),:]
    else:
        X = corpus_uni_stop._X
    lda_models[party] = (LatentDirichletAllocation(n_topics=20, 
                                                   max_iter=60,
                                                   learning_method='online',
                                                   learning_offset=50.,
                                                   random_state=0)
                         .fit(X))

In [20]:

def top_words_in_topic(scores, corpus, n_top_words):
    return [corpus._term_idx_store.getval(i) for i 
            in scores.argsort()[:-n_top_words - 1:-1]]

def print_some_topics(model):
    for topic_idx, topic in list(enumerate(model.components_))[:3]:
        print("Topic #%d:" % topic_idx)
        print(', '.join(top_words_in_topic(model.components_[topic_idx], corpus_uni_stop, 10)))
        
    print()
print("Some General Topics")
print_some_topics(lda_models['General'])
print("Some Republican Topics")
print_some_topics(lda_models['Republican'])
print("Some Democratic Topics")
print_some_topics(lda_models['Democratic'])

Some General Topics
Topic #0:
concerned, office, troubling, installers, deeply, visited, far, threat, man, met
Topic #1:
chicago, puppet, shootings, 4,000, guns, 1st, january, terrible, 2014, shared
Topic #2:
trashing, muslims, syrians, pick, bookstore, tomorrow, announced, book, called, stronger

Some Republican Topics
Topic #0:
years, look, 30, entitled, miss, imagine, deductions, half, debt, number
Topic #1:
nonsense, oh, telling, puppet, cybersecurity, surge, speak, seventh, circuit, respectful
Topic #2:
slowest, recovery, great, economic, depression, $, audit, release, returns, audited

Some Democratic Topics
Topic #0:
security, going, social, trade, read, solvent, book, energy, enforce, voted
Topic #1:
apologize, crisis, collapse, fact, worst, shared, minutes, dramatically, improved, 2014
Topic #2:
think, donald, people, prepared, yes, undocumented, promised, privatize, said, jobs

In [14]:

topic_idx = 1
party='Democratic'
print('Top terms in Dem topic %s' % topic_idx, 
      top_words_in_topic(lda_models[party].components_[topic_idx], corpus_uni_stop, 10))
draw_corpus(df_dem_rep, 
            corpus_uni_stop, 
            'Democratic', 
            'Republican', 
            'party', 
            extra='_dem_topic_%s'%(topic_idx), 
            scores = lda_models[party].components_[topic_idx],
            minimum_term_frequency=1,
            singleScoreMode=True)

Top terms in Dem topic 1 ['actually', 'admit', 'completely', 'polished', 'values', 'antithetical', 'jeffersonian', 'bit', 'borders', 'open']

Out[14]:

In [15]:

topic_idx = 7
party='Republican'
print('Top terms in Rep topic %s' % topic_idx, 
      top_words_in_topic(lda_models[party].components_[topic_idx], corpus_uni_stop, 10))
draw_corpus(df_dem_rep, 
            corpus_uni_stop, 
            'Democratic', 
            'Republican', 
            'party', 
            extra='_rep_topic_%s'%(topic_idx), 
            scores = lda_models[party].components_[topic_idx],
            minimum_term_frequency=1,
            singleScoreMode=True)

Top terms in Rep topic 7 ['doubt', 'jail', 'debunked', 'ugh', 'taunting', 'yeah', 'doing', 'just', 'defective', 'lester']

Out[15]:

In [16]:

topic_idx = 2
party='General'
print('Top terms in general topic %s' % topic_idx, 
      top_words_in_topic(lda_models[party].components_[topic_idx], corpus_uni_stop, 10))
draw_corpus(df_dem_rep, 
            corpus_uni_stop, 
            'Democratic', 
            'Republican', 
            'party', 
            extra='_gen_topic_%s'%(topic_idx), 
            scores = lda_models[party].components_[topic_idx],
            minimum_term_frequency=1,
            singleScoreMode=True)

Top terms in general topic 2 ['shared', 'bet', 'u.s.', 'program', 'suspended', 'citizen', 'announced', 'website', 'think', 'actually']

Out[16]:

Visualizing Word2Vec term similarity¶

Score each term in corpus against the word "job". SpaCy includes 300-dimensional word vectors and a cosine-similarity function.¶

In [23]:

base_term_text = 'job'
base_term = nlp(base_term_text)
scores=np.array([base_term.similarity(nlp(tok)) 
                 for tok 
                 in corpus_uni_stop._term_idx_store._i2val])



print('Terms that are most similar to "%s"' % base_term)
print(top_words_in_topic(scores, corpus_uni_stop, 10))
draw_corpus(df_dem_rep, 
            corpus_uni_stop, 
            'Democratic', 
            'Republican', 
            'party', 
            extra = '_embedding_' + base_term_text, 
            scores = scores,
            minimum_term_frequency=1,
            singleScoreMode=True)

Terms that are most similar to "job"
['job', 'jobs', 'position', 'role', 'work', 'career', 'duty', 'responsibilities', 'tenure', 'contract']

Out[23]:

In [3]:

base_term = nlp('wealth')
scores=np.array([base_term.similarity(nlp(tok)) 
                 for tok 
                 in corpus_uni_stop._term_idx_store._i2val])

print('Terms that are most similar to "%s"' % base_term)
print(top_words_in_topic(scores, corpus_uni_stop, 10))
draw_corpus(df_dem_rep, 
            corpus_uni_stop, 
            'Democratic', 
            'Republican', 
            'party', 
            extra='_dem_topic_%s'%(topic_idx), 
            scores = scores,
            minimum_term_frequency=1,
            singleScoreMode=True)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-2ab6b232bf9f> in <module>()
----> 1 base_term = nlp('wealth')
      2 scores=np.array([base_term.similarity(nlp(tok)) 
      3                  for tok
      4                  in corpus_uni_stop._term_idx_store._i2val])
      5 

NameError: name 'nlp' is not defined

Let's use Scattertext to inspect the coefficients from Lasso-logistic regression¶

We can see the accuracies used

In [19]:

l1scores, acc, baseline = corpus_uni_stop.get_logistic_regression_coefs_l1('Democratic')
print('Terms have the highest lasso coefficients for predicting Democrats are')
print(top_words_in_topic(l1scores, corpus_uni_stop, 10))
print('Terms have the highest lasso coefficients for predicting Republicans are')
print(top_words_in_topic(-1*l1scores, corpus_uni_stop,  10))
print('Cross-validated classification accuracy', acc)
print('Baseline (class-conditional) accuracy', baseline)

/Users/kesslej/anaconda3/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)

Terms have the highest lasso coefficients for predicting Democrats are
['chief', 'kind', 'donald', 'mistake', 'intelligence', 'worked', 'debate', 'vladimir', 'good', 'book']
Terms have the highest lasso coefficients for predicting Republicans are
['tell', 'clinton', 'mean', 'tremendous', 'kaine', 'change', 'country', 'stop', 'happy', 'respond']
Cross-validated classification accuracy 0.627182044888
Baseline (class-conditional) accuracy 0.56608478803

In [20]:

#draw_corpus(df_dem_rep,  corpus_uni_stop, 'Democratic', 'Republican', 'party', extra='lasso')
draw_corpus(df_dem_rep, 
            corpus_uni_stop, 
            'Democratic', 
            'Republican', 
            'party', 
            extra='lasso', 
            scores = l1scores,
            minimum_term_frequency=1,
            sort_by_dist = False,
            grey_zero_scores = True)

Out[20]:

In [ ]:

Comare Clinton and Trump's 1st debate¶

In [21]:

draw_plot(debate_dfs['1st'], 'CLINTON', 'TRUMP', 'speaker', '1st')

Out[21]:

Compare Trump to Pence¶

In [22]:

draw_plot(df_all, 'TRUMP', 'PENCE', 'speaker', '_trumppence')

Out[22]:

Compare the 1st to the 2nd debate¶