This notebook analyzes how presidents have used first person vs. second person pronouns during their official news conferences.
The analysis relies on this parsing library: https://github.com/BuzzFeedNews/whtranscripts
import pandas as pd
import sys
import whtranscripts
import re
press_conference_data is a directory that includes a full set of press conferences pulled down using whtranscripts.download
conferences = whtranscripts.Conference.from_dir("../president_speech_notebooks/press_conference_data")
all_passages = [x for b in conferences for x in b.passages]
passages = pd.DataFrame(all_passages, columns=["passage"])
passages["date"] = passages["passage"].apply(lambda x: x.transcript.date)
passages["speaker"] = passages["passage"].apply(lambda x: x.speaker)
passages["text"] = passages["passage"].apply(lambda x: x.text)
passages["president"] = passages["passage"].apply(lambda x: x.transcript.president)
passages["tokens"] = passages["passage"].apply(lambda x: x.tokens)
This eliminates everything but the words spoken by the President. Unfortunately there are a lot of special cases so we have to do some fancy filtering in the is_president function.
def is_president(row):
if row["speaker"] and "The President" in row["speaker"]\
and "Secretary" not in row["speaker"]:
return True
elif row["speaker"] == "The. President" or row["speaker"] == "Mr. President":
return True
elif row["speaker"] and row["president"].split()[-1] in row["speaker"]\
and "Mrs." not in row["speaker"] and "Governor" not in row["speaker"]:
return True
else:
return False
passages["is_president"] = passages.apply(lambda x: is_president(x), axis=1)
president_passages = passages[passages["is_president"]]
First person singular references
president_passages["i"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("i"))
president_passages["me"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("me"))
president_passages["my"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("my"))
president_passages["mine"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("mine"))
president_passages["myself"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("myself"))
president_passages["first_person_singular"] = president_passages.apply(lambda x: x["i"] + x["me"] + x["my"] +\
x["mine"] + x["myself"], axis=1)
First person plural references
president_passages["we"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("we"))
president_passages["our"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("our"))
president_passages["ours"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("ours"))
president_passages["ourselves"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("ourselves"))
president_passages["us"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("us"))
president_passages["first_person_plural"] = president_passages.apply(lambda x: x["we"] + x["our"] + x["ours"] + x["ourselves"] + x["us"], axis=1)
president_passages["first_person"] = president_passages.apply(lambda x: x["first_person_singular"] +
x["first_person_singular"], axis=1)
president_passages["you"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("you"))
president_passages["your"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("your"))
president_passages["yours"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("yours"))
president_passages["yourself"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("yourself"))
president_passages["second_person"] = president_passages.apply(lambda x: x["you"] + x["your"] + + x["yours"] + x["yourself"], axis=1)
president_passages["they"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("they"))
president_passages["their"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("their"))
president_passages["theirs"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("theirs"))
president_passages["themselves"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("themselves"))
president_passages["third_person"] = president_passages.apply(lambda x: x["they"] + x["their"] + x["theirs"] + x["themselves"], axis=1)
president_passages["word_count"] = president_passages["passage"].apply(lambda x: x.get_word_count())
president_analysis = president_passages[["word_count", "tokens", "date", "speaker", "president",
"passage", "first_person", "first_person_singular",
"first_person_plural", "second_person", "third_person"]]
presidents = pd.DataFrame(president_analysis.groupby("president").sum())
round(100.0 * presidents["first_person_singular"].sum() / presidents["word_count"].sum())
3.0
presidents["pct_first"] = presidents.apply(lambda x: round(100.0 * x["first_person"] / x["word_count"], 2), axis=1)
presidents["pct_first_singular"] = presidents.apply(lambda x: round(100.0 * x["first_person_singular"] / x["word_count"], 2), axis=1)
presidents["pct_first_plural"] = presidents.apply(lambda x: round(100.0 * x["first_person_plural"] / x["word_count"], 2), axis=1)
presidents[["pct_first_singular", "pct_first_plural", "pct_first", "word_count"]].sort("pct_first_singular", ascending=False)
pct_first_singular | pct_first_plural | pct_first | word_count | |
---|---|---|---|---|
president | ||||
Harry S. Truman | 4.85 | 0.90 | 9.70 | 366974 |
George Bush | 4.65 | 2.38 | 9.30 | 400470 |
Dwight D. Eisenhower | 4.55 | 1.75 | 9.10 | 564562 |
Gerald R. Ford | 4.18 | 2.13 | 8.36 | 126528 |
Jimmy Carter | 3.50 | 2.38 | 6.99 | 220024 |
William J. Clinton | 3.40 | 3.03 | 6.79 | 637070 |
Lyndon B. Johnson | 3.11 | 3.04 | 6.22 | 404830 |
Richard Nixon | 3.09 | 2.32 | 6.19 | 171527 |
George W. Bush | 3.03 | 2.87 | 6.07 | 615121 |
Ronald Reagan | 3.00 | 3.10 | 6.00 | 173451 |
John F. Kennedy | 2.80 | 3.09 | 5.60 | 245266 |
Barack Obama | 2.45 | 3.61 | 4.90 | 473680 |
Franklin D. Roosevelt | 2.14 | 1.30 | 4.29 | 314211 |
Herbert Hoover | 1.94 | 1.13 | 3.88 | 133108 |
presidents[["pct_first_singular", "pct_first_plural", "pct_first", "word_count"]].sort("pct_first_plural", ascending=False)
pct_first_singular | pct_first_plural | pct_first | word_count | |
---|---|---|---|---|
president | ||||
Barack Obama | 2.45 | 3.61 | 4.90 | 473680 |
Ronald Reagan | 3.00 | 3.10 | 6.00 | 173451 |
John F. Kennedy | 2.80 | 3.09 | 5.60 | 245266 |
Lyndon B. Johnson | 3.11 | 3.04 | 6.22 | 404830 |
William J. Clinton | 3.40 | 3.03 | 6.79 | 637070 |
George W. Bush | 3.03 | 2.87 | 6.07 | 615121 |
George Bush | 4.65 | 2.38 | 9.30 | 400470 |
Jimmy Carter | 3.50 | 2.38 | 6.99 | 220024 |
Richard Nixon | 3.09 | 2.32 | 6.19 | 171527 |
Gerald R. Ford | 4.18 | 2.13 | 8.36 | 126528 |
Dwight D. Eisenhower | 4.55 | 1.75 | 9.10 | 564562 |
Franklin D. Roosevelt | 2.14 | 1.30 | 4.29 | 314211 |
Herbert Hoover | 1.94 | 1.13 | 3.88 | 133108 |
Harry S. Truman | 4.85 | 0.90 | 9.70 | 366974 |
presidents[["pct_first_singular", "pct_first_plural", "pct_first", "word_count"]].sort("pct_first", ascending=False)
pct_first_singular | pct_first_plural | pct_first | word_count | |
---|---|---|---|---|
president | ||||
Harry S. Truman | 4.85 | 0.90 | 9.70 | 366974 |
George Bush | 4.65 | 2.38 | 9.30 | 400470 |
Dwight D. Eisenhower | 4.55 | 1.75 | 9.10 | 564562 |
Gerald R. Ford | 4.18 | 2.13 | 8.36 | 126528 |
Jimmy Carter | 3.50 | 2.38 | 6.99 | 220024 |
William J. Clinton | 3.40 | 3.03 | 6.79 | 637070 |
Lyndon B. Johnson | 3.11 | 3.04 | 6.22 | 404830 |
Richard Nixon | 3.09 | 2.32 | 6.19 | 171527 |
George W. Bush | 3.03 | 2.87 | 6.07 | 615121 |
Ronald Reagan | 3.00 | 3.10 | 6.00 | 173451 |
John F. Kennedy | 2.80 | 3.09 | 5.60 | 245266 |
Barack Obama | 2.45 | 3.61 | 4.90 | 473680 |
Franklin D. Roosevelt | 2.14 | 1.30 | 4.29 | 314211 |
Herbert Hoover | 1.94 | 1.13 | 3.88 | 133108 |
%matplotlib inline
import mplstyle, mplstyle.styles.simple
mplstyle.set(mplstyle.styles.simple)
mplstyle.set({
"figure.figsize": (10, 6),
"axes": {
"color_cycle": [ "teal", "red" ],
},
"lines": {
"linewidth": 2
}
})
import datetime
president_analysis["datetime"] = president_analysis["date"].apply(lambda x: datetime.datetime(x.year, x.month, x.day))
def get_term_freq(df, term, resampler="AS"):
_ = df.set_index("datetime")
total_words = _["word_count"].resample(resampler, how="sum")
freq_count = _[term].resample(resampler, how="sum")
return (100.0 * freq_count / total_words)
ax = get_term_freq(president_analysis, "first_person_singular").plot(kind="line", label="singular", color="r")
get_term_freq(president_analysis, "first_person_plural").plot(kind="line", label="plural", color="b")
ax.legend(bbox_to_anchor=(0.2, 1))
pass
terms_df = pd.DataFrame(get_term_freq(president_analysis, "first_person_singular"))
terms_df.columns = ["singular"]
terms_df["plural"] = get_term_freq(president_analysis, "first_person_plural")
terms_df["date"] = terms_df.index
terms_df["year"] = terms_df["date"].apply(lambda x: int(x.year))
terms_df.set_index("year")
terms_df[["singular", "plural"]].to_csv("singularVsPlural2.csv")