import pandas as pd import json import nltk objects = json.loads(open("../../vocativ_president_data/The original speeches.json").read())["objects"] speeches_df = pd.DataFrame(objects) speeches_df["word_count"] = speeches_df["Text"].apply(lambda x: len(x.split())) json_data = open().read() speeches_df["tokens"] = speeches_df["Text"].apply(lambda x: nltk.word_tokenize(x)) speeches_df["i"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "i"]), axis=1) speeches_df["me"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "me"]), axis=1) speeches_df["my"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "my"]), axis=1) speeches_df["mine"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "mine"]), axis=1) speeches_df["myself"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "myself"]), axis=1) speeches_df["first_person_singular"] = speeches_df.apply(lambda x: x["i"] + x["me"] + x["my"] +\ x["mine"] + x["myself"], axis=1) speeches_df["we"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "we"]), axis=1) speeches_df["our"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "our"]), axis=1) speeches_df["ours"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "ours"]), axis=1) speeches_df["ourselves"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "ourselves"]), axis=1) speeches_df["us"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "us"]), axis=1) speeches_df["first_person_plural"] = speeches_df.apply(lambda x: x["we"] + x["our"] + x["ours"] + x["ourselves"] + x["us"], axis=1) speeches_df["first_person"] = speeches_df.apply(lambda x: x["first_person_singular"] + x["first_person_singular"], axis=1) speech_analysis = speeches_df[["word_count", "tokens", "President", "first_person", "first_person_singular", "first_person_plural"]] news_conf_presidents = ["Richard Nixon", "Gerald Ford", "George H. W. Bush", "Lyndon B. Johnson", "Jimmy Carter", "Bill Clinton", "Harry S. Truman", "Ronald Reagan", "Barack Obama", "John F. Kennedy", "Franklin D. Roosevelt", "Dwight D. Eisenhower", "Herbert Hoover", "George W. Bush"] modern_presidents = speech_analysis[speech_analysis["President"].isin(news_conf_presidents)] presidents = pd.DataFrame(modern_presidents.groupby("President").sum()) presidents["pct_first"] = presidents.apply(lambda x: round(100.0 * x["first_person"] / x["word_count"], 2), axis=1) presidents["pct_first_singular"] = presidents.apply(lambda x: round(100.0 * x["first_person_singular"] / x["word_count"], 2), axis=1) presidents["pct_first_plural"] = presidents.apply(lambda x: round(100.0 * x["first_person_plural"] / x["word_count"], 2), axis=1) presidents.sort("pct_first_singular", ascending=False) round(100.0 * presidents["first_person_singular"].sum() / presidents["word_count"].sum(), 2)