import pandas as pd
import json
import nltk

objects = json.loads(open("../../vocativ_president_data/The original speeches.json").read())["objects"]

speeches_df = pd.DataFrame(objects)

speeches_df["word_count"] = speeches_df["Text"].apply(lambda x: len(x.split()))

json_data = open().read()

speeches_df["tokens"] = speeches_df["Text"].apply(lambda x: nltk.word_tokenize(x))

speeches_df["i"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "i"]), axis=1)
speeches_df["me"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "me"]), axis=1)
speeches_df["my"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "my"]), axis=1)
speeches_df["mine"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "mine"]), axis=1)
speeches_df["myself"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "myself"]), axis=1)

speeches_df["first_person_singular"] = speeches_df.apply(lambda x: x["i"] + x["me"] + x["my"] +\
                                                                x["mine"] + x["myself"], axis=1)

speeches_df["we"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "we"]), axis=1)
speeches_df["our"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "our"]), axis=1)
speeches_df["ours"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "ours"]), axis=1)
speeches_df["ourselves"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "ourselves"]), axis=1)
speeches_df["us"] = speeches_df.apply(lambda x: len([ t for t in x["tokens"] if t.lower() == "us"]), axis=1)

speeches_df["first_person_plural"] = speeches_df.apply(lambda x: x["we"] + x["our"] + x["ours"] + x["ourselves"] + x["us"], axis=1)

speeches_df["first_person"] = speeches_df.apply(lambda x: x["first_person_singular"] + x["first_person_singular"], axis=1)

speech_analysis = speeches_df[["word_count", "tokens", "President", "first_person", 
                               "first_person_singular", "first_person_plural"]]

news_conf_presidents = ["Richard Nixon", "Gerald Ford", "George H. W. Bush", "Lyndon B. Johnson", "Jimmy Carter", 
                        "Bill Clinton", "Harry S. Truman", "Ronald Reagan", "Barack Obama", "John F. Kennedy", 
                        "Franklin D. Roosevelt", "Dwight D. Eisenhower", "Herbert Hoover", "George W. Bush"]

modern_presidents = speech_analysis[speech_analysis["President"].isin(news_conf_presidents)]

presidents = pd.DataFrame(modern_presidents.groupby("President").sum())

presidents["pct_first"] = presidents.apply(lambda x: round(100.0 * x["first_person"] / x["word_count"], 2), axis=1)

presidents["pct_first_singular"] = presidents.apply(lambda x: round(100.0 * x["first_person_singular"] / x["word_count"], 2), axis=1)

presidents["pct_first_plural"] = presidents.apply(lambda x: round(100.0 * x["first_person_plural"] / x["word_count"], 2), axis=1)

presidents.sort("pct_first_singular", ascending=False)

round(100.0 * presidents["first_person_singular"].sum() / presidents["word_count"].sum(), 2)