from IPython.core.display import HTML styles = open("Style.css").read() HTML(styles) import datetime as dt from collections import defaultdict import matplotlib.pyplot as plt import pandas matplotlib.rcParams['figure.figsize'] = [8,4] # Set default figure size # Set this variable to the directory where the GDELT data files are PATH = "GDELT.1979-2012.reduced/" # Peeking at the data: !head -n 5 GDELT.1979-2012.reduced/2010.reduced.txt monthly_data = defaultdict(int) # We'll use this to store the counts count = 0 # While we're at it, let's count how many records there are, total. for year in range(1979, 2013): #print year # Uncomment this line to see the program's progress. f = open(PATH + str(year) + ".reduced.txt") next(f) # Skip the header row. for raw_row in f: try: row = raw_row.split("\t") # Get the date, which is in YYYYMMDD format: date_str = row[0] year = int(date_str[:4]) month = int(date_str[4:6]) date = dt.datetime(year, month, 1) monthly_data[date] += 1 count += 1 except: pass # Skip error-generating rows for now. print "Total rows processed:", count print "Total months:", len(monthly_data) monthly_events = pandas.Series(monthly_data) monthly_events.plot() material_coop = defaultdict(int) material_conf = defaultdict(int) for year in range(1979, 2013): f = open(PATH + str(year) + ".reduced.txt") next(f) # Skip the header row. for raw_row in f: try: row = raw_row.split("\t") # Check the quadcat, and skip if not relevant: if row[4] not in ['1', '4']: continue # Get the date, which is in YYYYMMDD format: date_str = row[0] year = int(date_str[:4]) month = int(date_str[4:6]) date = dt.datetime(year, month, 1) if row[4] == '1': material_coop[date] += 1 elif row[4] == '4': material_conf[date] += 1 except: pass # Skip error-generating rows for now. # Convert both into time series: monthly_coop = pandas.Series(material_coop) monthly_conf = pandas.Series(material_conf) # Join the time series together into a DataFrame trends = pandas.DataFrame({"Material_Cooperation": monthly_coop, "Material_Conflict": monthly_conf}) trends.plot() data = [] for year in range(1979, 2013): f = open(PATH + str(year) + ".reduced.txt") for raw_row in f: row = raw_row.split("\t") actor1 = row[1][:3] actor2 = row[2][:3] both = actor1 + actor2 if "ISR" in both and ("PAL" in both or "PSE" in both): year = int(row[0][:4]) month = int(row[0][4:6]) day = int(row[0][6:]) quad_cat = row[4] data.append([year, month, day, actor1, actor2, quad_cat]) print "Israeli-Palestinian Conflict Records:", len(data) ilpalcon = pandas.DataFrame(data, columns=["Year", "Month", "Day", "Actor1", "Actor2", "QuadCat"]) ilpalcon.head() pivot = pandas.pivot_table(ilpalcon, values="Day", rows=["Year", "Month"], cols="QuadCat", aggfunc=len) pivot = pivot.fillna(0) # Replace any missing data with zeros pivot = pivot.reset_index() # Make Year and Month regular columns pivot.head() # date-generating function: get_date = lambda x: dt.datetime(year=int(x[0]), month=int(x[1]), day=1) pivot["date"] = pivot.apply(get_date, axis=1) # Apply row-wise pivot = pivot.set_index("date") # Set the new date column as the index # Now we no longer need the Year and Month columns, so let's drop them: pivot = pivot[["1", "2", "3", "4"]] # Rename the QuadCat columns pivot = pivot.rename(columns={"1": "Material Cooperation", "2": "Verbal Cooperation", "3": "Verbal Conflict", "4": "Material Conflict"}) pivot.plot(figsize=(8,4)) pivot["Peace_Index"] = (pivot["Material Cooperation"] + pivot["Verbal Cooperation"] - pivot["Verbal Conflict"] - pivot["Material Conflict"]) pivot["Peace_Index"].plot(figsize=(8,4))