from IPython.core.display import HTML
styles = open("Style.css").read()
HTML(styles)
import datetime as dt
from collections import defaultdict
import matplotlib.pyplot as plt
import pandas
matplotlib.rcParams['figure.figsize'] = [8,4] # Set default figure size
# Set this variable to the directory where the GDELT data files are
PATH = "GDELT.1979-2012.reduced/"
# Peeking at the data:
!head -n 5 GDELT.1979-2012.reduced/2010.reduced.txt
monthly_data = defaultdict(int) # We'll use this to store the counts
count = 0 # While we're at it, let's count how many records there are, total.
for year in range(1979, 2013):
#print year # Uncomment this line to see the program's progress.
f = open(PATH + str(year) + ".reduced.txt")
next(f) # Skip the header row.
for raw_row in f:
try:
row = raw_row.split("\t")
# Get the date, which is in YYYYMMDD format:
date_str = row[0]
year = int(date_str[:4])
month = int(date_str[4:6])
date = dt.datetime(year, month, 1)
monthly_data[date] += 1
count += 1
except:
pass # Skip error-generating rows for now.
print "Total rows processed:", count
print "Total months:", len(monthly_data)
monthly_events = pandas.Series(monthly_data)
monthly_events.plot()
material_coop = defaultdict(int)
material_conf = defaultdict(int)
for year in range(1979, 2013):
f = open(PATH + str(year) + ".reduced.txt")
next(f) # Skip the header row.
for raw_row in f:
try:
row = raw_row.split("\t")
# Check the quadcat, and skip if not relevant:
if row[4] not in ['1', '4']:
continue
# Get the date, which is in YYYYMMDD format:
date_str = row[0]
year = int(date_str[:4])
month = int(date_str[4:6])
date = dt.datetime(year, month, 1)
if row[4] == '1':
material_coop[date] += 1
elif row[4] == '4':
material_conf[date] += 1
except:
pass # Skip error-generating rows for now.
# Convert both into time series:
monthly_coop = pandas.Series(material_coop)
monthly_conf = pandas.Series(material_conf)
# Join the time series together into a DataFrame
trends = pandas.DataFrame({"Material_Cooperation": monthly_coop,
"Material_Conflict": monthly_conf})
trends.plot()
data = []
for year in range(1979, 2013):
f = open(PATH + str(year) + ".reduced.txt")
for raw_row in f:
row = raw_row.split("\t")
actor1 = row[1][:3]
actor2 = row[2][:3]
both = actor1 + actor2
if "ISR" in both and ("PAL" in both or "PSE" in both):
year = int(row[0][:4])
month = int(row[0][4:6])
day = int(row[0][6:])
quad_cat = row[4]
data.append([year, month, day, actor1, actor2, quad_cat])
print "Israeli-Palestinian Conflict Records:", len(data)
ilpalcon = pandas.DataFrame(data,
columns=["Year", "Month", "Day", "Actor1", "Actor2", "QuadCat"])
ilpalcon.head()
pivot = pandas.pivot_table(ilpalcon, values="Day", rows=["Year", "Month"], cols="QuadCat", aggfunc=len)
pivot = pivot.fillna(0) # Replace any missing data with zeros
pivot = pivot.reset_index() # Make Year and Month regular columns
pivot.head()
# date-generating function:
get_date = lambda x: dt.datetime(year=int(x[0]), month=int(x[1]), day=1)
pivot["date"] = pivot.apply(get_date, axis=1) # Apply row-wise
pivot = pivot.set_index("date") # Set the new date column as the index
# Now we no longer need the Year and Month columns, so let's drop them:
pivot = pivot[["1", "2", "3", "4"]]
# Rename the QuadCat columns
pivot = pivot.rename(columns={"1": "Material Cooperation",
"2": "Verbal Cooperation",
"3": "Verbal Conflict",
"4": "Material Conflict"})
pivot.plot(figsize=(8,4))
pivot["Peace_Index"] = (pivot["Material Cooperation"]
+ pivot["Verbal Cooperation"]
- pivot["Verbal Conflict"]
- pivot["Material Conflict"])
pivot["Peace_Index"].plot(figsize=(8,4))