# Render our plots inline
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)


#this presumes you've the two data sets locally


import pandas as pd

CPI={"2010": 218.056, "2011": 224.939, "2012": 229.594, "2013": 232.957} #http://www.bls.gov/cpi/home.htm

CPI_series=pd.Series(CPI)

CPI_series

House_sale_mean={"2010": 100000, "2011": 100000, "2012": 100000, "2013": 100000}

House_sale_series=pd.Series(House_sale_mean)

House_sale_series

(House_sale_series/CPI_series)

#If we multiply each one by 100, we'll get the value of our houses in 1982-4 dollars.
(House_sale_series/CPI_series)*100

inflation_adjusted=(House_sale_series/CPI_series)*100

#can perform calculations on individual 
inflation_adjusted['2013']/inflation_adjusted['2010']

#plotting is simple as pie
inflation_adjusted.plot()

(House_sale_series/CPI_series).plot(title="Sorry, kids. Blame X, where X is current politician we don't like.")

#Let's start with yet another way to read csv files, this time from `pandas`
df=pd.read_csv('./HMXPC13_DI_v2_5-14-14.csv', sep=",")

#take a look--it's a biggie
df

df["course_id"]

df["course_id"][3340:3350]

df[3340:3350]

df[666]

df.ix[666]

df.ix[[666]]

df.dtypes

df=pd.read_csv('HMXPC13_DI_v2_5-14-14.csv', sep="," , parse_dates=['start_time_DI', 'last_event_DI'])

df.dtypes

df=df.drop(['userid_DI', 'roles', 'incomplete_flag', 'nforum_posts'], axis=1) # axis=1 means the column names, not the rows

df['final_cc_cname_DI'][100:110]

df[['gender', 'ndays_act','nplay_video']][1781:1787] 
#note the double [[]]--you're providing a list [x,y. . . ] of the columns you want


df['start_time_DI'] # This produces a series of start times

startdates=df['start_time_DI'].value_counts()

startdates

startdates.plot()

df['nplay_video'].dropna().plot()

country=df['final_cc_cname_DI'].value_counts()

country

country[:15].plot(kind='bar')

#pick out all the people who watched over 10000[!] minutes of video
df['nplay_video']>10000

#use that long vector of trues and falses to pick out just those rows
df[df['nplay_video']>10000]

df['final_cc_cname_DI']=="France"

france=df[df['final_cc_cname_DI']=="France"]

years=df['YoB'].value_counts()


years

years.plot(kind="bar")

years=df['YoB'].value_counts(sort=False)

years

years.plot(kind="bar", title="Individual course enrollments by birth year")

india=df['final_cc_cname_DI']=="India"

india_years=df[india]['YoB'].value_counts(sort=False)

india_years.plot(kind='bar', title="Indian enrollments by birth year")

explored=df[df['explored']==1]

len(explored)

years_explorers=explored['YoB'].value_counts(sort=False)

years.plot(kind="bar")

years_explorers.plot(kind='bar', title='Age of those who explored half the chapters')

france_certified=france[df['certified']!=0]
france_certified[:10]

len(france)  #how many?

len(france_certified) 

len(france_certified)/len(france)

float(len(france_certified))/len(france)


explored_by_country=explored['final_cc_cname_DI'].value_counts()

explored_by_country

enrolled_by_country=df['final_cc_cname_DI'].value_counts()

enrolled_by_country

explored_by_country/enrolled_by_country

diligence_in_exploration=(explored_by_country/enrolled_by_country)
#Can we plot it? Yes!

diligence_in_exploration.plot(kind='bar', title='Diligence: Exploration per enrollment')

from IPython.display import Image
Image("https://sslimgs.xkcd.com/comics/correlation.png")
    

certified=df[df['certified']==1]

len(certified)

certified_by_country=certified['final_cc_cname_DI'].value_counts()

diligence=(certified_by_country/enrolled_by_country)
diligence.plot(kind='bar', title='Diligence redux: certification per enrollment')

certified_male=certified[certified['gender']=='m']
certified_male_by_country=certified_male['final_cc_cname_DI'].value_counts()

enrolled_male=df[df['gender']=='m']

enrolled_male_by_country=enrolled_male['final_cc_cname_DI'].value_counts()

diligence_male=(certified_male_by_country/enrolled_male_by_country)


diligence_male.plot(kind='bar', title='Diligence: Certification per enrollment(dudes)')

certified_female=certified[certified['gender']=='f']
certified_female_by_country=certified_female['final_cc_cname_DI'].value_counts()
enrolled_female=df[df['gender']=='f']
enrolled_female_by_country=enrolled_female['final_cc_cname_DI'].value_counts()
diligence_female=(certified_female_by_country/enrolled_female_by_country)
diligence_female.plot(kind='bar', title='Diligence: Certification per enrollment(lasses)')

diligence_genders=pd.DataFrame([diligence_female, diligence_male], index=["female","male"])

diligence_genders

diligence_genders.T

diligence_genders.T[["female","male"]].plot(kind="bar", title="Diligence: Certification per enrollment (lasses and dudes)")