# Render our plots inline %matplotlib inline import pandas as pd import matplotlib.pyplot as plt import numpy as np pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier plt.rcParams['figure.figsize'] = (15, 5) #this presumes you've the two data sets locally import pandas as pd CPI={"2010": 218.056, "2011": 224.939, "2012": 229.594, "2013": 232.957} #http://www.bls.gov/cpi/home.htm CPI_series=pd.Series(CPI) CPI_series House_sale_mean={"2010": 100000, "2011": 100000, "2012": 100000, "2013": 100000} House_sale_series=pd.Series(House_sale_mean) House_sale_series (House_sale_series/CPI_series) #If we multiply each one by 100, we'll get the value of our houses in 1982-4 dollars. (House_sale_series/CPI_series)*100 inflation_adjusted=(House_sale_series/CPI_series)*100 #can perform calculations on individual inflation_adjusted['2013']/inflation_adjusted['2010'] #plotting is simple as pie inflation_adjusted.plot() (House_sale_series/CPI_series).plot(title="Sorry, kids. Blame X, where X is current politician we don't like.") #Let's start with yet another way to read csv files, this time from `pandas` df=pd.read_csv('./HMXPC13_DI_v2_5-14-14.csv', sep=",") #take a look--it's a biggie df df["course_id"] df["course_id"][3340:3350] df[3340:3350] df[666] df.ix[666] df.ix[[666]] df.dtypes df=pd.read_csv('HMXPC13_DI_v2_5-14-14.csv', sep="," , parse_dates=['start_time_DI', 'last_event_DI']) df.dtypes df=df.drop(['userid_DI', 'roles', 'incomplete_flag', 'nforum_posts'], axis=1) # axis=1 means the column names, not the rows df['final_cc_cname_DI'][100:110] df[['gender', 'ndays_act','nplay_video']][1781:1787] #note the double [[]]--you're providing a list [x,y. . . ] of the columns you want df['start_time_DI'] # This produces a series of start times startdates=df['start_time_DI'].value_counts() startdates startdates.plot() df['nplay_video'].dropna().plot() country=df['final_cc_cname_DI'].value_counts() country country[:15].plot(kind='bar') #pick out all the people who watched over 10000[!] minutes of video df['nplay_video']>10000 #use that long vector of trues and falses to pick out just those rows df[df['nplay_video']>10000] df['final_cc_cname_DI']=="France" france=df[df['final_cc_cname_DI']=="France"] years=df['YoB'].value_counts() years years.plot(kind="bar") years=df['YoB'].value_counts(sort=False) years years.plot(kind="bar", title="Individual course enrollments by birth year") india=df['final_cc_cname_DI']=="India" india_years=df[india]['YoB'].value_counts(sort=False) india_years.plot(kind='bar', title="Indian enrollments by birth year") explored=df[df['explored']==1] len(explored) years_explorers=explored['YoB'].value_counts(sort=False) years.plot(kind="bar") years_explorers.plot(kind='bar', title='Age of those who explored half the chapters') france_certified=france[df['certified']!=0] france_certified[:10] len(france) #how many? len(france_certified) len(france_certified)/len(france) float(len(france_certified))/len(france) explored_by_country=explored['final_cc_cname_DI'].value_counts() explored_by_country enrolled_by_country=df['final_cc_cname_DI'].value_counts() enrolled_by_country explored_by_country/enrolled_by_country diligence_in_exploration=(explored_by_country/enrolled_by_country) #Can we plot it? Yes! diligence_in_exploration.plot(kind='bar', title='Diligence: Exploration per enrollment') from IPython.display import Image Image("https://sslimgs.xkcd.com/comics/correlation.png") certified=df[df['certified']==1] len(certified) certified_by_country=certified['final_cc_cname_DI'].value_counts() diligence=(certified_by_country/enrolled_by_country) diligence.plot(kind='bar', title='Diligence redux: certification per enrollment') certified_male=certified[certified['gender']=='m'] certified_male_by_country=certified_male['final_cc_cname_DI'].value_counts() enrolled_male=df[df['gender']=='m'] enrolled_male_by_country=enrolled_male['final_cc_cname_DI'].value_counts() diligence_male=(certified_male_by_country/enrolled_male_by_country) diligence_male.plot(kind='bar', title='Diligence: Certification per enrollment(dudes)') certified_female=certified[certified['gender']=='f'] certified_female_by_country=certified_female['final_cc_cname_DI'].value_counts() enrolled_female=df[df['gender']=='f'] enrolled_female_by_country=enrolled_female['final_cc_cname_DI'].value_counts() diligence_female=(certified_female_by_country/enrolled_female_by_country) diligence_female.plot(kind='bar', title='Diligence: Certification per enrollment(lasses)') diligence_genders=pd.DataFrame([diligence_female, diligence_male], index=["female","male"]) diligence_genders diligence_genders.T diligence_genders.T[["female","male"]].plot(kind="bar", title="Diligence: Certification per enrollment (lasses and dudes)")