#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd from pandas import Series,DataFrame import numpy as np # In[2]: import matplotlib.pyplot as plt import seaborn as sns sns.set_style('whitegrid') get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: # Use to grab data from the web(HTTP capabilities) import requests # We'll also use StringIO to work with the csv file, the DataFrame will require a .read() method from io import StringIO # In[4]: # This is the url link for the poll data in csv form url = "http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv" source=requests.get(url).text poll_data=StringIO(source) # In[5]: poll_df=pd.read_csv(poll_data) # In[6]: poll_df.head() # In[7]: poll_df.info() # In[8]: sns.catplot(x='Affiliation',data=poll_df,kind="count") # In[9]: sns.catplot(x='Affiliation',data=poll_df,kind="count",hue='Population') # In[10]: avg=pd.DataFrame(poll_df.mean()) avg.drop('Number of Observations',axis=0,inplace=True) # In[11]: avg # In[12]: std=pd.DataFrame(poll_df.std()) std.drop('Number of Observations',axis=0,inplace=True) # In[13]: std # In[14]: avg.plot(yerr=std,kind='bar',legend=False) # In[15]: poll_avg=pd.concat([avg,std],axis=1) # In[16]: poll_avg # In[17]: poll_avg.columns=['Average','Std'] # In[18]: poll_avg # In[19]: poll_df # In[20]: poll_df.plot(x='End Date',y=['Obama','Romney','Undecided'],linestyle='',marker='o') # In this the time is from left to right so at the end the voter turns to submerged # In[21]: from datetime import datetime # In[22]: poll_df['Difference']=(poll_df.Obama - poll_df.Romney)/100 # In[23]: poll_df.head() # In[24]: # Set as_index=Flase to keep the 0,1,2,... index. Then we'll take the mean of the polls on that day. poll_df=poll_df.groupby(['Start Date'],as_index=False).mean() # In[25]: poll_df # Now plotting the Differencce versus time should be straight forward. # In[26]: # Plotting the difference in polls between Obama and Romney poll_df.plot('Start Date','Difference',figsize=(12,4),marker='o',linestyle='-',color='purple') # It would be very interesting to plot marker lines on the dates of the debates and see if there is any general insight to the poll results. # # The debate dates were Oct 3rd, Oct 11, and Oct 22nd. Let's plot some lines as markers and then zoom in on the month of October. # # In order to find where to set the x limits for the figure we need to find out where the index for the month of October in 2012 is. Here's a simple for loop to find that row. Note, the string format of the date makes this difficult to do without using a lambda expression or a map. # In[27]: # Set row count and xlimit list row_in=0 xlimit=[] # Cycle through dates until 2012-10 is found, then print row index for date in poll_df['Start Date']: if date[0:7] == '2012-10': xlimit.append(row_in) row_in +=1 else: row_in +=1 print(min(xlimit)) print(max(xlimit)) # Great now we know where to set our x limits for the month of October in our figure. # # # In[28]: # Start with original figure fig = poll_df.plot('Start Date','Difference',figsize=(12,4),marker='o',linestyle='-',color='purple',xlim=(325,352)) # In[29]: fig = poll_df.plot('Start Date','Difference',figsize=(12,4),marker='o',linestyle='-',color='purple',xlim=(325,352)) # Now add the debate markers plt.axvline(x=325+2, linewidth=4, color='grey') plt.axvline(x=325+10, linewidth=4, color='grey') plt.axvline(x=325+21, linewidth=4, color='grey') # Surprisingly, thse polls reflect a dip for Obama after the second debate against Romney, even though memory serves that he performed much worse against Romney during the first debate. # # For all these polls it is important to remeber how geographical location can effect the value of a poll in predicting the outcomes of a national election. # In[30]: pwd # ### Donor Data Set # # Let's go ahead and switch gears and take a look at a data set consisting of information on donations to the federal campaign. # # This is going to be the biggest data set we've looked at so far. You can download it here , then make sure to save it to the same folder your iPython Notebooks are in. # # The questions we will be trying to answer while looking at this Data Set is: # # # 1.) How much was donated and what was the average donation? # # 2.) How did the donations differ between candidates? # # 3.) How did the donations differ between Democrats and Republicans? # # 4.) What were the demographics of the donors? # # 5.) Is there a pattern to donation amounts? # # # In[31]: donor_df=pd.read_csv('Election_Donor_Data.csv') # In[33]: donor_df.info() # In[34]: donor_df.head() # What might be interesting to do is get a quick glimpse of the donation amounts, and the average donation amount. Let's go ahead and break down the data. # # # In[35]: # Get a quick look at the various donation amounts donor_df['contb_receipt_amt'].value_counts() # 8079 different amounts! Thats quite a variation. Let's look at the average and the std. # # # In[42]: don_mean=donor_df['contb_receipt_amt'].mean() don_std=donor_df['contb_receipt_amt'].std() print("The Average donation was %.2f with a standard deviation of %.2f"%(don_mean,don_std)) # Wow! That's a huge standard deviation! Let's see if there are any large donations or other factors messing with the distribution of the donations. # In[54]: # Let's make a Series from the DataFrame, use .copy() to avoid view errors top_donor=donor_df['contb_receipt_amt'].copy() # Now we sort it top_donor.sort_values() # Looks like we have some negative values, as well as some huge donation amounts! The negative values are due to the FEC recording refunds as well as donations, let's go ahead and only look at the positive contribution amounts # # # In[57]: # Get rid of negative values top_donor=top_donor[top_donor>0] # In[58]: top_donor.sort_values() # In[65]: new_df=top_donor.value_counts() # In[67]: new_df.head(10) # Here we can see that the top 10 most common donations ranged from 10 to 2500 dollars. # In[68]: # Create a Series of the common donations limited to 2500 com_don = top_donor[top_donor < 2500] # Set a high number of bins to account for the non-round donations and check histogram for spikes. com_don.hist(bins=100) # So People give donations mostly in round number such as 100,500,1000 etc # In[69]: # Grab the unique object from the candidate column candidates=donor_df.cand_nm.unique() candidates # Let's go ahead and seperate Obama from the Republican Candidates by adding a Party Affiliation column. We can do this by using map along a dictionary of party affiliations. # In[70]: # Dictionary of party affiliation party_map = {'Bachmann, Michelle': 'Republican', 'Cain, Herman': 'Republican', 'Gingrich, Newt': 'Republican', 'Huntsman, Jon': 'Republican', 'Johnson, Gary Earl': 'Republican', 'McCotter, Thaddeus G': 'Republican', 'Obama, Barack': 'Democrat', 'Paul, Ron': 'Republican', 'Pawlenty, Timothy': 'Republican', 'Perry, Rick': 'Republican', "Roemer, Charles E. 'Buddy' III": 'Republican', 'Romney, Mitt': 'Republican', 'Santorum, Rick': 'Republican'} # Now map the party with candidates donor_df["Party"]=donor_df.cand_nm.map(party_map) # Let's look at our DataFrame and also make sure we clear refunds from the contribution amounts. # # # In[73]: # Clear Refund donor_df=donor_df[donor_df.contb_receipt_amt >0] # In[77]: donor_df.head() # Let's start by aggregating the data by candidate. We'll take a quick look a the total amounts received by each candidate. First we will look a the total number of donations and then at the total amount. # # # In[78]: # Groupby candidate and then displayt the total number of people who donated donor_df.groupby('cand_nm')['contb_receipt_amt'].count() # Clearly Obama is the front-runner in number of people donating, which makes sense, since he is not competeing with any other democratic nominees. Let's take a look at the total dollar amounts. # # # In[79]: # Groupby candidate and then displayt the total amount donated donor_df.groupby('cand_nm')['contb_receipt_amt'].sum() # This isn't super readable, and an important aspect of data science is to clearly present information. Let's go ahead and just print out these values in a clean for loop. # # # In[84]: # Start by setting gruopby as a object cand_amount=donor_df.groupby('cand_nm')['contb_receipt_amt'].sum() #our index tracker i=0 for don in cand_amount: print("The candidate %s raised %.0f dollars"%(cand_amount.index[i],don)) print('\n') i+=1 # This is okay, but its hard to do a quick comparison just by reading this information. How about just a quick graphic presentation? # # # In[85]: # PLot out total donation amounts cand_amount.plot(kind='bar') # Now the comparison is very easy to see. As we saw berfore, clearly Obama is the front-runner in donation amounts, which makes sense, since he is not competeing with any other democratic nominees. How about we just compare Democrat versus Republican donations? # # # In[86]: # Grouping Party and the counting donations donor_df.groupby('Party')['contb_receipt_amt'].sum().plot(kind='bar') # Looks like Obama couldn't compete against all the republicans, but he certainly has the advantage of their funding being splintered across multiple candidates. # # # Finally to start closing out the project, let's look at donations and who they came from (as far as occupation is concerned). We will start by grabing the occupation information from the dono_df DataFrame and then using pivot_table to make the index defined by the various occupations and then have the columns defined by the Party (Republican or Democrat). FInally we'll also pass an aggregation function in the pivot table, in this case a simple sum function will add up all the comntributions by anyone with the same profession. # # # In[101]: # Use a pivot table to extract and organize the data by the donor occupation occupation_df=donor_df.pivot_table('contb_receipt_amt',index=['contbr_occupation'],columns='Party',aggfunc='sum') # In[102]: occupation_df.head() # Great! Now let's see how big the DataFrame is. # # # In[103]: occupation_df.shape # Wow! This is probably far too large to display effectively with a small, static visualization. What we should do is have a cut-off for total contribution amounts. Afterall, small donations of 20 dollars by one type of occupation won't give us too much insight. So let's set our cut off at 1 million dollars. # # # In[104]: occupation_df=occupation_df[occupation_df.sum(1)>1000000] # In[105]: # Now let's check the size occupation_df.shape # Great! This looks much more manageable! Now let's visualize it. # # # In[106]: # Plot out with pandas occupation_df.plot(kind='bar') # This is a bit hard to read, so let's use kind = 'barh' (horizontal) to set the ocucpation on the correct axis. # # # In[107]: occupation_df.plot(kind='barh',cmap='seismic',figsize=(10,12)) # Looks like there are some occupations that are either mislabeled or aren't really occupations. Let's get rid of: Information Requested occupations and let's combine CEO and C.E.O. # # # In[108]: occupation_df.drop(['INFORMATION REQUESTED PER BEST EFFORTS','INFORMATION REQUESTED'],axis=0,inplace=True) # Now let's combine the CEO and C.E.O rows. # # # In[110]: # Set new ceo row as sum of the current two occupation_df.loc['CEO']=occupation_df.loc['CEO']+occupation_df.loc['C.E.O.'] #Drop C.E.O. occupation_df.drop('C.E.O.',inplace=True) # Now let's repeat the same plot! # In[111]: occupation_df.plot(kind='barh',cmap='seismic',figsize=(10,12)) # Awesome! Looks like CEOs are a little more conservative leaning, this may be due to the tax philosphies of each party during the election. # # # In[ ]: