import pandas as pd
from pandas import Series,DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
# Use to grab data from the web(HTTP capabilities)
import requests
# We'll also use StringIO to work with the csv file, the DataFrame will require a .read() method
from io import StringIO
# This is the url link for the poll data in csv form
url = "http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv"
source=requests.get(url).text
poll_data=StringIO(source)
poll_df=pd.read_csv(poll_data)
poll_df.head()
Pollster | Start Date | End Date | Entry Date/Time (ET) | Number of Observations | Population | Mode | Obama | Romney | Undecided | Other | Pollster URL | Source URL | Partisan | Affiliation | Question Text | Question Iteration | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Politico/GWU/Battleground | 2012-11-04 | 2012-11-05 | 2012-11-06T08:40:26Z | 1000.0 | Likely Voters | Live Phone | 47.0 | 47.0 | 6.0 | NaN | https://elections.huffingtonpost.com/pollster/... | http://www.politico.com/news/stories/1112/8338... | Nonpartisan | None | NaN | 1 |
1 | YouGov/Economist | 2012-11-03 | 2012-11-05 | 2012-11-26T15:31:23Z | 740.0 | Likely Voters | Internet | 49.0 | 47.0 | 3.0 | NaN | https://elections.huffingtonpost.com/pollster/... | http://cdn.yougov.com/cumulus_uploads/document... | Nonpartisan | None | NaN | 1 |
2 | Gravis Marketing | 2012-11-03 | 2012-11-05 | 2012-11-06T09:22:02Z | 872.0 | Likely Voters | Automated Phone | 48.0 | 48.0 | 4.0 | NaN | https://elections.huffingtonpost.com/pollster/... | http://www.gravispolls.com/2012/11/gravis-mark... | Nonpartisan | None | NaN | 1 |
3 | IBD/TIPP | 2012-11-03 | 2012-11-05 | 2012-11-06T08:51:48Z | 712.0 | Likely Voters | Live Phone | 50.0 | 49.0 | NaN | 1.0 | https://elections.huffingtonpost.com/pollster/... | http://news.investors.com/special-report/50841... | Nonpartisan | None | NaN | 1 |
4 | Rasmussen | 2012-11-03 | 2012-11-05 | 2012-11-06T08:47:50Z | 1500.0 | Likely Voters | Automated Phone | 48.0 | 49.0 | NaN | NaN | https://elections.huffingtonpost.com/pollster/... | http://www.rasmussenreports.com/public_content... | Nonpartisan | None | NaN | 1 |
poll_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 586 entries, 0 to 585 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pollster 586 non-null object 1 Start Date 586 non-null object 2 End Date 586 non-null object 3 Entry Date/Time (ET) 586 non-null object 4 Number of Observations 564 non-null float64 5 Population 586 non-null object 6 Mode 586 non-null object 7 Obama 586 non-null float64 8 Romney 586 non-null float64 9 Undecided 423 non-null float64 10 Other 202 non-null float64 11 Pollster URL 586 non-null object 12 Source URL 584 non-null object 13 Partisan 586 non-null object 14 Affiliation 586 non-null object 15 Question Text 0 non-null float64 16 Question Iteration 586 non-null int64 dtypes: float64(6), int64(1), object(10) memory usage: 78.0+ KB
sns.catplot(x='Affiliation',data=poll_df,kind="count")
<seaborn.axisgrid.FacetGrid at 0x12a0f9d0790>
sns.catplot(x='Affiliation',data=poll_df,kind="count",hue='Population')
<seaborn.axisgrid.FacetGrid at 0x12a101c61c0>
avg=pd.DataFrame(poll_df.mean())
avg.drop('Number of Observations',axis=0,inplace=True)
avg
0 | |
---|---|
Obama | 46.805461 |
Romney | 44.614334 |
Undecided | 6.550827 |
Other | 3.376238 |
Question Text | NaN |
Question Iteration | 1.000000 |
std=pd.DataFrame(poll_df.std())
std.drop('Number of Observations',axis=0,inplace=True)
std
0 | |
---|---|
Obama | 2.422058 |
Romney | 2.906180 |
Undecided | 3.701754 |
Other | 2.692726 |
Question Text | NaN |
Question Iteration | 0.000000 |
avg.plot(yerr=std,kind='bar',legend=False)
<AxesSubplot:>
poll_avg=pd.concat([avg,std],axis=1)
poll_avg
0 | 0 | |
---|---|---|
Obama | 46.805461 | 2.422058 |
Romney | 44.614334 | 2.906180 |
Undecided | 6.550827 | 3.701754 |
Other | 3.376238 | 2.692726 |
Question Text | NaN | NaN |
Question Iteration | 1.000000 | 0.000000 |
poll_avg.columns=['Average','Std']
poll_avg
Average | Std | |
---|---|---|
Obama | 46.805461 | 2.422058 |
Romney | 44.614334 | 2.906180 |
Undecided | 6.550827 | 3.701754 |
Other | 3.376238 | 2.692726 |
Question Text | NaN | NaN |
Question Iteration | 1.000000 | 0.000000 |
poll_df
Pollster | Start Date | End Date | Entry Date/Time (ET) | Number of Observations | Population | Mode | Obama | Romney | Undecided | Other | Pollster URL | Source URL | Partisan | Affiliation | Question Text | Question Iteration | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Politico/GWU/Battleground | 2012-11-04 | 2012-11-05 | 2012-11-06T08:40:26Z | 1000.0 | Likely Voters | Live Phone | 47.0 | 47.0 | 6.0 | NaN | https://elections.huffingtonpost.com/pollster/... | http://www.politico.com/news/stories/1112/8338... | Nonpartisan | None | NaN | 1 |
1 | YouGov/Economist | 2012-11-03 | 2012-11-05 | 2012-11-26T15:31:23Z | 740.0 | Likely Voters | Internet | 49.0 | 47.0 | 3.0 | NaN | https://elections.huffingtonpost.com/pollster/... | http://cdn.yougov.com/cumulus_uploads/document... | Nonpartisan | None | NaN | 1 |
2 | Gravis Marketing | 2012-11-03 | 2012-11-05 | 2012-11-06T09:22:02Z | 872.0 | Likely Voters | Automated Phone | 48.0 | 48.0 | 4.0 | NaN | https://elections.huffingtonpost.com/pollster/... | http://www.gravispolls.com/2012/11/gravis-mark... | Nonpartisan | None | NaN | 1 |
3 | IBD/TIPP | 2012-11-03 | 2012-11-05 | 2012-11-06T08:51:48Z | 712.0 | Likely Voters | Live Phone | 50.0 | 49.0 | NaN | 1.0 | https://elections.huffingtonpost.com/pollster/... | http://news.investors.com/special-report/50841... | Nonpartisan | None | NaN | 1 |
4 | Rasmussen | 2012-11-03 | 2012-11-05 | 2012-11-06T08:47:50Z | 1500.0 | Likely Voters | Automated Phone | 48.0 | 49.0 | NaN | NaN | https://elections.huffingtonpost.com/pollster/... | http://www.rasmussenreports.com/public_content... | Nonpartisan | None | NaN | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
581 | PPP (D) | 2009-07-15 | 2009-07-16 | 2010-11-08T11:02:37Z | 577.0 | Registered Voters | Automated Phone | 49.0 | 40.0 | 11.0 | NaN | https://elections.huffingtonpost.com/pollster/... | http://www.publicpolicypolling.com/pdf/surveys... | Pollster | Dem | NaN | 1 |
582 | PPP (D) | 2009-06-12 | 2009-06-16 | 2010-11-08T10:58:00Z | 638.0 | Registered Voters | Automated Phone | 48.0 | 40.0 | 12.0 | NaN | https://elections.huffingtonpost.com/pollster/... | http://www.publicpolicypolling.com/pdf/surveys... | Pollster | Dem | NaN | 1 |
583 | PPP (D) | 2009-05-14 | 2009-05-18 | 2010-11-08T10:49:08Z | 1000.0 | Registered Voters | Automated Phone | 53.0 | 35.0 | 12.0 | NaN | https://elections.huffingtonpost.com/pollster/... | http://www.publicpolicypolling.com/pdf/surveys... | Pollster | Dem | NaN | 1 |
584 | PPP (D) | 2009-04-17 | 2009-04-19 | 2010-11-08T10:45:18Z | 686.0 | Registered Voters | Automated Phone | 50.0 | 39.0 | 11.0 | NaN | https://elections.huffingtonpost.com/pollster/... | http://www.publicpolicypolling.com/pdf/surveys... | Pollster | Dem | NaN | 1 |
585 | PPP (D) | 2009-03-13 | 2009-03-15 | 2010-11-08T10:36:31Z | 1403.0 | Registered Voters | Automated Phone | 44.0 | 44.0 | 12.0 | NaN | https://elections.huffingtonpost.com/pollster/... | http://www.publicpolicypolling.com/pdf/PPP_Rel... | Pollster | Dem | NaN | 1 |
586 rows × 17 columns
poll_df.plot(x='End Date',y=['Obama','Romney','Undecided'],linestyle='',marker='o')
<AxesSubplot:xlabel='End Date'>
In this the time is from left to right so at the end the voter turns to submerged
from datetime import datetime
poll_df['Difference']=(poll_df.Obama - poll_df.Romney)/100
poll_df.head()
Pollster | Start Date | End Date | Entry Date/Time (ET) | Number of Observations | Population | Mode | Obama | Romney | Undecided | Other | Pollster URL | Source URL | Partisan | Affiliation | Question Text | Question Iteration | Difference | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Politico/GWU/Battleground | 2012-11-04 | 2012-11-05 | 2012-11-06T08:40:26Z | 1000.0 | Likely Voters | Live Phone | 47.0 | 47.0 | 6.0 | NaN | https://elections.huffingtonpost.com/pollster/... | http://www.politico.com/news/stories/1112/8338... | Nonpartisan | None | NaN | 1 | 0.00 |
1 | YouGov/Economist | 2012-11-03 | 2012-11-05 | 2012-11-26T15:31:23Z | 740.0 | Likely Voters | Internet | 49.0 | 47.0 | 3.0 | NaN | https://elections.huffingtonpost.com/pollster/... | http://cdn.yougov.com/cumulus_uploads/document... | Nonpartisan | None | NaN | 1 | 0.02 |
2 | Gravis Marketing | 2012-11-03 | 2012-11-05 | 2012-11-06T09:22:02Z | 872.0 | Likely Voters | Automated Phone | 48.0 | 48.0 | 4.0 | NaN | https://elections.huffingtonpost.com/pollster/... | http://www.gravispolls.com/2012/11/gravis-mark... | Nonpartisan | None | NaN | 1 | 0.00 |
3 | IBD/TIPP | 2012-11-03 | 2012-11-05 | 2012-11-06T08:51:48Z | 712.0 | Likely Voters | Live Phone | 50.0 | 49.0 | NaN | 1.0 | https://elections.huffingtonpost.com/pollster/... | http://news.investors.com/special-report/50841... | Nonpartisan | None | NaN | 1 | 0.01 |
4 | Rasmussen | 2012-11-03 | 2012-11-05 | 2012-11-06T08:47:50Z | 1500.0 | Likely Voters | Automated Phone | 48.0 | 49.0 | NaN | NaN | https://elections.huffingtonpost.com/pollster/... | http://www.rasmussenreports.com/public_content... | Nonpartisan | None | NaN | 1 | -0.01 |
# Set as_index=Flase to keep the 0,1,2,... index. Then we'll take the mean of the polls on that day.
poll_df=poll_df.groupby(['Start Date'],as_index=False).mean()
poll_df
Start Date | Number of Observations | Obama | Romney | Undecided | Other | Question Text | Question Iteration | Difference | |
---|---|---|---|---|---|---|---|---|---|
0 | 2009-03-13 | 1403.000000 | 44.00 | 44.00 | 12.000000 | NaN | NaN | 1 | 0.0000 |
1 | 2009-04-17 | 686.000000 | 50.00 | 39.00 | 11.000000 | NaN | NaN | 1 | 0.1100 |
2 | 2009-05-14 | 1000.000000 | 53.00 | 35.00 | 12.000000 | NaN | NaN | 1 | 0.1800 |
3 | 2009-06-12 | 638.000000 | 48.00 | 40.00 | 12.000000 | NaN | NaN | 1 | 0.0800 |
4 | 2009-07-15 | 577.000000 | 49.00 | 40.00 | 11.000000 | NaN | NaN | 1 | 0.0900 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
352 | 2012-10-31 | 10420.250000 | 48.00 | 46.50 | 4.000000 | 2.0 | NaN | 1 | 0.0150 |
353 | 2012-11-01 | 1989.000000 | 49.00 | 47.25 | 2.400000 | 1.5 | NaN | 1 | 0.0175 |
354 | 2012-11-02 | 1041.750000 | 49.25 | 48.50 | 2.000000 | 1.0 | NaN | 1 | 0.0075 |
355 | 2012-11-03 | 1310.833333 | 48.50 | 48.00 | 4.333333 | 1.0 | NaN | 1 | 0.0050 |
356 | 2012-11-04 | 1000.000000 | 47.00 | 47.00 | 6.000000 | NaN | NaN | 1 | 0.0000 |
357 rows × 9 columns
Now plotting the Differencce versus time should be straight forward.
# Plotting the difference in polls between Obama and Romney
poll_df.plot('Start Date','Difference',figsize=(12,4),marker='o',linestyle='-',color='purple')
<AxesSubplot:xlabel='Start Date'>
It would be very interesting to plot marker lines on the dates of the debates and see if there is any general insight to the poll results.
The debate dates were Oct 3rd, Oct 11, and Oct 22nd. Let's plot some lines as markers and then zoom in on the month of October.
In order to find where to set the x limits for the figure we need to find out where the index for the month of October in 2012 is. Here's a simple for loop to find that row. Note, the string format of the date makes this difficult to do without using a lambda expression or a map.
# Set row count and xlimit list
row_in=0
xlimit=[]
# Cycle through dates until 2012-10 is found, then print row index
for date in poll_df['Start Date']:
if date[0:7] == '2012-10':
xlimit.append(row_in)
row_in +=1
else:
row_in +=1
print(min(xlimit))
print(max(xlimit))
325 352
Great now we know where to set our x limits for the month of October in our figure.
# Start with original figure
fig = poll_df.plot('Start Date','Difference',figsize=(12,4),marker='o',linestyle='-',color='purple',xlim=(325,352))
fig = poll_df.plot('Start Date','Difference',figsize=(12,4),marker='o',linestyle='-',color='purple',xlim=(325,352))
# Now add the debate markers
plt.axvline(x=325+2, linewidth=4, color='grey')
plt.axvline(x=325+10, linewidth=4, color='grey')
plt.axvline(x=325+21, linewidth=4, color='grey')
<matplotlib.lines.Line2D at 0x12a1051a1c0>
Surprisingly, thse polls reflect a dip for Obama after the second debate against Romney, even though memory serves that he performed much worse against Romney during the first debate.
For all these polls it is important to remeber how geographical location can effect the value of a poll in predicting the outcomes of a national election.
pwd
'C:\\Users\\HP'
Let's go ahead and switch gears and take a look at a data set consisting of information on donations to the federal campaign.
This is going to be the biggest data set we've looked at so far. You can download it here , then make sure to save it to the same folder your iPython Notebooks are in.
The questions we will be trying to answer while looking at this Data Set is:
1.) How much was donated and what was the average donation?
2.) How did the donations differ between candidates?
3.) How did the donations differ between Democrats and Republicans?
4.) What were the demographics of the donors?
5.) Is there a pattern to donation amounts?
donor_df=pd.read_csv('Election_Donor_Data.csv')
C:\Users\HP\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3165: DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
donor_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1001731 entries, 0 to 1001730 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 cmte_id 1001731 non-null object 1 cand_id 1001731 non-null object 2 cand_nm 1001731 non-null object 3 contbr_nm 1001731 non-null object 4 contbr_city 1001712 non-null object 5 contbr_st 1001727 non-null object 6 contbr_zip 1001620 non-null object 7 contbr_employer 988002 non-null object 8 contbr_occupation 993301 non-null object 9 contb_receipt_amt 1001731 non-null float64 10 contb_receipt_dt 1001731 non-null object 11 receipt_desc 14166 non-null object 12 memo_cd 92482 non-null object 13 memo_text 97770 non-null object 14 form_tp 1001731 non-null object 15 file_num 1001731 non-null int64 dtypes: float64(1), int64(1), object(14) memory usage: 122.3+ MB
donor_df.head()
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290.0 | RETIRED | RETIRED | 250.0 | 20-JUN-11 | NaN | NaN | NaN | SA17A | 736166 |
1 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290.0 | RETIRED | RETIRED | 50.0 | 23-JUN-11 | NaN | NaN | NaN | SA17A | 736166 |
2 | C00410118 | P20002978 | Bachmann, Michelle | SMITH, LANIER | LANETT | AL | 368633403.0 | INFORMATION REQUESTED | INFORMATION REQUESTED | 250.0 | 05-JUL-11 | NaN | NaN | NaN | SA17A | 749073 |
3 | C00410118 | P20002978 | Bachmann, Michelle | BLEVINS, DARONDA | PIGGOTT | AR | 724548253.0 | NONE | RETIRED | 250.0 | 01-AUG-11 | NaN | NaN | NaN | SA17A | 749073 |
4 | C00410118 | P20002978 | Bachmann, Michelle | WARDENBURG, HAROLD | HOT SPRINGS NATION | AR | 719016467.0 | NONE | RETIRED | 300.0 | 20-JUN-11 | NaN | NaN | NaN | SA17A | 736166 |
What might be interesting to do is get a quick glimpse of the donation amounts, and the average donation amount. Let's go ahead and break down the data.
# Get a quick look at the various donation amounts
donor_df['contb_receipt_amt'].value_counts()
100.00 178188 50.00 137584 25.00 110345 250.00 91182 500.00 57984 ... 58.43 1 -2115.21 1 75.64 1 142.41 1 35.83 1 Name: contb_receipt_amt, Length: 8079, dtype: int64
8079 different amounts! Thats quite a variation. Let's look at the average and the std.
don_mean=donor_df['contb_receipt_amt'].mean()
don_std=donor_df['contb_receipt_amt'].std()
print("The Average donation was %.2f with a standard deviation of %.2f"%(don_mean,don_std))
The Average donation was 298.24 with a standard deviation of 3749.67
Wow! That's a huge standard deviation! Let's see if there are any large donations or other factors messing with the distribution of the donations.
# Let's make a Series from the DataFrame, use .copy() to avoid view errors
top_donor=donor_df['contb_receipt_amt'].copy()
# Now we sort it
top_donor.sort_values()
114604 -30800.00 226986 -25800.00 101356 -7500.00 398429 -5500.00 250737 -5455.00 ... 319478 526246.17 344419 1511192.17 344539 1679114.65 326651 1944042.43 325136 2014490.51 Name: contb_receipt_amt, Length: 1001731, dtype: float64
Looks like we have some negative values, as well as some huge donation amounts! The negative values are due to the FEC recording refunds as well as donations, let's go ahead and only look at the positive contribution amounts
# Get rid of negative values
top_donor=top_donor[top_donor>0]
top_donor.sort_values()
335573 0.01 335407 0.01 335352 0.01 324596 0.01 329896 0.01 ... 319478 526246.17 344419 1511192.17 344539 1679114.65 326651 1944042.43 325136 2014490.51 Name: contb_receipt_amt, Length: 991475, dtype: float64
new_df=top_donor.value_counts()
new_df.head(10)
100.0 178188 50.0 137584 25.0 110345 250.0 91182 500.0 57984 2500.0 49005 35.0 37237 1000.0 36494 10.0 33986 200.0 27813 Name: contb_receipt_amt, dtype: int64
Here we can see that the top 10 most common donations ranged from 10 to 2500 dollars.
# Create a Series of the common donations limited to 2500
com_don = top_donor[top_donor < 2500]
# Set a high number of bins to account for the non-round donations and check histogram for spikes.
com_don.hist(bins=100)
<AxesSubplot:>
So People give donations mostly in round number such as 100,500,1000 etc
# Grab the unique object from the candidate column
candidates=donor_df.cand_nm.unique()
candidates
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack', "Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy', 'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick', 'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G', 'Huntsman, Jon', 'Perry, Rick'], dtype=object)
Let's go ahead and seperate Obama from the Republican Candidates by adding a Party Affiliation column. We can do this by using map along a dictionary of party affiliations.
# Dictionary of party affiliation
party_map = {'Bachmann, Michelle': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'Huntsman, Jon': 'Republican',
'Johnson, Gary Earl': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Obama, Barack': 'Democrat',
'Paul, Ron': 'Republican',
'Pawlenty, Timothy': 'Republican',
'Perry, Rick': 'Republican',
"Roemer, Charles E. 'Buddy' III": 'Republican',
'Romney, Mitt': 'Republican',
'Santorum, Rick': 'Republican'}
# Now map the party with candidates
donor_df["Party"]=donor_df.cand_nm.map(party_map)
Let's look at our DataFrame and also make sure we clear refunds from the contribution amounts.
# Clear Refund
donor_df=donor_df[donor_df.contb_receipt_amt >0]
donor_df.head()
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | Party | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290.0 | RETIRED | RETIRED | 250.0 | 20-JUN-11 | NaN | NaN | NaN | SA17A | 736166 | Republican |
1 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290.0 | RETIRED | RETIRED | 50.0 | 23-JUN-11 | NaN | NaN | NaN | SA17A | 736166 | Republican |
2 | C00410118 | P20002978 | Bachmann, Michelle | SMITH, LANIER | LANETT | AL | 368633403.0 | INFORMATION REQUESTED | INFORMATION REQUESTED | 250.0 | 05-JUL-11 | NaN | NaN | NaN | SA17A | 749073 | Republican |
3 | C00410118 | P20002978 | Bachmann, Michelle | BLEVINS, DARONDA | PIGGOTT | AR | 724548253.0 | NONE | RETIRED | 250.0 | 01-AUG-11 | NaN | NaN | NaN | SA17A | 749073 | Republican |
4 | C00410118 | P20002978 | Bachmann, Michelle | WARDENBURG, HAROLD | HOT SPRINGS NATION | AR | 719016467.0 | NONE | RETIRED | 300.0 | 20-JUN-11 | NaN | NaN | NaN | SA17A | 736166 | Republican |
Let's start by aggregating the data by candidate. We'll take a quick look a the total amounts received by each candidate. First we will look a the total number of donations and then at the total amount.
# Groupby candidate and then displayt the total number of people who donated
donor_df.groupby('cand_nm')['contb_receipt_amt'].count()
cand_nm Bachmann, Michelle 13082 Cain, Herman 20052 Gingrich, Newt 46883 Huntsman, Jon 4066 Johnson, Gary Earl 1234 McCotter, Thaddeus G 73 Obama, Barack 589127 Paul, Ron 143161 Pawlenty, Timothy 3844 Perry, Rick 12709 Roemer, Charles E. 'Buddy' III 5844 Romney, Mitt 105155 Santorum, Rick 46245 Name: contb_receipt_amt, dtype: int64
Clearly Obama is the front-runner in number of people donating, which makes sense, since he is not competeing with any other democratic nominees. Let's take a look at the total dollar amounts.
# Groupby candidate and then displayt the total amount donated
donor_df.groupby('cand_nm')['contb_receipt_amt'].sum()
cand_nm Bachmann, Michelle 2.711439e+06 Cain, Herman 7.101082e+06 Gingrich, Newt 1.283277e+07 Huntsman, Jon 3.330373e+06 Johnson, Gary Earl 5.669616e+05 McCotter, Thaddeus G 3.903000e+04 Obama, Barack 1.358774e+08 Paul, Ron 2.100962e+07 Pawlenty, Timothy 6.004819e+06 Perry, Rick 2.030575e+07 Roemer, Charles E. 'Buddy' III 3.730099e+05 Romney, Mitt 8.833591e+07 Santorum, Rick 1.104316e+07 Name: contb_receipt_amt, dtype: float64
This isn't super readable, and an important aspect of data science is to clearly present information. Let's go ahead and just print out these values in a clean for loop.
# Start by setting gruopby as a object
cand_amount=donor_df.groupby('cand_nm')['contb_receipt_amt'].sum()
#our index tracker
i=0
for don in cand_amount:
print("The candidate %s raised %.0f dollars"%(cand_amount.index[i],don))
print('\n')
i+=1
The candidate Bachmann, Michelle raised 2711439 dollars The candidate Cain, Herman raised 7101082 dollars The candidate Gingrich, Newt raised 12832770 dollars The candidate Huntsman, Jon raised 3330373 dollars The candidate Johnson, Gary Earl raised 566962 dollars The candidate McCotter, Thaddeus G raised 39030 dollars The candidate Obama, Barack raised 135877427 dollars The candidate Paul, Ron raised 21009620 dollars The candidate Pawlenty, Timothy raised 6004819 dollars The candidate Perry, Rick raised 20305754 dollars The candidate Roemer, Charles E. 'Buddy' III raised 373010 dollars The candidate Romney, Mitt raised 88335908 dollars The candidate Santorum, Rick raised 11043159 dollars
This is okay, but its hard to do a quick comparison just by reading this information. How about just a quick graphic presentation?
# PLot out total donation amounts
cand_amount.plot(kind='bar')
<AxesSubplot:xlabel='cand_nm'>
Now the comparison is very easy to see. As we saw berfore, clearly Obama is the front-runner in donation amounts, which makes sense, since he is not competeing with any other democratic nominees. How about we just compare Democrat versus Republican donations?
# Grouping Party and the counting donations
donor_df.groupby('Party')['contb_receipt_amt'].sum().plot(kind='bar')
<AxesSubplot:xlabel='Party'>
Looks like Obama couldn't compete against all the republicans, but he certainly has the advantage of their funding being splintered across multiple candidates.
Finally to start closing out the project, let's look at donations and who they came from (as far as occupation is concerned). We will start by grabing the occupation information from the dono_df DataFrame and then using pivot_table to make the index defined by the various occupations and then have the columns defined by the Party (Republican or Democrat). FInally we'll also pass an aggregation function in the pivot table, in this case a simple sum function will add up all the comntributions by anyone with the same profession.
# Use a pivot table to extract and organize the data by the donor occupation
occupation_df=donor_df.pivot_table('contb_receipt_amt',index=['contbr_occupation'],columns='Party',aggfunc='sum')
occupation_df.head()
Party | Democrat | Republican |
---|---|---|
contbr_occupation | ||
MIXED-MEDIA ARTIST / STORYTELLER | 100.0 | NaN |
AREA VICE PRESIDENT | 250.0 | NaN |
RESEARCH ASSOCIATE | 100.0 | NaN |
TEACHER | 500.0 | NaN |
THERAPIST | 3900.0 | NaN |
Great! Now let's see how big the DataFrame is.
occupation_df.shape
(45067, 2)
Wow! This is probably far too large to display effectively with a small, static visualization. What we should do is have a cut-off for total contribution amounts. Afterall, small donations of 20 dollars by one type of occupation won't give us too much insight. So let's set our cut off at 1 million dollars.
occupation_df=occupation_df[occupation_df.sum(1)>1000000]
# Now let's check the size
occupation_df.shape
(31, 2)
Great! This looks much more manageable! Now let's visualize it.
# Plot out with pandas
occupation_df.plot(kind='bar')
<AxesSubplot:xlabel='contbr_occupation'>
This is a bit hard to read, so let's use kind = 'barh' (horizontal) to set the ocucpation on the correct axis.
occupation_df.plot(kind='barh',cmap='seismic',figsize=(10,12))
<AxesSubplot:ylabel='contbr_occupation'>
Looks like there are some occupations that are either mislabeled or aren't really occupations. Let's get rid of: Information Requested occupations and let's combine CEO and C.E.O.
occupation_df.drop(['INFORMATION REQUESTED PER BEST EFFORTS','INFORMATION REQUESTED'],axis=0,inplace=True)
Now let's combine the CEO and C.E.O rows.
# Set new ceo row as sum of the current two
occupation_df.loc['CEO']=occupation_df.loc['CEO']+occupation_df.loc['C.E.O.']
#Drop C.E.O.
occupation_df.drop('C.E.O.',inplace=True)
Now let's repeat the same plot!
occupation_df.plot(kind='barh',cmap='seismic',figsize=(10,12))
<AxesSubplot:ylabel='contbr_occupation'>
Awesome! Looks like CEOs are a little more conservative leaning, this may be due to the tax philosphies of each party during the election.