In [1]:

import pandas as pd
from pandas import Series,DataFrame
import numpy as np

In [2]:

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [3]:

# Use to grab data from the web(HTTP capabilities)
import requests

# We'll also use StringIO to work with the csv file, the DataFrame will require a .read() method
from io import StringIO

In [4]:

# This is the url link for the poll data in csv form
url = "http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv"

source=requests.get(url).text

poll_data=StringIO(source)

In [5]:

poll_df=pd.read_csv(poll_data)

In [6]:

poll_df.head()

Out[6]:

	Pollster	Start Date	End Date	Entry Date/Time (ET)	Number of Observations	Population	Mode	Obama	Romney	Undecided	Other	Pollster URL	Source URL	Partisan	Affiliation	Question Text	Question Iteration
0	Politico/GWU/Battleground	2012-11-04	2012-11-05	2012-11-06T08:40:26Z	1000.0	Likely Voters	Live Phone	47.0	47.0	6.0	NaN	https://elections.huffingtonpost.com/pollster/...	http://www.politico.com/news/stories/1112/8338...	Nonpartisan	None	NaN	1
1	YouGov/Economist	2012-11-03	2012-11-05	2012-11-26T15:31:23Z	740.0	Likely Voters	Internet	49.0	47.0	3.0	NaN	https://elections.huffingtonpost.com/pollster/...	http://cdn.yougov.com/cumulus_uploads/document...	Nonpartisan	None	NaN	1
2	Gravis Marketing	2012-11-03	2012-11-05	2012-11-06T09:22:02Z	872.0	Likely Voters	Automated Phone	48.0	48.0	4.0	NaN	https://elections.huffingtonpost.com/pollster/...	http://www.gravispolls.com/2012/11/gravis-mark...	Nonpartisan	None	NaN	1
3	IBD/TIPP	2012-11-03	2012-11-05	2012-11-06T08:51:48Z	712.0	Likely Voters	Live Phone	50.0	49.0	NaN	1.0	https://elections.huffingtonpost.com/pollster/...	http://news.investors.com/special-report/50841...	Nonpartisan	None	NaN	1
4	Rasmussen	2012-11-03	2012-11-05	2012-11-06T08:47:50Z	1500.0	Likely Voters	Automated Phone	48.0	49.0	NaN	NaN	https://elections.huffingtonpost.com/pollster/...	http://www.rasmussenreports.com/public_content...	Nonpartisan	None	NaN	1

In [7]:

poll_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586 entries, 0 to 585
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Pollster                586 non-null    object 
 1   Start Date              586 non-null    object 
 2   End Date                586 non-null    object 
 3   Entry Date/Time (ET)    586 non-null    object 
 4   Number of Observations  564 non-null    float64
 5   Population              586 non-null    object 
 6   Mode                    586 non-null    object 
 7   Obama                   586 non-null    float64
 8   Romney                  586 non-null    float64
 9   Undecided               423 non-null    float64
 10  Other                   202 non-null    float64
 11  Pollster URL            586 non-null    object 
 12  Source URL              584 non-null    object 
 13  Partisan                586 non-null    object 
 14  Affiliation             586 non-null    object 
 15  Question Text           0 non-null      float64
 16  Question Iteration      586 non-null    int64  
dtypes: float64(6), int64(1), object(10)
memory usage: 78.0+ KB

In [8]:

sns.catplot(x='Affiliation',data=poll_df,kind="count")

Out[8]:

<seaborn.axisgrid.FacetGrid at 0x12a0f9d0790>

In [9]:

sns.catplot(x='Affiliation',data=poll_df,kind="count",hue='Population')

Out[9]:

<seaborn.axisgrid.FacetGrid at 0x12a101c61c0>

In [10]:

avg=pd.DataFrame(poll_df.mean())
avg.drop('Number of Observations',axis=0,inplace=True)

In [11]:

avg

Out[11]:

	0
Obama	46.805461
Romney	44.614334
Undecided	6.550827
Other	3.376238
Question Text	NaN
Question Iteration	1.000000

In [12]:

std=pd.DataFrame(poll_df.std())
std.drop('Number of Observations',axis=0,inplace=True)

In [13]:

std

Out[13]:

	0
Obama	2.422058
Romney	2.906180
Undecided	3.701754
Other	2.692726
Question Text	NaN
Question Iteration	0.000000

In [14]:

avg.plot(yerr=std,kind='bar',legend=False)

Out[14]:

<AxesSubplot:>

In [15]:

poll_avg=pd.concat([avg,std],axis=1)

In [16]:

poll_avg

Out[16]:

	0	0
Obama	46.805461	2.422058
Romney	44.614334	2.906180
Undecided	6.550827	3.701754
Other	3.376238	2.692726
Question Text	NaN	NaN
Question Iteration	1.000000	0.000000

In [17]:

poll_avg.columns=['Average','Std']

In [18]:

poll_avg

Out[18]:

	Average	Std
Obama	46.805461	2.422058
Romney	44.614334	2.906180
Undecided	6.550827	3.701754
Other	3.376238	2.692726
Question Text	NaN	NaN
Question Iteration	1.000000	0.000000

In [19]:

poll_df

Out[19]:

	Pollster	Start Date	End Date	Entry Date/Time (ET)	Number of Observations	Population	Mode	Obama	Romney	Undecided	Other	Pollster URL	Source URL	Partisan	Affiliation	Question Text	Question Iteration
0	Politico/GWU/Battleground	2012-11-04	2012-11-05	2012-11-06T08:40:26Z	1000.0	Likely Voters	Live Phone	47.0	47.0	6.0	NaN	https://elections.huffingtonpost.com/pollster/...	http://www.politico.com/news/stories/1112/8338...	Nonpartisan	None	NaN	1
1	YouGov/Economist	2012-11-03	2012-11-05	2012-11-26T15:31:23Z	740.0	Likely Voters	Internet	49.0	47.0	3.0	NaN	https://elections.huffingtonpost.com/pollster/...	http://cdn.yougov.com/cumulus_uploads/document...	Nonpartisan	None	NaN	1
2	Gravis Marketing	2012-11-03	2012-11-05	2012-11-06T09:22:02Z	872.0	Likely Voters	Automated Phone	48.0	48.0	4.0	NaN	https://elections.huffingtonpost.com/pollster/...	http://www.gravispolls.com/2012/11/gravis-mark...	Nonpartisan	None	NaN	1
3	IBD/TIPP	2012-11-03	2012-11-05	2012-11-06T08:51:48Z	712.0	Likely Voters	Live Phone	50.0	49.0	NaN	1.0	https://elections.huffingtonpost.com/pollster/...	http://news.investors.com/special-report/50841...	Nonpartisan	None	NaN	1
4	Rasmussen	2012-11-03	2012-11-05	2012-11-06T08:47:50Z	1500.0	Likely Voters	Automated Phone	48.0	49.0	NaN	NaN	https://elections.huffingtonpost.com/pollster/...	http://www.rasmussenreports.com/public_content...	Nonpartisan	None	NaN	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
581	PPP (D)	2009-07-15	2009-07-16	2010-11-08T11:02:37Z	577.0	Registered Voters	Automated Phone	49.0	40.0	11.0	NaN	https://elections.huffingtonpost.com/pollster/...	http://www.publicpolicypolling.com/pdf/surveys...	Pollster	Dem	NaN	1
582	PPP (D)	2009-06-12	2009-06-16	2010-11-08T10:58:00Z	638.0	Registered Voters	Automated Phone	48.0	40.0	12.0	NaN	https://elections.huffingtonpost.com/pollster/...	http://www.publicpolicypolling.com/pdf/surveys...	Pollster	Dem	NaN	1
583	PPP (D)	2009-05-14	2009-05-18	2010-11-08T10:49:08Z	1000.0	Registered Voters	Automated Phone	53.0	35.0	12.0	NaN	https://elections.huffingtonpost.com/pollster/...	http://www.publicpolicypolling.com/pdf/surveys...	Pollster	Dem	NaN	1
584	PPP (D)	2009-04-17	2009-04-19	2010-11-08T10:45:18Z	686.0	Registered Voters	Automated Phone	50.0	39.0	11.0	NaN	https://elections.huffingtonpost.com/pollster/...	http://www.publicpolicypolling.com/pdf/surveys...	Pollster	Dem	NaN	1
585	PPP (D)	2009-03-13	2009-03-15	2010-11-08T10:36:31Z	1403.0	Registered Voters	Automated Phone	44.0	44.0	12.0	NaN	https://elections.huffingtonpost.com/pollster/...	http://www.publicpolicypolling.com/pdf/PPP_Rel...	Pollster	Dem	NaN	1

586 rows × 17 columns

In [20]:

poll_df.plot(x='End Date',y=['Obama','Romney','Undecided'],linestyle='',marker='o')

Out[20]:

<AxesSubplot:xlabel='End Date'>

In this the time is from left to right so at the end the voter turns to submerged

In [21]:

from datetime import datetime

In [22]:

poll_df['Difference']=(poll_df.Obama - poll_df.Romney)/100

In [23]:

poll_df.head()

Out[23]:

	Pollster	Start Date	End Date	Entry Date/Time (ET)	Number of Observations	Population	Mode	Obama	Romney	Undecided	Other	Pollster URL	Source URL	Partisan	Affiliation	Question Text	Question Iteration	Difference
0	Politico/GWU/Battleground	2012-11-04	2012-11-05	2012-11-06T08:40:26Z	1000.0	Likely Voters	Live Phone	47.0	47.0	6.0	NaN	https://elections.huffingtonpost.com/pollster/...	http://www.politico.com/news/stories/1112/8338...	Nonpartisan	None	NaN	1	0.00
1	YouGov/Economist	2012-11-03	2012-11-05	2012-11-26T15:31:23Z	740.0	Likely Voters	Internet	49.0	47.0	3.0	NaN	https://elections.huffingtonpost.com/pollster/...	http://cdn.yougov.com/cumulus_uploads/document...	Nonpartisan	None	NaN	1	0.02
2	Gravis Marketing	2012-11-03	2012-11-05	2012-11-06T09:22:02Z	872.0	Likely Voters	Automated Phone	48.0	48.0	4.0	NaN	https://elections.huffingtonpost.com/pollster/...	http://www.gravispolls.com/2012/11/gravis-mark...	Nonpartisan	None	NaN	1	0.00
3	IBD/TIPP	2012-11-03	2012-11-05	2012-11-06T08:51:48Z	712.0	Likely Voters	Live Phone	50.0	49.0	NaN	1.0	https://elections.huffingtonpost.com/pollster/...	http://news.investors.com/special-report/50841...	Nonpartisan	None	NaN	1	0.01
4	Rasmussen	2012-11-03	2012-11-05	2012-11-06T08:47:50Z	1500.0	Likely Voters	Automated Phone	48.0	49.0	NaN	NaN	https://elections.huffingtonpost.com/pollster/...	http://www.rasmussenreports.com/public_content...	Nonpartisan	None	NaN	1	-0.01

In [24]:

# Set as_index=Flase to keep the 0,1,2,... index. Then we'll take the mean of the polls on that day.
poll_df=poll_df.groupby(['Start Date'],as_index=False).mean() 

In [25]:

poll_df

Out[25]:

	Start Date	Number of Observations	Obama	Romney	Undecided	Other	Question Text	Question Iteration	Difference
0	2009-03-13	1403.000000	44.00	44.00	12.000000	NaN	NaN	1	0.0000
1	2009-04-17	686.000000	50.00	39.00	11.000000	NaN	NaN	1	0.1100
2	2009-05-14	1000.000000	53.00	35.00	12.000000	NaN	NaN	1	0.1800
3	2009-06-12	638.000000	48.00	40.00	12.000000	NaN	NaN	1	0.0800
4	2009-07-15	577.000000	49.00	40.00	11.000000	NaN	NaN	1	0.0900
...	...	...	...	...	...	...	...	...	...
352	2012-10-31	10420.250000	48.00	46.50	4.000000	2.0	NaN	1	0.0150
353	2012-11-01	1989.000000	49.00	47.25	2.400000	1.5	NaN	1	0.0175
354	2012-11-02	1041.750000	49.25	48.50	2.000000	1.0	NaN	1	0.0075
355	2012-11-03	1310.833333	48.50	48.00	4.333333	1.0	NaN	1	0.0050
356	2012-11-04	1000.000000	47.00	47.00	6.000000	NaN	NaN	1	0.0000

357 rows × 9 columns

Now plotting the Differencce versus time should be straight forward.

In [26]:

# Plotting the difference in polls between Obama and Romney
poll_df.plot('Start Date','Difference',figsize=(12,4),marker='o',linestyle='-',color='purple')

Out[26]:

<AxesSubplot:xlabel='Start Date'>

It would be very interesting to plot marker lines on the dates of the debates and see if there is any general insight to the poll results.

The debate dates were Oct 3rd, Oct 11, and Oct 22nd. Let's plot some lines as markers and then zoom in on the month of October.

In order to find where to set the x limits for the figure we need to find out where the index for the month of October in 2012 is. Here's a simple for loop to find that row. Note, the string format of the date makes this difficult to do without using a lambda expression or a map.

In [27]:

# Set row count and xlimit list
row_in=0
xlimit=[]

# Cycle through dates until 2012-10 is found, then print row index
for date in poll_df['Start Date']:
    if date[0:7] == '2012-10':
        xlimit.append(row_in)
        row_in +=1
    else:
        row_in +=1
        
print(min(xlimit))
print(max(xlimit))

325
352

Great now we know where to set our x limits for the month of October in our figure.

In [28]:

# Start with original figure
fig = poll_df.plot('Start Date','Difference',figsize=(12,4),marker='o',linestyle='-',color='purple',xlim=(325,352))

In [29]:

fig = poll_df.plot('Start Date','Difference',figsize=(12,4),marker='o',linestyle='-',color='purple',xlim=(325,352))

# Now add the debate markers

plt.axvline(x=325+2, linewidth=4, color='grey')
plt.axvline(x=325+10, linewidth=4, color='grey')
plt.axvline(x=325+21, linewidth=4, color='grey')

Out[29]:

<matplotlib.lines.Line2D at 0x12a1051a1c0>

Surprisingly, thse polls reflect a dip for Obama after the second debate against Romney, even though memory serves that he performed much worse against Romney during the first debate.

For all these polls it is important to remeber how geographical location can effect the value of a poll in predicting the outcomes of a national election.

In [30]:

pwd

Out[30]:

'C:\\Users\\HP'

Donor Data Set¶

Let's go ahead and switch gears and take a look at a data set consisting of information on donations to the federal campaign.

This is going to be the biggest data set we've looked at so far. You can download it here , then make sure to save it to the same folder your iPython Notebooks are in.

The questions we will be trying to answer while looking at this Data Set is:

1.) How much was donated and what was the average donation?

2.) How did the donations differ between candidates?

3.) How did the donations differ between Democrats and Republicans?

4.) What were the demographics of the donors?

5.) Is there a pattern to donation amounts?

In [31]:

donor_df=pd.read_csv('Election_Donor_Data.csv')

C:\Users\HP\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3165: DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False.
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,

In [33]:

donor_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001731 entries, 0 to 1001730
Data columns (total 16 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   cmte_id            1001731 non-null  object 
 1   cand_id            1001731 non-null  object 
 2   cand_nm            1001731 non-null  object 
 3   contbr_nm          1001731 non-null  object 
 4   contbr_city        1001712 non-null  object 
 5   contbr_st          1001727 non-null  object 
 6   contbr_zip         1001620 non-null  object 
 7   contbr_employer    988002 non-null   object 
 8   contbr_occupation  993301 non-null   object 
 9   contb_receipt_amt  1001731 non-null  float64
 10  contb_receipt_dt   1001731 non-null  object 
 11  receipt_desc       14166 non-null    object 
 12  memo_cd            92482 non-null    object 
 13  memo_text          97770 non-null    object 
 14  form_tp            1001731 non-null  object 
 15  file_num           1001731 non-null  int64  
dtypes: float64(1), int64(1), object(14)
memory usage: 122.3+ MB

In [34]:

donor_df.head()

Out[34]:

	cmte_id	cand_id	cand_nm	contbr_nm	contbr_city	contbr_st	contbr_zip	contbr_employer	contbr_occupation	contb_receipt_amt	contb_receipt_dt	receipt_desc	memo_cd	memo_text	form_tp	file_num
0	C00410118	P20002978	Bachmann, Michelle	HARVEY, WILLIAM	MOBILE	AL	366010290.0	RETIRED	RETIRED	250.0	20-JUN-11	NaN	NaN	NaN	SA17A	736166
1	C00410118	P20002978	Bachmann, Michelle	HARVEY, WILLIAM	MOBILE	AL	366010290.0	RETIRED	RETIRED	50.0	23-JUN-11	NaN	NaN	NaN	SA17A	736166
2	C00410118	P20002978	Bachmann, Michelle	SMITH, LANIER	LANETT	AL	368633403.0	INFORMATION REQUESTED	INFORMATION REQUESTED	250.0	05-JUL-11	NaN	NaN	NaN	SA17A	749073
3	C00410118	P20002978	Bachmann, Michelle	BLEVINS, DARONDA	PIGGOTT	AR	724548253.0	NONE	RETIRED	250.0	01-AUG-11	NaN	NaN	NaN	SA17A	749073
4	C00410118	P20002978	Bachmann, Michelle	WARDENBURG, HAROLD	HOT SPRINGS NATION	AR	719016467.0	NONE	RETIRED	300.0	20-JUN-11	NaN	NaN	NaN	SA17A	736166

What might be interesting to do is get a quick glimpse of the donation amounts, and the average donation amount. Let's go ahead and break down the data.

In [35]:

# Get a quick look at the various donation amounts
donor_df['contb_receipt_amt'].value_counts()

Out[35]:

 100.00     178188
 50.00      137584
 25.00      110345
 250.00      91182
 500.00      57984
             ...  
 58.43           1
-2115.21         1
 75.64           1
 142.41          1
 35.83           1
Name: contb_receipt_amt, Length: 8079, dtype: int64

8079 different amounts! Thats quite a variation. Let's look at the average and the std.

In [42]:

don_mean=donor_df['contb_receipt_amt'].mean()
don_std=donor_df['contb_receipt_amt'].std()


print("The Average donation was %.2f with a standard deviation of %.2f"%(don_mean,don_std))

The Average donation was 298.24 with a standard deviation of 3749.67

Wow! That's a huge standard deviation! Let's see if there are any large donations or other factors messing with the distribution of the donations.

In [54]:

# Let's make a Series from the DataFrame, use .copy() to avoid view errors
top_donor=donor_df['contb_receipt_amt'].copy()

# Now we sort it
top_donor.sort_values()

Out[54]:

114604     -30800.00
226986     -25800.00
101356      -7500.00
398429      -5500.00
250737      -5455.00
             ...    
319478     526246.17
344419    1511192.17
344539    1679114.65
326651    1944042.43
325136    2014490.51
Name: contb_receipt_amt, Length: 1001731, dtype: float64

Looks like we have some negative values, as well as some huge donation amounts! The negative values are due to the FEC recording refunds as well as donations, let's go ahead and only look at the positive contribution amounts

In [57]:

# Get rid of negative values
top_donor=top_donor[top_donor>0]

In [58]:

top_donor.sort_values()

Out[58]:

335573          0.01
335407          0.01
335352          0.01
324596          0.01
329896          0.01
             ...    
319478     526246.17
344419    1511192.17
344539    1679114.65
326651    1944042.43
325136    2014490.51
Name: contb_receipt_amt, Length: 991475, dtype: float64

In [65]:

new_df=top_donor.value_counts()

In [67]:

new_df.head(10)

Out[67]:

100.0     178188
50.0      137584
25.0      110345
250.0      91182
500.0      57984
2500.0     49005
35.0       37237
1000.0     36494
10.0       33986
200.0      27813
Name: contb_receipt_amt, dtype: int64

Here we can see that the top 10 most common donations ranged from 10 to 2500 dollars.

In [68]:

# Create a Series of the common donations limited to 2500
com_don = top_donor[top_donor < 2500]

# Set a high number of bins to account for the non-round donations and check histogram for spikes.
com_don.hist(bins=100)

Out[68]:

<AxesSubplot:>

So People give donations mostly in round number such as 100,500,1000 etc

In [69]:

# Grab the unique object from the candidate column
candidates=donor_df.cand_nm.unique()

candidates

Out[69]:

array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
       "Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
       'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
       'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
       'Huntsman, Jon', 'Perry, Rick'], dtype=object)

Let's go ahead and seperate Obama from the Republican Candidates by adding a Party Affiliation column. We can do this by using map along a dictionary of party affiliations.

In [70]:

# Dictionary of party affiliation
party_map = {'Bachmann, Michelle': 'Republican',
           'Cain, Herman': 'Republican',
           'Gingrich, Newt': 'Republican',
           'Huntsman, Jon': 'Republican',
           'Johnson, Gary Earl': 'Republican',
           'McCotter, Thaddeus G': 'Republican',
           'Obama, Barack': 'Democrat',
           'Paul, Ron': 'Republican',
           'Pawlenty, Timothy': 'Republican',
           'Perry, Rick': 'Republican',
           "Roemer, Charles E. 'Buddy' III": 'Republican',
           'Romney, Mitt': 'Republican',
           'Santorum, Rick': 'Republican'}

# Now map the party with candidates
donor_df["Party"]=donor_df.cand_nm.map(party_map)

Let's look at our DataFrame and also make sure we clear refunds from the contribution amounts.

In [73]:

# Clear Refund
donor_df=donor_df[donor_df.contb_receipt_amt >0]

In [77]:

donor_df.head()

Out[77]:

	cmte_id	cand_id	cand_nm	contbr_nm	contbr_city	contbr_st	contbr_zip	contbr_employer	contbr_occupation	contb_receipt_amt	contb_receipt_dt	receipt_desc	memo_cd	memo_text	form_tp	file_num	Party
0	C00410118	P20002978	Bachmann, Michelle	HARVEY, WILLIAM	MOBILE	AL	366010290.0	RETIRED	RETIRED	250.0	20-JUN-11	NaN	NaN	NaN	SA17A	736166	Republican
1	C00410118	P20002978	Bachmann, Michelle	HARVEY, WILLIAM	MOBILE	AL	366010290.0	RETIRED	RETIRED	50.0	23-JUN-11	NaN	NaN	NaN	SA17A	736166	Republican
2	C00410118	P20002978	Bachmann, Michelle	SMITH, LANIER	LANETT	AL	368633403.0	INFORMATION REQUESTED	INFORMATION REQUESTED	250.0	05-JUL-11	NaN	NaN	NaN	SA17A	749073	Republican
3	C00410118	P20002978	Bachmann, Michelle	BLEVINS, DARONDA	PIGGOTT	AR	724548253.0	NONE	RETIRED	250.0	01-AUG-11	NaN	NaN	NaN	SA17A	749073	Republican
4	C00410118	P20002978	Bachmann, Michelle	WARDENBURG, HAROLD	HOT SPRINGS NATION	AR	719016467.0	NONE	RETIRED	300.0	20-JUN-11	NaN	NaN	NaN	SA17A	736166	Republican

Let's start by aggregating the data by candidate. We'll take a quick look a the total amounts received by each candidate. First we will look a the total number of donations and then at the total amount.

In [78]:

# Groupby candidate and then displayt the total number of people who donated
donor_df.groupby('cand_nm')['contb_receipt_amt'].count()

Out[78]:

cand_nm
Bachmann, Michelle                 13082
Cain, Herman                       20052
Gingrich, Newt                     46883
Huntsman, Jon                       4066
Johnson, Gary Earl                  1234
McCotter, Thaddeus G                  73
Obama, Barack                     589127
Paul, Ron                         143161
Pawlenty, Timothy                   3844
Perry, Rick                        12709
Roemer, Charles E. 'Buddy' III      5844
Romney, Mitt                      105155
Santorum, Rick                     46245
Name: contb_receipt_amt, dtype: int64

Clearly Obama is the front-runner in number of people donating, which makes sense, since he is not competeing with any other democratic nominees. Let's take a look at the total dollar amounts.

In [79]:

# Groupby candidate and then displayt the total amount donated
donor_df.groupby('cand_nm')['contb_receipt_amt'].sum()

Out[79]:

cand_nm
Bachmann, Michelle                2.711439e+06
Cain, Herman                      7.101082e+06
Gingrich, Newt                    1.283277e+07
Huntsman, Jon                     3.330373e+06
Johnson, Gary Earl                5.669616e+05
McCotter, Thaddeus G              3.903000e+04
Obama, Barack                     1.358774e+08
Paul, Ron                         2.100962e+07
Pawlenty, Timothy                 6.004819e+06
Perry, Rick                       2.030575e+07
Roemer, Charles E. 'Buddy' III    3.730099e+05
Romney, Mitt                      8.833591e+07
Santorum, Rick                    1.104316e+07
Name: contb_receipt_amt, dtype: float64

This isn't super readable, and an important aspect of data science is to clearly present information. Let's go ahead and just print out these values in a clean for loop.

In [84]:

# Start by setting gruopby as a object
cand_amount=donor_df.groupby('cand_nm')['contb_receipt_amt'].sum()

#our index tracker
i=0

for don in cand_amount:
    print("The candidate %s raised %.0f dollars"%(cand_amount.index[i],don))
    print('\n')
    i+=1

The candidate Bachmann, Michelle raised 2711439 dollars


The candidate Cain, Herman raised 7101082 dollars


The candidate Gingrich, Newt raised 12832770 dollars


The candidate Huntsman, Jon raised 3330373 dollars


The candidate Johnson, Gary Earl raised 566962 dollars


The candidate McCotter, Thaddeus G raised 39030 dollars


The candidate Obama, Barack raised 135877427 dollars


The candidate Paul, Ron raised 21009620 dollars


The candidate Pawlenty, Timothy raised 6004819 dollars


The candidate Perry, Rick raised 20305754 dollars


The candidate Roemer, Charles E. 'Buddy' III raised 373010 dollars


The candidate Romney, Mitt raised 88335908 dollars


The candidate Santorum, Rick raised 11043159 dollars

This is okay, but its hard to do a quick comparison just by reading this information. How about just a quick graphic presentation?

In [85]:

# PLot out total donation amounts
cand_amount.plot(kind='bar')

Out[85]:

<AxesSubplot:xlabel='cand_nm'>

Now the comparison is very easy to see. As we saw berfore, clearly Obama is the front-runner in donation amounts, which makes sense, since he is not competeing with any other democratic nominees. How about we just compare Democrat versus Republican donations?

In [86]:

# Grouping Party and the counting donations
donor_df.groupby('Party')['contb_receipt_amt'].sum().plot(kind='bar')

Out[86]:

<AxesSubplot:xlabel='Party'>

Looks like Obama couldn't compete against all the republicans, but he certainly has the advantage of their funding being splintered across multiple candidates.

Finally to start closing out the project, let's look at donations and who they came from (as far as occupation is concerned). We will start by grabing the occupation information from the dono_df DataFrame and then using pivot_table to make the index defined by the various occupations and then have the columns defined by the Party (Republican or Democrat). FInally we'll also pass an aggregation function in the pivot table, in this case a simple sum function will add up all the comntributions by anyone with the same profession.

In [101]:

# Use a pivot table to extract and organize the data by the donor occupation
occupation_df=donor_df.pivot_table('contb_receipt_amt',index=['contbr_occupation'],columns='Party',aggfunc='sum')

In [102]:

occupation_df.head()

Out[102]:

Party	Democrat	Republican
contbr_occupation
MIXED-MEDIA ARTIST / STORYTELLER	100.0	NaN
AREA VICE PRESIDENT	250.0	NaN
RESEARCH ASSOCIATE	100.0	NaN
TEACHER	500.0	NaN
THERAPIST	3900.0	NaN

Great! Now let's see how big the DataFrame is.

In [103]:

occupation_df.shape

Out[103]:

(45067, 2)

Wow! This is probably far too large to display effectively with a small, static visualization. What we should do is have a cut-off for total contribution amounts. Afterall, small donations of 20 dollars by one type of occupation won't give us too much insight. So let's set our cut off at 1 million dollars.

In [104]:

occupation_df=occupation_df[occupation_df.sum(1)>1000000]

In [105]:

# Now let's check the size
occupation_df.shape

Out[105]:

(31, 2)

Great! This looks much more manageable! Now let's visualize it.

In [106]:

# Plot out with pandas
occupation_df.plot(kind='bar')

Out[106]:

<AxesSubplot:xlabel='contbr_occupation'>

This is a bit hard to read, so let's use kind = 'barh' (horizontal) to set the ocucpation on the correct axis.

In [107]:

occupation_df.plot(kind='barh',cmap='seismic',figsize=(10,12))

Out[107]:

<AxesSubplot:ylabel='contbr_occupation'>

Looks like there are some occupations that are either mislabeled or aren't really occupations. Let's get rid of: Information Requested occupations and let's combine CEO and C.E.O.

In [108]:

occupation_df.drop(['INFORMATION REQUESTED PER BEST EFFORTS','INFORMATION REQUESTED'],axis=0,inplace=True)

Now let's combine the CEO and C.E.O rows.

In [110]:

# Set new ceo row as sum of the current two
occupation_df.loc['CEO']=occupation_df.loc['CEO']+occupation_df.loc['C.E.O.']
#Drop C.E.O.
occupation_df.drop('C.E.O.',inplace=True)

Now let's repeat the same plot!

In [111]:

occupation_df.plot(kind='barh',cmap='seismic',figsize=(10,12))

Out[111]:

<AxesSubplot:ylabel='contbr_occupation'>

Awesome! Looks like CEOs are a little more conservative leaning, this may be due to the tax philosphies of each party during the election.

In [ ]: