import datetime
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas
from scipy import stats
np.set_printoptions(precision=4, suppress=True)
pandas.set_printoptions(notebook_repr_html=False,
precision=4,
max_columns=12, column_space=10,
max_colwidth=25)
from matplotlib import rcParams
#rcParams['text.usetex'] = False
#rcParams['text.latex.unicode'] = False
We have a snapshot for what would happen if the election is held today (Don't go bet on intratrade based on this model). Historically, polls have narrowed as the election nears.
today = datetime.datetime(2012, 10, 2)
election = datetime.datetime(2012, 11, 6)
days_before = election - today
date2004 = datetime.datetime(2004, 11, 2)
days_before2004 = date2004 - days_before
date2008 = datetime.datetime(2008, 11, 4)
days_before2008 = date2008 - days_before
national_2004 = pandas.read_table("/home/skipper/school/talks/538model/data/2004_poll_data.csv")
national_2004.rename(columns={"Poll" : "Pollster"}, inplace=True);
state_data2004 = pandas.read_csv("/home/skipper/school/talks/538model/data/2004-pres-polls.csv")
state_data2008 = pandas.read_csv("/home/skipper/school/talks/538model/data/2008-pres-polls.csv")
state_data2004
<class 'pandas.core.frame.DataFrame'> Int64Index: 879 entries, 0 to 878 Data columns: State 879 non-null values Kerry 879 non-null values Bush 879 non-null values Date 879 non-null values Pollster 879 non-null values dtypes: int64(2), object(3)
state_data2004.rename(columns={"Kerry" : "challenger",
"Bush" : "incumbent"},
inplace=True);
state_data2004["dem_spread"] = (state_data2004["challenger"] -
state_data2004["incumbent"])
state_data2004.Date.replace({"Nov 00" : "Nov 01", "Oct 00" : "Oct 01"},
inplace=True);
state_data2004.Date = (state_data2004.Date + ", 2004").apply(
pandas.datetools.parse)
def median_date(row, year="2008"):
dt1 = pandas.datetools.parse(row["Start"] + ", " + year)
dt2 = pandas.datetools.parse(row["End"] + ", " + year)
dates = pandas.date_range(dt1, dt2)
median_idx = int(np.median(range(len(dates)))+.5)
return dates[median_idx]
state_data2008["Date"] = state_data2008.apply(median_date, axis=1)
del state_data2008["Start"]
del state_data2008["End"]
actual = national_2004.head(1)
national_2004 = national_2004.ix[national_2004.index[~national_2004.Pollster.isin(["Final Results", "RCP Average"])]]
def split_median_date(row):
dt = row["Date"]
dt1, dt2 = dt.split(" - ")
dates = pandas.date_range(dt1 + ", 2004", dt2 + ", 2004")
median_idx = int(np.median(range(len(dates)))+.5)
return dates[median_idx]
national_2004["Date"] = national_2004.apply(split_median_date, axis=1)
national_2004["dem_spread"] = national_2004["Kerry (D)"] - national_2004["Bush (R)"]
state_data2008
<class 'pandas.core.frame.DataFrame'> Int64Index: 1189 entries, 0 to 1188 Data columns: State 1189 non-null values Obama 1189 non-null values McCain 1189 non-null values Pollster 1189 non-null values Date 1189 non-null values dtypes: int64(2), object(3)
state_data2008.rename(columns={"Obama" : "challenger",
"McCain" : "incumbent"},
inplace=True);
state_data2008["dem_spread"] = (state_data2008["challenger"] -
state_data2008["incumbent"])
import pickle
pollster_map = pickle.load(open(
"/home/skipper/school/talks/538model/data/pollster_map.pkl", "rb"))
state_data2004.Pollster.replace(pollster_map, inplace=True);
state_data2008.Pollster.replace(pollster_map, inplace=True);
national_2004.Pollster.replace(pollster_map, inplace=True);
These are old weights obtained from the 538 web site. New weights are not published anywhere to my knowledge.
weights = pandas.read_table("/home/skipper/school/talks/538model/"
"data/pollster_weights.csv")
state_data2004 = state_data2004.merge(weights, on="Pollster", how="inner");
state_data2008 = state_data2008.merge(weights, on="Pollster", how="inner");
def edit_tick_label(tick_val, tick_pos):
if tick_val < 0:
text = str(int(tick_val)).replace("-", "Republican+")
else:
text = "Democrat+"+str(int(tick_val))
return text
from pandas import lib
from matplotlib.ticker import FuncFormatter
fig, axes = plt.subplots(figsize=(12,8))
data = national_2004[["Date", "dem_spread"]]
#data = data.ix[data.Date >= days_before2004]
#data = pandas.concat((data, national_data2012[["Date", "dem_spread"]]))
data.sort("Date", inplace=True)
dates = pandas.DatetimeIndex(data.Date).asi8
x = data.dem_spread.values.astype(float)
lowess_res = sm.nonparametric.lowess(x, dates,
frac=.2, it=3)[:,1]
dates_x = lib.ints_to_pydatetime(dates)
axes.scatter(dates_x, data["dem_spread"])
axes.plot(dates_x, lowess_res, color='r', lw=4)
axes.yaxis.get_major_locator().set_params(nbins=12)
axes.yaxis.set_major_formatter(FuncFormatter(edit_tick_label))
axes.grid(False, axis='x')
axes.hlines(-1.21, dates_x[0], dates_x[-1], color='black', lw=3)
axes.vlines(datetime.datetime(2004, 8, 5), -20, 15, lw=3)
axes.margins(0, .00)
from pandas import lib
from matplotlib.ticker import FuncFormatter
fig, axes = plt.subplots(figsize=(12,8))
data = state_data2004[["Date", "dem_spread"]]
#data = data.ix[data.Date >= days_before2004]
data = data.ix[data.Date >= datetime.datetime(2004, 7, 15)]
#data = pandas.concat((data, national_data2012[["Date", "dem_spread"]]))
data.sort("Date", inplace=True)
dates = pandas.DatetimeIndex(data.Date).asi8
x = data.dem_spread.values.astype(float)
lowess_res = sm.nonparametric.lowess(x, dates,
frac=.2, it=3)[:,1]
dates_x = lib.ints_to_pydatetime(dates)
axes.scatter(dates_x, data["dem_spread"])
axes.plot(dates_x, lowess_res, color='r', lw=4)
axes.yaxis.get_major_locator().set_params(nbins=12)
axes.yaxis.set_major_formatter(FuncFormatter(edit_tick_label))
axes.grid(False, axis='x')
axes.hlines(-1.21, dates_x[0], dates_x[-1], color='black', lw=3)
axes.margins(0, .05)
from pandas import lib
from matplotlib.ticker import FuncFormatter
fig, axes = plt.subplots(figsize=(12,8))
data = state_data2008[["Date", "dem_spread"]]
data = data.ix[data.Date >= datetime.datetime(2008, 7, 15)]
#data = data.ix[data.Date >= days_before2008]
#data = pandas.concat((data, national_data2012[["Date", "dem_spread"]]))
data.sort("Date", inplace=True)
dates = pandas.DatetimeIndex(data.Date).asi8
x = data.dem_spread.values.astype(float)
lowess_res = sm.nonparametric.lowess(x, dates,
frac=.2, it=3)[:,1]
dates_x = lib.ints_to_pydatetime(dates)
axes.scatter(dates_x, data["dem_spread"])
axes.plot(dates_x, lowess_res, color='r', lw=4)
axes.yaxis.get_major_locator().set_params(nbins=12)
axes.yaxis.set_major_formatter(FuncFormatter(edit_tick_label))
axes.grid(False, axis='x')
axes.hlines(3.65, dates_x[0], dates_x[-1], color='black', lw=3)
axes.vlines(datetime.datetime(2008, 8, 29), -45, 70, lw=3)
axes.vlines(datetime.datetime(2008, 9, 24), -45, 70, lw=3)
axes.margins(0, .0)
#loadpy https://raw.github.com/gist/3912533/d958b515f602f6e73f7b16d8bc412bc8d1f433d9/state_abbrevs.py;
states_abbrev_dict = {
'AK': 'Alaska',
'AL': 'Alabama',
'AR': 'Arkansas',
'AS': 'American Samoa',
'AZ': 'Arizona',
'CA': 'California',
'CO': 'Colorado',
'CT': 'Connecticut',
'DC': 'District of Columbia',
'DE': 'Delaware',
'FL': 'Florida',
'GA': 'Georgia',
'GU': 'Guam',
'HI': 'Hawaii',
'IA': 'Iowa',
'ID': 'Idaho',
'IL': 'Illinois',
'IN': 'Indiana',
'KS': 'Kansas',
'KY': 'Kentucky',
'LA': 'Louisiana',
'MA': 'Massachusetts',
'MD': 'Maryland',
'ME': 'Maine',
'MI': 'Michigan',
'MN': 'Minnesota',
'MO': 'Missouri',
'MP': 'Northern Mariana Islands',
'MS': 'Mississippi',
'MT': 'Montana',
'NA': 'National',
'NC': 'North Carolina',
'ND': 'North Dakota',
'NE': 'Nebraska',
'NH': 'New Hampshire',
'NJ': 'New Jersey',
'NM': 'New Mexico',
'NV': 'Nevada',
'NY': 'New York',
'OH': 'Ohio',
'OK': 'Oklahoma',
'OR': 'Oregon',
'PA': 'Pennsylvania',
'PR': 'Puerto Rico',
'RI': 'Rhode Island',
'SC': 'South Carolina',
'SD': 'South Dakota',
'TN': 'Tennessee',
'TX': 'Texas',
'UT': 'Utah',
'VA': 'Virginia',
'VI': 'Virgin Islands',
'VT': 'Vermont',
'WA': 'Washington',
'WI': 'Wisconsin',
'WV': 'West Virginia',
'WY': 'Wyoming'
}
state_data2004.State.replace(states_abbrev_dict, inplace=True);
state_data2008.State.replace(states_abbrev_dict, inplace=True);
state_data2004["days_until"] = date2004 - state_data2004.Date
state_data2008["days_until"] = date2008 - state_data2004.Date
#state_data2004 = state_data2004.drop(
# state_data2004.index[state_data2004.days_until > days_before])
#state_data2008 = state_data2008.drop(
# state_data2008.index[state_data2008.days_until > days_before])
def exp_decay(days):
# defensive coding, accepts timedeltas
days = getattr(days, "days", days)
return .5 ** (days/30.)
state_data2004["time_weight_oct2"] = (days_before2004 -
state_data2004["Date"]).apply(exp_decay)
state_data2004["time_weight_election"] = (date2004 -
state_data2004["Date"]).apply(exp_decay)
state_data2008["time_weight_oct2"] = (days_before2008 -
state_data2008["Date"]).apply(exp_decay)
state_data2008["time_weight_election"] = (date2008 -
state_data2008["Date"]).apply(exp_decay)
def weighted_mean(group, weights_name):
weights = group[weights_name]
return np.sum(weights*group["dem_spread"]/np.sum(weights))
def get_state_averages(dframe, time_weight_name):
dframe_pollsters = dframe.groupby(["State", "Pollster"])
dframe_result = dframe_pollsters.apply(weighted_mean, time_weight_name)
dframe_result.name = "dem_spread"
dframe_result = dframe_result.reset_index()
dframe_result = dframe_result.merge(dframe[["Pollster", "Weight"]],
on="Pollster")
return dframe_result.groupby("State").apply(weighted_mean, "Weight")
oct2 = state_data2004.Date <= days_before2004
state_polls_oct2_2004 = get_state_averages(state_data2004.ix[oct2], "time_weight_oct2")
state_polls_election_2004 = get_state_averages(state_data2004, "time_weight_election")
updated2004 = state_data2004.ix[~oct2].State.unique()
updated2004.sort()
oct2 = state_data2008.Date <= days_before2008
state_polls_oct2_2008 = get_state_averages(state_data2008.ix[oct2], "time_weight_oct2")
state_polls_election_2008 = get_state_averages(state_data2008, "time_weight_election")
updated2008 = state_data2008.ix[~oct2].State.unique()
updated2008.sort()
FRED Variable | Explanation |
---|---|
PAYEMS | Nonfarm-Payrolls (Job Growth) |
PI | Personal Income |
INDPRO | Industrial Production |
PCEC96 | Consumption |
CPIAUCSL | Inflation |
from pandas.io.data import DataReader
series = dict(jobs = "PAYEMS",
income = "PI",
prod = "INDPRO",
cons = "PCEC96",
prices = "CPIAUCSL")
try:
indicators = []
for variable in series:
data = DataReader(series[variable], "fred", start="2000-10-1")
data.rename(columns={series[variable] : variable}, inplace=True)
indicators.append(data)
indicators = pandas.concat(indicators, axis=1)
indicators.to_csv("/home/skipper/school/talks/538model/tmp_indicators_full.csv")
except: # probably not online
indicators = pandas.read_csv("/home/skipper/school/talks/538model/tmp_indicators_full.csv",
parse_dates=True)
indicators.set_index("DATE", inplace=True)
# why doesn't it do this automaticall?
indicators.index = pandas.DatetimeIndex(indicators.index)
For stock variables, just compute annualized quarterly growth rates (end - beginning)/beginning * 400 and average.
quarterly_growth = np.log(indicators.resample("Q",
how="mean")).diff() * 400
annualized = quarterly_growth.resample("A", how="mean")
quarterly_growth = quarterly_growth.dropna()
Try to be rigorous about what the voters know at the time of election.
econ2004 = quarterly_growth.ix[:15].resample('A', 'mean').mean()
econ2008 = quarterly_growth.ix[15:31].resample('A', 'mean').mean()
Leave out last quarter 2008 because that's on Bush? Do voters see it that way...?
econ2012 = quarterly_growth.ix[32:].resample('A', 'mean').mean()
For flow variables, sum the quarters and get annualized quarter over quarter changes then average.
Partisan voting index
pvi = pandas.read_csv("/home/skipper/school/talks/538model/data/partisan_voting.csv")
pvi.set_index("State", inplace=True);
pvi.PVI = pvi.PVI.replace({"EVEN" : "0"})
pvi.PVI = pvi.PVI.str.replace("R\+", "-")
pvi.PVI = pvi.PVI.str.replace("D\+", "")
pvi.PVI = pvi.PVI.astype(float)
pvi.PVI
State Alabama -13 Alaska -13 Arizona -6 Arkansas -9 California 7 Colorado 0 Connecticut 7 Delaware 7 District of Columbia 39 Florida -2 Georgia -7 Hawaii 12 Idaho -17 Illinois 8 Indiana -6 Iowa 1 Kansas -12 Kentucky -10 Louisiana -10 Maine 5 Maryland 9 Massachusetts 12 Michigan 4 Minnesota 2 Mississippi -10 Missouri -3 Montana -7 Nebraska -13 Nevada 1 New Hampshire 2 New Jersey 4 New Mexico 2 New York 10 North Carolina -4 North Dakota -10 Ohio -1 Oklahoma -17 Oregon 4 Pennsylvania 2 Rhode Island 11 South Carolina -8 South Dakota -9 Tennessee -9 Texas -10 Utah -20 Vermont 13 Virginia -2 Washington 5 West Virginia -8 Wisconsin 2 Wyoming -20 Name: PVI, Length: 51
Gallup party affiliation (Poll Jan.-Jun. 2012)
party_affil = pandas.read_csv("/home/skipper/school/talks/538model/"
"data/gallup_electorate.csv")
party_affil.Democrat = party_affil.Democrat.str.replace("%", "").astype(float)
party_affil.Republican = party_affil.Republican.str.replace("%", "").astype(float)
party_affil.set_index("State", inplace=True);
party_affil.rename(columns={"Democrat Advantage" : "dem_adv"}, inplace=True);
party_affil["no_party"] = 100 - party_affil.Democrat - party_affil.Republican
party_affil[["dem_adv", "no_party"]]
dem_adv no_party State District of Columbia 66.30 8.3 Rhode Island 26.00 21.0 Hawaii 25.60 17.0 New York 21.20 17.2 Maryland 20.20 12.2 Massachusetts 19.10 14.1 Delaware 17.40 16.4 Connecticut 15.40 15.8 Vermont 13.90 16.3 California 13.70 17.1 Illinois 12.60 15.8 New Jersey 11.50 16.7 Michigan 11.10 15.7 Minnesota 10.20 13.4 Washington 9.80 14.8 Oregon 8.10 13.7 Pennsylvania 5.20 12.4 Maine 4.40 16.8 New Mexico 3.60 14.2 Ohio 3.60 15.4 West Virginia 3.40 12.8 Wisconsin 2.80 12.8 Iowa 1.80 15.4 Florida 0.70 14.7 Arkansas 0.70 17.7 Kentucky 0.40 13.4 North Carolina 0.20 13.4 New Hampshire -1.50 13.9 Virginia -3.00 14.6 Missouri -3.90 15.9 Georgia -4.00 15.4 Nevada -4.20 17.4 Louisiana -4.80 14.6 Colorado -5.20 15.0 Texas -5.80 17.6 South Dakota -6.00 11.0 Indiana -6.70 15.3 Mississippi -7.00 12.8 Arizona -7.50 12.9 Tennessee -8.40 15.4 Alaska -8.44 19.8 Oklahoma -9.40 13.4 South Carolina -11.90 14.3 North Dakota -13.20 15.2 Alabama -13.60 14.4 Montana -13.70 14.5 Kansas -16.90 14.3 Nebraska -19.00 14.8 Wyoming -29.90 16.7 Idaho -30.30 14.7 Utah -39.30 11.7
Census data
census_data_2012 = pandas.read_csv("/home/skipper/school/talks/"
"538model/data/census_demographics.csv")
def capitalize(s):
s = s.title()
s = s.replace("Of", "of")
return s
census_data_2012["State"] = census_data_2012.state.map(capitalize)
del census_data_2012["state"]
census_data_2012.set_index("State", inplace=True);
census_data_2000 = pandas.read_csv("/home/skipper/school/talks/"
"538model/data/census_data_2000.csv")
census_data_2000.set_index("State", inplace=True);
census_data_2005 = (census_data_2000 + census_data_2012) / 2.
changes_2004 = state_polls_election_2004.ix[updated2004].sub(
state_polls_oct2_2004)
changes_2004 = changes_2004.dropna()
changes_2008 = state_polls_election_2008.ix[updated2008].sub(
state_polls_oct2_2008)
changes_2008 = changes_2008.dropna()
changes_2004
Alabama -1.533 Arizona -2.196 Arkansas -0.278 California -0.022 Colorado -0.615 Florida 0.170 Georgia 2.119 Illinois 3.262 Indiana 2.566 Iowa 0.154 Kansas 1.640 Kentucky -3.251 Maine 3.151 Maryland 2.609 Michigan -0.471 Minnesota 0.363 Missouri -1.095 Montana 0.495 Nevada -1.676 New Hampshire 0.689 New Jersey 3.254 New Mexico -2.374 New York 2.350 North Carolina -0.304 Ohio 2.134 Oklahoma -1.377 Oregon 3.295 Pennsylvania 0.210 Rhode Island -2.322 South Carolina -0.051 South Dakota 0.009 Tennessee -3.368 Texas -0.620 Virginia -1.851 Washington -0.155 West Virginia -0.254 Wisconsin -0.022
changes_2008
Alabama 1.409 Alaska 1.032 Arizona 4.619 Arkansas 0.016 California 6.518 Colorado 1.487 Connecticut 1.022 Delaware 4.191 Florida 2.063 Georgia 2.469 Illinois 1.660 Indiana 1.171 Iowa 1.709 Kansas 0.727 Kentucky 1.636 Louisiana -0.125 Maine 3.708 Massachusetts 4.180 Michigan 1.890 Minnesota 1.464 Mississippi 0.339 Missouri 5.364 Montana 0.403 Nevada 2.548 New Hampshire 6.714 New Jersey 0.812 New Mexico 2.179 New York 5.212 North Carolina 4.124 Ohio 3.271 Oklahoma 1.392 Oregon 5.742 Pennsylvania 3.088 South Carolina 1.726 South Dakota 1.904 Tennessee 1.873 Texas -0.120 Utah 0.236 Vermont 0.008 Virginia 2.594 Washington 3.398 West Virginia 0.667 Wisconsin 3.615 Wyoming -0.821
for name in econ2004.index:
census_data_2000[name] = econ2004.ix[name]
for name in econ2008.index:
census_data_2005[name] = econ2008.ix[name]
census_data_2000["poll_change"] = changes_2004
census_data_2005["poll_change"] = changes_2008
#changes_2008 = changes_2008.join(census_data_2005)
#years = pandas.DataFrame([2004]*len(changes_2004), columns=["Year"], index=changes_2004.index)
#years["poll_change"] = changes_2004
#changes_2004 = years
#years = pandas.DataFrame([2008]*len(changes_2008), columns=["Year"], index=changes_2008.index)
#years["poll_change"] = changes_2008
#changes_2008 = years
#changes_2004
#changes_2004 = changes_2004.join(census_data_2000, how="left")
#changes_2008 = changes_2008.join(census_data_2000, how="left")
census_data_2000["year"] = 2004
census_data_2005["year"] = 2008
changes = pandas.concat((census_data_2000.reset_index(), census_data_2005.reset_index()))
changes.reset_index(drop=True, inplace=True);
changes = changes.dropna() # don't have polls for all the states
predict = census_data_2012.reset_index()
predict["year"] = 2012
Add in Partisan information
changes = changes.merge(pvi.reset_index(), on="State")
predict = predict.merge(pvi.reset_index(), on="State")
Add in Party affiliation information
changes = changes.merge(party_affil[["dem_adv", "no_party"]].reset_index(), on="State")
predict = predict.merge(party_affil[["dem_adv", "no_party"]].reset_index(), on="State")
from scipy.cluster import vq
from sklearn import cluster
clstr_dta = predict[["per_black", "per_hisp", "per_white", "educ_coll", "pop_density", "per_older", "PVI", "dem_adv"]].values
clstr_dta = vq.whiten(clstr_dta) # might want to play with this to emphasize dimensions?
kmeans = cluster.KMeans(n_clusters=7, n_init=100)
kmeans.fit(clstr_dta)
values = kmeans.cluster_centers_
labels = kmeans.labels_
predict["kmeans_groups"] = labels
for key, grp in predict.groupby("kmeans_groups"): print key, grp.State.tolist()
0 ['Arizona', 'California', 'Nevada', 'New Mexico', 'Texas'] 1 ['Iowa', 'Maine', 'Michigan', 'Minnesota', 'New Hampshire', 'Ohio', 'Oregon', 'Pennsylvania', 'Vermont', 'Wisconsin'] 2 ['District of Columbia'] 3 ['Arkansas', 'Idaho', 'Indiana', 'Kansas', 'Kentucky', 'Missouri', 'Montana', 'Nebraska', 'North Dakota', 'Oklahoma', 'South Dakota', 'Tennessee', 'West Virginia', 'Wyoming'] 4 ['Alabama', 'Georgia', 'Louisiana', 'Mississippi', 'North Carolina', 'South Carolina'] 5 ['Colorado', 'Connecticut', 'Delaware', 'Florida', 'Hawaii', 'Illinois', 'Maryland', 'Massachusetts', 'New Jersey', 'New York', 'Rhode Island', 'Virginia', 'Washington'] 6 ['Alaska', 'Utah']
changes = changes.merge(predict[["kmeans_groups", "State"]], on="State")
Drop D.C. because it's not in the training data.
predict.set_index(["State", "year"], inplace=True);
predict = predict.drop(("District of Columbia", 2012))
changes.set_index(["State", "year"], inplace=True);
from statsmodels.formula.api import ols
changes
<class 'pandas.core.frame.DataFrame'> MultiIndex: 81 entries, ('Alabama', 2004) to ('Wyoming', 2008) Data columns: average_income 81 non-null values cons 81 non-null values educ_coll 81 non-null values educ_hs 81 non-null values income 81 non-null values jobs 81 non-null values median_income 81 non-null values older_pop 81 non-null values per_black 81 non-null values per_hisp 81 non-null values per_older 81 non-null values per_vote 81 non-null values per_white 81 non-null values poll_change 81 non-null values pop_density 81 non-null values prices 81 non-null values prod 81 non-null values vote_pop 81 non-null values PVI 81 non-null values dem_adv 81 non-null values no_party 81 non-null values kmeans_groups 81 non-null values dtypes: float64(21), int64(1)
changes[["dem_adv", "PVI"]].corr()
dem_adv PVI dem_adv 1.000 0.877 PVI 0.877 1.000
formula = ("poll_change ~ C(kmeans_groups) + per_older*per_white + "
"per_hisp + no_party*np.log(median_income) + PVI")
mod = ols(formula, data=changes).fit()
print mod.summary()
OLS Regression Results ============================================================================== Dep. Variable: poll_change R-squared: 0.498 Model: OLS Adj. R-squared: 0.400 Method: Least Squares F-statistic: 5.103 Date: Sat, 27 Oct 2012 Prob (F-statistic): 3.47e-06 Time: 12:46:19 Log-Likelihood: -147.99 No. Observations: 81 AIC: 324.0 Df Residuals: 67 BIC: 357.5 Df Model: 13 ================================================================================================== coef std err t P>|t| [95.0% Conf. Int.] -------------------------------------------------------------------------------------------------- Intercept -129.4535 67.810 -1.909 0.061 -264.802 5.896 C(kmeans_groups)[T.1] 2.1833 1.534 1.423 0.159 -0.879 5.246 C(kmeans_groups)[T.3] 2.8481 1.575 1.808 0.075 -0.296 5.992 C(kmeans_groups)[T.4] 2.2163 1.644 1.348 0.182 -1.065 5.498 C(kmeans_groups)[T.5] 0.3561 1.197 0.297 0.767 -2.033 2.746 C(kmeans_groups)[T.6] 1.7437 1.952 0.893 0.375 -2.152 5.639 per_older -0.3697 0.380 -0.974 0.334 -1.127 0.388 per_white -0.0950 0.055 -1.733 0.088 -0.204 0.014 per_older:per_white 0.0064 0.004 1.425 0.159 -0.003 0.015 per_hisp 0.0276 0.056 0.490 0.626 -0.085 0.140 no_party 4.9148 4.124 1.192 0.238 -3.317 13.147 np.log(median_income) 13.4102 6.651 2.016 0.048 0.136 26.685 no_party:np.log(median_income) -0.4960 0.402 -1.235 0.221 -1.298 0.306 PVI 0.1257 0.046 2.747 0.008 0.034 0.217 ============================================================================== Omnibus: 1.558 Durbin-Watson: 1.886 Prob(Omnibus): 0.459 Jarque-Bera (JB): 1.217 Skew: 0.054 Prob(JB): 0.544 Kurtosis: 2.410 Cond. No. 2.92e+05 ============================================================================== The condition number is large, 2.92e+05. This might indicate that there are strong multicollinearity or other numerical problems.
hyp = ", ".join(mod.model.exog_names[:5])
print hyp
Intercept, C(kmeans_groups)[T.1], C(kmeans_groups)[T.3], C(kmeans_groups)[T.4], C(kmeans_groups)[T.5]
print mod.f_test(hyp)
<F test: F=array([[ 1.6831]]), p=[[ 0.1507]], df_denom=67, df_num=5>
predicted2012 = pandas.read_csv("/home/skipper/school/talks/538model/2012-predicted.csv")
predicted2012["year"] = 2012
predicted2012 = predicted2012.set_index(["State", "year"])["poll"]
predicted_change = pandas.Series(mod.predict(predict), index=predict.index)
predicted_change
State year Alabama 2012 2.408 Alaska 2012 1.938 Arizona 2012 4.496 Arkansas 2012 1.388 California 2012 7.027 Colorado 2012 3.907 Connecticut 2012 5.086 Delaware 2012 4.265 Florida 2012 3.977 Georgia 2012 4.918 Hawaii 2012 9.279 Idaho 2012 1.652 Illinois 2012 4.905 Indiana 2012 3.014 Iowa 2012 2.634 Kansas 2012 3.370 Kentucky 2012 1.863 Louisiana 2012 3.549 Maine 2012 1.658 Maryland 2012 9.220 Massachusetts 2012 5.800 Michigan 2012 3.936 Minnesota 2012 5.226 Mississippi 2012 3.455 Missouri 2012 2.945 Montana 2012 2.046 Nebraska 2012 2.599 Nevada 2012 4.072 New Hampshire 2012 4.682 New Jersey 2012 5.568 New Mexico 2012 6.087 New York 2012 4.985 North Carolina 2012 4.740 North Dakota 2012 1.645 Ohio 2012 2.841 Oklahoma 2012 3.058 Oregon 2012 4.968 Pennsylvania 2012 5.191 Rhode Island 2012 1.271 South Carolina 2012 3.645 South Dakota 2012 3.876 Tennessee 2012 2.552 Texas 2012 3.181 Utah 2012 3.259 Vermont 2012 3.415 Virginia 2012 4.519 Washington 2012 4.241 West Virginia 2012 1.071 Wisconsin 2012 4.811 Wyoming 2012 0.955
results = predicted2012 + predicted_change
results
State year Alabama 2012 NaN Alaska 2012 NaN Arizona 2012 -1.855 Arkansas 2012 NaN California 2012 26.821 Colorado 2012 10.854 Connecticut 2012 19.053 Delaware 2012 NaN Florida 2012 6.057 Georgia 2012 -4.050 Hawaii 2012 40.512 Idaho 2012 NaN Illinois 2012 31.774 Indiana 2012 -3.856 Iowa 2012 4.959 Kansas 2012 -6.171 Kentucky 2012 NaN Louisiana 2012 NaN Maine 2012 14.393 Maryland 2012 38.075 Massachusetts 2012 27.616 Michigan 2012 12.497 Minnesota 2012 13.273 Mississippi 2012 -5.182 Missouri 2012 0.970 Montana 2012 -4.989 Nebraska 2012 -6.064 Nevada 2012 13.094 New Hampshire 2012 3.549 New Jersey 2012 19.113 New Mexico 2012 15.232 New York 2012 28.191 North Carolina 2012 4.150 North Dakota 2012 -7.493 Ohio 2012 7.224 Oklahoma 2012 NaN Oregon 2012 14.085 Pennsylvania 2012 10.883 Rhode Island 2012 27.202 South Carolina 2012 -3.122 South Dakota 2012 2.739 Tennessee 2012 -0.331 Texas 2012 0.603 Utah 2012 -25.883 Vermont 2012 20.227 Virginia 2012 9.504 Washington 2012 20.359 West Virginia 2012 -8.706 Wisconsin 2012 9.720 Wyoming 2012 NaN
electoral_votes = pandas.read_csv("/home/skipper/school/seaboldgit/talks/pydata/data/electoral_votes.csv")
electoral_votes.sort("State", inplace=True).reset_index(drop=True, inplace=True);
red_states = ["Alabama", "Alaska", "Arkansas", "Idaho", "Kentucky", "Louisiana",
"Oklahoma", "Wyoming"]
blue_states = ["Delaware"]#, "District of Columbia"]
results.name = "Poll"
results = results.reset_index()
results = results.merge(electoral_votes, on="State", how="left").set_index("State")
results["obama"] = 0
results["romney"] = 0
results.ix[results["Poll"] > 0, ["obama"]] = 1
results.ix[results["Poll"] < 0, ["romney"]] = 1
results.ix[red_states, ["romney"]] = 1
results.ix[blue_states, ["obama"]] = 1
print results["Votes"].mul(results["obama"]).sum() + 3
print results["Votes"].mul(results["romney"]).sum()
398 140
Component-Component plus residual plots. Partial residual plots attempt to show the relationship between a given independent variable and the response variable given that other independent variables are also in the model.
from statsmodels.graphics.regressionplots import plot_ccpr_ax
fig, ax = plt.subplots(figsize=(12,8))
fig = plot_ccpr_ax(mod, 11, ax=ax)
ax = fig.axes[0]
ax.set_title("log(median_income)*B_11 + Resid vs log(median_income)");
from statsmodels.graphics.regressionplots import plot_ccpr_ax
fig, ax = plt.subplots(figsize=(12,8))
fig = plot_ccpr_ax(mod, 9, ax=ax)
ax = fig.axes[0]
ax.set_title("per_hisp*B_9 + resid vs per_hisp");
X = mod.model.data.orig_exog
X[X.columns[:6]]
Intercept C(kmeans_groups)[T.1] C(kmeans_groups)[T.3] C(kmeans_groups)[T.4] C(kmeans_groups)[T.5] C(kmeans_groups)[T.6] State year Alabama 2004 1 0 0 1 0 0 2008 1 0 0 1 0 0 Alaska 2008 1 0 0 0 0 1 Arizona 2004 1 0 0 0 0 0 2008 1 0 0 0 0 0 Arkansas 2004 1 0 1 0 0 0 2008 1 0 1 0 0 0 California 2004 1 0 0 0 0 0 2008 1 0 0 0 0 0 Colorado 2004 1 0 0 0 1 0 2008 1 0 0 0 1 0 Connecticut 2008 1 0 0 0 1 0 Delaware 2008 1 0 0 0 1 0 Florida 2004 1 0 0 0 1 0 2008 1 0 0 0 1 0 Georgia 2004 1 0 0 1 0 0 2008 1 0 0 1 0 0 Illinois 2004 1 0 0 0 1 0 2008 1 0 0 0 1 0 Indiana 2004 1 0 1 0 0 0 2008 1 0 1 0 0 0 Iowa 2004 1 1 0 0 0 0 2008 1 1 0 0 0 0 Kansas 2004 1 0 1 0 0 0 2008 1 0 1 0 0 0 Kentucky 2004 1 0 1 0 0 0 2008 1 0 1 0 0 0 Louisiana 2008 1 0 0 1 0 0 Maine 2004 1 1 0 0 0 0 2008 1 1 0 0 0 0 Maryland 2004 1 0 0 0 1 0 Massachusetts 2008 1 0 0 0 1 0 Michigan 2004 1 1 0 0 0 0 2008 1 1 0 0 0 0 Minnesota 2004 1 1 0 0 0 0 2008 1 1 0 0 0 0 Mississippi 2008 1 0 0 1 0 0 Missouri 2004 1 0 1 0 0 0 2008 1 0 1 0 0 0 Montana 2004 1 0 1 0 0 0 2008 1 0 1 0 0 0 Nevada 2004 1 0 0 0 0 0 2008 1 0 0 0 0 0 New Hampshire 2004 1 1 0 0 0 0 2008 1 1 0 0 0 0 New Jersey 2004 1 0 0 0 1 0 2008 1 0 0 0 1 0 New Mexico 2004 1 0 0 0 0 0 2008 1 0 0 0 0 0 New York 2004 1 0 0 0 1 0 2008 1 0 0 0 1 0 North Carolina 2004 1 0 0 1 0 0 2008 1 0 0 1 0 0 Ohio 2004 1 1 0 0 0 0 2008 1 1 0 0 0 0 Oklahoma 2004 1 0 1 0 0 0 2008 1 0 1 0 0 0 Oregon 2004 1 1 0 0 0 0 2008 1 1 0 0 0 0 Pennsylvania 2004 1 1 0 0 0 0 2008 1 1 0 0 0 0 Rhode Island 2004 1 0 0 0 1 0 South Carolina 2004 1 0 0 1 0 0 2008 1 0 0 1 0 0 South Dakota 2004 1 0 1 0 0 0 2008 1 0 1 0 0 0 Tennessee 2004 1 0 1 0 0 0 2008 1 0 1 0 0 0 Texas 2004 1 0 0 0 0 0 2008 1 0 0 0 0 0 Utah 2008 1 0 0 0 0 1 Vermont 2008 1 1 0 0 0 0 Virginia 2004 1 0 0 0 1 0 2008 1 0 0 0 1 0 Washington 2004 1 0 0 0 1 0 2008 1 0 0 0 1 0 West Virginia 2004 1 0 1 0 0 0 2008 1 0 1 0 0 0 Wisconsin 2004 1 1 0 0 0 0 2008 1 1 0 0 0 0 Wyoming 2008 1 0 1 0 0 0
X[X.columns[6:]]
per_older per_white per_older:per_white per_hisp no_party np.log(median_income) no_party:np.log(median_income) PVI State year Alabama 2004 13.038 70.289 916.404 1.705 14.4 9.961 143.441 -13 2008 6.589 68.544 451.628 2.853 14.4 10.362 149.213 -13 Alaska 2008 2.888 65.649 189.568 4.962 19.8 10.740 212.644 -13 Arizona 2004 13.017 63.818 830.698 25.253 12.9 10.018 129.233 -6 2008 6.579 60.609 398.767 27.676 12.9 10.503 135.493 -6 Arkansas 2004 13.990 78.557 1099.039 3.249 17.7 9.883 174.925 -9 2008 7.068 76.378 539.857 4.925 17.7 10.290 182.128 -9 California 2004 10.616 46.696 495.706 32.377 17.1 10.128 173.183 7 2008 5.366 43.198 231.813 35.238 17.1 10.668 182.421 7 Colorado 2004 9.673 74.464 720.309 17.102 15.0 10.139 152.089 0 2008 4.893 72.082 352.707 19.001 15.0 10.619 159.279 0 Connecticut 2008 6.975 74.193 517.509 11.603 15.8 10.801 170.657 7 Delaware 2008 6.564 68.791 451.576 6.579 16.4 10.640 174.489 7 Florida 2004 17.567 65.438 1149.534 16.785 14.7 10.001 147.016 -2 2008 8.871 61.469 545.316 19.843 14.7 10.459 153.747 -2 Georgia 2004 9.592 62.648 600.944 5.316 15.4 10.090 155.393 -7 2008 4.851 59.074 286.579 7.208 15.4 10.511 161.874 -7 Illinois 2004 12.078 67.831 819.276 12.322 15.8 10.162 160.553 8 2008 6.103 65.566 400.120 14.261 15.8 10.617 167.745 8 Indiana 2004 12.381 85.838 1062.770 3.528 15.3 10.053 153.813 -6 2008 6.257 83.569 522.854 4.864 15.3 10.476 160.287 -6 Iowa 2004 14.907 92.619 1380.633 2.818 15.4 9.971 153.560 1 2008 7.528 90.510 681.335 4.009 15.4 10.467 161.193 1 Kansas 2004 13.251 83.097 1101.078 7.002 14.3 10.006 143.079 -12 2008 6.692 80.449 538.342 8.901 14.3 10.485 149.940 -12 Kentucky 2004 12.489 89.268 1114.906 1.483 13.4 9.950 133.329 -10 2008 6.312 87.684 553.480 2.341 13.4 10.350 138.693 -10 Louisiana 2008 5.846 61.314 358.445 3.405 14.6 10.373 151.445 -10 Maine 2004 14.385 96.500 1388.181 0.734 16.8 9.966 167.425 5 2008 7.274 95.400 693.955 1.067 16.8 10.437 175.347 5 Maryland 2004 11.315 62.051 702.124 4.303 12.2 10.284 125.465 9 Massachusetts 2008 6.844 79.138 541.610 8.326 14.1 10.746 151.525 12 Michigan 2004 12.266 78.550 963.475 3.259 15.7 10.137 159.157 4 2008 6.203 77.475 480.605 3.879 15.7 10.515 165.080 4 Minnesota 2004 12.080 88.163 1064.992 2.915 13.4 10.147 135.965 2 2008 6.105 85.481 521.900 3.907 13.4 10.630 142.447 2 Mississippi 2008 6.103 59.221 361.429 2.145 12.8 10.268 131.431 -10 Missouri 2004 13.500 83.759 1130.780 2.120 15.9 9.987 158.800 -3 2008 6.821 82.279 561.246 2.910 15.9 10.434 165.905 -3 Montana 2004 13.406 89.540 1200.377 2.004 14.5 9.755 141.441 -7 2008 6.779 88.520 600.080 2.552 14.5 10.327 149.744 -7 Nevada 2004 10.956 65.207 714.406 19.716 17.4 10.111 175.933 1 2008 5.540 59.403 329.125 23.408 17.4 10.601 184.455 1 New Hampshire 2004 11.974 95.102 1138.723 1.658 13.9 10.162 141.254 2 2008 6.057 93.651 567.231 2.279 13.9 10.705 148.803 2 New Jersey 2004 13.229 66.044 873.703 13.277 16.7 10.323 172.402 4 2008 6.683 62.472 417.502 15.689 16.7 10.822 180.732 4 New Mexico 2004 11.667 44.721 521.752 42.076 14.2 9.874 140.217 2 2008 5.901 42.460 250.577 44.388 14.2 10.362 147.136 2 New York 2004 12.902 61.977 799.626 15.111 17.2 10.175 175.015 10 2008 6.520 59.988 391.096 16.556 17.2 10.619 182.655 10 North Carolina 2004 12.039 70.157 844.612 4.708 13.4 10.011 134.151 -4 2008 6.085 67.578 411.245 6.654 13.4 10.432 139.787 -4 Ohio 2004 13.281 84.013 1115.737 1.912 15.4 10.084 155.289 -1 2008 6.712 82.506 553.764 2.556 15.4 10.482 161.417 -1 Oklahoma 2004 13.213 74.084 978.899 5.196 13.4 9.901 132.680 -17 2008 6.675 71.142 474.887 7.198 13.4 10.357 138.781 -17 Oregon 2004 12.807 83.522 1069.661 8.047 13.7 10.008 137.108 4 2008 6.475 80.811 523.249 10.023 13.7 10.484 143.627 4 Pennsylvania 2004 15.627 84.052 1313.482 3.209 12.4 10.074 124.915 2 2008 7.892 81.626 644.153 4.554 12.4 10.520 130.450 2 Rhode Island 2004 14.538 81.887 1190.447 8.663 21.0 10.086 211.808 11 South Carolina 2004 12.097 66.109 799.717 2.370 14.3 9.979 142.701 -8 2008 6.119 65.054 398.068 3.835 14.3 10.397 148.674 -8 South Dakota 2004 14.325 88.043 1261.207 1.444 11.0 9.867 108.533 -9 2008 7.234 86.221 623.766 2.172 11.0 10.399 114.388 -9 Tennessee 2004 12.362 79.200 979.077 2.177 15.4 9.985 153.770 -9 2008 6.250 77.300 483.089 3.438 15.4 10.389 159.994 -9 Texas 2004 9.939 52.433 521.153 31.986 17.6 10.005 176.092 -10 2008 5.022 48.617 244.161 35.043 17.6 10.488 184.595 -10 Utah 2008 4.305 82.686 355.964 11.113 11.7 10.557 123.520 -20 Vermont 2008 6.441 95.179 613.000 1.252 16.3 10.510 171.308 13 Virginia 2004 11.193 70.151 785.233 4.655 14.6 10.141 148.056 -2 2008 5.659 67.325 381.011 6.428 14.6 10.678 155.896 -2 Washington 2004 11.234 78.934 886.753 7.491 14.8 10.146 150.166 5 2008 5.681 75.517 428.977 9.545 14.8 10.630 157.329 5 West Virginia 2004 15.312 94.560 1447.906 0.679 12.8 9.861 126.215 -8 2008 7.737 93.780 725.579 0.990 12.8 10.267 131.419 -8 Wisconsin 2004 13.098 87.284 1143.276 3.597 12.8 10.069 128.884 2 2008 6.619 85.192 563.858 4.848 12.8 10.535 134.845 2 Wyoming 2008 5.905 87.182 514.852 7.757 16.7 10.513 175.563 -20
false_disc = mod.outlier_test("fdr_bh")
false_disc.sort("unadj_p", inplace=True)
student_resid unadj_p fdr_bh(p) State year Missouri 2008 2.125 0.037 0.748 New Hampshire 2008 2.099 0.040 0.748 Vermont 2008 -2.080 0.041 0.748 Rhode Island 2004 -2.063 0.043 0.748 Tennessee 2004 -2.032 0.046 0.748 New Jersey 2008 -1.934 0.057 0.774 Arizona 2008 1.861 0.067 0.778 Connecticut 2008 -1.727 0.089 0.784 Kentucky 2004 -1.657 0.102 0.784 Maine 2004 1.553 0.125 0.784 Illinois 2004 1.548 0.126 0.784 California 2008 1.521 0.133 0.784 Minnesota 2008 -1.500 0.138 0.784 Oregon 2008 1.480 0.144 0.784 2004 1.400 0.166 0.784 Maine 2008 1.392 0.169 0.784 New Jersey 2004 1.371 0.175 0.784 Michigan 2004 -1.345 0.183 0.784 New Mexico 2004 -1.339 0.185 0.784 Indiana 2004 1.313 0.194 0.784 New York 2008 1.221 0.226 0.818 Montana 2004 1.198 0.235 0.818 Georgia 2004 1.188 0.239 0.818 Kansas 2004 1.180 0.242 0.818 Louisiana 2008 -1.142 0.258 0.831 Missouri 2004 -1.120 0.267 0.831 Pennsylvania 2004 -1.024 0.310 0.873 Delaware 2008 1.014 0.314 0.873 Illinois 2008 -0.966 0.338 0.873 Ohio 2004 0.876 0.384 0.873 North Carolina 2008 0.861 0.392 0.873 Texas 2004 0.848 0.399 0.873 Wisconsin 2004 -0.797 0.429 0.873 Kansas 2008 -0.789 0.433 0.873 Ohio 2008 0.768 0.445 0.873 Virginia 2004 -0.764 0.447 0.873 New York 2004 0.758 0.451 0.873 California 2004 -0.742 0.461 0.873 Michigan 2008 -0.730 0.468 0.873 Texas 2008 -0.728 0.469 0.873 New Mexico 2008 -0.724 0.472 0.873 Florida 2004 0.723 0.472 0.873 Nevada 2004 -0.721 0.473 0.873 Mississippi 2008 -0.707 0.482 0.873 Minnesota 2004 -0.703 0.485 0.873 Washington 2008 0.563 0.576 0.951 South Dakota 2004 0.550 0.584 0.951 Indiana 2008 -0.542 0.590 0.951 Alaska 2008 0.536 0.593 0.951 Utah 2008 -0.536 0.593 0.951 Arizona 2004 -0.520 0.605 0.951 Wyoming 2008 -0.499 0.619 0.951 Pennsylvania 2008 -0.492 0.624 0.951 Kentucky 2008 0.447 0.656 0.951 Nevada 2008 0.437 0.664 0.951 Colorado 2008 -0.426 0.672 0.951 New Hampshire 2004 -0.414 0.680 0.951 Arkansas 2008 -0.380 0.705 0.951 Alabama 2008 0.358 0.722 0.951 Montana 2008 -0.357 0.722 0.951 North Carolina 2004 -0.348 0.729 0.951 Iowa 2004 -0.325 0.746 0.951 Maryland 2004 0.304 0.762 0.951 South Carolina 2004 0.302 0.764 0.951 Tennessee 2008 0.278 0.782 0.951 Georgia 2008 -0.251 0.802 0.951 Iowa 2008 -0.197 0.844 0.951 South Dakota 2008 -0.196 0.845 0.951 Florida 2008 0.173 0.863 0.951 Arkansas 2004 0.170 0.865 0.951 Alabama 2004 -0.166 0.869 0.951 West Virginia 2004 0.153 0.879 0.951 Oklahoma 2008 0.150 0.881 0.951 Virginia 2008 0.137 0.891 0.951 Washington 2004 -0.133 0.895 0.951 Colorado 2004 -0.121 0.904 0.951 South Carolina 2008 -0.121 0.904 0.951 Wisconsin 2008 0.096 0.924 0.960 Massachusetts 2008 -0.075 0.940 0.964 Oklahoma 2004 0.055 0.957 0.969 West Virginia 2008 -0.005 0.996 0.996
bonf = mod.outlier_test("sidak")
bonf.sort("unadj_p", inplace=True)
student_resid unadj_p sidak(p) State year Missouri 2008 2.125 0.037 0.954 New Hampshire 2008 2.099 0.040 0.962 Vermont 2008 -2.080 0.041 0.967 Rhode Island 2004 -2.063 0.043 0.972 Tennessee 2004 -2.032 0.046 0.978 New Jersey 2008 -1.934 0.057 0.992 Arizona 2008 1.861 0.067 0.996 Connecticut 2008 -1.727 0.089 0.999 Kentucky 2004 -1.657 0.102 1.000 Maine 2004 1.553 0.125 1.000 Illinois 2004 1.548 0.126 1.000 California 2008 1.521 0.133 1.000 Minnesota 2008 -1.500 0.138 1.000 Oregon 2008 1.480 0.144 1.000 2004 1.400 0.166 1.000 Maine 2008 1.392 0.169 1.000 New Jersey 2004 1.371 0.175 1.000 Michigan 2004 -1.345 0.183 1.000 New Mexico 2004 -1.339 0.185 1.000 Indiana 2004 1.313 0.194 1.000 New York 2008 1.221 0.226 1.000 Montana 2004 1.198 0.235 1.000 Georgia 2004 1.188 0.239 1.000 Kansas 2004 1.180 0.242 1.000 Louisiana 2008 -1.142 0.258 1.000 Missouri 2004 -1.120 0.267 1.000 Pennsylvania 2004 -1.024 0.310 1.000 Delaware 2008 1.014 0.314 1.000 Illinois 2008 -0.966 0.338 1.000 Ohio 2004 0.876 0.384 1.000 North Carolina 2008 0.861 0.392 1.000 Texas 2004 0.848 0.399 1.000 Wisconsin 2004 -0.797 0.429 1.000 Kansas 2008 -0.789 0.433 1.000 Ohio 2008 0.768 0.445 1.000 Virginia 2004 -0.764 0.447 1.000 New York 2004 0.758 0.451 1.000 California 2004 -0.742 0.461 1.000 Michigan 2008 -0.730 0.468 1.000 Texas 2008 -0.728 0.469 1.000 New Mexico 2008 -0.724 0.472 1.000 Florida 2004 0.723 0.472 1.000 Nevada 2004 -0.721 0.473 1.000 Mississippi 2008 -0.707 0.482 1.000 Minnesota 2004 -0.703 0.485 1.000 Washington 2008 0.563 0.576 1.000 South Dakota 2004 0.550 0.584 1.000 Indiana 2008 -0.542 0.590 1.000 Alaska 2008 0.536 0.593 1.000 Utah 2008 -0.536 0.593 1.000 Arizona 2004 -0.520 0.605 1.000 Wyoming 2008 -0.499 0.619 1.000 Pennsylvania 2008 -0.492 0.624 1.000 Kentucky 2008 0.447 0.656 1.000 Nevada 2008 0.437 0.664 1.000 Colorado 2008 -0.426 0.672 1.000 New Hampshire 2004 -0.414 0.680 1.000 Arkansas 2008 -0.380 0.705 1.000 Alabama 2008 0.358 0.722 1.000 Montana 2008 -0.357 0.722 1.000 North Carolina 2004 -0.348 0.729 1.000 Iowa 2004 -0.325 0.746 1.000 Maryland 2004 0.304 0.762 1.000 South Carolina 2004 0.302 0.764 1.000 Tennessee 2008 0.278 0.782 1.000 Georgia 2008 -0.251 0.802 1.000 Iowa 2008 -0.197 0.844 1.000 South Dakota 2008 -0.196 0.845 1.000 Florida 2008 0.173 0.863 1.000 Arkansas 2004 0.170 0.865 1.000 Alabama 2004 -0.166 0.869 1.000 West Virginia 2004 0.153 0.879 1.000 Oklahoma 2008 0.150 0.881 1.000 Virginia 2008 0.137 0.891 1.000 Washington 2004 -0.133 0.895 1.000 Colorado 2004 -0.121 0.904 1.000 South Carolina 2008 -0.121 0.904 1.000 Wisconsin 2008 0.096 0.924 1.000 Massachusetts 2008 -0.075 0.940 1.000 Oklahoma 2004 0.055 0.957 1.000 West Virginia 2008 -0.005 0.996 1.000
infl = mod.get_influence()
table = infl.summary_frame()
for stat in table.columns:
print stat
dfb_Intercept dfb_C(kmeans_groups)[T.1] dfb_C(kmeans_groups)[T.3] dfb_C(kmeans_groups)[T.4] dfb_C(kmeans_groups)[T.5] dfb_C(kmeans_groups)[T.6] dfb_per_older dfb_per_white dfb_per_older:per_white dfb_per_hisp dfb_no_party dfb_np.log(median_income) dfb_no_party:np.log(median_income) dfb_PVI cooks_d dffits dffits_internal hat_diag standard_resid student_resid
Measure the influence of points on prediction
$$\text{DFFITS}=\frac{\hat{y}-\hat{y}_{i}}{s_i\sqrt{h_{ii}} }$$points greater than
$$2\left\(\frac{p}{\text{nobs}} \right\)^{1/2}$$might be cause for concern
print 2*np.sqrt(mod.df_model/mod.nobs)
0.80123361677
dffits = np.abs(table['dffits'].copy())
dffits.sort()
dffits[::-1][:15]
State year Rhode Island 2004 1.526 Vermont 2008 1.133 New Mexico 2004 1.022 Arizona 2008 0.989 California 2008 0.945 New Jersey 2008 0.909 New Hampshire 2008 0.872 Alaska 2008 0.844 Utah 2008 0.844 Maine 2004 0.810 Connecticut 2008 0.719 Missouri 2008 0.704 Maine 2008 0.611 Tennessee 2004 0.605 New Jersey 2004 0.565 Name: dffits
Indicate influential observations, where you might want more data.
Overall fit change with deleted observation.
$$\text{Cook's D}=\frac{e_i^2}{p\text{MSE}\frac{h_{ii}}{(1-h_{ii})^2}}$$print 4/mod.nobs
0.0493827160494
cooks_d = table["cooks_d"].copy()
cooks_d.sort()
print cooks_d[::-1][:15]
State year Rhode Island 2004 0.159 Vermont 2008 0.087 New Mexico 2004 0.074 Arizona 2008 0.067 California 2008 0.063 New Jersey 2008 0.057 New Hampshire 2008 0.052 Alaska 2008 0.051 Utah 2008 0.051 Maine 2004 0.046 Connecticut 2008 0.036 Missouri 2008 0.034 Maine 2008 0.026 Tennessee 2004 0.025 New Jersey 2004 0.023 Name: cooks_d
student_resid = np.abs(table.student_resid.copy())
student_resid.sort()
student_resid[::-1][:15]
State year Missouri 2008 2.125 New Hampshire 2008 2.099 Vermont 2008 2.080 Rhode Island 2004 2.063 Tennessee 2004 2.032 New Jersey 2008 1.934 Arizona 2008 1.861 Connecticut 2008 1.727 Kentucky 2004 1.657 Maine 2004 1.553 Illinois 2004 1.548 California 2008 1.521 Minnesota 2008 1.500 Oregon 2008 1.480 2004 1.400 Name: student_resid