%pylab inline import numpy as np from pandas import * from statsmodels.formula.api import logit from statsmodels.nonparametric import KDE import matplotlib.pyplot as plt from patsy import dmatrix, dmatrices df = read_csv('data/wells.dat', sep = ' ', header = 0, index_col = 0) print df.head() model1 = logit('switch ~ I(dist/100.)', df = df).fit() print model1.summary() def binary_jitter(x, jitter_amount = .05): ''' Add jitter to a 0/1 vector of data for plotting. ''' jitters = np.random.rand(*x.shape) * jitter_amount x_jittered = x + np.where(x == 1, -1, 1) * jitters return x_jittered dist_logit_par = model1.params['I(dist / 100.)'] plt.plot(df['dist'], binary_jitter(df['switch'], .1), '.', alpha = .1) plt.plot(np.sort(df['dist']), model1.predict()[np.argsort(df['dist'])], lw = 2) plt.ylabel('Switched Wells') plt.xlabel('Distance from safe well (meters)') kde_sw = KDE(df['dist'][df['switch'] == 1]) kde_nosw = KDE(df['dist'][df['switch'] == 0]) kde_sw.fit() kde_nosw.fit() plt.plot(kde_sw.support, kde_sw.density, label = 'Switch') plt.plot(kde_nosw.support, kde_nosw.density, color = 'red', label = 'No Switch') plt.xlabel('Distance (meters)') plt.legend(loc = 'best') model2 = logit('switch ~ I(dist / 100.) + arsenic', df = df).fit() print model2.summary() model2.margeff(at = 'mean') logit_pars = model2.params intercept = -logit_pars[0] / logit_pars[2] slope = -logit_pars[1] / logit_pars[2] dist_sw = df['dist'][df['switch'] == 1] dist_nosw = df['dist'][df['switch'] == 0] arsenic_sw = df['arsenic'][df['switch'] == 1] arsenic_nosw = df['arsenic'][df['switch'] == 0] plt.figure(figsize = (12, 8)) plt.plot(dist_sw, arsenic_sw, '.', mec = 'purple', mfc = 'None', label = 'Switch') plt.plot(dist_nosw, arsenic_nosw, '.', mec = 'orange', mfc = 'None', label = 'No switch') plt.plot(np.arange(0, 350, 1), intercept + slope * np.arange(0, 350, 1) / 100., '-k', label = 'Separating line') plt.ylim(0, 10) plt.xlabel('Distance to safe well (meters)') plt.ylabel('Arsenic level') plt.legend(loc = 'best') model3 = logit('switch ~ I(dist / 100.) + arsenic + I(dist / 100.):arsenic', df = df).fit() print model3.summary() model_form = ('switch ~ center(I(dist / 100.)) + center(arsenic) + ' + 'center(I(educ / 4.)) + ' + 'center(I(dist / 100.)) : center(arsenic) + ' + 'center(I(dist / 100.)) : center(I(educ / 4.)) + ' + 'center(arsenic) : center(I(educ / 4.))' ) model4 = logit(model_form, df = df).fit() print model4.summary() def bin_residuals(resid, var, bins): ''' Compute average residuals within bins of a variable. Returns a dataframe indexed by the bins, with the bin midpoint, the residual average within the bin, and the confidence interval bounds. ''' resid_df = DataFrame({'var': var, 'resid': resid}) resid_df['bins'] = qcut(var, bins) bin_group = resid_df.groupby('bins') bin_df = bin_group['var', 'resid'].mean() bin_df['count'] = bin_group['resid'].count() bin_df['lower_ci'] = -2 * (bin_group['resid'].std() / np.sqrt(bin_group['resid'].count())) bin_df['upper_ci'] = 2 * (bin_group['resid'].std() / np.sqrt(bin_df['count'])) bin_df = bin_df.sort('var') return(bin_df) def plot_binned_residuals(bin_df): ''' Plotted binned residual averages and confidence intervals. ''' plt.plot(bin_df['var'], bin_df['resid'], '.') plt.plot(bin_df['var'], bin_df['lower_ci'], '-r') plt.plot(bin_df['var'], bin_df['upper_ci'], '-r') plt.axhline(0, color = 'gray', lw = .5) arsenic_resids = bin_residuals(model4.resid, df['arsenic'], 40) dist_resids = bin_residuals(model4.resid, df['dist'], 40) plt.figure(figsize = (12, 5)) plt.subplot(121) plt.ylabel('Residual (bin avg.)') plt.xlabel('Arsenic (bin avg.)') plot_binned_residuals(arsenic_resids) plt.subplot(122) plot_binned_residuals(dist_resids) plt.ylabel('Residual (bin avg.)') plt.xlabel('Distance (bin avg.)') model_form = ('switch ~ center(I(dist / 100.)) + center(np.log(arsenic)) + ' + 'center(I(educ / 4.)) + ' + 'center(I(dist / 100.)) : center(np.log(arsenic)) + ' + 'center(I(dist / 100.)) : center(I(educ / 4.)) + ' + 'center(np.log(arsenic)) : center(I(educ / 4.))' ) model5 = logit(model_form, df = df).fit() print model5.summary() arsenic_resids = bin_residuals(model5.resid, df['arsenic'], 40) dist_resids = bin_residuals(model5.resid, df['dist'], 40) plt.figure(figsize = (12, 5)) plt.subplot(121) plot_binned_residuals(arsenic_resids) plt.ylabel('Residual (bin avg.)') plt.xlabel('Arsenic (bin avg.)') plt.subplot(122) plot_binned_residuals(dist_resids) plt.ylabel('Residual (bin avg.)') plt.xlabel('Distance (bin avg.)') print model5.pred_table() print 'Model Error rate: {0: 3.0%}'.format( 1 - np.diag(model5.pred_table()).sum() / model5.pred_table().sum()) print 'Null Error Rate: {0: 3.0%}'.format( 1 - df['switch'].mean())