#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd from IPython.core.display import display, HTML # We'll pull population density for counties from [this Census ESRI API](https://covid19.census.gov/datasets/21843f238cbb46b08615fc53e19e0daf_1). # In[2]: county_pop_df = pd.read_csv('https://opendata.arcgis.com/datasets/21843f238cbb46b08615fc53e19e0daf_1.csv' ,usecols=['GEOID','NAME','State','B01001_001E','B01001_calc_PopDensity'] ,dtype={'GEOID':str}) # 1. Filter out Puerto Rico _(but support Puerto Rican self-determination!)_, # 1. sort the records by their calculated population density, # 1. and then add a cumulative sum column. # In[3]: county_pop_df = county_pop_df[county_pop_df['State']!='Puerto Rico'] county_pop_df.sort_values(by='B01001_calc_PopDensity', ascending=False, inplace=True) county_pop_df['cumcount']=county_pop_df['B01001_001E'].cumsum() # In[4]: county_pop_df.head() # It's surprisingly maddening to find a good source for county-level election data. We're going to use [this one](https://github.com/tonmcg/US_County_Level_Election_Results_08-20/), though we need to repair the FIPS codes so they're 5-character strings. # In[5]: election_results_df = pd.read_csv('https://raw.githubusercontent.com/tonmcg/US_County_Level_Election_Results_08-20/master/2016_US_County_Level_Presidential_Results.csv' ,usecols=['votes_dem','votes_gop','total_votes','state_abbr','county_name','combined_fips'] ,dtype={'combined_fips':str,'votes_dem':'int64','votes_gop':'int64','total_votes':'int64'}) election_results_df['fips'] = election_results_df['combined_fips'].apply(lambda x: '0' + x if len(x)==4 else x) election_results_df.head() # There are two Census FIPS that mismatch between the tables; we'll coerce them to matchin values. # In[6]: election_results_df.loc[election_results_df['fips']=='46113','fips'] = '46102' #Oglala Lakota County election_results_df.loc[election_results_df['fips']=='02270','fips'] = '02158' #Kusilvak Census Area # Join them and see if anything went wrong. There's one county that's still mismatched, but, with only 75 people, it isn't going to affect any of our election results. Out of exhaustion, we'll leave it be. # In[7]: joined = county_pop_df.set_index('GEOID').join(election_results_df.set_index('fips')) joined[pd.isna(joined['votes_dem'])] # Define a function that will run an election for an arbitrary number of district. To do so, we break the density-sorted dataframe by using `pd.cut` on the cumulative population field. Then we write a nice little HTML table to output the results. # In[8]: def run_election(number_districts): joined['electorate'] = pd.cut(joined['cumcount'], bins=number_districts, labels=range(1,number_districts+1)) results_table = joined.groupby(by='electorate').sum() results_table['pctdem'] = results_table['votes_dem']/results_table['total_votes'] results_table['pctgop'] = results_table['votes_gop']/results_table['total_votes'] display(HTML('

Election with {} districts

'.format(number_districts))) html_table = '' html_table += '' for i,r in results_table.iterrows(): most_dense = joined[joined['electorate']==i].iloc[0] least_dense = joined[joined['electorate']==i].iloc[-1] most_dense_printable = '{}, {}'.format(most_dense['NAME'], most_dense['State']) least_dense_printable = '{}, {}'.format(least_dense['NAME'], least_dense['State']) winner = 'D' if r['pctdem'] > r['pctgop'] else 'R' html_table += ''.format(i, f"{int(r['B01001_001E']):,}", most_dense_printable, least_dense_printable, round(r['pctdem']*100,2),round(r['pctgop']*100,2), winner) html_table +='
DistrictTotal PopulationDensest CountyLeast Dense CountyDem %Rep %Winner
{}{}{}{}{}{}{}
' display(HTML(html_table)) # Run the election with 11 districts ... # In[9]: run_election(11) # How about 9? # In[10]: run_election(9) # With 17? # In[11]: run_election(17) # In[12]: run_election(21)