Link census tract populations, total and by race, into police beats. Attributes population from tracts to beats by the areas of the overlaps. The basic procedure is to find the overlaps between beats and Census tracts, then addign a portion of the population of the tract to the beat, based on the raio of the size of overlap to the size of the tract.
import seaborn as sns
import metapack as mp
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import utm
%matplotlib inline
sns.set_context('notebook')
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg
# Convert to EPSG:26911, ( A randomly selected UTM Zone 11N CRS) so area calculations
# will be in square meters, rather than square degrees
beats = pkg.resource('pd_beats').geoframe()
beats.head()
# Figure out the UTM zone, and from that the CRS EPSG for the UTM zone
xmin, ymin, xmax, ymax = beats.to_crs(4326).envelope.total_bounds
band = utm.from_latlon((ymax-ymin)/2+ymin, (xmax-xmin)/2+xmin )[2]
epsg = int('326'+str(band))
assert(epsg == 32611) # It's always San Diego, so we already know what it is ...
# Convert to a UTM EPSG, so that we can do calculations in meters.
beats = beats.to_crs(epsg)
# There are beats that are way off in east county. Get rid of them.
#rightmost_centroid = beats.centroid.x.sort_values(ascending=False).iloc[:6].max()
#beats = beats[beats.centroid.x <rightmost_centroid]
# It looks like the dataset has multiple rows per beat, one feature per row. We need
# it to have one row per beat, with multiple features combined together.
beats = beats.dissolve(by='beat').reset_index()
# Add the area
beats['beat_area'] = beats.area / 1_000_000
beats.plot()
tracts = pkg.reference('tracts').geoframe().to_crs(epsg)
# Add the area
tracts['tract_area'] = tracts.area / 1_000_000
tracts.plot()
# White, black, asian, etc are all non hispanic.
col_map = {
'b03002_001':'total',
'b03002_003':'white',
'b03002_004':'black',
'b03002_005':'aian',
'b03002_006':'asian',
'b03002_007':'nhopi',
'b03002_012':'hisp'
}
for k,v in list(col_map.items()):
col_map[k+'_m90'] = col_map[k]+'_m90'
t = pkg.reference('race').dataframe()
race_tracts = t[t.county=='073'].rename(columns=col_map).reset_index().rename(columns={'GEOID':'geoid'})
race_tracts = race_tracts[['geoid', 'total', 'white', 'black', 'aian', 'asian', 'nhopi', 'hisp']]
race_tracts.titles.head().T
t = gpd.sjoin(beats, tracts)
ax = t.plot()
beats.centroid.plot(ax=ax, color='red')
t = t[['geoid', 'beat']].drop_duplicates()\
.merge(tracts[['geoid','geometry', 'tract_area']],on='geoid')\
.merge(beats[['beat','geometry', 'beat_area']],on='beat')
# merge by finding intersections. Each record in intr will be an area that is entirely
# in both one tract and one beat.
intr = gpd.overlay(beats, tracts, how='intersection')[['beat','geoid','geometry']]
print(len(beats), len(tracts), len(intr))
intr['intr_area'] = (intr.geometry.area/1_000_000.0).astype(float)
# Get rid of really small intersections
intr = intr[intr.intr_area >= .01]
# Merge back with beats and tracts to get beat and tract areas.
merged = intr[['beat','geoid', 'intr_area']]\
.merge(tracts[['geoid', 'tract_area']],on='geoid')\
.merge(beats[['beat', 'beat_area']],on='beat')\
.merge(race_tracts, on='geoid')
merged = merged.drop_duplicates(subset=['beat','geoid'])
merged['tract_overlap_proportion'] = merged.intr_area/merged.tract_area
merged['beat_overlap_proportion'] = merged.intr_area/merged.beat_area
# The intersection areas must be smaller than both of the areas being intersected
assert(not any(merged.intr_area > merged.beat_area))
assert(not any(merged.intr_area > merged.tract_area))
# Check that all of the areas of the beats are accounted for
assert(all(merged.groupby('beat').beat_overlap_proportion.sum().round(1) == 1))
# Multiply by the overlap portion to get the final values.
merged.loc[:,'total':'hisp'] = merged.loc[:,'total':'hisp'].multiply(merged.tract_overlap_proportion, axis=0)
merged.head().T
beat_demographics = merged.groupby('beat').sum()[['total', 'white', 'black', 'aian', 'asian', 'nhopi', 'hisp']].round()
# Sanity checks. SD population is about 1.3M
assert beat_demographics.sum().total > 1_300_000
rel_err = np.abs(beat_demographics.loc[:,'white':].sum().sum() - beat_demographics.total.sum()) / beat_demographics.total.sum()
assert rel_err < 0.05
merged.loc[:,'total':'hisp'].multiply(merged.tract_overlap_proportion, axis=0)
beat_demographics
merged