January 15, 2019
Brian Dew, brian.w.dew@gmail.com
This code creates a map showing the share of each center-based statistical area that is not in the labor force due to disability or illness.
%config Completer.use_jedi = False
# Import python packages
import pandas as pd
import numpy as np
import calendar
# Generate a list of file names matching the Census microdata files
latest_month = '12/1/2017'
cps_files = [f'{calendar.month_abbr[m.month].lower()}{m.year % 100}pub.dat'
for m in pd.date_range(end=latest_month, periods=24, freq='MS')]
ldate = pd.to_datetime(latest_month).strftime('%B %Y')
pdate = (pd.to_datetime(latest_month) -
pd.DateOffset(months=23)).strftime('%B %Y')
# Microdata files are fixed-width
# The variable locations are from the data dictionary
cps_vars = [('GTCBSA', 95, 100), # CBSA (geo area)
('PRTAGE', 121, 123), # Person's age
('PEMLR', 179, 181), # Person's labor market status
('PWCMPWGT', 845, 855)] # Person's composite weight
path = '/home/brian/Documents/CPS/data' # Location of microdata files
d = {} # Empty dictionary to store monthly files
for file in cps_files:
d[file] = (pd.read_fwf(f'{path}/{file}',
colspecs=[[v[1], v[2]] for v in cps_vars],
names=[v[0] for v in cps_vars])
.query('PEMLR != -1 and PWCMPWGT > 0 and GTCBSA > 0 '
'and PRTAGE > 15 and PRTAGE < 65'))
df = pd.concat(d, ignore_index=True) # Combine into one dataframe
len(df.groupby('GTCBSA')['PEMLR'].sum().unique())
258
# Check sample size to make sure I have at least 500 observations
min_obs = df.groupby('GTCBSA')['PEMLR'].agg('count').min()
max_obs = df.groupby('GTCBSA')['PEMLR'].agg('count').max()
print(f'Between {min_obs} and {max_obs} observations per CBSA')
Between 589 and 83326 observations per CBSA
# Calculate disabled NILF share of age group, by CBSA
dis = lambda x: np.average(np.where(x['PEMLR'] == 6, 1, 0), weights=x['PWCMPWGT'])
dis_list = df.groupby('GTCBSA').apply(dis)
dis_list.index = dis_list.index.map(str)
# Decide where to split the color codes for the map
bins = [0.004, 0.02, 0.04, 0.06, 0.08, 0.1, 0.12, 0.2]
colors = ['#4d9221', '#a1d76a', '#e6f5d0', '#f7f7f7', '#fde0ef', '#e9a3c9', '#c51b7d']
colormap = pd.cut(dis_list, bins, labels=colors).to_dict()
%matplotlib inline
import os
os.environ['PROJ_LIB'] = '/home/brian/miniconda3/share/proj'
from mpl_toolkits.basemap import Basemap
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
# Show contiguous US (and move Hawaii, no data for Alaska):
m = Basemap(llcrnrlon=-121, llcrnrlat=20, urcrnrlon=-64, urcrnrlat=49,
projection='lcc', lat_1=33, lat_2=45, lon_0=-95)
fig, ax = plt.subplots(figsize=(16,8))
m.readshapefile('shapefiles/cb_2019_us_nation_20m', 'nation', color='gray', linewidth=0.3)
m.readshapefile('shapefiles/cb_2019_us_state_20m', 'states', drawbounds=False)
m.readshapefile('shapefiles/cb_2019_us_cbsa_20m', 'cbsa', drawbounds=False)
for info, shape in zip(m.states_info, m.states):
seg = shape
if info['NAME'] == 'Hawaii' and info['ALAND'] > 100:
seg = [[x + 5200000, y-1400000] for x,y in shape]
ax.add_patch(Polygon(seg, facecolor='white', edgecolor='gray', linewidth=0.3))
ax.add_patch(Polygon(seg, facecolor='white', edgecolor='gray', linewidth=0.3))
for info, shape in zip(m.cbsa_info, m.cbsa):
if info['GEOID'] in colormap.keys():
color = colormap[info['GEOID']]
seg = shape
if info['NAME'][-2:] == 'HI' and info['RINGNUM'] == 1:
seg = [[x + 5200000, y-1400000] for x,y in shape]
ax.add_patch(Polygon(seg, facecolor=color, edgecolor='gray', linewidth=0.18))
plt.title('Share of age 16-64 population that is out of work due to disability',
fontsize=20, fontweight=800, loc='left', color='dimgrey', fontname='Lato')
plt.annotate(f'Source: Current Population Survey microdata, {pdate} to {ldate}',
(0,0), (0, 0), fontsize=10, xycoords='axes fraction',
textcoords='offset points', va='top')
ax.axis('off')
ax_inset = inset_axes(ax, width="16%", height="26%", loc=3, borderpad=3.6)
bars = list(pd.cut(dis_list, bins).reset_index().groupby(0)['GTCBSA'].nunique().values)
labels = [f'{int(x.left*100)} – {int(x.right*100)}%'
for x in pd.cut(dis_list, bins).sort_values().unique()[:-1]]
labels.append('12%+')
ax_inset.barh(list(range(0, len(bars))), bars, align='center', color=colors)
ax_inset.axis('off')
for i, bar in enumerate(bars):
ax_inset.text(bar+2, i, bar, fontsize=9, va='center')
for i, label in enumerate(labels):
ax_inset.text(-2, i, label, fontsize=9, va='center', ha='right')
ax_inset.text(-25, 7, 'Value Count')
plt.show();