This notebook is a replication attempt of Stier et al.'s preprint on Arxiv [1]. This notebook brings together MSA definitions and census data to allow demographic calculations for MSAs in relation to the coronavirus outbreak. The MSAs are a county-level unit delineated by the Census Bureau (see https://www.census.gov/programs-surveys/metro-micro/about/delineation-files.html). The coronavirus outbreak data are provided by USAFacts (https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/, last downloaded March 25th, 2020 14:34 EDT).
.. [1] Andrew J Stier, Marc G. Berman, and Luis M. A. Bettencourt. March 23rd, 2020. COVID-19 attack rate increases with city size. arXiv:2003.10376v1 [q-bio.PE] https://arxiv.org/abs/2003.10376v1
#import key packages
import numpy as np
import pandas
import datetime #the covid dataset uses datetimes as column indices
import string
import scipy.stats
import matplotlib.pyplot as plt
import ipywidgets as widgets
#Load the datasets: the MSA delineations, the ACS 2018 population estimates, and the USAFacts Coronavirus dataset
msas = pandas.read_excel(r'data/USCensus/Delineations/list1_Sep_2018.xls',header=2)
pop = pandas.read_excel(r'data/USCensus/Population/co-est2018-alldata.xls')
#Load the different Coronavirus datasets
edt = datetime.timezone(datetime.timedelta(hours = -4),name = 'US Eastern Daylight Time')
#I've created a standardized notation for up-to-date coronavirus datasets
## These are dictionaries with the following keyterms:
#cases - the cases per day per county dataset
#deaths - deathes per day per county
#credit - attribution
#daterange - the range of dates that can be queried. If dates are out of range, won't work
#dateformat - how to format a date into a column name for a particular day's data. Lambda function
#get_county_cases - a lambda function that takes cases or deaths and a row of the MSAS dataset to return the matching row
#This is teh USAFacts dataset
## This seems to be updated by 1 am EDT
latestdate = datetime.datetime.combine(datetime.datetime.now(tz=edt).date(),datetime.time()) + [datetime.timedelta(days = -2) if datetime.datetime.now(tz=edt).time() < datetime.time(1, 0) else datetime.timedelta(days = -1)][0]
usafacts = {'cases' : pandas.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv'),
'deaths': pandas.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_deaths_usafacts.csv'),
'credit': 'USAFacts. https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/',
'daterange' : pandas.date_range(datetime.datetime(2020, 1, 22, 0, 0), latestdate),
'dateformat_cases' : lambda x: '{dt.month}/{dt.day}/{short}'.format(dt=x,short=str(x.year)[:2]),
'dateformat_deaths': lambda x: '{dt.month}/{dt.day}/{dt.year}'.format(dt=x),
'get_county' : lambda data, m: data.countyFIPS == int(m['FIPS State Code']*1000 + m['FIPS County Code'])
}
#This is the dataset made by Tom Quisel, CSBS, and 1point3acres
## This seems to update as of 2 am eastern the next day
## Note that county names do not include the designation for counties/parishes/etc.
### in that state, so they must be removed from the MSAs data labels
latestdate = datetime.datetime.combine(datetime.datetime.now(tz=edt).date(),datetime.time()) + [datetime.timedelta(days = -2) if datetime.datetime.now(tz=edt).time() < datetime.time(2, 0) else datetime.timedelta(days = -1)][0]
acres = {'cases' : pandas.read_csv('https://raw.githubusercontent.com/tomquisel/covid19-data/master/data/csv/timeseries_confirmed.csv'),
'deaths' : pandas.read_csv('https://raw.githubusercontent.com/tomquisel/covid19-data/master/data/csv/timeseries_death.csv'),
'credit' : 'Tom Quisel, CSBS (Conference of State Bank Supervisors), and 1point3acres. https://github.com/tomquisel/covid19-data, https://www.csbs.org/information-covid-19-coronavirus, and https://coronavirus.1point3acres.com/en',
'daterange' : pandas.date_range(datetime.datetime(2020, 3, 14, 0, 0), latestdate),
'dateformat_cases' : lambda x: x.strftime('%Y-%m-%d'),
'dateformat_deaths': lambda x: x.strftime('%Y-%m-%d'),
'get_county' : lambda data, m: (data['County_Name'] == ' '.join(m['County/County Equivalent'].split(' ')[:-1])) & (data['State_Name'] == m['State Name'])
}
Now that the data are loaded, pick a dataset and dates to run the COVID-19 regression on. (This is a linear regression of normalized, natural log active case counts (i.e., confirmed tests with deaths removed). These assumptions are all replicating reference [1], above.
Note: this takes a few minutes to run! Set output to True to see all the places that can't be included.
#First, select a dataset to explore.
dataset_select = widgets.Select(
options=['USAFacts','1point3acres'],
value='USAFacts',
description='Dataset:',
disabled=False)
dataset_select
Select(description='Dataset:', options=('USAFacts', '1point3acres'), value='USAFacts')
#Assemble the state, metro/micro areas, pop data, and covid cases
## First define a selection slider for the analysis
if dataset_select.value == 'USAFacts':
dataset = usafacts
if dataset_select.value == '1point3acres':
dataset = acres
date_range_slider = widgets.SelectionRangeSlider(options=[(d.strftime('%m/%d'),d) for d in dataset['daterange']],
#index=(min([d for d in data['daterange']]),max([d for d in data['daterange']])),
description='Dates:',
orientation='horizontal',
layout={'width': '500px'}
)
def coallate_analyze(dateTuple, msas, pop, dataset, remove_deaths = True, filtering = 'None', chatty=False):
"""Utility for ipywidgets.
This function takes as input the US Census MSA definitions and 2018 ACS data, the USAFacts
coronavirus case and death counts at the county level, and the start and end dates for the
statistical regression in keeping with the methodology outlined in [1]. It tabulates these data
and outputs a graph.
If chatty is True, lots of output about which cities aren't included.
"""
global df #to remain accessible after the function is run
dateStart = datetime.datetime.combine(dateTuple[0],datetime.time())
dateEnd = datetime.datetime.combine(dateTuple[1],datetime.time())
output = [] #to store the actual data to be exported and used
popnotfound = [] #to store fips of counties that can't be found
covidnotfound = [] #to store fips of counties without cases
covidnotincl = [] #to store CBSA of metros excluded
days = (dateEnd - dateStart).days+1 #number of days to be included in the analysis
col_names = ['CBSA','Title','MetroMicro','Pop2018','COVIDEnd','AttackRate','r']
#Leaving out Puerto Rico because it is not in the census data; I'm sorry! I hope to remedy this.
for cbsa in msas.loc[(msas['FIPS State Code'] != 72)]['CBSA Code'][:-4].unique():
#Get the MSA information
#cbsa = '10740' #ABQ metro CBSA code for testing
counties = msas.loc[msas['CBSA Code'] == cbsa]
if chatty == True:
print('Starting on ' + str(counties.iloc[0]['CBSA Title']))
#Add teh metro area name and its metro/micro status to the output
row = [cbsa,counties.loc[counties.index[0]]['CBSA Title'],counties.loc[counties.index[0]]['Metropolitan/Micropolitan Statistical Area']]
#for all state and county codes, go through and select the relevant pop data
pop_total = 0 #to sum the population
covid_last = 0 #the total number of cases in the last day of the series
covid_series = [0]*days #This stores just cases, not people who died
#Loop through every constituent county to get the population as well as the COVID cases
for ci in counties.index:
#Get teh FIPS code for working with census data
county = counties.loc[ci]
s, c = (county['FIPS State Code'],county['FIPS County Code'])
fips = int(s*1000 + c) #str(int(s)) + '0'*(3-len(str(int(c))))+str(int(c))
#Calculate the population from this county towards the MSA
if any((pop.STATE == int(s)) & (pop.COUNTY == int(c))):
pop_total += int(pop.loc[(pop.STATE == int(s)) & (pop.COUNTY == int(c))]['POPESTIMATE2018'])
else: #If the fips code doesn't exist, skip this county but record why.
if chatty == True:
print(str(fips) + ' was not found in the ACS 2018 data.')
popnotfound.append(fips)
#Go through the chosen COVID-19 data and add the confirmed case numbers
#If the county is in the data, add it to the total for this MSA
if any(dataset['get_county'](dataset['cases'], county)):
#Remove deaths if called for by the methodology
if remove_deaths == True:
#When data contain duplicates; it is assumed that together these are the true total.
covid_last += int(sum(dataset['cases'].loc[dataset['get_county'](dataset['cases'], county)][dataset['dateformat_cases'](dateEnd)]))-int(sum(dataset['deaths'].loc[dataset['get_county'](dataset['deaths'], county)][dataset['dateformat_deaths'](dateEnd)]))
#Add the total number of active cases to each day
for i,d in zip(range(days),pandas.date_range(dateStart,dateEnd)):
covid_series[i] += int(sum(dataset['cases'].loc[(dataset['get_county'](dataset['cases'], county))][dataset['dateformat_cases'](d)])) - int(sum(dataset['deaths'].loc[dataset['get_county'](dataset['deaths'], county)][dataset['dateformat_deaths'](d)]))
#Otherwise, don't remove the deaths, just count active cases
else:
#When data contain duplicates; it is assumed that together these are the true total.
covid_last += int(sum(dataset['cases'].loc[dataset['get_county'](dataset['cases'], county)][dataset['dateformat_cases'](dateEnd)]))
#Add the total number of active cases to each day
for i,d in zip(range(days),pandas.date_range(dateStart,dateEnd)):
covid_series[i] += int(sum(dataset['cases'].loc[(dataset['get_county'](dataset['cases'], county))][dataset['dateformat_cases'](d)]))
else:
if chatty == True:
print(str(fips) + ' was not found in the COVID data.')
covidnotfound.append(fips)
row.append(pop_total)
row.append(covid_last)
#Now filter the data using any of the predefined criteria
if filtering == 'Chicago':
if any(pandas.isna(covid_series)) or (covid_series[-1] <= 3) or any([cs <= 0 for cs in covid_series]):
if chatty == True:
print(row[1] + ' had too few cases for inclusion')
covidnotincl.append(cbsa)
row.append(np.nan)
row.append(np.nan)
else:
#normalize the covid_series so that the March 13th data is 1
covid_series = [cs * 1. / covid_series[0] for cs in covid_series]
#run a regression
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(range(days),[np.log(cs) for cs in covid_series])
row.append(slope)
row.append(r_value)
elif filtering == 'None':
#run a regression
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(range(days),[np.log(cs) for cs in covid_series])
row.append(slope)
row.append(r_value)
output.append(row)
df = pandas.DataFrame(output,columns=col_names)
print('\n \n \n Data are gathered for cases from ' + dateStart.strftime("%B %d, %Y") +' to ' + dateEnd.strftime("%B %d, %Y") + '! Run the next box when ready.')
#Run with some interactability for the dates
print('Change the dates to your window of interest, then click "Run Interact" to run the regression.')
widgets.interact_manual(coallate_analyze, dateTuple = date_range_slider, msas = widgets.fixed(msas), pop = widgets.fixed(pop), dataset = widgets.fixed(dataset), remove_deaths = True, filtering = widgets.Select(options=['None','Chicago']), chatty = False)
print('')
Change the dates to your window of interest, then click "Run Interact" to run the regression.
interactive(children=(SelectionRangeSlider(description='Dates:', index=(0, 0), layout=Layout(width='500px'), o…
def plot_some_figures(df):
"""
Make plots.
"""
####################################################
#Now that the data are collated, generate the graphs
#Plot the chart from figure 1a from the arxiv preprint
fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2)
fig.set_figheight(12)
fig.set_figwidth(12)
points, = ax1.plot([np.log(x) for x in df.Pop2018],[np.log(y) for y in df.AttackRate],'bo')
#Now run a linear regression to calculate any potential power law behavior
#Only include those where an attack rate could be esitmated and its a Metropolitan Area (not Micro)
which = df.index[(pandas.isna(df.AttackRate)==False) & (df.MetroMicro == 'Metropolitan Statistical Area')]
global slope, intercept
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress([np.log(x) for x in df.Pop2018[which]],[np.log(y) for y in df.AttackRate[which]])
line, = ax1.plot([np.log(x) for x in df.Pop2018],[np.log(x)*slope + intercept for x in df.Pop2018],'k-')
ax1.set_xlabel('log(MSA Population)')
ax1.set_ylabel('log(Estimated attack rate)')
ax1.set_title('Attack rate vs. pop (r = {:0.3f}, p = {:0.3f}, exponent {:1.4f})'.format(r_value,p_value,slope))
# Look at how well correlation describes the growth curves for the selected dates
ax2.hist(df.r[which][pandas.isna(df.r[which]) == False],bins=np.arange(0.0,1.10,.10))
ax2.set_title('Correlations for city-by-city exponential growth')
# Make a q-q plot
#Calculate teh residual distribution of the data
residuals = [np.log(y) - (intercept + slope*np.log(x)) for x,y in zip( df.Pop2018[which], df.AttackRate[which])]
(osm, osr), (qqslope, qqintercept, qqr) = scipy.stats.probplot(residuals,fit=True, plot=ax3)
ax3.set_title('Q-Q plot against normal (slope = %f, r = %f)' % (qqslope, qqr))
plt.show
widgets.interact(plot_some_figures,df = widgets.fixed(df))
print('')
interactive(children=(Output(),), _dom_classes=('widget-interact',))
def lookup_a_city(df,pickacity):
w = df.index[df.Title == pickacity]
if len(w) > 1:
print('Uh-oh, more than one city by that name!')
else:
w = df.index[df.Title == cityselector.value][0]
print(str(df.Title[w]))
print(str(df.MetroMicro[w]))
print('Population (2018 ACS estimate): %d ' % (df.Pop2018[w]))
print('Covid cases by ' + date_range_slider.value[1].strftime("%B %d, %Y") + ': ' + str(df.COVIDEnd[w]))
if pandas.isna(df.AttackRate[w]):
print('There was not sufficient data (or another error occurred) to estimate a growth rate')
else:
print('COVID-19 attack rate (from regression): %f' % (df.AttackRate[w]))
print('Correlation for that regression: %f' % (df.r[w]))
print('Subsequent R: %f' % ((df.AttackRate[w]*4.5) + 1))
print('Residual for the power-law regression: %f' % (np.log(df.AttackRate[w]) - (intercept + slope*np.log(df.Pop2018[w]))))
cityselector = widgets.Select(options = df.Title[df.MetroMicro == 'Metropolitan Statistical Area'],description = 'Pick a city to examine specifics')
widgets.interact(lookup_a_city,df = widgets.fixed(df),pickacity = cityselector, output = False)
print('')
interactive(children=(Select(description='Pick a city to examine specifics', options=('Abilene, TX', 'Akron, O…
Output(outputs=({'output_type': 'stream', 'text': '\n \n \n Data are gathered for cases from March 14, 2020 to March 30, 2020! Run the next box when ready.\n', 'name': 'stdout'}, {'output_type': 'display_data', 'data': {'text/plain': ' CBSA Title MetroMicro \\\n0 10100 Aberdeen, SD Micropolitan Statistical Area \n1 10140 Aberdeen, WA Micropolitan Statistical Area \n2 10180 Abilene, TX Metropolitan Statistical Area \n3 10220 Ada, OK Micropolitan Statistical Area \n4 10300 Adrian, MI Micropolitan Statistical Area \n.. ... ... ... \n921 49660 Youngstown-Warren-Boardman, OH-PA Metropolitan Statistical Area \n922 49700 Yuba City, CA Metropolitan Statistical Area \n923 49740 Yuma, AZ Metropolitan Statistical Area \n924 49780 Zanesville, OH Micropolitan Statistical Area \n925 49820 Zapata, TX Micropolitan Statistical Area \n\n Pop2018 COVIDEnd AttackRate r \n0 43191 3 NaN NaN \n1 73901 1 NaN NaN \n2 171451 11 NaN NaN \n3 38247 3 NaN NaN \n4 98266 15 NaN NaN \n.. ... ... ... ... \n921 538952 160 0.315987 0.992818 \n922 174848 15 NaN NaN \n923 212128 6 NaN NaN \n924 86183 2 NaN NaN \n925 14190 0 NaN NaN \n\n[926 rows x 7 columns]', 'text/html': '<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border="1" class="dataframe">\n <thead>\n <tr style="text-align: right;">\n <th></th>\n <th>CBSA</th>\n <th>Title</th>\n <th>MetroMicro</th>\n <th>Pop2018</th>\n <th>COVIDEnd</th>\n <th>AttackRate</th>\n <th>r</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>10100</td>\n <td>Aberdeen, SD</td>\n <td>Micropolitan Statistical Area</td>\n <td>43191</td>\n <td>3</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>1</th>\n <td>10140</td>\n <td>Aberdeen, WA</td>\n <td>Micropolitan Statistical Area</td>\n <td>73901</td>\n <td>1</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>2</th>\n <td>10180</td>\n <td>Abilene, TX</td>\n <td>Metropolitan Statistical Area</td>\n <td>171451</td>\n <td>11</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>3</th>\n <td>10220</td>\n <td>Ada, OK</td>\n <td>Micropolitan Statistical Area</td>\n <td>38247</td>\n <td>3</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>4</th>\n <td>10300</td>\n <td>Adrian, MI</td>\n <td>Micropolitan Statistical Area</td>\n <td>98266</td>\n <td>15</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>921</th>\n <td>49660</td>\n <td>Youngstown-Warren-Boardman, OH-PA</td>\n <td>Metropolitan Statistical Area</td>\n <td>538952</td>\n <td>160</td>\n <td>0.315987</td>\n <td>0.992818</td>\n </tr>\n <tr>\n <th>922</th>\n <td>49700</td>\n <td>Yuba City, CA</td>\n <td>Metropolitan Statistical Area</td>\n <td>174848</td>\n <td>15</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>923</th>\n <td>49740</td>\n <td>Yuma, AZ</td>\n <td>Metropolitan Statistical Area</td>\n <td>212128</td>\n <td>6</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>924</th>\n <td>49780</td>\n <td>Zanesville, OH</td>\n <td>Micropolitan Statistical Area</td>\n <td>86183</td>\n <td>2</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>925</th>\n <td>49820</td>\n <td>Zapata, TX</td>\n <td>Micropolitan Statistical Area</td>\n <td>14190</td>\n <td>0</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n </tbody>\n</table>\n<p>926 rows × 7 columns</p>\n</div>'}, 'metadata': {}}))