#!/usr/bin/env python # coding: utf-8 # # Urban scaling and coronavirus # # # This notebook is a replication attempt of Stier et al.'s preprint on Arxiv [1]. This notebook brings together MSA definitions and census data to allow demographic calculations for MSAs in relation to the coronavirus outbreak. The MSAs are a county-level unit delineated by the Census Bureau (see https://www.census.gov/programs-surveys/metro-micro/about/delineation-files.html). The coronavirus outbreak data are provided by USAFacts (https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/, last downloaded March 25th, 2020 14:34 EDT). # # .. [1] Andrew J Stier, Marc G. Berman, and Luis M. A. Bettencourt. March 23rd, 2020. COVID-19 attack rate increases with city size. arXiv:2003.10376v1 [q-bio.PE] https://arxiv.org/abs/2003.10376v1 # # In[8]: #import key packages import numpy as np import pandas import datetime #the covid dataset uses datetimes as column indices import string import scipy.stats import matplotlib.pyplot as plt import ipywidgets as widgets #Load the datasets: the MSA delineations, the ACS 2018 population estimates, and the USAFacts Coronavirus dataset msas = pandas.read_excel(r'data/USCensus/Delineations/list1_Sep_2018.xls',header=2) pop = pandas.read_excel(r'data/USCensus/Population/co-est2018-alldata.xls') #Load the different Coronavirus datasets edt = datetime.timezone(datetime.timedelta(hours = -4),name = 'US Eastern Daylight Time') #I've created a standardized notation for up-to-date coronavirus datasets ## These are dictionaries with the following keyterms: #cases - the cases per day per county dataset #deaths - deathes per day per county #credit - attribution #daterange - the range of dates that can be queried. If dates are out of range, won't work #dateformat - how to format a date into a column name for a particular day's data. Lambda function #get_county_cases - a lambda function that takes cases or deaths and a row of the MSAS dataset to return the matching row #This is teh USAFacts dataset ## This seems to be updated by 1 am EDT latestdate = datetime.datetime.combine(datetime.datetime.now(tz=edt).date(),datetime.time()) + [datetime.timedelta(days = -2) if datetime.datetime.now(tz=edt).time() < datetime.time(1, 0) else datetime.timedelta(days = -1)][0] usafacts = {'cases' : pandas.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv'), 'deaths': pandas.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_deaths_usafacts.csv'), 'credit': 'USAFacts. https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/', 'daterange' : pandas.date_range(datetime.datetime(2020, 1, 22, 0, 0), latestdate), 'dateformat_cases' : lambda x: '{dt.month}/{dt.day}/{short}'.format(dt=x,short=str(x.year)[:2]), 'dateformat_deaths': lambda x: '{dt.month}/{dt.day}/{dt.year}'.format(dt=x), 'get_county' : lambda data, m: data.countyFIPS == int(m['FIPS State Code']*1000 + m['FIPS County Code']) } #This is the dataset made by Tom Quisel, CSBS, and 1point3acres ## This seems to update as of 2 am eastern the next day ## Note that county names do not include the designation for counties/parishes/etc. ### in that state, so they must be removed from the MSAs data labels latestdate = datetime.datetime.combine(datetime.datetime.now(tz=edt).date(),datetime.time()) + [datetime.timedelta(days = -2) if datetime.datetime.now(tz=edt).time() < datetime.time(2, 0) else datetime.timedelta(days = -1)][0] acres = {'cases' : pandas.read_csv('https://raw.githubusercontent.com/tomquisel/covid19-data/master/data/csv/timeseries_confirmed.csv'), 'deaths' : pandas.read_csv('https://raw.githubusercontent.com/tomquisel/covid19-data/master/data/csv/timeseries_death.csv'), 'credit' : 'Tom Quisel, CSBS (Conference of State Bank Supervisors), and 1point3acres. https://github.com/tomquisel/covid19-data, https://www.csbs.org/information-covid-19-coronavirus, and https://coronavirus.1point3acres.com/en', 'daterange' : pandas.date_range(datetime.datetime(2020, 3, 14, 0, 0), latestdate), 'dateformat_cases' : lambda x: x.strftime('%Y-%m-%d'), 'dateformat_deaths': lambda x: x.strftime('%Y-%m-%d'), 'get_county' : lambda data, m: (data['County_Name'] == ' '.join(m['County/County Equivalent'].split(' ')[:-1])) & (data['State_Name'] == m['State Name']) } # Now that the data are loaded, pick a dataset and dates to run the COVID-19 regression on. (This is a linear regression of normalized, natural log active case counts (i.e., confirmed tests with deaths removed). These assumptions are all replicating reference [1], above. # # Note: this takes a few minutes to run! Set output to True to see all the places that can't be included. # In[2]: #First, select a dataset to explore. dataset_select = widgets.Select( options=['USAFacts','1point3acres'], value='USAFacts', description='Dataset:', disabled=False) dataset_select # In[21]: #Assemble the state, metro/micro areas, pop data, and covid cases ## First define a selection slider for the analysis if dataset_select.value == 'USAFacts': dataset = usafacts if dataset_select.value == '1point3acres': dataset = acres date_range_slider = widgets.SelectionRangeSlider(options=[(d.strftime('%m/%d'),d) for d in dataset['daterange']], #index=(min([d for d in data['daterange']]),max([d for d in data['daterange']])), description='Dates:', orientation='horizontal', layout={'width': '500px'} ) def coallate_analyze(dateTuple, msas, pop, dataset, remove_deaths = True, filtering = 'None', chatty=False): """Utility for ipywidgets. This function takes as input the US Census MSA definitions and 2018 ACS data, the USAFacts coronavirus case and death counts at the county level, and the start and end dates for the statistical regression in keeping with the methodology outlined in [1]. It tabulates these data and outputs a graph. If chatty is True, lots of output about which cities aren't included. """ global df #to remain accessible after the function is run dateStart = datetime.datetime.combine(dateTuple[0],datetime.time()) dateEnd = datetime.datetime.combine(dateTuple[1],datetime.time()) output = [] #to store the actual data to be exported and used popnotfound = [] #to store fips of counties that can't be found covidnotfound = [] #to store fips of counties without cases covidnotincl = [] #to store CBSA of metros excluded days = (dateEnd - dateStart).days+1 #number of days to be included in the analysis col_names = ['CBSA','Title','MetroMicro','Pop2018','COVIDEnd','AttackRate','r'] #Leaving out Puerto Rico because it is not in the census data; I'm sorry! I hope to remedy this. for cbsa in msas.loc[(msas['FIPS State Code'] != 72)]['CBSA Code'][:-4].unique(): #Get the MSA information #cbsa = '10740' #ABQ metro CBSA code for testing counties = msas.loc[msas['CBSA Code'] == cbsa] if chatty == True: print('Starting on ' + str(counties.iloc[0]['CBSA Title'])) #Add teh metro area name and its metro/micro status to the output row = [cbsa,counties.loc[counties.index[0]]['CBSA Title'],counties.loc[counties.index[0]]['Metropolitan/Micropolitan Statistical Area']] #for all state and county codes, go through and select the relevant pop data pop_total = 0 #to sum the population covid_last = 0 #the total number of cases in the last day of the series covid_series = [0]*days #This stores just cases, not people who died #Loop through every constituent county to get the population as well as the COVID cases for ci in counties.index: #Get teh FIPS code for working with census data county = counties.loc[ci] s, c = (county['FIPS State Code'],county['FIPS County Code']) fips = int(s*1000 + c) #str(int(s)) + '0'*(3-len(str(int(c))))+str(int(c)) #Calculate the population from this county towards the MSA if any((pop.STATE == int(s)) & (pop.COUNTY == int(c))): pop_total += int(pop.loc[(pop.STATE == int(s)) & (pop.COUNTY == int(c))]['POPESTIMATE2018']) else: #If the fips code doesn't exist, skip this county but record why. if chatty == True: print(str(fips) + ' was not found in the ACS 2018 data.') popnotfound.append(fips) #Go through the chosen COVID-19 data and add the confirmed case numbers #If the county is in the data, add it to the total for this MSA if any(dataset['get_county'](dataset['cases'], county)): #Remove deaths if called for by the methodology if remove_deaths == True: #When data contain duplicates; it is assumed that together these are the true total. covid_last += int(sum(dataset['cases'].loc[dataset['get_county'](dataset['cases'], county)][dataset['dateformat_cases'](dateEnd)]))-int(sum(dataset['deaths'].loc[dataset['get_county'](dataset['deaths'], county)][dataset['dateformat_deaths'](dateEnd)])) #Add the total number of active cases to each day for i,d in zip(range(days),pandas.date_range(dateStart,dateEnd)): covid_series[i] += int(sum(dataset['cases'].loc[(dataset['get_county'](dataset['cases'], county))][dataset['dateformat_cases'](d)])) - int(sum(dataset['deaths'].loc[dataset['get_county'](dataset['deaths'], county)][dataset['dateformat_deaths'](d)])) #Otherwise, don't remove the deaths, just count active cases else: #When data contain duplicates; it is assumed that together these are the true total. covid_last += int(sum(dataset['cases'].loc[dataset['get_county'](dataset['cases'], county)][dataset['dateformat_cases'](dateEnd)])) #Add the total number of active cases to each day for i,d in zip(range(days),pandas.date_range(dateStart,dateEnd)): covid_series[i] += int(sum(dataset['cases'].loc[(dataset['get_county'](dataset['cases'], county))][dataset['dateformat_cases'](d)])) else: if chatty == True: print(str(fips) + ' was not found in the COVID data.') covidnotfound.append(fips) row.append(pop_total) row.append(covid_last) #Now filter the data using any of the predefined criteria if filtering == 'Chicago': if any(pandas.isna(covid_series)) or (covid_series[-1] <= 3) or any([cs <= 0 for cs in covid_series]): if chatty == True: print(row[1] + ' had too few cases for inclusion') covidnotincl.append(cbsa) row.append(np.nan) row.append(np.nan) else: #normalize the covid_series so that the March 13th data is 1 covid_series = [cs * 1. / covid_series[0] for cs in covid_series] #run a regression slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(range(days),[np.log(cs) for cs in covid_series]) row.append(slope) row.append(r_value) elif filtering == 'None': #run a regression slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(range(days),[np.log(cs) for cs in covid_series]) row.append(slope) row.append(r_value) output.append(row) df = pandas.DataFrame(output,columns=col_names) print('\n \n \n Data are gathered for cases from ' + dateStart.strftime("%B %d, %Y") +' to ' + dateEnd.strftime("%B %d, %Y") + '! Run the next box when ready.') #Run with some interactability for the dates print('Change the dates to your window of interest, then click "Run Interact" to run the regression.') widgets.interact_manual(coallate_analyze, dateTuple = date_range_slider, msas = widgets.fixed(msas), pop = widgets.fixed(pop), dataset = widgets.fixed(dataset), remove_deaths = True, filtering = widgets.Select(options=['None','Chicago']), chatty = False) print('') # In[14]: def plot_some_figures(df): """ Make plots. """ #################################################### #Now that the data are collated, generate the graphs #Plot the chart from figure 1a from the arxiv preprint fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2) fig.set_figheight(12) fig.set_figwidth(12) points, = ax1.plot([np.log(x) for x in df.Pop2018],[np.log(y) for y in df.AttackRate],'bo') #Now run a linear regression to calculate any potential power law behavior #Only include those where an attack rate could be esitmated and its a Metropolitan Area (not Micro) which = df.index[(pandas.isna(df.AttackRate)==False) & (df.MetroMicro == 'Metropolitan Statistical Area')] global slope, intercept slope, intercept, r_value, p_value, std_err = scipy.stats.linregress([np.log(x) for x in df.Pop2018[which]],[np.log(y) for y in df.AttackRate[which]]) line, = ax1.plot([np.log(x) for x in df.Pop2018],[np.log(x)*slope + intercept for x in df.Pop2018],'k-') ax1.set_xlabel('log(MSA Population)') ax1.set_ylabel('log(Estimated attack rate)') ax1.set_title('Attack rate vs. pop (r = {:0.3f}, p = {:0.3f}, exponent {:1.4f})'.format(r_value,p_value,slope)) # Look at how well correlation describes the growth curves for the selected dates ax2.hist(df.r[which][pandas.isna(df.r[which]) == False],bins=np.arange(0.0,1.10,.10)) ax2.set_title('Correlations for city-by-city exponential growth') # Make a q-q plot #Calculate teh residual distribution of the data residuals = [np.log(y) - (intercept + slope*np.log(x)) for x,y in zip( df.Pop2018[which], df.AttackRate[which])] (osm, osr), (qqslope, qqintercept, qqr) = scipy.stats.probplot(residuals,fit=True, plot=ax3) ax3.set_title('Q-Q plot against normal (slope = %f, r = %f)' % (qqslope, qqr)) plt.show widgets.interact(plot_some_figures,df = widgets.fixed(df)) print('') # In[20]: def lookup_a_city(df,pickacity): w = df.index[df.Title == pickacity] if len(w) > 1: print('Uh-oh, more than one city by that name!') else: w = df.index[df.Title == cityselector.value][0] print(str(df.Title[w])) print(str(df.MetroMicro[w])) print('Population (2018 ACS estimate): %d ' % (df.Pop2018[w])) print('Covid cases by ' + date_range_slider.value[1].strftime("%B %d, %Y") + ': ' + str(df.COVIDEnd[w])) if pandas.isna(df.AttackRate[w]): print('There was not sufficient data (or another error occurred) to estimate a growth rate') else: print('COVID-19 attack rate (from regression): %f' % (df.AttackRate[w])) print('Correlation for that regression: %f' % (df.r[w])) print('Subsequent R: %f' % ((df.AttackRate[w]*4.5) + 1)) print('Residual for the power-law regression: %f' % (np.log(df.AttackRate[w]) - (intercept + slope*np.log(df.Pop2018[w])))) cityselector = widgets.Select(options = df.Title[df.MetroMicro == 'Metropolitan Statistical Area'],description = 'Pick a city to examine specifics') widgets.interact(lookup_a_city,df = widgets.fixed(df),pickacity = cityselector, output = False) print('') # In[23]: # In[ ]: