#!/usr/bin/env python
# coding: utf-8

# # Urban scaling and coronavirus #
# 
# This notebook is a replication attempt of Stier et al.'s preprint on Arxiv [1]. This notebook brings together MSA definitions and census data to allow demographic calculations for MSAs in relation to the coronavirus outbreak. The MSAs are a county-level unit delineated by the Census Bureau (see https://www.census.gov/programs-surveys/metro-micro/about/delineation-files.html). The coronavirus outbreak data are provided by USAFacts (https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/, last downloaded March 25th, 2020 14:34 EDT).
# 
# .. [1] Andrew J Stier, Marc G. Berman, and Luis M. A. Bettencourt. March 23rd, 2020. COVID-19 attack rate increases with city size. arXiv:2003.10376v1 [q-bio.PE] https://arxiv.org/abs/2003.10376v1
# 

# In[8]:


#import key packages
import numpy as np
import pandas
import datetime #the covid dataset uses datetimes as column indices
import string
import scipy.stats
import matplotlib.pyplot as plt
import ipywidgets as widgets

#Load the datasets: the MSA delineations, the ACS 2018 population estimates, and the USAFacts Coronavirus dataset
msas = pandas.read_excel(r'data/USCensus/Delineations/list1_Sep_2018.xls',header=2)
pop = pandas.read_excel(r'data/USCensus/Population/co-est2018-alldata.xls')

#Load the different Coronavirus datasets
edt = datetime.timezone(datetime.timedelta(hours = -4),name = 'US Eastern Daylight Time')

#I've created a standardized notation for up-to-date coronavirus datasets
## These are dictionaries with the following keyterms:
    #cases - the cases per day per county dataset
    #deaths - deathes per day per county
    #credit - attribution
    #daterange - the range of dates that can be queried. If dates are out of range, won't work
    #dateformat - how to format a date into a column name for a particular day's data. Lambda function
    #get_county_cases - a lambda function that takes cases or deaths and a row of the MSAS dataset to return the matching row
    

#This is teh USAFacts dataset
## This seems to be updated by 1 am EDT
latestdate = datetime.datetime.combine(datetime.datetime.now(tz=edt).date(),datetime.time()) + [datetime.timedelta(days = -2) if datetime.datetime.now(tz=edt).time() < datetime.time(1, 0) else datetime.timedelta(days = -1)][0]
usafacts = {'cases' : pandas.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv'),
            'deaths': pandas.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_deaths_usafacts.csv'),
            'credit': 'USAFacts. https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/',
            'daterange' : pandas.date_range(datetime.datetime(2020, 1, 22, 0, 0), latestdate),
            'dateformat_cases' : lambda x: '{dt.month}/{dt.day}/{short}'.format(dt=x,short=str(x.year)[:2]),
            'dateformat_deaths': lambda x: '{dt.month}/{dt.day}/{dt.year}'.format(dt=x),
            'get_county' : lambda data, m: data.countyFIPS == int(m['FIPS State Code']*1000 + m['FIPS County Code'])
           }

#This is the dataset made by Tom Quisel, CSBS, and 1point3acres
## This seems to update as of 2 am eastern the next day
## Note that county names do not include the designation for counties/parishes/etc.
### in that state, so they must be removed from the MSAs data labels
latestdate = datetime.datetime.combine(datetime.datetime.now(tz=edt).date(),datetime.time()) + [datetime.timedelta(days = -2) if datetime.datetime.now(tz=edt).time() < datetime.time(2, 0) else datetime.timedelta(days = -1)][0]
acres = {'cases' : pandas.read_csv('https://raw.githubusercontent.com/tomquisel/covid19-data/master/data/csv/timeseries_confirmed.csv'),
         'deaths' : pandas.read_csv('https://raw.githubusercontent.com/tomquisel/covid19-data/master/data/csv/timeseries_death.csv'),
         'credit' : 'Tom Quisel, CSBS (Conference of State Bank Supervisors), and 1point3acres. https://github.com/tomquisel/covid19-data, https://www.csbs.org/information-covid-19-coronavirus, and https://coronavirus.1point3acres.com/en',
         'daterange' : pandas.date_range(datetime.datetime(2020, 3, 14, 0, 0), latestdate),
         'dateformat_cases' : lambda x: x.strftime('%Y-%m-%d'),
         'dateformat_deaths': lambda x: x.strftime('%Y-%m-%d'),
         'get_county' : lambda data, m: (data['County_Name'] == ' '.join(m['County/County Equivalent'].split(' ')[:-1])) & (data['State_Name'] == m['State Name'])
       }


# Now that the data are loaded, pick a dataset and dates to run the COVID-19 regression on. (This is a linear regression of normalized, natural log active case counts (i.e., confirmed tests with deaths removed). These assumptions are all replicating reference [1], above.
# 
# Note: this takes a few minutes to run! Set output to True to see all the places that can't be included.

# In[2]:


#First, select a dataset to explore.
dataset_select = widgets.Select(
    options=['USAFacts','1point3acres'],
    value='USAFacts',
    description='Dataset:',
    disabled=False)

dataset_select


# In[21]:


#Assemble the state, metro/micro areas, pop data, and covid cases
## First  define a selection slider for the analysis
if dataset_select.value == 'USAFacts':
    dataset = usafacts
if dataset_select.value == '1point3acres':
    dataset = acres

date_range_slider = widgets.SelectionRangeSlider(options=[(d.strftime('%m/%d'),d) for d in dataset['daterange']],
    #index=(min([d for d in data['daterange']]),max([d for d in data['daterange']])),
    description='Dates:',
    orientation='horizontal',
    layout={'width': '500px'}
)


def coallate_analyze(dateTuple, msas, pop, dataset, remove_deaths = True, filtering = 'None', chatty=False):
    """Utility for ipywidgets.
    
    This function takes as input the US Census MSA definitions and 2018 ACS data, the USAFacts
    coronavirus case and death counts at the county level, and the start and end dates for the 
    statistical regression in keeping with the methodology outlined in [1]. It tabulates these data
    and outputs a graph.
    
    If chatty is True, lots of output about which cities aren't included.
    
    """
    global df #to remain accessible after the function is run
    dateStart = datetime.datetime.combine(dateTuple[0],datetime.time())
    dateEnd = datetime.datetime.combine(dateTuple[1],datetime.time())
    output = [] #to store the actual data to be exported and used
    popnotfound = [] #to store fips of counties that can't be found
    covidnotfound = [] #to store fips of counties without cases
    covidnotincl = [] #to store CBSA of metros excluded
    days = (dateEnd - dateStart).days+1 #number of days to be included in the analysis
    col_names = ['CBSA','Title','MetroMicro','Pop2018','COVIDEnd','AttackRate','r']
    #Leaving out Puerto Rico because it is not in the census data; I'm sorry! I hope to remedy this.
    for cbsa in msas.loc[(msas['FIPS State Code'] != 72)]['CBSA Code'][:-4].unique():
        #Get the MSA information
        #cbsa = '10740' #ABQ metro CBSA code for testing
        counties = msas.loc[msas['CBSA Code'] == cbsa]
        if chatty == True:
            print('Starting on ' + str(counties.iloc[0]['CBSA Title']))
        #Add teh metro area name and its metro/micro status to the output
        row = [cbsa,counties.loc[counties.index[0]]['CBSA Title'],counties.loc[counties.index[0]]['Metropolitan/Micropolitan Statistical Area']]
        #for all state and county codes, go through and select the relevant pop data
        pop_total = 0 #to sum the population
        covid_last = 0 #the total number of cases in the last day of the series
        covid_series = [0]*days #This stores just cases, not people who died
        #Loop through every constituent county to get the population as well as the COVID cases
        for ci in counties.index:
            #Get teh FIPS code for working with census data
            county = counties.loc[ci]
            s, c = (county['FIPS State Code'],county['FIPS County Code'])
            fips = int(s*1000 + c) #str(int(s)) + '0'*(3-len(str(int(c))))+str(int(c))
            #Calculate the population from this county towards the MSA
            if any((pop.STATE == int(s)) & (pop.COUNTY == int(c))):
                pop_total += int(pop.loc[(pop.STATE == int(s)) & (pop.COUNTY == int(c))]['POPESTIMATE2018'])
            else: #If the fips code doesn't exist, skip this county but record why.
                if chatty == True:
                    print(str(fips) + ' was not found in the ACS 2018 data.')
                popnotfound.append(fips)
            #Go through the chosen COVID-19 data and add the confirmed case numbers
            #If the county is in the data, add it to the total for this MSA
            if any(dataset['get_county'](dataset['cases'], county)):
                #Remove deaths if called for by the methodology
                if remove_deaths == True:
                    #When data contain duplicates; it is assumed that together these are the true total.
                    covid_last += int(sum(dataset['cases'].loc[dataset['get_county'](dataset['cases'], county)][dataset['dateformat_cases'](dateEnd)]))-int(sum(dataset['deaths'].loc[dataset['get_county'](dataset['deaths'], county)][dataset['dateformat_deaths'](dateEnd)]))
                    #Add the total number of active cases to each day
                    for i,d in zip(range(days),pandas.date_range(dateStart,dateEnd)):
                        covid_series[i] += int(sum(dataset['cases'].loc[(dataset['get_county'](dataset['cases'], county))][dataset['dateformat_cases'](d)])) - int(sum(dataset['deaths'].loc[dataset['get_county'](dataset['deaths'], county)][dataset['dateformat_deaths'](d)]))
                #Otherwise, don't remove the deaths, just count active cases
                else:
                    #When data contain duplicates; it is assumed that together these are the true total.
                    covid_last += int(sum(dataset['cases'].loc[dataset['get_county'](dataset['cases'], county)][dataset['dateformat_cases'](dateEnd)]))
                    #Add the total number of active cases to each day
                    for i,d in zip(range(days),pandas.date_range(dateStart,dateEnd)):
                        covid_series[i] += int(sum(dataset['cases'].loc[(dataset['get_county'](dataset['cases'], county))][dataset['dateformat_cases'](d)])) 
            else:
                if chatty == True:
                    print(str(fips) + ' was not found in the COVID data.')
                covidnotfound.append(fips)
        row.append(pop_total) 
        row.append(covid_last)
        #Now filter the data using any of the predefined criteria
        if filtering == 'Chicago':
            if any(pandas.isna(covid_series)) or (covid_series[-1] <= 3) or any([cs <= 0 for cs in covid_series]):
                if chatty == True:
                    print(row[1] + ' had too few cases for inclusion')
                covidnotincl.append(cbsa)
                row.append(np.nan)
                row.append(np.nan)
            else:
                #normalize the covid_series so that the March 13th data is 1
                covid_series = [cs * 1. / covid_series[0] for cs in covid_series]
                #run a regression
                slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(range(days),[np.log(cs) for cs in covid_series])
                row.append(slope)
                row.append(r_value)
        elif filtering == 'None':
                #run a regression
                slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(range(days),[np.log(cs) for cs in covid_series])
                row.append(slope)
                row.append(r_value)
        output.append(row)
    df = pandas.DataFrame(output,columns=col_names)
    print('\n \n \n Data are gathered for cases from ' + dateStart.strftime("%B %d, %Y") +' to ' + dateEnd.strftime("%B %d, %Y") + '! Run the next box when ready.')

#Run with some interactability for the dates
print('Change the dates to your window of interest, then click "Run Interact" to run the regression.')
widgets.interact_manual(coallate_analyze, dateTuple = date_range_slider, msas = widgets.fixed(msas), pop = widgets.fixed(pop), dataset = widgets.fixed(dataset), remove_deaths = True, filtering = widgets.Select(options=['None','Chicago']), chatty = False)
print('')


# In[14]:


def plot_some_figures(df):
    """
    Make plots.
    """
    ####################################################
    #Now that the data are collated, generate the graphs
    #Plot the chart from figure 1a from the arxiv preprint
    fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2)    
    fig.set_figheight(12)
    fig.set_figwidth(12)
    points, = ax1.plot([np.log(x) for x in df.Pop2018],[np.log(y) for y in df.AttackRate],'bo')
    #Now run a linear regression to calculate any potential power law behavior
    #Only include those where an attack rate could be esitmated and its a Metropolitan Area (not Micro)
    which = df.index[(pandas.isna(df.AttackRate)==False) & (df.MetroMicro == 'Metropolitan Statistical Area')] 
    global slope, intercept
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress([np.log(x) for x in df.Pop2018[which]],[np.log(y) for y in df.AttackRate[which]])
    line, = ax1.plot([np.log(x) for x in df.Pop2018],[np.log(x)*slope + intercept for x in df.Pop2018],'k-')
    ax1.set_xlabel('log(MSA Population)')
    ax1.set_ylabel('log(Estimated attack rate)')
    ax1.set_title('Attack rate vs. pop (r = {:0.3f}, p = {:0.3f}, exponent {:1.4f})'.format(r_value,p_value,slope))
    # Look at how well correlation describes the growth curves for the selected dates
    ax2.hist(df.r[which][pandas.isna(df.r[which]) == False],bins=np.arange(0.0,1.10,.10))
    ax2.set_title('Correlations for city-by-city exponential growth')
    # Make a q-q plot
    #Calculate teh residual distribution of the data
    residuals = [np.log(y) - (intercept + slope*np.log(x)) for x,y in zip( df.Pop2018[which], df.AttackRate[which])]
    (osm, osr), (qqslope, qqintercept, qqr) = scipy.stats.probplot(residuals,fit=True, plot=ax3)    
    ax3.set_title('Q-Q plot against normal (slope = %f, r = %f)' % (qqslope, qqr))
    plt.show
    
widgets.interact(plot_some_figures,df = widgets.fixed(df))
print('')


# In[20]:


def lookup_a_city(df,pickacity):
    w = df.index[df.Title == pickacity]
    if len(w) > 1:
        print('Uh-oh, more than one city by that name!')
    else:
        w = df.index[df.Title == cityselector.value][0]
        print(str(df.Title[w]))
        print(str(df.MetroMicro[w]))
        print('Population (2018 ACS estimate): %d ' % (df.Pop2018[w]))
        print('Covid cases by ' + date_range_slider.value[1].strftime("%B %d, %Y") + ': ' + str(df.COVIDEnd[w]))
        if pandas.isna(df.AttackRate[w]):
            print('There was not sufficient data (or another error occurred) to estimate a growth rate')
        else:
            print('COVID-19 attack rate (from regression): %f' % (df.AttackRate[w]))
            print('Correlation for that regression: %f' % (df.r[w]))
            print('Subsequent R: %f' % ((df.AttackRate[w]*4.5) + 1))
            print('Residual for the power-law regression: %f' % (np.log(df.AttackRate[w]) - (intercept + slope*np.log(df.Pop2018[w]))))

cityselector = widgets.Select(options = df.Title[df.MetroMicro == 'Metropolitan Statistical Area'],description = 'Pick a city to examine specifics')
widgets.interact(lookup_a_city,df = widgets.fixed(df),pickacity = cityselector, output = False)
print('')


# In[23]:


# In[ ]: