#!/usr/bin/env python
# coding: utf-8
# # COVID-19 Data Analytics
# ### Annie Bryan
# #### Imports
# In[1]:
import csv
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
from scipy.signal import argrelextrema
from datetime import datetime, timedelta
# #### Obtaining data
# In[2]:
url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv"
df_raw = pd.read_csv(url)
df = df_raw.iloc[:,np.r_[6,11:len(df_raw.columns)]]
# #### Cleaning data
# In[3]:
# aggregate by state
aggregated_df = df.groupby(by="Province_State").sum().reset_index()
states = set(pd.read_csv("https://raw.githubusercontent.com/jasonong/List-of-US-States/master/states.csv")["State"])
is_state = aggregated_df["Province_State"].isin(states)
df_by_state = aggregated_df[is_state].reset_index().iloc[:,1:]
# #### Exploring data
# In[4]:
# new daily cases
states = df_by_state.iloc[:,0]
daily_data = df_by_state.iloc[:,1:len(df_by_state.columns)].diff(axis=1).iloc[:,1:len(df_by_state.columns)].applymap(lambda i: int(i))
daily_cases = pd.concat([states, daily_data], axis=1)
# In[5]:
def df_n_day_avg(df, n):
states = df.iloc[:,0]
data = df.rolling(n, axis=1, center=True).mean().iloc[:,int((n-1)/2):len(df.columns)-int((n+1)/2)].applymap(lambda i: int(i))
return pd.concat([states, data], axis=1)
# In[6]:
# 3 day average
df_3_day_avg = df_n_day_avg(daily_cases, 3)
# 5 day average
df_5_day_avg = df_n_day_avg(daily_cases, 5)
# 7 day average
df_7_day_avg = df_n_day_avg(daily_cases, 7)
# In[7]:
def series_n_day_avg(series, n):
return series.rolling(n, center=True).mean().iloc[int((n-1)/2):len(series)-int((n-1)/2)].apply(lambda i: int(i))
# In[8]:
# aggregate across country
us_daily_cases = daily_cases.sum()[1:]
# 3 day average
series_3_day_avg = series_n_day_avg(us_daily_cases, 3)
# 5 day average
series_5_day_avg = series_n_day_avg(us_daily_cases, 5)
# 7 day average
series_7_day_avg = series_n_day_avg(us_daily_cases, 7)
# #### Visualizing data
# In[9]:
# initialize plot
plt.figure(figsize=(20,10))
# plot daily cases as bar graph
dates = us_daily_cases.index
cases = list(us_daily_cases)
plt.bar(dates, cases, color='r')
# plot 3 day average as line graph
dates_3_day_avg = series_3_day_avg.index
cases_3_day_avg = list(series_3_day_avg)
plt.plot(dates_3_day_avg, cases_3_day_avg, color='y')
# plot 5 day average as line graph
dates_5_day_avg = series_5_day_avg.index
cases_5_day_avg = list(series_5_day_avg)
plt.plot(dates_5_day_avg, cases_5_day_avg, color='g')
# plot 7 day average as line graph
dates_7_day_avg = series_7_day_avg.index
cases_7_day_avg = list(series_7_day_avg)
plt.plot(dates_7_day_avg, cases_7_day_avg, color='b')
# define x-axis ticks
ticks = list(us_daily_cases.index)[::7]
plt.xticks(ticks, rotation=90)
# define title and labels
plt.title("COVID-19 US Cases")
plt.ylabel('Daily Cases')
plt.legend(["3 day average", "5 day average", "7 day average", "Daily cases"])
# save to png
filename = "us-covid-cases.png"
plt.savefig(filename, bbox_inches = 'tight', optimize=True)
# plot chart below
plt.show()
# In[10]:
# get local maxima
maxima = argrelextrema(np.array(cases_7_day_avg), np.greater)
# get local minima
minima = argrelextrema(np.array(cases_7_day_avg), np.less)
# Based on visual observation of the plotted data along with the maxima and minima values, I was able to divide the time series into 4 groups.
#
# - the initial rise of the virus
# - the first decline
# - the second wave
# - the second decline
#
# I will analyze these 4 groups separately in order to more accurately model the data with best-fit curves.
# In[14]:
# initial rise (feb 25 - apr 7)
initial_rise_dates = dates_7_day_avg[30:73]
initial_rise_cases = cases_7_day_avg[30:73]
# first decline (apr 7 - june 6)
first_decline_dates = dates_7_day_avg[72:133]
first_decline_cases = cases_7_day_avg[72:133]
# second wave (june 6 - july 19)
second_wave_dates = dates_7_day_avg[132:176]
second_wave_cases = cases_7_day_avg[132:176]
# second decline (july 19 - present)
second_decline_dates = dates_7_day_avg[160:]
second_decline_cases = cases_7_day_avg[160:]
# In[15]:
# defining functions to represent data
def logistic(x, yo, k, r):
return (k*yo)/(yo + (k-yo)*np.exp(-r*x))
def exponential(x, a, b):
return a*np.exp(b*x)
def parabola(x, a, b, c):
return a*x**2 + b*x + c
def linear(x, a, b):
return a*x + b
# In[16]:
def get_best_fit(data, best_fit_fn):
popt, pcov = curve_fit(best_fit_fn, list(range(len(data))), data)
best_fit_vals = [best_fit_fn(t, *popt) for t in range(len(data))]
residuals = [d - b for d, b in zip(data, best_fit_vals)]
ss_res = np.sum([i**2 for i in residuals])
ss_tot = np.sum((data-np.mean(data))**2)
r_squared = 1 - (ss_res / ss_tot)
return best_fit_vals, r_squared
# In[17]:
# Initial Rise
plt.rcParams['figure.figsize'] = [18, 5]
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3)
best_fit_vals, r_squared = get_best_fit(initial_rise_cases, logistic)
ax1.plot(initial_rise_dates, initial_rise_cases, color='k')
ax1.plot(initial_rise_dates, best_fit_vals, color='b')
ticks = list(initial_rise_dates)[::7]
ax1.set_xticks(ticks)
ax1.set_xticklabels(ticks, rotation=90)
ax1.legend(["7 day average of daily cases", "logistic best fit"])
ax1.set_title("R2: " + str(format(r_squared, '.3f')))
best_fit_vals, r_squared = get_best_fit(initial_rise_cases, parabola)
ax2.plot(initial_rise_dates, initial_rise_cases, color='k')
ax2.plot(initial_rise_dates, best_fit_vals, color='b')
ticks = list(initial_rise_dates)[::7]
ax2.set_xticks(ticks)
ax2.set_xticklabels(ticks, rotation=90)
ax2.legend(["7 day average of daily cases", "parabolic best fit"])
ax2.set_title("R2: " + str(format(r_squared, '.3f')))
best_fit_vals, r_squared = get_best_fit(initial_rise_cases, exponential)
ax3.plot(initial_rise_dates, initial_rise_cases, color='k')
ax3.plot(initial_rise_dates, best_fit_vals, color='b')
ticks = list(initial_rise_dates)[::7]
ax3.set_xticks(ticks)
ax3.set_xticklabels(ticks, rotation=90)
ax3.legend(["7 day average of daily cases", "exponential best fit"])
ax3.set_title("R2: " + str(format(r_squared, '.3f')))
plt.show()
# In[18]:
# First Decline
plt.rcParams['figure.figsize'] = [10, 5]
fig, ax1 = plt.subplots(ncols=1)
best_fit_vals, r_squared = get_best_fit(first_decline_cases, linear)
ax1.plot(first_decline_dates, first_decline_cases, color='k')
ax1.plot(first_decline_dates, best_fit_vals, color='b')
ax1.legend(["7 day average of daily cases", "linear best fit"])
ax1.set_title("R2: " + str(format(r_squared, '.3f')))
ticks = list(first_decline_dates)[::7]
ax1.set_xticks(ticks)
ax1.set_xticklabels(ticks, rotation=90)
plt.show()
# In[19]:
# Second Wave
plt.rcParams['figure.figsize'] = [15, 5]
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3)
best_fit_vals, r_squared = get_best_fit(second_wave_cases, logistic)
ax1.plot(second_wave_dates, second_wave_cases, color='k')
ax1.plot(second_wave_dates, best_fit_vals, color='b')
ax1.legend(["7 day average of daily cases", "logistic best fit"])
ax1.set_title("R2: " + str(format(r_squared, '.3f')))
best_fit_vals, r_squared = get_best_fit(second_wave_cases, parabola)
ax2.plot(second_wave_dates, second_wave_cases, color='k')
ax2.plot(second_wave_dates, best_fit_vals, color='b')
ax2.legend(["7 day average of daily cases", "parabolic best fit"])
ax2.set_title("R2: " + str(format(r_squared, '.3f')))
best_fit_vals, r_squared = get_best_fit(second_wave_cases, linear)
ax3.plot(second_wave_dates, second_wave_cases, color='k')
ax3.plot(second_wave_dates, best_fit_vals, color='b')
ax3.legend(["7 day average of daily cases", "linear best fit"])
ax3.set_title("R2: " + str(format(r_squared, '.3f')))
ticks = list(second_wave_dates)[::7]
ax1.set_xticks(ticks)
ax1.set_xticklabels(ticks, rotation=90)
ax2.set_xticks(ticks)
ax2.set_xticklabels(ticks, rotation=90)
ax3.set_xticks(ticks)
ax3.set_xticklabels(ticks, rotation=90)
plt.show()
# ### Analyzing the effects of a state-mandated stay-at-home order
# **Null Hypothesis:** Stay-at-home policies do not slow the rate of spread of the COVID-19 virus.
# **Alternative Hypothesis:** Stay-at-home policies slow the rate of spread of the COVID-19 virus.
# **Significance Level:** $\alpha$ = 0.05
# States that did not implement a stay at home order:
#
# - Arkansas
# - Iowa
# - Nebraska
# - North Dakota
# - South Dakota
#
# #### \#TODO calculate each of the following
# growth rate before policy
# growth rate during policy
# growth rate after policy
# In[20]:
# reading in CSV file as pandas dataframe
stay_at_home_df = pd.read_csv(open('stay-at-home-orders.csv'))
states = set(stay_at_home_df['State'])
stay_at_home_df
# In[21]:
is_state = df_by_state["Province_State"].isin(states)
df = df_by_state[is_state].reset_index().iloc[:,1:]
df
# In[22]:
def get_enacted_dates(state):
dates = stay_at_home_df.loc[stay_at_home_df["State"]==state, ["Date Enacted", "Date Lifted"]].values.flatten().tolist()
return dates
def get_all_state_data(state):
is_state = df_7_day_avg["Province_State"]==state
all_state_data = df_7_day_avg[is_state].reset_index().iloc[:,1:]
return all_state_data
def get_data_before_enacted(state):
all_dates = list(df_7_day_avg.columns[1:])
date_enacted = get_enacted_dates(state)[0]
dates_before_enacted = all_dates[:all_dates.index(date_enacted)+1]
all_state_data = get_all_state_data(state)
state_data_before_enacted = all_state_data[dates_before_enacted].values.flatten().tolist()
first_non_zero_day = next((index for index,value in enumerate(state_data_before_enacted) if value != 0), None)
return state_data_before_enacted[first_non_zero_day:]
def get_data_while_enacted(state):
all_dates = list(df_7_day_avg.columns[1:])
date_enacted, date_lifted = get_enacted_dates(state)
dates_while_enacted = all_dates[all_dates.index(date_enacted):all_dates.index(date_lifted)+1]
all_state_data = get_all_state_data(state)
state_data_while_enacted = all_state_data[dates_while_enacted].values.flatten().tolist()
return state_data_while_enacted
def get_data_after_lifted(state):
all_dates = list(df_7_day_avg.columns[1:])
date_lifted = get_enacted_dates(state)[1]
dates_after_lifted = all_dates[all_dates.index(date_lifted):]
all_state_data = get_all_state_data(state)
state_data_after_lifted = all_state_data[dates_after_lifted].values.flatten().tolist()
return state_data_after_lifted