#!/usr/bin/env python # coding: utf-8 # # Estimating the Date of COVID-19 Changes # In[1]: import pandas as pd import numpy as np import seaborn as sns; sns.set() import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from scipy import stats import statsmodels.api as sm import pylab # for fancy python printing from IPython.display import Markdown, display def printmd(string): display(Markdown(string)) import warnings warnings.filterwarnings('ignore') import matplotlib as mpl mpl.rcParams['figure.dpi'] = 250 # In[2]: data = pd.read_csv("/Users/jonny/Desktop/Dataset/covid_19_counts_may2.csv") data.Date = pd.to_datetime(data.Date) # use only canada for now cad = data.loc[data["Country/Region"] == "Canada", ["Country/Region", "Date", "Confirmed", "Deaths", "Recovered"]] cad.columns = ["country", "date", "confirmed", "deaths", "recovered"] # group by country and date, sum(confirmed, deaths, recovered) cad = cad.groupby(['country','date'])['confirmed', 'deaths', 'recovered'].sum().reset_index() # convert date string to datetime cad.date = pd.to_datetime(cad.date) cad = cad.sort_values(by = "date") cad.tail() # In[4]: # countries with the most cases: data.loc[data["Date"] == "2020-04-30", ["Country/Region", "Confirmed"]].sort_values(by = "Confirmed", ascending = False)[:7] # In[5]: # function to make the time series of confirmed and daily confirmed cases for a specific country def create_country (country, end_date, state = False) : if state : df = data.loc[data["Province/State"] == country, ["Province/State", "Date", "Confirmed", "Deaths", "Recovered"]] else : df = data.loc[data["Country/Region"] == country, ["Country/Region", "Date", "Confirmed", "Deaths", "Recovered"]] df.columns = ["country", "date", "confirmed", "deaths", "recovered"] # group by country and date, sum(confirmed, deaths, recovered). do this because countries have multiple cities df = df.groupby(['country','date'])['confirmed', 'deaths', 'recovered'].sum().reset_index() # convert date string to datetime df.date = pd.to_datetime(df.date) df = df.sort_values(by = "date") df = df[df.date <= end_date] df.tail() # make new confirmed cases every day: cases_shifted = np.array([0] + list(df.confirmed[:-1])) daily_confirmed = np.array(df.confirmed) - cases_shifted df["daily_confirmed"] = daily_confirmed # moving average for daily confirmed cases df["moving_avg"] = df.daily_confirmed.rolling(window=4).mean() fig, ax = plt.subplots(1,2, figsize=(15, 6)) # plot daily confirmed cases, along with moving average #plt.figure(figsize=(11, 5)) sns.lineplot(x = df.date, y = df.daily_confirmed, #label = "Raw Data", ax = ax[1]) # sns.lineplot(x = df.date, # y = df.moving_avg, # label = "Moving Average", # legend = "full", # ax = ax[0]).set_title("Daily New Confirmed COVID-19 Cases in %s" % country) ax[1].set(ylabel='Daily Confirmed Cases', xlabel='Date', title = "Daily New Confirmed COVID-19 Cases in %s" % country) sns.lineplot(x="date", y="confirmed", data= df, ax = ax[0] ).set_title("Total Confirmed COVID-19 Cases in %s" % country) ax[0].set(ylabel='Daily Confirmed Cases', xlabel='Date'); return df def summary(samples): site_stats = {} for k, v in samples.items(): site_stats[k] = { "mean": torch.mean(v, 0), "std": torch.std(v, 0), "5%": v.kthvalue(int(len(v) * 0.05), dim=0)[0], "95%": v.kthvalue(int(len(v) * 0.95), dim=0)[0], } return site_stats # ### US # In[6]: us = create_country("US", end_date = "2020-04-30") # In[7]: us_start = "2020-02-26" # 51 confirmed cases us = us[us.date >= us_start].reset_index(drop = True) us["days_since_start"] = np.arange(us.shape[0]) + 1 # ### Canada # In[8]: cad = create_country("Canada", end_date = "2020-04-30") # In[9]: cad_start = "2020-02-27" # 13 confirmed cases cad = cad[cad.date >= cad_start].reset_index(drop = True) cad["days_since_start"] = np.arange(cad.shape[0]) + 1 # In[ ]: cad.shape cad_tmp = cad[cad.date < "2020-03-28"] cad_tmp.shape # In[ ]: ontario = create_country("Ontario", end_date = "2020-04-15", state = True) #data[data["Province/State"] == "Ontario"] # In[ ]: ontario[ontario.date > "2020-04-01"] # ### Italy # In[10]: italy = create_country("Italy", end_date = "2020-04-30") # In[11]: italy_start = "2020-02-21" italy = italy[italy.date >= italy_start].reset_index(drop = True) italy["days_since_start"] = np.arange(italy.shape[0]) + 1 # no daily confirmed cases on March 12th, probably some sort of admin error because there are 5100 cases # on march 13th. # take 2500 cases from march 13th and put them in march 12th. This will be "wrong", however hopefully any error w num_cases = 2500 italy.loc[italy.date == "2020-03-12", "daily_confirmed"] = num_cases # add cases to march12th italy.loc[italy.date == "2020-03-13", "daily_confirmed"] -= num_cases # subtract the cases from march13th # ### Spain # In[12]: spain = create_country("Spain", end_date = "2020-04-30") # In[13]: spain_start = "2020-02-25" spain = spain[spain.date >= spain_start].reset_index(drop = True) spain["days_since_start"] = np.arange(spain.shape[0]) + 1 # no cases on march 12th but 2900 cases on march 13th num_cases = 1450 spain.loc[spain.date == "2020-03-12", "daily_confirmed"] = num_cases # add cases to march12th spain.loc[spain.date == "2020-03-13", "daily_confirmed"] -= num_cases # subtract the cases from march13th # spain2 = spain2[spain2.date >= spain_start].reset_index(drop = True) # spain2["days_since_start"] = np.arange(spain2.shape[0]) + 1 # # no cases on march 12th but 2900 cases on march 13th # spain2.loc[spain2.date == "2020-03-12", "daily_confirmed"] = num_cases # add cases to march12th # spain2.loc[spain2.date == "2020-03-13", "daily_confirmed"] -= num_cases # subtract the cases from march13th # ### Germany # In[14]: germany = create_country("Germany", end_date = "2020-04-30") # In[15]: germany_start = "2020-02-25" germany = germany[germany.date >= germany_start].reset_index(drop = True) germany["days_since_start"] = np.arange(germany.shape[0]) + 1 # ### United Kingdom # In[17]: uk = create_country("United Kingdom", end_date = "2020-04-30") # In[ ]: # ### South Korea # In[18]: #"Country/Region" sk = create_country("South Korea", end_date = "2020-04-30") # ##### Data for Regression # In[ ]: # variable for data to easily swap it out: country_ = "Canada (Before March 28th)" reg_data = cad_tmp.copy() # #### POC # In[ ]: ### testing out piecewise regression to see if its reasonnable # just choose a random point to split on. difference here is that we have a closed form solution _date = 25 # piece 1 df = reg_data[reg_data.days_since_start < _date] x = np.array(df["days_since_start"]).reshape(-1, 1) # predictor is the day number, t y = np.log(np.array(df["daily_confirmed"]).reshape(-1, 1)) # response is the number of confirmed cases reg = LinearRegression() _ = reg.fit(x, y) print("Regression 1 weight: ", round(reg.coef_[0][0], 2)) print("Regression 1 bias: ", round(reg.intercept_[0], 2)) # piece 2 df2 = reg_data[reg_data.days_since_start >= _date] x2 = np.array(df2["days_since_start"]).reshape(-1, 1) # predictor is the day number, t y2 = np.log(np.array(df2["daily_confirmed"]).reshape(-1, 1)) # response is the number of confirmed cases reg2 = LinearRegression() _ = reg2.fit(x2, y2) print("Regression 2 weight: ", round(reg2.coef_[0][0], 2)) print("Regression 2 bias: ", round(reg2.intercept_[0], 2)) # plot data: fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 5)) # log regression model ax[0].scatter(x, y, s = 15); ax[0].scatter(x2, y2, s = 15, color = "red"); ax[0].plot(x, x*reg.coef_[0][0] + reg.intercept_) ax[0].plot(x2, x2*reg2.coef_[0][0] + reg2.intercept_, color = "red") ax[0].set(title = "Daily Cases: Piecewise Regression Model - Log Data") ax[1].scatter(x, np.exp(y), s = 15) ax[1].scatter(x2, np.exp(y2), s = 15, color = "red") ax[1].plot(x, np.exp(x*reg.coef_[0][0] + reg.intercept_)) ax[1].plot(x2, np.exp(x2*reg2.coef_[0][0] + reg2.intercept_), color = "red"); ax[1].set(title = "Daily Cases: Piecewise Regression Model - Original Data"); # ## Change Point Estimation in Pyro # In[ ]: import torch import pyro import pyro.distributions as dist from torch import nn from pyro.nn import PyroModule, PyroSample from pyro.infer import MCMC, NUTS, HMC from pyro.infer.autoguide import AutoGuide, AutoDiagonalNormal from pyro.infer import SVI, Trace_ELBO from pyro.infer import Predictive # In[ ]: # we should be able to have an empirical estimate for the mean of the prior for the 2nd regression bias term # this will be something like b = log(max(daily_confirmed)) # might be able to have 1 regression model but change the data so that we have new terms for (tau < t) # like an interaction term class COVID_change(PyroModule): def __init__(self, in_features, out_features, b1_mu, b2_mu): super().__init__() self.linear1 = PyroModule[nn.Linear](in_features, out_features, bias = False) self.linear1.weight = PyroSample(dist.Normal(0.5, 0.25).expand([1, 1]).to_event(1)) self.linear1.bias = PyroSample(dist.Normal(b1_mu, 1.)) # could possibly have stronger priors for the 2nd regression line, because we wont have as much data self.linear2 = PyroModule[nn.Linear](in_features, out_features, bias = False) self.linear2.weight = PyroSample(dist.Normal(0., 0.25).expand([1, 1])) #.to_event(1)) self.linear2.bias = PyroSample(dist.Normal(b2_mu, b2_mu/4)) def forward(self, x, y=None): tau = pyro.sample("tau", dist.Beta(4, 3)) sigma = pyro.sample("sigma", dist.Uniform(0., 3.)) # fit lm's to data based on tau sep = int(np.ceil(tau.detach().numpy() * len(x))) mean1 = self.linear1(x[:sep]).squeeze(-1) mean2 = self.linear2(x[sep:]).squeeze(-1) mean = torch.cat((mean1, mean2)) obs = pyro.sample("obs", dist.Normal(mean, sigma), obs=y) return mean # In[ ]: tensor_data = torch.tensor(reg_data[["daily_confirmed", "days_since_start"]].values, dtype=torch.float) x_data = tensor_data[:, 1].unsqueeze_(1) y_data = np.log(tensor_data[:, 0]) # prior hyper params # take log of the average of the 1st quartile to get the prior mean for the bias of the 2nd regression line q1 = np.quantile(y_data, q = 0.25) bias_1_mean = np.mean(y_data.numpy()[y_data <= q1]) print("Prior mean for Bias 1: ", bias_1_mean) # take log of the average of the 4th quartile to get the prior mean for the bias of the 2nd regression line q4 = np.quantile(y_data, q = 0.75) bias_2_mean = np.mean(y_data.numpy()[y_data >= q4]) print("Prior mean for Bias 2: ", bias_2_mean) # #### Approximate Inference with Stochastic Variational Inference # In[ ]: model = COVID_change(1, 1, b1_mu = bias_1_mean, b2_mu = bias_2_mean) auto_guide = AutoDiagonalNormal(model) svi = SVI(model = model, # bayesian regression class guide = auto_guide, # using auto guide optim = pyro.optim.Adam({"lr": 0.1}), # optimizer loss=Trace_ELBO()) # loss function num_iterations = 7500 # param_store is where pyro stores param estimates pyro.clear_param_store() # inference loop for j in range(num_iterations): # calculate the loss and take a gradient step loss = svi.step(x_data, y_data) if j % 750 == 0: print("[iteration %04d] loss: %.4f" % (j + 1, loss / len(y_data))) auto_guide.requires_grad_(False) num_samples = 2000 predictive = Predictive(model = model, guide = auto_guide, num_samples = num_samples, return_sites=("linear1.weight", "linear1.bias", "linear2.weight", "linear2.bias", "tau", "sigma", "_RETURN", "obs")) samples = predictive(x_data) pred_summary = summary(samples) mu = pred_summary["_RETURN"] # mean y = pred_summary["obs"] # samples from likelihood: mu + sigma predictions = pd.DataFrame({ "days_since_start": x_data[:, 0], "mu_mean": mu["mean"], # mean of likelihood "mu_perc_5": mu["5%"], "mu_perc_95": mu["95%"], "y_mean": y["mean"], # mean of likelihood + noise "y_perc_5": y["5%"], "y_perc_95": y["95%"], "true_daily_confirmed": y_data, }) predictions.head() # #### HMC with NUTS # In[ ]: model = COVID_change(1, 1, b1_mu = bias_1_mean, b2_mu = bias_2_mean) # need more than 400 samples/chain if we want to use a flat prior on b_2 and w_2 num_samples =800 # mcmc nuts_kernel = NUTS(model) mcmc = MCMC(nuts_kernel, num_samples=num_samples, warmup_steps = 150, num_chains = 4) mcmc.run(x_data, y_data) samples = mcmc.get_samples() # In[ ]: # extract individual posteriors weight_1_post = samples["linear1.weight"].detach().numpy() weight_2_post = samples["linear2.weight"].detach().numpy() bias_1_post = samples["linear1.bias"].detach().numpy() bias_2_post = samples["linear2.bias"].detach().numpy() tau_post = samples["tau"].detach().numpy() sigma_post = samples["sigma"].detach().numpy() # build likelihood distribution: tau_days = list(map(int, np.ceil(tau_post * len(x_data)))) mean_ = torch.zeros(len(tau_days), len(x_data)) obs_ = torch.zeros(len(tau_days), len(x_data)) for i in range(len(tau_days)) : mean_[i, :] = torch.cat((x_data[:tau_days[i]] * weight_1_post[i] + bias_1_post[i], x_data[tau_days[i]:] * weight_2_post[i] + bias_2_post[i])).reshape(len(x_data)) obs_[i, :] = dist.Normal(mean_[i, :], sigma_post[i]).sample() samples["_RETURN"] = mean_ samples["obs"] = obs_ pred_summary = summary(samples) mu = pred_summary["_RETURN"] # mean y = pred_summary["obs"] # samples from likelihood: mu + sigma predictions = pd.DataFrame({ "days_since_start": x_data[:, 0], "mu_mean": mu["mean"], # mean of likelihood "mu_perc_5": mu["5%"], "mu_perc_95": mu["95%"], "y_mean": y["mean"], # mean of likelihood + noise "y_perc_5": y["5%"], "y_perc_95": y["95%"], "true_daily_confirmed": y_data, }) predictions.head() w1_ = pred_summary["linear1.weight"] w2_ = pred_summary["linear2.weight"] b1_ = pred_summary["linear1.bias"] b2_ = pred_summary["linear2.bias"] tau_ = pred_summary["tau"] sigma_ = pred_summary["sigma"] ind = int(np.ceil(tau_["mean"] * len(x_data))) # ## Model Diagnostics # # - Residual plots: Should these be samples from the likelihood compared with the actual data? Or just the mean of the likelihood? # - $\hat{R}$: The factor that the scale of the current distribution will be reduced by if we were to run the simulations forever. As n tends to $\inf$, $\hat{R}$ tends to 1. So we want values close to 1. # - Mixing and Stationarity: I sampled 4 chains. Do I then take these chains, split them in half and plot them. If they converge to the same stationary distribution, does that mean the MCMC converged? What do I do with more sampled chains? # In[ ]: mcmc.summary() diag = mcmc.diagnostics() # In[ ]: n_samples = np.arange(len(weight_1_post)) _half = int(len(n_samples) / 2) fig, ax = plt.subplots(3,2, figsize=(20, 14)) ax[0,0].plot(n_samples[:_half], weight_1_post[:_half, 0]) ax[0,0].plot(n_samples[:_half], weight_1_post[_half:, 0]); ax[0,0].set(title="Weight 1, R_hat: %f" % diag["linear1.weight"]['r_hat'].numpy()) #ax[0,1].plot(n_samples, weight_2_post[:, 0]); ax[0,1].plot(n_samples[:_half], weight_2_post[:_half, 0]) ax[0,1].plot(n_samples[:_half], weight_2_post[_half:, 0]); ax[0,1].set(title="Weight 2, R_hat: %f" % diag["linear2.weight"]['r_hat'].numpy()) ax[1,0].plot(n_samples[:_half], bias_1_post[:_half]); ax[1,0].plot(n_samples[:_half], bias_1_post[_half:]); ax[1,0].set(title="Bias 1, R_hat: %f" % diag["linear1.bias"]['r_hat'].numpy()) ax[1,1].plot(n_samples[:_half], bias_2_post[:_half]); ax[1,1].plot(n_samples[:_half], bias_2_post[_half:]); ax[1,1].set(title="Bias 2, R_hat: %f" % diag["linear2.bias"]['r_hat'].numpy()) ax[2,0].plot(n_samples[:_half], tau_post[:_half]); ax[2,0].plot(n_samples[:_half], tau_post[_half:]); ax[2,0].set(title="Tau, R_hat: %f" % diag["tau"]['r_hat'].numpy()) ax[2,1].plot(n_samples[:_half], sigma_post[:_half]); ax[2,1].plot(n_samples[:_half], sigma_post[_half:]); ax[2,1].set(title="Sigma, R_hat: %f" % diag["sigma"]['r_hat'].numpy()); # ## Posterior Plots # In[ ]: print(ind) print(reg_data.date[ind]) fig, ax = plt.subplots(2,2, figsize=(18, 7)) plt.suptitle("Posterior Distributions for %s" % country_, fontsize=15) sns.distplot(weight_1_post, kde_kws = {"label": "Weight 1"}, color = "green", norm_hist = True, kde = True, ax = ax[0, 0]) #ax[0, 0].axvline(x = w1_["mean"], linestyle = '--') sns.distplot(weight_2_post, kde_kws = {"label": "Weight 2"}, color = "teal", norm_hist = True, kde = True, ax = ax[0, 0]) #ax[0, 0].axvline(x = w2_["mean"], linestyle = '--') sns.distplot(bias_1_post, kde_kws = {"label": "Bias 1"}, color = "red", norm_hist = True, kde = True, ax = ax[0, 1]); #ax[0, 1].axvline(x = b1_["mean"], linestyle = '--') sns.distplot(bias_2_post, kde_kws = {"label": "Bias 2"}, color = "orange", norm_hist = True, kde = True, ax = ax[0, 1]) #ax[0, 1].axvline(x = b2_["mean"], linestyle = '--'); sns.distplot(tau_post, kde_kws = {"label": "Tau"}, color = "blue", norm_hist = True, kde = True, ax = ax[1, 0]) # ax[1, 0].axvline(x = tau_["mean"], linestyle = '--', color = "red") # ax[1, 0].axvline(x = np.median(tau_post), linestyle = '--', color = "blue") sns.distplot(sigma_post, kde_kws = {"label": "Sigma"}, color = "orange", norm_hist = True, kde = True, ax = ax[1,1]); #ax[1, 1].axvline(x = sigma_["mean"], linestyle = '--'); # In[ ]: start_date_ = str(reg_data.date[0]).split(' ')[0] change_date_ = str(reg_data.date[ind]).split(' ')[0] print("Date of change for {}: {}".format(country_, change_date_)) # plot data: fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5)) # log regression model ax[0].scatter(y = y_data[:ind], x = x_data[:ind], s = 15); ax[0].scatter(y = y_data[ind:], x = x_data[ind:], s = 15, color = "red"); ax[0].plot(predictions["days_since_start"], predictions["y_mean"], color = "green", label = "Mean") ax[0].axvline(ind, linestyle = '--', linewidth = 1, label = "Day of Change", color = "black") ax[0].fill_between(predictions["days_since_start"], predictions["y_perc_5"], predictions["y_perc_95"], alpha = 0.25, label = "90% Credible Interval", color = "teal"); ax[0].fill_betweenx([0, 1], tau_["5%"] * len(x_data), tau_["95%"] * len(x_data), alpha = 0.25, label = "90% Credible Interval \n for tau", color = "lightcoral", transform=ax[0].get_xaxis_transform()); ax[0].set(ylabel = "log (Daily New Cases)", xlabel = "Days since %s" % start_date_, title = "Log Daily Confirmed Cases: %s" % country_) ax[0].legend(loc = "upper left") # plot daily confirmed cases, along with moving average sns.lineplot(x = "date", y = "daily_confirmed", data = reg_data, label = "New Cases", ax = ax[1]) sns.lineplot(x = "date", y = "moving_avg", data = reg_data, label = "Moving Average", ax = ax[1]).set_title("Daily New Cases in %s" % country_); ax[1].fill_betweenx([0, 1], reg_data.date[int(tau_["5%"] * len(x_data))], reg_data.date[int(tau_["95%"] * len(x_data))], alpha = 0.25, label = "90% Credible Interval \n for tau", color = "lightcoral", transform=ax[1].get_xaxis_transform()) ax[1].axvline(reg_data.date[ind], linestyle = '--', linewidth = 1, label = "Day of Change", color = "black"); plt.xticks(reg_data.date) ax[1].xaxis.set_major_locator(plt.MaxNLocator(6)) # reduce number of xticks ax[1].legend(); # In[ ]: # residual and qq plot resid = predictions["true_daily_confirmed"] - predictions["y_mean"] fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5)) ax[0].scatter(predictions["days_since_start"], resid, s = 15) ax[0].axvline(ind, linestyle = '--', linewidth = 1, label = "Est. Date of Change"); ax[0].axhline(0, linewidth = 1, color = "red"); ax[0].legend(); ax[0].set(title="Residual Plot - %s" % country_, xlabel = "Days since %s" % start_date_) sm.qqplot(resid, line='45', ax = ax[1]); ax[1].set(title = "QQ Plot of Residuals - %s" % country_); # In[ ]: fig, ax = plt.subplots(1,2, figsize=(15, 6)) #plt.figure(figsize=(11, 5)) sns.lineplot(x="date", y="confirmed", data= reg_data, ax = ax[0] ).set_title("Confirmed COVID-19 Cases in %s" % country_) ax[0].axvline(reg_data.date[ind], color="red", linestyle="--") ax[1].scatter(y = reg_data.daily_confirmed[:ind], x = x_data[:ind], s = 15); ax[1].scatter(y = reg_data.daily_confirmed[ind:], x = x_data[ind:], s = 15, color = "red"); ax[1].plot(predictions["days_since_start"], np.exp(predictions["y_mean"]), color = "green", label = "Mean") # ax[1].fill_between(predictions["days_since_start"], # np.exp(predictions["y_perc_5"]), # np.exp(predictions["y_perc_95"]), # alpha = 0.25, # label = "90% Credible Interval", # color = "teal"); ax[1].axvline(ind, linestyle = '--', linewidth = 1, label = "Day of Change") ax[1].legend(loc = "upper left") ax[1].set(ylabel = "Daily Confirmed Cases", xlabel = "Days since %s" % start_date_, title = "Daily Confirmed Cases: %s" % country_); printmd("**Date of change for {}: {}**".format(country_, change_date_)); # In[ ]: