import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy import stats
import statsmodels.api as sm
import pylab
# for fancy python printing
from IPython.display import Markdown, display
def printmd(string):
display(Markdown(string))
import warnings
warnings.filterwarnings('ignore')
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 250
data = pd.read_csv("/Users/jonny/Desktop/Dataset/covid_19_counts_may2.csv")
data.Date = pd.to_datetime(data.Date)
# use only canada for now
cad = data.loc[data["Country/Region"] == "Canada", ["Country/Region", "Date", "Confirmed", "Deaths", "Recovered"]]
cad.columns = ["country", "date", "confirmed", "deaths", "recovered"]
# group by country and date, sum(confirmed, deaths, recovered)
cad = cad.groupby(['country','date'])['confirmed', 'deaths', 'recovered'].sum().reset_index()
# convert date string to datetime
cad.date = pd.to_datetime(cad.date)
cad = cad.sort_values(by = "date")
cad.tail()
country | date | confirmed | deaths | recovered | |
---|---|---|---|---|---|
95 | Canada | 2020-04-26 | 48033 | 2687 | 0 |
96 | Canada | 2020-04-27 | 49616 | 2841 | 0 |
97 | Canada | 2020-04-28 | 51150 | 2983 | 0 |
98 | Canada | 2020-04-29 | 52865 | 3155 | 0 |
99 | Canada | 2020-04-30 | 54457 | 3310 | 0 |
# countries with the most cases:
data.loc[data["Date"] == "2020-04-30", ["Country/Region", "Confirmed"]].sort_values(by = "Confirmed",
ascending = False)[:7]
Country/Region | Confirmed | |
---|---|---|
26361 | US | 1069424 |
26337 | Spain | 213435 |
26273 | Italy | 205463 |
26359 | United Kingdom | 171253 |
26252 | France | 165764 |
26256 | Germany | 163009 |
26349 | Turkey | 120204 |
# function to make the time series of confirmed and daily confirmed cases for a specific country
def create_country (country, end_date, state = False) :
if state :
df = data.loc[data["Province/State"] == country, ["Province/State", "Date", "Confirmed", "Deaths", "Recovered"]]
else :
df = data.loc[data["Country/Region"] == country, ["Country/Region", "Date", "Confirmed", "Deaths", "Recovered"]]
df.columns = ["country", "date", "confirmed", "deaths", "recovered"]
# group by country and date, sum(confirmed, deaths, recovered). do this because countries have multiple cities
df = df.groupby(['country','date'])['confirmed', 'deaths', 'recovered'].sum().reset_index()
# convert date string to datetime
df.date = pd.to_datetime(df.date)
df = df.sort_values(by = "date")
df = df[df.date <= end_date]
df.tail()
# make new confirmed cases every day:
cases_shifted = np.array([0] + list(df.confirmed[:-1]))
daily_confirmed = np.array(df.confirmed) - cases_shifted
df["daily_confirmed"] = daily_confirmed
# moving average for daily confirmed cases
df["moving_avg"] = df.daily_confirmed.rolling(window=4).mean()
fig, ax = plt.subplots(1,2, figsize=(15, 6))
# plot daily confirmed cases, along with moving average
#plt.figure(figsize=(11, 5))
sns.lineplot(x = df.date,
y = df.daily_confirmed,
#label = "Raw Data",
ax = ax[1])
# sns.lineplot(x = df.date,
# y = df.moving_avg,
# label = "Moving Average",
# legend = "full",
# ax = ax[0]).set_title("Daily New Confirmed COVID-19 Cases in %s" % country)
ax[1].set(ylabel='Daily Confirmed Cases',
xlabel='Date',
title = "Daily New Confirmed COVID-19 Cases in %s" % country)
sns.lineplot(x="date",
y="confirmed",
data= df,
ax = ax[0]
).set_title("Total Confirmed COVID-19 Cases in %s" % country)
ax[0].set(ylabel='Daily Confirmed Cases', xlabel='Date');
return df
def summary(samples):
site_stats = {}
for k, v in samples.items():
site_stats[k] = {
"mean": torch.mean(v, 0),
"std": torch.std(v, 0),
"5%": v.kthvalue(int(len(v) * 0.05), dim=0)[0],
"95%": v.kthvalue(int(len(v) * 0.95), dim=0)[0],
}
return site_stats
us = create_country("US", end_date = "2020-04-30")
us_start = "2020-02-26" # 51 confirmed cases
us = us[us.date >= us_start].reset_index(drop = True)
us["days_since_start"] = np.arange(us.shape[0]) + 1
cad = create_country("Canada", end_date = "2020-04-30")
cad_start = "2020-02-27" # 13 confirmed cases
cad = cad[cad.date >= cad_start].reset_index(drop = True)
cad["days_since_start"] = np.arange(cad.shape[0]) + 1
cad.shape
cad_tmp = cad[cad.date < "2020-03-28"]
cad_tmp.shape
ontario = create_country("Ontario", end_date = "2020-04-15", state = True) #data[data["Province/State"] == "Ontario"]
ontario[ontario.date > "2020-04-01"]
italy = create_country("Italy", end_date = "2020-04-30")
italy_start = "2020-02-21"
italy = italy[italy.date >= italy_start].reset_index(drop = True)
italy["days_since_start"] = np.arange(italy.shape[0]) + 1
# no daily confirmed cases on March 12th, probably some sort of admin error because there are 5100 cases
# on march 13th.
# take 2500 cases from march 13th and put them in march 12th. This will be "wrong", however hopefully any error w
num_cases = 2500
italy.loc[italy.date == "2020-03-12", "daily_confirmed"] = num_cases # add cases to march12th
italy.loc[italy.date == "2020-03-13", "daily_confirmed"] -= num_cases # subtract the cases from march13th
spain = create_country("Spain", end_date = "2020-04-30")
spain_start = "2020-02-25"
spain = spain[spain.date >= spain_start].reset_index(drop = True)
spain["days_since_start"] = np.arange(spain.shape[0]) + 1
# no cases on march 12th but 2900 cases on march 13th
num_cases = 1450
spain.loc[spain.date == "2020-03-12", "daily_confirmed"] = num_cases # add cases to march12th
spain.loc[spain.date == "2020-03-13", "daily_confirmed"] -= num_cases # subtract the cases from march13th
# spain2 = spain2[spain2.date >= spain_start].reset_index(drop = True)
# spain2["days_since_start"] = np.arange(spain2.shape[0]) + 1
# # no cases on march 12th but 2900 cases on march 13th
# spain2.loc[spain2.date == "2020-03-12", "daily_confirmed"] = num_cases # add cases to march12th
# spain2.loc[spain2.date == "2020-03-13", "daily_confirmed"] -= num_cases # subtract the cases from march13th
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-13-36a9eaecb3c1> in <module> 9 spain.loc[spain.date == "2020-03-13", "daily_confirmed"] -= num_cases # subtract the cases from march13th 10 ---> 11 spain2 = spain2[spain2.date >= spain_start].reset_index(drop = True) 12 spain2["days_since_start"] = np.arange(spain2.shape[0]) + 1 13 # no cases on march 12th but 2900 cases on march 13th NameError: name 'spain2' is not defined
germany = create_country("Germany", end_date = "2020-04-30")
germany_start = "2020-02-25"
germany = germany[germany.date >= germany_start].reset_index(drop = True)
germany["days_since_start"] = np.arange(germany.shape[0]) + 1
uk = create_country("United Kingdom", end_date = "2020-04-30")
#"Country/Region"
sk = create_country("South Korea", end_date = "2020-04-30")
# variable for data to easily swap it out:
country_ = "Canada (Before March 28th)"
reg_data = cad_tmp.copy()
### testing out piecewise regression to see if its reasonnable
# just choose a random point to split on. difference here is that we have a closed form solution
_date = 25
# piece 1
df = reg_data[reg_data.days_since_start < _date]
x = np.array(df["days_since_start"]).reshape(-1, 1) # predictor is the day number, t
y = np.log(np.array(df["daily_confirmed"]).reshape(-1, 1)) # response is the number of confirmed cases
reg = LinearRegression()
_ = reg.fit(x, y)
print("Regression 1 weight: ", round(reg.coef_[0][0], 2))
print("Regression 1 bias: ", round(reg.intercept_[0], 2))
# piece 2
df2 = reg_data[reg_data.days_since_start >= _date]
x2 = np.array(df2["days_since_start"]).reshape(-1, 1) # predictor is the day number, t
y2 = np.log(np.array(df2["daily_confirmed"]).reshape(-1, 1)) # response is the number of confirmed cases
reg2 = LinearRegression()
_ = reg2.fit(x2, y2)
print("Regression 2 weight: ", round(reg2.coef_[0][0], 2))
print("Regression 2 bias: ", round(reg2.intercept_[0], 2))
# plot data:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
# log regression model
ax[0].scatter(x, y, s = 15);
ax[0].scatter(x2, y2, s = 15, color = "red");
ax[0].plot(x, x*reg.coef_[0][0] + reg.intercept_)
ax[0].plot(x2, x2*reg2.coef_[0][0] + reg2.intercept_, color = "red")
ax[0].set(title = "Daily Cases: Piecewise Regression Model - Log Data")
ax[1].scatter(x, np.exp(y), s = 15)
ax[1].scatter(x2, np.exp(y2), s = 15, color = "red")
ax[1].plot(x, np.exp(x*reg.coef_[0][0] + reg.intercept_))
ax[1].plot(x2, np.exp(x2*reg2.coef_[0][0] + reg2.intercept_), color = "red");
ax[1].set(title = "Daily Cases: Piecewise Regression Model - Original Data");
import torch
import pyro
import pyro.distributions as dist
from torch import nn
from pyro.nn import PyroModule, PyroSample
from pyro.infer import MCMC, NUTS, HMC
from pyro.infer.autoguide import AutoGuide, AutoDiagonalNormal
from pyro.infer import SVI, Trace_ELBO
from pyro.infer import Predictive
# we should be able to have an empirical estimate for the mean of the prior for the 2nd regression bias term
# this will be something like b = log(max(daily_confirmed))
# might be able to have 1 regression model but change the data so that we have new terms for (tau < t)
# like an interaction term
class COVID_change(PyroModule):
def __init__(self, in_features, out_features, b1_mu, b2_mu):
super().__init__()
self.linear1 = PyroModule[nn.Linear](in_features, out_features, bias = False)
self.linear1.weight = PyroSample(dist.Normal(0.5, 0.25).expand([1, 1]).to_event(1))
self.linear1.bias = PyroSample(dist.Normal(b1_mu, 1.))
# could possibly have stronger priors for the 2nd regression line, because we wont have as much data
self.linear2 = PyroModule[nn.Linear](in_features, out_features, bias = False)
self.linear2.weight = PyroSample(dist.Normal(0., 0.25).expand([1, 1])) #.to_event(1))
self.linear2.bias = PyroSample(dist.Normal(b2_mu, b2_mu/4))
def forward(self, x, y=None):
tau = pyro.sample("tau", dist.Beta(4, 3))
sigma = pyro.sample("sigma", dist.Uniform(0., 3.))
# fit lm's to data based on tau
sep = int(np.ceil(tau.detach().numpy() * len(x)))
mean1 = self.linear1(x[:sep]).squeeze(-1)
mean2 = self.linear2(x[sep:]).squeeze(-1)
mean = torch.cat((mean1, mean2))
obs = pyro.sample("obs", dist.Normal(mean, sigma), obs=y)
return mean
tensor_data = torch.tensor(reg_data[["daily_confirmed", "days_since_start"]].values, dtype=torch.float)
x_data = tensor_data[:, 1].unsqueeze_(1)
y_data = np.log(tensor_data[:, 0])
# prior hyper params
# take log of the average of the 1st quartile to get the prior mean for the bias of the 2nd regression line
q1 = np.quantile(y_data, q = 0.25)
bias_1_mean = np.mean(y_data.numpy()[y_data <= q1])
print("Prior mean for Bias 1: ", bias_1_mean)
# take log of the average of the 4th quartile to get the prior mean for the bias of the 2nd regression line
q4 = np.quantile(y_data, q = 0.75)
bias_2_mean = np.mean(y_data.numpy()[y_data >= q4])
print("Prior mean for Bias 2: ", bias_2_mean)
model = COVID_change(1, 1,
b1_mu = bias_1_mean,
b2_mu = bias_2_mean)
auto_guide = AutoDiagonalNormal(model)
svi = SVI(model = model, # bayesian regression class
guide = auto_guide, # using auto guide
optim = pyro.optim.Adam({"lr": 0.1}), # optimizer
loss=Trace_ELBO()) # loss function
num_iterations = 7500
# param_store is where pyro stores param estimates
pyro.clear_param_store()
# inference loop
for j in range(num_iterations):
# calculate the loss and take a gradient step
loss = svi.step(x_data, y_data)
if j % 750 == 0:
print("[iteration %04d] loss: %.4f" % (j + 1, loss / len(y_data)))
auto_guide.requires_grad_(False)
num_samples = 2000
predictive = Predictive(model = model,
guide = auto_guide,
num_samples = num_samples,
return_sites=("linear1.weight", "linear1.bias",
"linear2.weight", "linear2.bias",
"tau", "sigma", "_RETURN", "obs"))
samples = predictive(x_data)
pred_summary = summary(samples)
mu = pred_summary["_RETURN"] # mean
y = pred_summary["obs"] # samples from likelihood: mu + sigma
predictions = pd.DataFrame({
"days_since_start": x_data[:, 0],
"mu_mean": mu["mean"], # mean of likelihood
"mu_perc_5": mu["5%"],
"mu_perc_95": mu["95%"],
"y_mean": y["mean"], # mean of likelihood + noise
"y_perc_5": y["5%"],
"y_perc_95": y["95%"],
"true_daily_confirmed": y_data,
})
predictions.head()
model = COVID_change(1, 1,
b1_mu = bias_1_mean,
b2_mu = bias_2_mean)
# need more than 400 samples/chain if we want to use a flat prior on b_2 and w_2
num_samples =800
# mcmc
nuts_kernel = NUTS(model)
mcmc = MCMC(nuts_kernel,
num_samples=num_samples,
warmup_steps = 150,
num_chains = 4)
mcmc.run(x_data, y_data)
samples = mcmc.get_samples()
# extract individual posteriors
weight_1_post = samples["linear1.weight"].detach().numpy()
weight_2_post = samples["linear2.weight"].detach().numpy()
bias_1_post = samples["linear1.bias"].detach().numpy()
bias_2_post = samples["linear2.bias"].detach().numpy()
tau_post = samples["tau"].detach().numpy()
sigma_post = samples["sigma"].detach().numpy()
# build likelihood distribution:
tau_days = list(map(int, np.ceil(tau_post * len(x_data))))
mean_ = torch.zeros(len(tau_days), len(x_data))
obs_ = torch.zeros(len(tau_days), len(x_data))
for i in range(len(tau_days)) :
mean_[i, :] = torch.cat((x_data[:tau_days[i]] * weight_1_post[i] + bias_1_post[i],
x_data[tau_days[i]:] * weight_2_post[i] + bias_2_post[i])).reshape(len(x_data))
obs_[i, :] = dist.Normal(mean_[i, :], sigma_post[i]).sample()
samples["_RETURN"] = mean_
samples["obs"] = obs_
pred_summary = summary(samples)
mu = pred_summary["_RETURN"] # mean
y = pred_summary["obs"] # samples from likelihood: mu + sigma
predictions = pd.DataFrame({
"days_since_start": x_data[:, 0],
"mu_mean": mu["mean"], # mean of likelihood
"mu_perc_5": mu["5%"],
"mu_perc_95": mu["95%"],
"y_mean": y["mean"], # mean of likelihood + noise
"y_perc_5": y["5%"],
"y_perc_95": y["95%"],
"true_daily_confirmed": y_data,
})
predictions.head()
w1_ = pred_summary["linear1.weight"]
w2_ = pred_summary["linear2.weight"]
b1_ = pred_summary["linear1.bias"]
b2_ = pred_summary["linear2.bias"]
tau_ = pred_summary["tau"]
sigma_ = pred_summary["sigma"]
ind = int(np.ceil(tau_["mean"] * len(x_data)))
mcmc.summary()
diag = mcmc.diagnostics()
n_samples = np.arange(len(weight_1_post))
_half = int(len(n_samples) / 2)
fig, ax = plt.subplots(3,2, figsize=(20, 14))
ax[0,0].plot(n_samples[:_half], weight_1_post[:_half, 0])
ax[0,0].plot(n_samples[:_half], weight_1_post[_half:, 0]);
ax[0,0].set(title="Weight 1, R_hat: %f" % diag["linear1.weight"]['r_hat'].numpy())
#ax[0,1].plot(n_samples, weight_2_post[:, 0]);
ax[0,1].plot(n_samples[:_half], weight_2_post[:_half, 0])
ax[0,1].plot(n_samples[:_half], weight_2_post[_half:, 0]);
ax[0,1].set(title="Weight 2, R_hat: %f" % diag["linear2.weight"]['r_hat'].numpy())
ax[1,0].plot(n_samples[:_half], bias_1_post[:_half]);
ax[1,0].plot(n_samples[:_half], bias_1_post[_half:]);
ax[1,0].set(title="Bias 1, R_hat: %f" % diag["linear1.bias"]['r_hat'].numpy())
ax[1,1].plot(n_samples[:_half], bias_2_post[:_half]);
ax[1,1].plot(n_samples[:_half], bias_2_post[_half:]);
ax[1,1].set(title="Bias 2, R_hat: %f" % diag["linear2.bias"]['r_hat'].numpy())
ax[2,0].plot(n_samples[:_half], tau_post[:_half]);
ax[2,0].plot(n_samples[:_half], tau_post[_half:]);
ax[2,0].set(title="Tau, R_hat: %f" % diag["tau"]['r_hat'].numpy())
ax[2,1].plot(n_samples[:_half], sigma_post[:_half]);
ax[2,1].plot(n_samples[:_half], sigma_post[_half:]);
ax[2,1].set(title="Sigma, R_hat: %f" % diag["sigma"]['r_hat'].numpy());
print(ind)
print(reg_data.date[ind])
fig, ax = plt.subplots(2,2, figsize=(18, 7))
plt.suptitle("Posterior Distributions for %s" % country_, fontsize=15)
sns.distplot(weight_1_post,
kde_kws = {"label": "Weight 1"},
color = "green",
norm_hist = True,
kde = True,
ax = ax[0, 0])
#ax[0, 0].axvline(x = w1_["mean"], linestyle = '--')
sns.distplot(weight_2_post,
kde_kws = {"label": "Weight 2"},
color = "teal",
norm_hist = True,
kde = True,
ax = ax[0, 0])
#ax[0, 0].axvline(x = w2_["mean"], linestyle = '--')
sns.distplot(bias_1_post,
kde_kws = {"label": "Bias 1"},
color = "red",
norm_hist = True,
kde = True,
ax = ax[0, 1]);
#ax[0, 1].axvline(x = b1_["mean"], linestyle = '--')
sns.distplot(bias_2_post,
kde_kws = {"label": "Bias 2"},
color = "orange",
norm_hist = True,
kde = True,
ax = ax[0, 1])
#ax[0, 1].axvline(x = b2_["mean"], linestyle = '--');
sns.distplot(tau_post,
kde_kws = {"label": "Tau"},
color = "blue",
norm_hist = True,
kde = True,
ax = ax[1, 0])
# ax[1, 0].axvline(x = tau_["mean"], linestyle = '--', color = "red")
# ax[1, 0].axvline(x = np.median(tau_post), linestyle = '--', color = "blue")
sns.distplot(sigma_post,
kde_kws = {"label": "Sigma"},
color = "orange",
norm_hist = True,
kde = True,
ax = ax[1,1]);
#ax[1, 1].axvline(x = sigma_["mean"], linestyle = '--');
start_date_ = str(reg_data.date[0]).split(' ')[0]
change_date_ = str(reg_data.date[ind]).split(' ')[0]
print("Date of change for {}: {}".format(country_, change_date_))
# plot data:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
# log regression model
ax[0].scatter(y = y_data[:ind], x = x_data[:ind], s = 15);
ax[0].scatter(y = y_data[ind:], x = x_data[ind:], s = 15, color = "red");
ax[0].plot(predictions["days_since_start"],
predictions["y_mean"],
color = "green",
label = "Mean")
ax[0].axvline(ind,
linestyle = '--', linewidth = 1,
label = "Day of Change",
color = "black")
ax[0].fill_between(predictions["days_since_start"],
predictions["y_perc_5"],
predictions["y_perc_95"],
alpha = 0.25,
label = "90% Credible Interval",
color = "teal");
ax[0].fill_betweenx([0, 1],
tau_["5%"] * len(x_data),
tau_["95%"] * len(x_data),
alpha = 0.25,
label = "90% Credible Interval \n for tau",
color = "lightcoral",
transform=ax[0].get_xaxis_transform());
ax[0].set(ylabel = "log (Daily New Cases)",
xlabel = "Days since %s" % start_date_,
title = "Log Daily Confirmed Cases: %s" % country_)
ax[0].legend(loc = "upper left")
# plot daily confirmed cases, along with moving average
sns.lineplot(x = "date",
y = "daily_confirmed",
data = reg_data,
label = "New Cases",
ax = ax[1])
sns.lineplot(x = "date",
y = "moving_avg",
data = reg_data,
label = "Moving Average",
ax = ax[1]).set_title("Daily New Cases in %s" % country_);
ax[1].fill_betweenx([0, 1],
reg_data.date[int(tau_["5%"] * len(x_data))],
reg_data.date[int(tau_["95%"] * len(x_data))],
alpha = 0.25,
label = "90% Credible Interval \n for tau",
color = "lightcoral",
transform=ax[1].get_xaxis_transform())
ax[1].axvline(reg_data.date[ind],
linestyle = '--', linewidth = 1,
label = "Day of Change",
color = "black");
plt.xticks(reg_data.date)
ax[1].xaxis.set_major_locator(plt.MaxNLocator(6)) # reduce number of xticks
ax[1].legend();
# residual and qq plot
resid = predictions["true_daily_confirmed"] - predictions["y_mean"]
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
ax[0].scatter(predictions["days_since_start"],
resid,
s = 15)
ax[0].axvline(ind, linestyle = '--',
linewidth = 1, label = "Est. Date of Change");
ax[0].axhline(0, linewidth = 1,
color = "red");
ax[0].legend();
ax[0].set(title="Residual Plot - %s" % country_,
xlabel = "Days since %s" % start_date_)
sm.qqplot(resid, line='45', ax = ax[1]);
ax[1].set(title = "QQ Plot of Residuals - %s" % country_);
fig, ax = plt.subplots(1,2, figsize=(15, 6))
#plt.figure(figsize=(11, 5))
sns.lineplot(x="date",
y="confirmed",
data= reg_data,
ax = ax[0]
).set_title("Confirmed COVID-19 Cases in %s" % country_)
ax[0].axvline(reg_data.date[ind], color="red", linestyle="--")
ax[1].scatter(y = reg_data.daily_confirmed[:ind], x = x_data[:ind], s = 15);
ax[1].scatter(y = reg_data.daily_confirmed[ind:], x = x_data[ind:], s = 15, color = "red");
ax[1].plot(predictions["days_since_start"],
np.exp(predictions["y_mean"]),
color = "green",
label = "Mean")
# ax[1].fill_between(predictions["days_since_start"],
# np.exp(predictions["y_perc_5"]),
# np.exp(predictions["y_perc_95"]),
# alpha = 0.25,
# label = "90% Credible Interval",
# color = "teal");
ax[1].axvline(ind, linestyle = '--',
linewidth = 1,
label = "Day of Change")
ax[1].legend(loc = "upper left")
ax[1].set(ylabel = "Daily Confirmed Cases",
xlabel = "Days since %s" % start_date_,
title = "Daily Confirmed Cases: %s" % country_);
printmd("**Date of change for {}: {}**".format(country_, change_date_));