import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy import stats
import statsmodels.api as sm
import pylab
# for fancy python printing
from IPython.display import Markdown, display
def printmd(string):
display(Markdown(string))
import warnings
warnings.filterwarnings('ignore')
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 250
data = pd.read_csv("/Users/jonny/Desktop/Dataset/covid_19_counts_may2.csv")
data.Date = pd.to_datetime(data.Date)
# use only canada for now
cad = data.loc[data["Country/Region"] == "Canada", ["Country/Region", "Date", "Confirmed", "Deaths", "Recovered"]]
cad.columns = ["country", "date", "confirmed", "deaths", "recovered"]
# group by country and date, sum(confirmed, deaths, recovered)
cad = cad.groupby(['country','date'])['confirmed', 'deaths', 'recovered'].sum().reset_index()
# convert date string to datetime
cad.date = pd.to_datetime(cad.date)
cad = cad.sort_values(by = "date")
cad.tail()
country | date | confirmed | deaths | recovered | |
---|---|---|---|---|---|
95 | Canada | 2020-04-26 | 48033 | 2687 | 0 |
96 | Canada | 2020-04-27 | 49616 | 2841 | 0 |
97 | Canada | 2020-04-28 | 51150 | 2983 | 0 |
98 | Canada | 2020-04-29 | 52865 | 3155 | 0 |
99 | Canada | 2020-04-30 | 54457 | 3310 | 0 |
# countries with the most cases:
data.loc[data["Date"] == "2020-04-30", ["Country/Region", "Confirmed"]].sort_values(by = "Confirmed",
ascending = False)[:7]
Country/Region | Confirmed | |
---|---|---|
26361 | US | 1069424 |
26337 | Spain | 213435 |
26273 | Italy | 205463 |
26359 | United Kingdom | 171253 |
26252 | France | 165764 |
26256 | Germany | 163009 |
26349 | Turkey | 120204 |
# function to make the time series of confirmed and daily confirmed cases for a specific country
def create_country (country, end_date, state = False) :
if state :
df = data.loc[data["Province/State"] == country, ["Province/State", "Date", "Confirmed", "Deaths", "Recovered"]]
else :
df = data.loc[data["Country/Region"] == country, ["Country/Region", "Date", "Confirmed", "Deaths", "Recovered"]]
df.columns = ["country", "date", "confirmed", "deaths", "recovered"]
# group by country and date, sum(confirmed, deaths, recovered). do this because countries have multiple cities
df = df.groupby(['country','date'])['confirmed', 'deaths', 'recovered'].sum().reset_index()
# convert date string to datetime
df.date = pd.to_datetime(df.date)
df = df.sort_values(by = "date")
df = df[df.date <= end_date]
df.tail()
# make new confirmed cases every day:
cases_shifted = np.array([0] + list(df.confirmed[:-1]))
daily_confirmed = np.array(df.confirmed) - cases_shifted
df["daily_confirmed"] = daily_confirmed
# moving average for daily confirmed cases
df["moving_avg"] = df.daily_confirmed.rolling(window=4).mean()
fig, ax = plt.subplots(1,2, figsize=(15, 6))
# plot daily confirmed cases, along with moving average
#plt.figure(figsize=(11, 5))
sns.lineplot(x = df.date,
y = df.daily_confirmed,
#label = "Raw Data",
ax = ax[1])
# sns.lineplot(x = df.date,
# y = df.moving_avg,
# label = "Moving Average",
# legend = "full",
# ax = ax[0]).set_title("Daily New Confirmed COVID-19 Cases in %s" % country)
ax[1].set(ylabel='Daily Confirmed Cases',
xlabel='Date',
title = "Daily New Confirmed COVID-19 Cases in %s" % country)
sns.lineplot(x="date",
y="confirmed",
data= df,
ax = ax[0]
).set_title("Total Confirmed COVID-19 Cases in %s" % country)
ax[0].set(ylabel='Daily Confirmed Cases', xlabel='Date');
return df
def summary(samples):
site_stats = {}
for k, v in samples.items():
site_stats[k] = {
"mean": torch.mean(v, 0),
"std": torch.std(v, 0),
"5%": v.kthvalue(int(len(v) * 0.05), dim=0)[0],
"95%": v.kthvalue(int(len(v) * 0.95), dim=0)[0],
}
return site_stats
us = create_country("US", end_date = "2020-04-30")