In [1]:
%load_ext watermark
%watermark -a 'cs224' -u -d -v -p numpy,xarray,scipy,pandas,sklearn,matplotlib,seaborn,pymc3
cs224 
last updated: 2020-08-03 

CPython 3.6.10
IPython 7.15.0

numpy 1.18.5
xarray 0.15.1
scipy 1.5.0
pandas 1.0.5
sklearn 0.23.1
matplotlib 3.2.2
seaborn 0.10.1
pymc3 3.9.2
In [2]:
%matplotlib inline
import numpy as np, scipy, scipy.stats as stats, scipy.special, scipy.misc, pandas as pd, matplotlib.pyplot as plt, seaborn as sns, xarray as xr
import matplotlib as mpl

import pymc3 as pm

import theano as thno
import theano.tensor as T

import sklearn, sklearn.linear_model

import datetime, time, math
from dateutil import relativedelta

from collections import OrderedDict

SEED = 42
np.random.seed(SEED)

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# pd.set_option('display.float_format', lambda x: '%.2f' % x)
np.set_printoptions(edgeitems=10)
np.set_printoptions(linewidth=1000)
np.set_printoptions(suppress=True)
np.core.arrayprint._line_width = 180

sns.set()
In [3]:
from IPython.display import display, HTML

from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        if type(df) == np.ndarray:
            df = pd.DataFrame(df)
        html_str+=df.to_html()
    html_str = html_str.replace('table','table style="display:inline"')
    # print(html_str)
    display_html(html_str,raw=True)

CSS = """
.output {
    flex-direction: row;
}
"""

def display_graphs_side_by_side(*args):
    html_str='<table><tr>'
    for g in args:
        html_str += '<td>'
        html_str += g._repr_svg_()
        html_str += '</td>'
    html_str += '</tr></table>'
    display_html(html_str,raw=True)
    

display(HTML("<style>.container { width:70% !important; }</style>"))
In [4]:
%load_ext autoreload
%autoreload 1
%aimport covid19

Data Source: Bayerische Landesamt für Gesundheit und Lebensmittelsicherheit (LGL): Übersicht der Fallzahlen von Coronavirusinfektionen

In [5]:
import requests
import urllib.request
import time
import bs4
import re
# from bs4 import BeautifulSoup
In [6]:
import decimal
import locale
In [7]:
import dateparser
In [8]:
url = 'https://www.lgl.bayern.de/gesundheit/infektionsschutz/infektionskrankheiten_a_z/coronavirus/karte_coronavirus/index.htm'
response = requests.get(url)
In [9]:
response
Out[9]:
<Response [200]>
In [10]:
soup = bs4.BeautifulSoup(response.text, "html.parser")
In [11]:
# soup.find_all(class_="accordion")
In [12]:
tableFaelle = soup.find(id="tableFaelle")
In [13]:
# tableFaelle
In [14]:
# dateparser.parse('01. Mrz 2020', date_formats=['%d. %b %Y'], languages=['de'])
In [15]:
# dateparser.parse('01. Mar 2020', date_formats=['%d. %b %Y'], languages=['de'])
In [16]:
# dateparser.parse('01. März 2020', languages=['de'])
In [17]:
locale.setlocale(locale.LC_ALL, 'de_DE.UTF8')
df = pd.DataFrame(columns=['report_date', 'new_confirmed'])
for i, tr in enumerate(tableFaelle.find_all('tr')):
    if i == 0:
        continue
        
    td    = tr.find_all('td')
    dtstr = td[0].text + '2020'
    dtstr = re.sub('Mrz','März', dtstr)
    dt = pd.to_datetime(dateparser.parse(dtstr, languages=['de']))
    ct = locale.atof(td[3].text, decimal.Decimal)
    
    df.loc[i] = [dt, ct]
df   = df.set_index('report_date')
df['confirmed'] = df.new_confirmed.cumsum()
df = df[['confirmed', 'new_confirmed']]
df = df.astype(np.float)
# df = df[df.index <= pd.to_datetime(datetime.datetime.now().date() - datetime.timedelta(days=2))]
In [18]:
today_string = datetime.datetime.now().date().strftime('%Y-%m-%d')
today_string
Out[18]:
'2020-08-03'
In [19]:
bavaria_df = df[df.index <= pd.to_datetime(datetime.datetime.now().date() - datetime.timedelta(days=2))].copy()
In [20]:
df.to_excel("bavaria_covid19_data.xlsx") 
In [21]:
# df.to_excel(today_string +"_bavaria_covid19_data.xlsx") 
In [22]:
display_side_by_side(df.tail(), bavaria_df.tail())
confirmed new_confirmed
report_date
2020-07-27 50632.0 76.0
2020-07-28 50732.0 100.0
2020-07-29 50845.0 113.0
2020-07-30 50944.0 99.0
2020-07-31 51063.0 119.0
confirmed new_confirmed
report_date
2020-07-27 50632.0 76.0
2020-07-28 50732.0 100.0
2020-07-29 50845.0 113.0
2020-07-30 50944.0 99.0
2020-07-31 51063.0 119.0
In [23]:
bavaria_df['recovered'] = 0
bavaria_df['death']     = 0
bavaria_df['new_recovered'] = 0
bavaria_df['new_death']     = 0
cbr_bavaria = covid19.CasesByRegion('Bavaria', df=bavaria_df)

Bavaria first dead person date: 2020-03-12

In [24]:
fig = plt.figure(figsize=(32,8), dpi=80, facecolor='w', edgecolor='k')
ax = plt.subplot(1,1,1)
cbr_bavaria.plot_daily_stats(ax=ax, days=40)
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f85a1deeda0>
In [25]:
einwohner_deutschland = 83019213.0
einwohner_bayern      = 13076721.0
prozent_bayern        = einwohner_bayern / einwohner_deutschland
prozent_bayern
Out[25]:
0.15751439368619405
In [26]:
einwohner_österreich = 8793370.0
prozent_österreich   = einwohner_österreich / einwohner_deutschland
prozent_österreich
Out[26]:
0.10591969837150829
In [27]:
bavaria_new_confirmed_threshold = 100.0 * prozent_bayern // 1 + 1
bavaria_new_confirmed_threshold
Out[27]:
16.0
prediction date predicted day for reaching threshold predicted max growth rate steady state rate
2020-04-02 2020-04-20 23948 0.059 -
2020-04-03 2020-04-21 24967 0.062 -
2020-04-04 2020-04-24 28251 0.063 -
2020-04-05 2020-04-27 30820 0.060 -
2020-04-07 2020-04-29 33170 0.049 -
2020-04-08 2020-04-29 33148 0.042 -
2020-04-09 2020-04-29 33534 0.036 -
2020-04-10 - - 0.046 1211
2020-04-11 - - 0.043 1195
2020-04-12 - - 0.039 1129
2020-04-14 - - 0.030 817
2020-04-15 2020-05-05 37238 0.017 -
2020-04-16 2020-05-05 37391 0.014 -
2020-04-19 - - 0.017 500
2020-04-21 - - 0.015 492
2020-04-22 - - 0.015 499
2020-04-23 - - 0.014 485
2020-04-24 - - 0.014 481
2020-04-26 2020-05-23 42926 0.009 -
2020-04-27 2020-05-25 43216 0.008 -
2020-04-28 2020-05-26 43529 0.008 -
2020-04-29 - - 0.012 405
2020-04-30 - - 0.012 364
2020-05-02 - - 0.011 355
2020-05-03 - - 0.010 347
2020-05-04 - - 0.010 339
2020-05-05 - - 0.009 322
2020-05-07 - - 0.009 285
2020-05-08 - - 0.008 274
2020-05-09 - - 0.008 265
2020-05-11 - - 0.007 262
2020-05-12 - - 0.007 256
2020-05-13 - - 0.007 248
2020-05-15 - - 0.004 157
2020-05-17 - - 0.004 162
2020-05-19 - - 0.004 161
2020-05-20 - - 0.003 152
2020-05-23 - - 0.003 128
2020-05-29 - - 0.003 119
2020-06-07 - - 0.003 99
2020-06-14 - - 0.002 87
2020-06-24 - - 0.002 73
In [28]:
cbr_bavaria.fit(first_date=pd.to_datetime('2020-03-09'), new_confirmed_threshold=bavaria_new_confirmed_threshold)
fig = plt.figure(figsize=(32,8), dpi=80, facecolor='w', edgecolor='k')
ax = plt.subplot(1,1,1)
cbr_bavaria.plot_with_fits(ax=ax, restriction_start_date=datetime.datetime(2020, 3, 22))
 /home/local/cs/workspaces/covid-19-data-analysis/covid19.py:1526: UserWarning:Exception in curve_fit: Optimal parameters not found: Number of calls to function has reached maxfev = 600./ None
sigmoid+asymmetric+linear: seor: 165.40695572785646; growth-rate: 0.001, date:2020-09-08 00:00:00, projected value: 73.48845146958611
In [29]:
cbr_bavaria.calculate_R_estimates()
cbr_bavaria.R().round(3)
 /home/local/cs/local/install/anaconda3-2020.02-Linux-x86_64/envs/py36ds/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:218: ValueWarning:A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.
Out[29]:
fit_R gp_R kf_R ll_R mean_R
report_date
2020-07-29 1.0 1.294 1.267 1.281 1.192
In [30]:
fig = plt.figure(figsize=(32,8), dpi=80, facecolor='w', edgecolor='k')
ax = plt.subplot(1,1,1)
cbr_bavaria.plot_R(ax=ax) # , plot_start_date='2020-03-10'
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f859bc10c88>
In [31]:
df = covid19.get_austria_df()
cbr_austria = covid19.CasesByRegion('Austria', df=df)
cbr_austria.tail()
Out[31]:
confirmed recovered death new_confirmed new_recovered new_death
index
2020-07-29 20832 18528 698 180 149 10
2020-07-30 20935 18628 698 103 100 0
2020-07-31 21110 18758 698 175 130 0
2020-08-01 21192 18911 698 82 153 0
2020-08-02 21284 18984 698 92 73 0
In [32]:
austria_new_confirmed_threshold = 100.0 * prozent_österreich // 1 + 1
austria_new_confirmed_threshold
Out[32]:
11.0
prediction date predicted day for reaching 100 threshold predicted max growth rate steady state rate
2020-04-02 2020-04-23 13860 0.042 -
2020-04-03 2020-04-23 13864 0.036 -
2020-04-04 2020-04-23 14082 0.030 -
2020-04-05 2020-04-22 13978 0.025 -
2020-04-07 2020-04-22 14033 0.017 -
2020-04-08 2020-04-23 14121 0.014 -
2020-04-09 2020-04-23 14229 0.012 -
2020-04-10 2020-04-23 14350 0.010 -
2020-04-11 2020-04-24 14488 0.009 -
2020-04-12 - - 0.012 106
2020-04-14 - - 0.011 126
2020-04-15 - - 0.010 114
2020-04-16 - - 0.009 108
2020-04-19 - - 0.007 95
2020-04-21 - - 0.006 83
2020-04-22 - - 0.006 77
2020-04-23 - - 0.006 73
2020-04-24 - - 0.005 69
2020-04-26 2020-05-01 15208 0.002 -
2020-04-27 2020-05-01 15096 0.002 -
2020-04-28 2020-05-01 15129 0.002 -
2020-04-29 - - 0.005 84
2020-04-30 - - 0.005 81
2020-05-02 - - 0.005 76
2020-05-03 - - 0.004 75
2020-05-04 - - 0.004 73
2020-05-05 - - 0.004 70
2020-05-07 - - 0.004 63
2020-05-08 - - 0.004 60
2020-05-09 - - 0.003 58
2020-05-11 - - 0.003 58
2020-05-12 - - 0.003 57
2020-05-13 - - 0.003 55
2020-05-15 - - 0.003 55
2020-05-17 - - 0.003 55
2020-05-19 - - 0.003 56
2020-05-20 - - 0.003 55
2020-05-23 - - 0.003 55
2020-05-29 - - 0.003 51
2020-06-07 - - 0.003 46
2020-06-14 - - 0.002 44
2020-06-24 - - 0.002 42
In [33]:
cbr_austria.fit(first_date=pd.to_datetime('2020-03-09'), new_confirmed_threshold=austria_new_confirmed_threshold)
fig = plt.figure(figsize=(32,8), dpi=80, facecolor='w', edgecolor='k')
ax = plt.subplot(1,1,1)
cbr_austria.plot_with_fits(ax=ax, restriction_start_date=datetime.datetime(2020, 3, 15))
 /home/local/cs/workspaces/covid-19-data-analysis/covid19.py:1526: UserWarning:Exception in curve_fit: Optimal parameters not found: Number of calls to function has reached maxfev = 600./ None
sigmoid+asymmetric+linear: seor: 60.83140122644205; growth-rate: 0.003, date:2020-09-10 00:00:00, projected value: 63.10625191231651