In [ ]:
from catboost import CatBoostRegressor
from datetime import datetime, timedelta
from fbprophet import Prophet
from fbprophet.diagnostics import cross_validation, performance_metrics
from fbprophet.plot import plot_cross_validation_metric
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from matplotlib import pyplot
from matplotlib.pylab import rcParams
from pandas import DataFrame
from pmdarima.arima import auto_arima, ARIMA
from pylab import rcParams
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, explained_variance_score, mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import PolynomialFeatures
from statistics import mean
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from time import sleep
from tqdm import tqdm, tqdm_notebook
from tqdm.notebook import trange, tqdm, tnrange
from xgboost import plot_importance, plot_tree, XGBClassifier
from xgboost.sklearn import XGBClassifier
from IPython.display import clear_output
import datetime as dt
import getpass
import graphviz
import hyperopt
import itertools
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import pmdarima as pm
import pycountry
import pycountry_convert as pc
import pydotplus
import seaborn as sns
import sklearn.linear_model as lm
import statsmodels.api as sm
import warnings
import xgboost as xgb
from sklearn import preprocessing
pd.options.display.float_format = '{:.2f}'.format   
pd.plotting.register_matplotlib_converters()
rcParams['figure.figsize'] = 15, 5
warnings.filterwarnings('ignore')

Data load

In [ ]:
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
df = pd.read_csv(url)
df = pd.melt(df, id_vars=['Province/State', 'Country/Region','Lat','Long'], var_name='Date', value_name='Confirmed')
df['Date'] = pd.to_datetime(df['Date'])
df.head(5)
In [ ]:
max(df.Date)
In [4]:
df.info(verbose=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66234 entries, 0 to 66233
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Province/State  20169 non-null  object        
 1   Country/Region  66234 non-null  object        
 2   Lat             66234 non-null  float64       
 3   Long            66234 non-null  float64       
 4   Date            66234 non-null  datetime64[ns]
 5   Confirmed       66234 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(1), object(2)
memory usage: 3.0+ MB
In [5]:
df = df.rename(columns={'Province/State': 'Province', 'Country/Region': 'Country'})
df.shape
Out[5]:
(66234, 6)
In [6]:
df_worldwide = pd.DataFrame(df.groupby('Date')['Confirmed'].sum())
df_worldwide['Date'] = df_worldwide.index
df_worldwide = df_worldwide.reset_index(drop=True)
df_worldwide['Country'] = 'Worldwide'
df = df.append(df_worldwide)
In [7]:
sum(df.Confirmed.loc[(df.Country == 'Worldwide')&(df.Date == '2020-09-25') ])
Out[7]:
32476713
In [8]:
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv'
df_country = pd.read_csv(url)
df_country = df_country.drop_duplicates(subset=['Country_Region'], keep='first')
df_country
Out[8]:
UID iso2 iso3 code3 FIPS Admin2 Province_State Country_Region Lat Long_ Combined_Key Population
0 4 AF AFG 4.00 nan NaN NaN Afghanistan 33.94 67.71 Afghanistan 38928341.00
1 8 AL ALB 8.00 nan NaN NaN Albania 41.15 20.17 Albania 2877800.00
2 12 DZ DZA 12.00 nan NaN NaN Algeria 28.03 1.66 Algeria 43851043.00
3 20 AD AND 20.00 nan NaN NaN Andorra 42.51 1.52 Andorra 77265.00
4 24 AO AGO 24.00 nan NaN NaN Angola -11.20 17.87 Angola 32866268.00
... ... ... ... ... ... ... ... ... ... ... ... ...
689 716 ZW ZWE 716.00 nan NaN NaN Zimbabwe -19.02 29.15 Zimbabwe 14862927.00
690 36 AU AUS 36.00 nan NaN NaN Australia -25.00 133.00 Australia 25459700.00
699 124 CA CAN 124.00 nan NaN NaN Canada 60.00 -95.00 Canada 37855702.00
715 156 CN CHN 156.00 nan NaN NaN China 30.59 114.31 China 1404676330.00
749 840 US USA 840.00 nan NaN NaN US 40.00 -100.00 US 329466283.00

188 rows × 12 columns

In [9]:
# apply the mapping to df
df = pd.merge(df, df_country[['Country_Region','iso2']], left_on='Country', right_on='Country_Region', how='left')
df
Out[9]:
Province Country Lat Long Date Confirmed Country_Region iso2
0 NaN Afghanistan 33.94 67.71 2020-01-22 0 Afghanistan AF
1 NaN Albania 41.15 20.17 2020-01-22 0 Albania AL
2 NaN Algeria 28.03 1.66 2020-01-22 0 Algeria DZ
3 NaN Andorra 42.51 1.52 2020-01-22 0 Andorra AD
4 NaN Angola -11.20 17.87 2020-01-22 0 Angola AO
... ... ... ... ... ... ... ... ...
66478 NaN Worldwide nan nan 2020-09-22 31516787 NaN NaN
66479 NaN Worldwide nan nan 2020-09-23 31779835 NaN NaN
66480 NaN Worldwide nan nan 2020-09-24 32141225 NaN NaN
66481 NaN Worldwide nan nan 2020-09-25 32476713 NaN NaN
66482 NaN Worldwide nan nan 2020-09-26 32751412 NaN NaN

66483 rows × 8 columns

In [10]:
def alpha2_to_continent(iso):
    try: cont = pc.country_alpha2_to_continent_code(iso)
    except: cont = float('NaN')
    return cont

df['Continent'] = df['iso2'].apply(alpha2_to_continent) # get continent code

df.loc[df['Country'] == "Diamond Princess", 'Continent'] = "Diamond Princess"
df.loc[df['Country'] == "MS Zaandam", 'Continent'] = "MS Zaandam"
df.loc[df['Country'] == "Netherlands", 'Continent'] = "EU"
df.loc[df['Country'] == "Holy See", 'Continent'] = "AS"
df.loc[df['Country'] == "Namibia", 'Continent'] = "AF"
df.loc[df['Country'] == "Timor-Leste", 'Continent'] = "AS"
df.loc[df['Country'] == "Western Sahara", 'Continent'] = "AF"
df.loc[df['Country'] == "Worldwide", 'Continent'] = "WL"


df['Continent'] = df['Continent'].map({'WL':'Worldwide','MS Zaandam':'Ships','Diamond Princess':'Ships','AF':'Africa','AS':'Asia','EU':'Europe','NA':'North America','OC':'Oceania','SA':'South America'})
In [11]:
df_missing_continents = df.copy()
df_missing_continents = pd.DataFrame(df_missing_continents.groupby(['Date','Continent'])['Confirmed'].sum())
df_missing_continents.reset_index(inplace=True)  
df_missing_continents['Country'] = df_missing_continents['Continent']
df_missing_continents = df_missing_continents[df_missing_continents.columns.difference(['Continent'])]
df = df.append(df_missing_continents)
In [12]:
df['Province_and_country'] = df['Country'].map(str) + "_" + df['Province'].map(str)
df['Province_and_country'] = df['Province_and_country'].str.replace('_nan', '').map(str)
In [13]:
df = df[df.columns.difference(['Lat','Long'])]
df = df.drop_duplicates(subset=['Province_and_country','Date'])
df['Days_since_outbreak_global'] = df.groupby(['Province_and_country']).cumcount()+1
In [14]:
df = df.reset_index(drop=True)
In [15]:
df.head()
Out[15]:
Confirmed Continent Country Country_Region Date Province Province_and_country iso2 Days_since_outbreak_global
0 0 Asia Afghanistan Afghanistan 2020-01-22 NaN Afghanistan AF 1
1 0 Europe Albania Albania 2020-01-22 NaN Albania AL 1
2 0 Africa Algeria Algeria 2020-01-22 NaN Algeria DZ 1
3 0 Europe Andorra Andorra 2020-01-22 NaN Andorra AD 1
4 0 Africa Angola Angola 2020-01-22 NaN Angola AO 1
In [16]:
df.info(verbose=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68226 entries, 0 to 68225
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Confirmed                   68226 non-null  int64         
 1   Continent                   66483 non-null  object        
 2   Country                     68226 non-null  object        
 3   Country_Region              66234 non-null  object        
 4   Date                        68226 non-null  datetime64[ns]
 5   Province                    20169 non-null  object        
 6   Province_and_country        68226 non-null  object        
 7   iso2                        65487 non-null  object        
 8   Days_since_outbreak_global  68226 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(6)
memory usage: 4.7+ MB
In [17]:
df.to_csv('df_m.CSV',sep=',')

Analysis - Amount of cases

In [18]:
len(df['Province_and_country'].unique())
Out[18]:
274
In [19]:
len(df['Country'].unique())
Out[19]:
196
In [20]:
len(df['Province'].unique())
Out[20]:
82
In [21]:
df.describe()
Out[21]:
Confirmed Days_since_outbreak_global
count 68226.00 68226.00
mean 102323.82 125.00
std 947878.10 71.88
min 0.00 1.00
25% 14.00 63.00
50% 384.00 125.00
75% 4303.00 187.00
max 32751412.00 249.00
In [22]:
plt.hist(df['Confirmed'], density=True, bins=40) 
plt.yscale('log')
In [23]:
ax=sns.lineplot(data=df, hue="Country", x="Days_since_outbreak_global", y="Confirmed", ci=None)
ax.legend_.remove()
In [31]:
def group_and_chart(Geography, Figure, Data):
    grouped = Data.copy()
    grouped = Data.groupby(['Days_since_outbreak_global',Geography])[Figure].sum()
    grouped = pd.DataFrame(grouped) 
    grouped.reset_index(inplace=True)
    grouped = grouped.pivot(index=Geography, columns='Days_since_outbreak_global', values=Figure)
    grouped = grouped.T[(grouped != 0).any()].T
    grouped = grouped.transpose()
    grouped.fillna(0, inplace=True)
    min_max_scaler = preprocessing.MinMaxScaler()
    grouped_temp = min_max_scaler.fit_transform(grouped)
    grouped = pd.DataFrame(grouped_temp, index=grouped.index, columns=grouped.columns)
    grouped = grouped.transpose()
    rcParams['figure.figsize'] = 20, 14
    sns.heatmap(grouped, cmap="Blues", linewidth=.5)
    
group_and_chart(Geography='Country', Figure='Confirmed', Data=df)
In [32]:
df_ = df.copy()
df_ = df_.loc[(df.Continent == 'Asia')]
group_and_chart(Geography='Country', Figure='Confirmed', Data=df_)
In [33]:
df_ = df.loc[(df.Continent == 'Europe')]
group_and_chart(Geography='Country', Figure='Confirmed', Data=df_)
In [34]:
df_ = df.loc[(df.Continent == 'Africa')]
group_and_chart(Geography='Country', Figure='Confirmed', Data=df_)
In [35]:
df_ = df.loc[(df.Continent == 'North America')]
group_and_chart(Geography='Country', Figure='Confirmed', Data=df_)
In [36]:
df_ = df.loc[(df.Continent == 'Oceania')]
group_and_chart(Geography='Country', Figure='Confirmed', Data=df_)
In [37]:
df_ = df.loc[(df.Continent == 'South America')]
group_and_chart(Geography='Country', Figure='Confirmed', Data=df_)

Analysis - daily increase

In [38]:
df['Day'] = df['Date'].dt.day
df['Weekday'] = df['Date'].dt.weekday
df['Week_number'] = df['Date'].dt.week
df['Quarter'] = df['Date'].dt.quarter
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['Confirmed_lag_1'] = df.groupby('Province_and_country')['Confirmed'].shift(1)
df['Daily_increase_confirmed'] = df['Confirmed']-df['Confirmed_lag_1']
df = df.replace([np.inf, -np.inf], np.nan)
#df.loc[df['Daily_increase_confirmed'] == np.inf, 'Daily_increase_confirmed'] = df['Confirmed']
#df.loc[df['Daily_increase_death'] == np.inf, 'Daily_increase_death'] = df['Deaths']
df = df.replace(np.nan, 0)
df
Out[38]:
Confirmed Continent Country Country_Region Date Province Province_and_country iso2 Days_since_outbreak_global Day Weekday Week_number Quarter Month Year Confirmed_lag_1 Daily_increase_confirmed
0 0 Asia Afghanistan Afghanistan 2020-01-22 0 Afghanistan AF 1 22 2 4 1 1 2020 0.00 0.00
1 0 Europe Albania Albania 2020-01-22 0 Albania AL 1 22 2 4 1 1 2020 0.00 0.00
2 0 Africa Algeria Algeria 2020-01-22 0 Algeria DZ 1 22 2 4 1 1 2020 0.00 0.00
3 0 Europe Andorra Andorra 2020-01-22 0 Andorra AD 1 22 2 4 1 1 2020 0.00 0.00
4 0 Africa Angola Angola 2020-01-22 0 Angola AO 1 22 2 4 1 1 2020 0.00 0.00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
68221 4829522 0 Europe 0 2020-09-26 0 Europe 0 249 26 5 39 3 9 2020 4791932.00 37590.00
68222 8479504 0 North America 0 2020-09-26 0 North America 0 249 26 5 39 3 9 2020 8423415.00 56089.00
68223 29437 0 Oceania 0 2020-09-26 0 Oceania 0 249 26 5 39 3 9 2020 29411.00 26.00
68224 721 0 Ships 0 2020-09-26 0 Ships 0 249 26 5 39 3 9 2020 721.00 0.00
68225 7864065 0 South America 0 2020-09-26 0 South America 0 249 26 5 39 3 9 2020 7811306.00 52759.00

68226 rows × 17 columns

In [39]:
df.to_csv('df_m.CSV',sep=',')
In [40]:
group_and_chart(Geography='Country', Figure='Daily_increase_confirmed', Data=df)