from catboost import CatBoostRegressor
from datetime import datetime, timedelta
from fbprophet import Prophet
from fbprophet.diagnostics import cross_validation, performance_metrics
from fbprophet.plot import plot_cross_validation_metric
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from matplotlib import pyplot
from matplotlib.pylab import rcParams
from pandas import DataFrame
from pmdarima.arima import auto_arima, ARIMA
from pylab import rcParams
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, explained_variance_score, mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import PolynomialFeatures
from statistics import mean
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from time import sleep
from tqdm import tqdm, tqdm_notebook
from tqdm.notebook import trange, tqdm, tnrange
from xgboost import plot_importance, plot_tree, XGBClassifier
from xgboost.sklearn import XGBClassifier
from IPython.display import clear_output
import datetime as dt
import getpass
import graphviz
import hyperopt
import itertools
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import pmdarima as pm
import pycountry
import pycountry_convert as pc
import pydotplus
import seaborn as sns
import sklearn.linear_model as lm
import statsmodels.api as sm
import warnings
import xgboost as xgb
from sklearn import preprocessing
pd.options.display.float_format = '{:.2f}'.format
pd.plotting.register_matplotlib_converters()
rcParams['figure.figsize'] = 15, 5
warnings.filterwarnings('ignore')
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
df = pd.read_csv(url)
df = pd.melt(df, id_vars=['Province/State', 'Country/Region','Lat','Long'], var_name='Date', value_name='Confirmed')
df['Date'] = pd.to_datetime(df['Date'])
df.head(5)
max(df.Date)
df.info(verbose=True)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 66234 entries, 0 to 66233 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Province/State 20169 non-null object 1 Country/Region 66234 non-null object 2 Lat 66234 non-null float64 3 Long 66234 non-null float64 4 Date 66234 non-null datetime64[ns] 5 Confirmed 66234 non-null int64 dtypes: datetime64[ns](1), float64(2), int64(1), object(2) memory usage: 3.0+ MB
df = df.rename(columns={'Province/State': 'Province', 'Country/Region': 'Country'})
df.shape
(66234, 6)
df_worldwide = pd.DataFrame(df.groupby('Date')['Confirmed'].sum())
df_worldwide['Date'] = df_worldwide.index
df_worldwide = df_worldwide.reset_index(drop=True)
df_worldwide['Country'] = 'Worldwide'
df = df.append(df_worldwide)
sum(df.Confirmed.loc[(df.Country == 'Worldwide')&(df.Date == '2020-09-25') ])
32476713
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv'
df_country = pd.read_csv(url)
df_country = df_country.drop_duplicates(subset=['Country_Region'], keep='first')
df_country
UID | iso2 | iso3 | code3 | FIPS | Admin2 | Province_State | Country_Region | Lat | Long_ | Combined_Key | Population | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4 | AF | AFG | 4.00 | nan | NaN | NaN | Afghanistan | 33.94 | 67.71 | Afghanistan | 38928341.00 |
1 | 8 | AL | ALB | 8.00 | nan | NaN | NaN | Albania | 41.15 | 20.17 | Albania | 2877800.00 |
2 | 12 | DZ | DZA | 12.00 | nan | NaN | NaN | Algeria | 28.03 | 1.66 | Algeria | 43851043.00 |
3 | 20 | AD | AND | 20.00 | nan | NaN | NaN | Andorra | 42.51 | 1.52 | Andorra | 77265.00 |
4 | 24 | AO | AGO | 24.00 | nan | NaN | NaN | Angola | -11.20 | 17.87 | Angola | 32866268.00 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
689 | 716 | ZW | ZWE | 716.00 | nan | NaN | NaN | Zimbabwe | -19.02 | 29.15 | Zimbabwe | 14862927.00 |
690 | 36 | AU | AUS | 36.00 | nan | NaN | NaN | Australia | -25.00 | 133.00 | Australia | 25459700.00 |
699 | 124 | CA | CAN | 124.00 | nan | NaN | NaN | Canada | 60.00 | -95.00 | Canada | 37855702.00 |
715 | 156 | CN | CHN | 156.00 | nan | NaN | NaN | China | 30.59 | 114.31 | China | 1404676330.00 |
749 | 840 | US | USA | 840.00 | nan | NaN | NaN | US | 40.00 | -100.00 | US | 329466283.00 |
188 rows × 12 columns
# apply the mapping to df
df = pd.merge(df, df_country[['Country_Region','iso2']], left_on='Country', right_on='Country_Region', how='left')
df
Province | Country | Lat | Long | Date | Confirmed | Country_Region | iso2 | |
---|---|---|---|---|---|---|---|---|
0 | NaN | Afghanistan | 33.94 | 67.71 | 2020-01-22 | 0 | Afghanistan | AF |
1 | NaN | Albania | 41.15 | 20.17 | 2020-01-22 | 0 | Albania | AL |
2 | NaN | Algeria | 28.03 | 1.66 | 2020-01-22 | 0 | Algeria | DZ |
3 | NaN | Andorra | 42.51 | 1.52 | 2020-01-22 | 0 | Andorra | AD |
4 | NaN | Angola | -11.20 | 17.87 | 2020-01-22 | 0 | Angola | AO |
... | ... | ... | ... | ... | ... | ... | ... | ... |
66478 | NaN | Worldwide | nan | nan | 2020-09-22 | 31516787 | NaN | NaN |
66479 | NaN | Worldwide | nan | nan | 2020-09-23 | 31779835 | NaN | NaN |
66480 | NaN | Worldwide | nan | nan | 2020-09-24 | 32141225 | NaN | NaN |
66481 | NaN | Worldwide | nan | nan | 2020-09-25 | 32476713 | NaN | NaN |
66482 | NaN | Worldwide | nan | nan | 2020-09-26 | 32751412 | NaN | NaN |
66483 rows × 8 columns
def alpha2_to_continent(iso):
try: cont = pc.country_alpha2_to_continent_code(iso)
except: cont = float('NaN')
return cont
df['Continent'] = df['iso2'].apply(alpha2_to_continent) # get continent code
df.loc[df['Country'] == "Diamond Princess", 'Continent'] = "Diamond Princess"
df.loc[df['Country'] == "MS Zaandam", 'Continent'] = "MS Zaandam"
df.loc[df['Country'] == "Netherlands", 'Continent'] = "EU"
df.loc[df['Country'] == "Holy See", 'Continent'] = "AS"
df.loc[df['Country'] == "Namibia", 'Continent'] = "AF"
df.loc[df['Country'] == "Timor-Leste", 'Continent'] = "AS"
df.loc[df['Country'] == "Western Sahara", 'Continent'] = "AF"
df.loc[df['Country'] == "Worldwide", 'Continent'] = "WL"
df['Continent'] = df['Continent'].map({'WL':'Worldwide','MS Zaandam':'Ships','Diamond Princess':'Ships','AF':'Africa','AS':'Asia','EU':'Europe','NA':'North America','OC':'Oceania','SA':'South America'})
df_missing_continents = df.copy()
df_missing_continents = pd.DataFrame(df_missing_continents.groupby(['Date','Continent'])['Confirmed'].sum())
df_missing_continents.reset_index(inplace=True)
df_missing_continents['Country'] = df_missing_continents['Continent']
df_missing_continents = df_missing_continents[df_missing_continents.columns.difference(['Continent'])]
df = df.append(df_missing_continents)
df['Province_and_country'] = df['Country'].map(str) + "_" + df['Province'].map(str)
df['Province_and_country'] = df['Province_and_country'].str.replace('_nan', '').map(str)
df = df[df.columns.difference(['Lat','Long'])]
df = df.drop_duplicates(subset=['Province_and_country','Date'])
df['Days_since_outbreak_global'] = df.groupby(['Province_and_country']).cumcount()+1
df = df.reset_index(drop=True)
df.head()
Confirmed | Continent | Country | Country_Region | Date | Province | Province_and_country | iso2 | Days_since_outbreak_global | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | Asia | Afghanistan | Afghanistan | 2020-01-22 | NaN | Afghanistan | AF | 1 |
1 | 0 | Europe | Albania | Albania | 2020-01-22 | NaN | Albania | AL | 1 |
2 | 0 | Africa | Algeria | Algeria | 2020-01-22 | NaN | Algeria | DZ | 1 |
3 | 0 | Europe | Andorra | Andorra | 2020-01-22 | NaN | Andorra | AD | 1 |
4 | 0 | Africa | Angola | Angola | 2020-01-22 | NaN | Angola | AO | 1 |
df.info(verbose=True)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 68226 entries, 0 to 68225 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Confirmed 68226 non-null int64 1 Continent 66483 non-null object 2 Country 68226 non-null object 3 Country_Region 66234 non-null object 4 Date 68226 non-null datetime64[ns] 5 Province 20169 non-null object 6 Province_and_country 68226 non-null object 7 iso2 65487 non-null object 8 Days_since_outbreak_global 68226 non-null int64 dtypes: datetime64[ns](1), int64(2), object(6) memory usage: 4.7+ MB
df.to_csv('df_m.CSV',sep=',')
len(df['Province_and_country'].unique())
274
len(df['Country'].unique())
196
len(df['Province'].unique())
82
df.describe()
Confirmed | Days_since_outbreak_global | |
---|---|---|
count | 68226.00 | 68226.00 |
mean | 102323.82 | 125.00 |
std | 947878.10 | 71.88 |
min | 0.00 | 1.00 |
25% | 14.00 | 63.00 |
50% | 384.00 | 125.00 |
75% | 4303.00 | 187.00 |
max | 32751412.00 | 249.00 |
plt.hist(df['Confirmed'], density=True, bins=40)
plt.yscale('log')
ax=sns.lineplot(data=df, hue="Country", x="Days_since_outbreak_global", y="Confirmed", ci=None)
ax.legend_.remove()
def group_and_chart(Geography, Figure, Data):
grouped = Data.copy()
grouped = Data.groupby(['Days_since_outbreak_global',Geography])[Figure].sum()
grouped = pd.DataFrame(grouped)
grouped.reset_index(inplace=True)
grouped = grouped.pivot(index=Geography, columns='Days_since_outbreak_global', values=Figure)
grouped = grouped.T[(grouped != 0).any()].T
grouped = grouped.transpose()
grouped.fillna(0, inplace=True)
min_max_scaler = preprocessing.MinMaxScaler()
grouped_temp = min_max_scaler.fit_transform(grouped)
grouped = pd.DataFrame(grouped_temp, index=grouped.index, columns=grouped.columns)
grouped = grouped.transpose()
rcParams['figure.figsize'] = 20, 14
sns.heatmap(grouped, cmap="Blues", linewidth=.5)
group_and_chart(Geography='Country', Figure='Confirmed', Data=df)
df_ = df.copy()
df_ = df_.loc[(df.Continent == 'Asia')]
group_and_chart(Geography='Country', Figure='Confirmed', Data=df_)