Data comes from https://data.gov.hk
import requests
import datetime
from itertools import product
from ipywidgets import widgets
from IPython.display import display
import numpy as np
import pandas as pd
import geopandas as gpd
from lets_plot import *
load_lets_plot_js()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) C:\Temp/ipykernel_9612/2399501846.py in <module> ----> 1 load_lets_plot_js() NameError: name 'load_lets_plot_js' is not defined
API_URL = 'https://api.data.gov.hk/v1/historical-archive'
DATA_IN_CHINA_URL = 'http://www.chp.gov.hk/files/misc/latest_situation_of_reported_cases_mainland_china_eng.csv'
DATA_OUTSIDE_CHINA_URL = 'http://www.chp.gov.hk/files/misc/countries_areas_outside_mainland_china_have_reported_cases_eng.csv'
START_DATE = pd.Timestamp(2020, 1, 14)
END_DATE = pd.Timestamp.today()
def player_widget(plots, *, fps=1):
interval = max(1, int(1000 / fps))
player = widgets.Play(min=0, max=len(plots) - 1, step=1, value=0, interval=interval)
slider = widgets.IntSlider(min=0, max=len(plots) - 1, step=1, value=0)
widgets.jslink((player, 'value'), (slider, 'value'))
widget = widgets.HBox([player, slider])
iout = widgets.interactive_output(lambda n, m: display(plots[n]), {'n': slider, 'm': player})
return display(widget, iout)
def load_data(data_url):
DEFAULT_TIMESTAMP = '20200302-0916'
response = requests.get('%s/list-file-versions' % API_URL, params=dict(
url=data_url,
start=START_DATE.strftime('%Y%m%d'),
end=END_DATE.strftime('%Y%m%d'),
))
time = response.json()['timestamps'][-1] if response.status_code == 200 else DEFAULT_TIMESTAMP
return pd.read_csv('{api_url}/get-file?url={data_url}&time={time}'.format(
api_url=API_URL,
data_url=data_url,
time=time,
))
def ffill_columns(df, *, columns=[]):
def get_ffill_cb():
last_max_value = 0
def ffill_cb(value):
nonlocal last_max_value
if not np.isnan(value) and value > last_max_value:
last_max_value = value
return last_max_value
return ffill_cb
for column in columns:
df[column] = df[column].apply(get_ffill_cb())
return df
def fix_country_name(name):
SUBSTITUTION = {
'Korea': 'South Korea',
'United States': 'United States of America',
'The Philippines': 'Philippines',
'North Macedonia': 'Macedonia',
'Geogia': 'Georgia',
'Holland': 'Netherlands',
'Germany ': 'Germany',
'United Kingdom and Northern Ireland': 'United Kingdom',
}
return SUBSTITUTION[name] if name in SUBSTITUTION.keys() else name
def simplify_geoms(world_gdf, *, tolerance=.5):
DANGEROUS_GEOMS = ['South Africa']
stable_gdf = world_gdf[world_gdf.name.isin(DANGEROUS_GEOMS)].copy()
changeable_gdf = world_gdf[~world_gdf.name.isin(DANGEROUS_GEOMS)].copy()
changeable_gdf.geometry = changeable_gdf.geometry.simplify(tolerance)
return pd.concat([stable_gdf, changeable_gdf])
# Prepare the gdf with simplified country polygons
world_gdf = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))[['name', 'geometry']]
world_gdf = simplify_geoms(world_gdf.copy())
# Prepare the df with disease data, combined from two tables
columns_sub = {'As of date': 'date', 'Number of cases': 'cases', 'Number of death cases': 'deaths'}
china_df = pd.DataFrame(load_data(DATA_IN_CHINA_URL))
china_df = china_df.rename(columns=columns_sub)[columns_sub.values()]
china_df['country'] = 'China'
china_df.date = pd.to_datetime(china_df.date, dayfirst=True, errors='coerce')
columns_sub = {'As of date': 'date', 'Other countries/areas': 'country', 'Number of cases/confirmed cases': 'cases', 'Number of deaths among confirmed cases': 'deaths'}
world_df = pd.DataFrame(load_data(DATA_OUTSIDE_CHINA_URL))
world_df = world_df.rename(columns=columns_sub)[columns_sub.values()]
world_df.date = pd.to_datetime(world_df.date, dayfirst=True, errors='coerce')
world_df.country = world_df.country.apply(fix_country_name)
df = pd.concat([china_df, world_df], sort=False)
# In disease data select only the countries that are in world
df = df[df.country.isin(world_gdf.name.unique())]
# Add missing pairs (date, country) to dataframe (for filling gaps in the time scale)
index_tuples = product(pd.date_range(START_DATE, END_DATE), world_gdf.name.unique())
multi_index = pd.MultiIndex.from_tuples(index_tuples, names=['date', 'country'])
df = df.groupby(['date', 'country']).max().sort_index().reindex(multi_index).reset_index()
# Fix empty and incorrect 'cases' and 'deaths'
df = pd.concat([
ffill_columns(df[df.country == country].copy(), columns=['cases', 'deaths'])
for country in df.country.unique()
]).reset_index(drop=True)
df.cases = df.cases.astype(int)
df.deaths = df.deaths.astype(int)
df['cases_log'] = np.log(df.cases + 1)
cases_limit = df.cases_log.max()
p = ggplot() + \
theme(legend_position='none', axis_title='blank', axis_text='blank', axis_ticks='blank', axis_line='blank') + \
ggsize(800, 600)
plots = []
for current_date in pd.date_range(START_DATE, END_DATE):
current_gdf = world_gdf.merge(df[df.date == current_date], left_on='name', right_on='country')[['country', 'cases_log', 'geometry']]
current_gdf = current_gdf[current_gdf.cases_log > 0]
plots.append(
p + \
geom_polygon(aes(fill='cases_log', color='country'), data=current_gdf, size=.01) + \
scale_color_gradient(name='Country') + \
scale_fill_gradient(name='Cases Count(log10)', low='white', high='red', limits=[0, cases_limit]) + \
geom_polygon(data=world_gdf, color='gray', fill='white', size=.5, alpha=0) + \
ggtitle('COVID-19 on %s' % current_date.strftime('%m/%d/%Y'))
)
player_widget(plots)