#!/usr/bin/env python # coding: utf-8 # # COVID-19 by Country # # # Data comes from https://data.gov.hk # In[1]: import requests import datetime from itertools import product from ipywidgets import widgets from IPython.display import display import numpy as np import pandas as pd import geopandas as gpd from lets_plot import * # In[2]: load_lets_plot_js() # In[ ]: API_URL = 'https://api.data.gov.hk/v1/historical-archive' DATA_IN_CHINA_URL = 'http://www.chp.gov.hk/files/misc/latest_situation_of_reported_cases_mainland_china_eng.csv' DATA_OUTSIDE_CHINA_URL = 'http://www.chp.gov.hk/files/misc/countries_areas_outside_mainland_china_have_reported_cases_eng.csv' START_DATE = pd.Timestamp(2020, 1, 14) END_DATE = pd.Timestamp.today() # In[ ]: def player_widget(plots, *, fps=1): interval = max(1, int(1000 / fps)) player = widgets.Play(min=0, max=len(plots) - 1, step=1, value=0, interval=interval) slider = widgets.IntSlider(min=0, max=len(plots) - 1, step=1, value=0) widgets.jslink((player, 'value'), (slider, 'value')) widget = widgets.HBox([player, slider]) iout = widgets.interactive_output(lambda n, m: display(plots[n]), {'n': slider, 'm': player}) return display(widget, iout) # In[ ]: def load_data(data_url): DEFAULT_TIMESTAMP = '20200302-0916' response = requests.get('%s/list-file-versions' % API_URL, params=dict( url=data_url, start=START_DATE.strftime('%Y%m%d'), end=END_DATE.strftime('%Y%m%d'), )) time = response.json()['timestamps'][-1] if response.status_code == 200 else DEFAULT_TIMESTAMP return pd.read_csv('{api_url}/get-file?url={data_url}&time={time}'.format( api_url=API_URL, data_url=data_url, time=time, )) # In[ ]: def ffill_columns(df, *, columns=[]): def get_ffill_cb(): last_max_value = 0 def ffill_cb(value): nonlocal last_max_value if not np.isnan(value) and value > last_max_value: last_max_value = value return last_max_value return ffill_cb for column in columns: df[column] = df[column].apply(get_ffill_cb()) return df # In[ ]: def fix_country_name(name): SUBSTITUTION = { 'Korea': 'South Korea', 'United States': 'United States of America', 'The Philippines': 'Philippines', 'North Macedonia': 'Macedonia', 'Geogia': 'Georgia', 'Holland': 'Netherlands', 'Germany ': 'Germany', 'United Kingdom and Northern Ireland': 'United Kingdom', } return SUBSTITUTION[name] if name in SUBSTITUTION.keys() else name # In[ ]: def simplify_geoms(world_gdf, *, tolerance=.5): DANGEROUS_GEOMS = ['South Africa'] stable_gdf = world_gdf[world_gdf.name.isin(DANGEROUS_GEOMS)].copy() changeable_gdf = world_gdf[~world_gdf.name.isin(DANGEROUS_GEOMS)].copy() changeable_gdf.geometry = changeable_gdf.geometry.simplify(tolerance) return pd.concat([stable_gdf, changeable_gdf]) # In[ ]: # Prepare the gdf with simplified country polygons world_gdf = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))[['name', 'geometry']] world_gdf = simplify_geoms(world_gdf.copy()) # In[ ]: # Prepare the df with disease data, combined from two tables columns_sub = {'As of date': 'date', 'Number of cases': 'cases', 'Number of death cases': 'deaths'} china_df = pd.DataFrame(load_data(DATA_IN_CHINA_URL)) china_df = china_df.rename(columns=columns_sub)[columns_sub.values()] china_df['country'] = 'China' china_df.date = pd.to_datetime(china_df.date, dayfirst=True, errors='coerce') columns_sub = {'As of date': 'date', 'Other countries/areas': 'country', 'Number of cases/confirmed cases': 'cases', 'Number of deaths among confirmed cases': 'deaths'} world_df = pd.DataFrame(load_data(DATA_OUTSIDE_CHINA_URL)) world_df = world_df.rename(columns=columns_sub)[columns_sub.values()] world_df.date = pd.to_datetime(world_df.date, dayfirst=True, errors='coerce') world_df.country = world_df.country.apply(fix_country_name) df = pd.concat([china_df, world_df], sort=False) # In[ ]: # In disease data select only the countries that are in world df = df[df.country.isin(world_gdf.name.unique())] # In[ ]: # Add missing pairs (date, country) to dataframe (for filling gaps in the time scale) index_tuples = product(pd.date_range(START_DATE, END_DATE), world_gdf.name.unique()) multi_index = pd.MultiIndex.from_tuples(index_tuples, names=['date', 'country']) df = df.groupby(['date', 'country']).max().sort_index().reindex(multi_index).reset_index() # In[ ]: # Fix empty and incorrect 'cases' and 'deaths' df = pd.concat([ ffill_columns(df[df.country == country].copy(), columns=['cases', 'deaths']) for country in df.country.unique() ]).reset_index(drop=True) df.cases = df.cases.astype(int) df.deaths = df.deaths.astype(int) # In[ ]: df['cases_log'] = np.log(df.cases + 1) cases_limit = df.cases_log.max() p = ggplot() + \ theme(legend_position='none', axis_title='blank', axis_text='blank', axis_ticks='blank', axis_line='blank') + \ ggsize(800, 600) plots = [] for current_date in pd.date_range(START_DATE, END_DATE): current_gdf = world_gdf.merge(df[df.date == current_date], left_on='name', right_on='country')[['country', 'cases_log', 'geometry']] current_gdf = current_gdf[current_gdf.cases_log > 0] plots.append( p + \ geom_polygon(aes(fill='cases_log', color='country'), data=current_gdf, size=.01) + \ scale_color_gradient(name='Country') + \ scale_fill_gradient(name='Cases Count(log10)', low='white', high='red', limits=[0, cases_limit]) + \ geom_polygon(data=world_gdf, color='gray', fill='white', size=.5, alpha=0) + \ ggtitle('COVID-19 on %s' % current_date.strftime('%m/%d/%Y')) ) # In[ ]: player_widget(plots)