Data comes from https://tinyurl.com/s6gsq5y
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn import cluster
from ipywidgets import widgets
from IPython.display import display
from lets_plot import *
load_lets_plot_js()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) C:\Temp/ipykernel_6776/2399501846.py in <module> ----> 1 load_lets_plot_js() NameError: name 'load_lets_plot_js' is not defined
%matplotlib notebook
DATA_URL = 'https://docs.google.com/spreadsheets/d/1itaohdPiAeniCXNlntNztZ_oRvjh0HsGuJXUJWET008/export?format=csv&id=1itaohdPiAeniCXNlntNztZ_oRvjh0HsGuJXUJWET008&gid=0'
MAX_CLUSTERS_COUNT = 20
OUTCOMES = ['diseased', 'died', 'discharged', 'stable']
def player_widget(plots, *, fps=1):
interval = max(1, int(1000 / fps))
player = widgets.Play(min=0, max=len(plots) - 1, step=1, value=0, interval=interval)
slider = widgets.IntSlider(min=0, max=len(plots) - 1, step=1, value=0)
widgets.jslink((player, 'value'), (slider, 'value'))
widget = widgets.HBox([player, slider])
iout = widgets.interactive_output(lambda n, m: display(plots[n]), {'n': slider, 'm': player})
return display(widget, iout)
def find_best_clustering(shapely_points, *, max_clusters_count=1, clusters_limit_proportion=.02, \
attempts_count=1, method=cluster.KMeans):
points = shapely_points.apply(lambda p: [p.x, p.y]).to_list()
max_clusters_count = min(max_clusters_count, len(set([str(p) for p in points])))
if max_clusters_count < 1:
return None
for n in range(max_clusters_count, 0, -1):
clustering = method(n_clusters=n, n_jobs=attempts_count).fit(points)
clusters, counts = np.unique(clustering.labels_, return_counts=True)
if clusters_limit_proportion <= np.min(counts) / np.max(counts):
return clustering
assert(False)
def append_cluster_column(gdf, *, distinguished_columns=[], max_clusters_count=1, cluster_column_name='cluster'):
if any(distinguished_columns):
column = distinguished_columns[0]
return pd.concat([
append_cluster_column(gdf[gdf[column] == value].copy(), \
distinguished_columns=distinguished_columns[1:], \
max_clusters_count=max_clusters_count, \
cluster_column_name=cluster_column_name)
for value in gdf[column].value_counts().keys()
])
else:
gdf[cluster_column_name] = find_best_clustering(gdf.geometry, max_clusters_count=max_clusters_count).labels_
return gdf
def get_clusters_gdf(gdf, outcome, current_date):
gdf = gdf[gdf.outcome == outcome].copy()
actual_gdf = gdf[gdf.date_confirmation<=current_date]
if outcome in ['discharged', 'died']:
actual_gdf = actual_gdf[actual_gdf.date_death_or_discharge<=current_date]
vc = actual_gdf.cluster.value_counts()
clusters, counts = vc.keys(), vc.values
geometry = [gdf[gdf.cluster == cluster].unary_union.centroid for cluster in clusters]
return gpd.GeoDataFrame(dict(cluster=clusters, \
count=counts, \
outcome=[outcome]*len(clusters), \
date=[current_date]*len(clusters), \
geometry=geometry))
# Read the map polygon
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
china = world[world.name == 'China']
# Prepare dataframe
columns = ['date_confirmation', 'outcome', 'date_death_or_discharge', 'country', 'latitude', 'longitude']
except_hubei_data = pd.read_csv(DATA_URL)
df = pd.DataFrame(except_hubei_data)[columns]
# Clean data
df = df[~df.latitude.isna()]
df = df[~df.longitude.isna()]
df = df[~df.date_confirmation.isna()]
df = df[df.country == 'China']
df = df[~(((df.outcome=='discharged')|(df.outcome=='died'))&df.date_death_or_discharge.isna())]
# Fix data
df.latitude = df.latitude.astype(float)
df.longitude = df.longitude.astype(float)
df.date_confirmation = pd.to_datetime(df.date_confirmation, dayfirst=True, errors='coerce')\
.apply(lambda dt: dt.replace(year=2020))
df.date_death_or_discharge = pd.to_datetime(df.date_death_or_discharge, dayfirst=True, errors='coerce')\
.apply(lambda dt: dt.replace(year=2020))
df.outcome = df.outcome.replace({'discharge': 'discharged'})\
.apply(lambda outcome: outcome if outcome in OUTCOMES else 'diseased')
# Prepare geodataframe
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
gdf = gdf[gdf.within(china.iloc[0].geometry)]
# Add clusters by geoposition
gdf = append_cluster_column(gdf, distinguished_columns=['outcome'], max_clusters_count=MAX_CLUSTERS_COUNT)
# Prepare clusterized geodataframe
clusters_gdf = pd.concat([
pd.concat([
get_clusters_gdf(gdf, outcome=outcome, current_date=current_date)
for current_date in gdf.date_confirmation.sort_values().unique()
])
for outcome in OUTCOMES
])
# Prepare list of plots that would be frames of the animation
p = ggplot() + \
geom_polygon(data=china, fill='#d6d6d6') + \
theme(legend_position=(.15, .15), axis_title='blank', axis_text='blank', axis_ticks='blank', axis_line='blank') + \
ggsize(600, 600)
limit = clusters_gdf['count'].max()
plots = []
for current_date in gdf.date_confirmation.sort_values().unique():
current_clusters_gdf = clusters_gdf[clusters_gdf.date == current_date]
plots.append(
p + \
geom_point(aes(size='count', color='outcome'), data=current_clusters_gdf) + \
scale_size(name='', limits=[0, limit]) + \
scale_color_manual(values=['#cf3c38', 'black', '#90c73e', '#4cc5db']) + \
ggtitle('COVID-19 on ' + np.datetime_as_string(current_date, unit='D'))
)
player_widget(plots)