#!/usr/bin/env python # coding: utf-8 # # timeseries_us_data.ipynb # # Analyze COVID-19 statistics over time for all counties in the United States. # # Inputs: # * `outputs/us_counties_clean.feather`: The contents of `data/us_counties.csv` after data cleaning by [clean_us_data.ipynb](./clean_us_data.ipynb) # * `outputs/dates.feather`: Dates that go with the points in the time series in `outputs/us_counties_clean.feather`, produced by [clean_us_data.ipynb](./clean_us_data.ipynb). # # **Note:** You can redirect these input files by setting the environment variable `COVID_OUTPUTS_DIR` to a replacement for the prefix `outputs` in the above paths. # In[1]: # Initialization boilerplate import os import pandas as pd import numpy as np import matplotlib.pyplot as plt # Local file of utility functions import util # Allow environment variables to override data file locations. _OUTPUTS_DIR = os.getenv("COVID_OUTPUTS_DIR", "outputs") util.ensure_dir_exists(_OUTPUTS_DIR) # create if necessary # Size of the line charts in this notebook, in inches _FIGSIZE = (13, 8) # Globally adjust the font size for matplotlib. plt.rcParams.update({'font.size': 16}) # In[2]: # Read time series data from the binary file that clean_us_data.ipynb produces dates_file = os.path.join(_OUTPUTS_DIR, "dates.feather") cases_file = os.path.join(_OUTPUTS_DIR, "us_counties_clean.feather") cases = pd.read_feather(cases_file).set_index("FIPS") dates = pd.read_feather(dates_file)["date"].to_numpy() cases.head() # In[3]: # Draw a graph of confirmed cases over time in the U.S. plt.figure(figsize=_FIGSIZE) plt.plot(dates, np.transpose(cases["Confirmed"].sum())) plt.show() # In[4]: # Draw a graph of total cases over time in the U.S., excluding New York City # and counties adjacent to New York City # List of the FIPS codes for counties inside New York City nyc_fips = [ 36005, # Bronx County 36047, # Kings County 36061, # New York County 36081, # Queens County 36085, # Richmond County ] # List of FIPS codes for counties close to New York City nyc_adjacent_fips = [ # Immediately adjacent to NYC 34023, # Middlesex County, NJ 34039, # Union County, NJ 34013, # Essex County, NJ 34017, # Hudson County, NJ 34003, # Bergen County, NJ 36119, # Westchester County, NY 36059, # Nassau County, NY # Uncomment the following lines to also include counties that are # 1 county away from New York City. # 34025, # Monmouth County, NJ # 34021, # Mercer County, NJ # 34035, # Somerset County, NJ # 34027, # Morris County, NJ # 34031, # Passaic County, NJ # 36087, # Rockland County, NY # 36071, # Orange County, NY # 36079, # Putnam County, NY # 9001, # Fairfield County, CT # 36103, # Suffolk County, NY ] plt.figure(figsize=_FIGSIZE) mask = ~cases.index.isin(nyc_fips + nyc_adjacent_fips) plt.plot(dates, np.transpose(cases["Confirmed"].sum()), label="Entire U.S.") plt.plot(dates, np.transpose(cases[mask]["Confirmed"].sum()), label="Outside New York City") plt.plot(dates, np.transpose(cases[~mask]["Confirmed"].sum()), label="New York City") plt.legend() plt.show() # In[5]: # Repeat the previous graph for the "Deaths" time series. plt.figure(figsize=_FIGSIZE) plt.plot(dates, np.transpose(cases["Deaths"].sum()), label="Entire U.S.") plt.plot(dates, np.transpose(cases[mask]["Deaths"].sum()), label="Outside New York City") plt.plot(dates, np.transpose(cases[~mask]["Deaths"].sum()), label="New York City") plt.legend() plt.show() # In[6]: # Plot all the "Confirmed" time series plt.figure(figsize=_FIGSIZE) plt.plot(dates, np.transpose(cases["Confirmed"].array)) plt.show() # In[7]: # Repeat the previous plot, but with a log scale plt.figure(figsize=_FIGSIZE) plt.yscale("log") plt.plot(dates, np.transpose(np.maximum(1e-1, cases["Confirmed"].array))) plt.show() # In[8]: # The time series in the above plot appear to have a very wide spread -- # multiple orders of magnitude. Much of this spread goes away, however, # if we normalize the time series for each county to the county's # population. Let's do that normalization for all our time series. # # The main dataframe is getting crowded at this point, so generate a # second dataframe with the same index. cases_per_100 = cases[["State", "County", "Population"]].copy() cases_per_100["Confirmed_per_100"] = 100.0 * cases["Confirmed"].array / cases["Population"].values.reshape(-1,1) cases_per_100["Deaths_per_100"] = 100.0 * cases["Deaths"].array / cases["Population"].values.reshape(-1,1) cases_per_100["Recovered_per_100"] = 100.0 * cases["Recovered"].array / cases["Population"].values.reshape(-1,1) # (shallow) copy the outlier masks so our graphing function can use them cases_per_100["Confirmed_per_100_Outlier"] = cases["Confirmed_Outlier"] cases_per_100["Deaths_per_100_Outlier"] = cases["Deaths_Outlier"] cases_per_100["Confirmed_per_100_Outlier"] = cases["Confirmed_Outlier"] cases_per_100 # In[9]: # Plot confirmed cases normalized to population, with a log scale on the Y axis plt.figure(figsize=_FIGSIZE) plt.yscale("log") plt.plot(dates, np.transpose(np.maximum(1e-6, cases_per_100["Confirmed_per_100"].array)), ) plt.show() # In[10]: # Normalizing to population makes the time series cluster together more # closely. There is still a fair amount of spread due to the fact that # the pandemic struck different counties at different points in time. # # We can correct for this second factor plotting the time series with the # X axis of the plot being not date # Define some functions for redoing these plots with the time series aligned # such that "day 0" is when each time series crosses a particular threshold. def compute_offset(series, cut, log) -> float: """ Subroutine of plot_aligned() below. Computes at what offset `series` reached the value `cut`, interpolating if the series skipped over the value. """ first_above_ix = np.argmax(series >= cut) if 0 == first_above_ix: # Either none or all of the values are above the cut if series[0] >= cut: return 0 else: return len(series) first_above_val = series[first_above_ix] last_below_val = series[first_above_ix - 1] if log: last_below_log = 0 if last_below_val == 0 else np.log(last_below_val) first_above_log = np.log(first_above_val) fraction = (first_above_log - np.log(cut)) / (first_above_log - last_below_log) else: fraction = (first_above_val - cut) / (first_above_val - last_below_val) ret = first_above_ix - fraction return ret def plot_aligned(series: pd.Series, name: str, cut: float, log: bool): """ Plot time series, shifting them so that they all align at the specified cut value. :param series: Series of tensors to plot :param name: Human-readable name for the series :param cut: Cut value to align on :param log: If `True`, use a logarithmic Y axis. """ data = series.array._tensor.astype(float) offsets = [compute_offset(s.to_numpy(), cut, log) for s in series] min_off = np.min(offsets) max_off = np.max(offsets) norm_offsets = (offsets - min_off) / (max_off - min_off) num_points = len(data[0]) x_vals = [np.linspace(-o, -o + num_points, num=num_points) for o in offsets] # Skip time series that never reach the cut. keep_mask = norm_offsets < 1.0 num_kept = np.sum(keep_mask) print(f"{num_kept} time series passed threshold of {cut}") # Adjust opacity to number of time series alpha = min(1.0, 0.05 + 5.0 / num_kept) color_map = plt.get_cmap("magma") plt.figure(figsize=_FIGSIZE) if log: plt.yscale("log") for i in range(len(data)): if not keep_mask[i]: continue # Use color to indicate how much we shifted each time series. color = color_map(norm_offsets[i]) plt.scatter(x_vals[i], data[i], s=10, alpha=alpha, color=color) cut_str = f"{cut:.3f}" if cut < 10 else str(cut) plt.ylabel(name) plt.xlabel(f"Days after {name} Reached {cut_str}") plt.show() # In[11]: # Use the value of the Manhattan stats at a particular date in the past # to define the cutoff for defining "day 0" in the plots below. # Use a later date for the "Deaths" time series because deaths lag # infections. confirmed_day_zero_date = "2020-03-23" deaths_day_zero_date = "2020-04-07" confirmed_day_zero_ix = np.argwhere(dates == np.datetime64(confirmed_day_zero_date))[0, 0] deaths_day_zero_ix = np.argwhere(dates == np.datetime64(deaths_day_zero_date))[0, 0] manhattan_fips = 36061 day_zero_confirmed = cases.loc[manhattan_fips]["Confirmed"].to_numpy()[confirmed_day_zero_ix] day_zero_deaths = cases.loc[manhattan_fips]["Deaths"].to_numpy()[deaths_day_zero_ix] day_zero_confirmed_per_100 = ( cases_per_100.loc[manhattan_fips]["Confirmed_per_100"].to_numpy()[confirmed_day_zero_ix] ) day_zero_deaths_per_100 = ( cases_per_100.loc[manhattan_fips]["Deaths_per_100"].to_numpy()[deaths_day_zero_ix] ) print(f"On {confirmed_day_zero_date}, Manhattan had {day_zero_confirmed} cases.") print(f"On {deaths_day_zero_date}, Manhattan had {day_zero_deaths} deaths.") # In[12]: # Shift all the "Confirmed" time series so that day 0 is when each # time series crosses the point where Manhattan was on # confirmed_day_zero_date. # Linear plot. # Color encodes how far the time series was shifted. Time series that # were shifted by more days (i.e. more recent outbreaks) are in orange. plot_aligned(cases["Confirmed"], "Confirmed Cases", day_zero_confirmed, log=False) # In[13]: # Same plot as above, but with a logarithmic Y axis. # Color encodes how far the time series was shifted. Time series that # were shifted by more days (i.e. more recent outbreaks) are in orange. plot_aligned(cases["Confirmed"], "Confirmed Cases", day_zero_confirmed, log=True) # In[14]: # Shift all the "Confirmed_per_100" time series so that they align at a single # Y value. Semlilog plot. # Color encodes how far the time series was shifted. Time series that # were shifted by more days (i.e. more recent outbreaks) are in orange. plot_aligned(cases_per_100["Confirmed_per_100"], "Confirmed Cases per 100 Residents", day_zero_confirmed_per_100, log=True) # In[15]: # Note the pronounced "rainbow" effect in the above plot. The more we # shifted a county's "confirmed cases" time series forward -- that is, # the later the outbreak reached the county -- the lower the slope of # the curve. # # This trend towards a flatter curve is probably due to several factors. # There were more public health measures in place at later dates, and # later outbreaks struck more sparsely-populated counties where the # average resident has contact with fewer other residents. # Let's see if the time series of deaths produces the same result. # Align all of the "Deaths" series. Log scale. # Color encodes how far the time series was shifted. Time series that # were shifted by more days (i.e. more recent outbreaks) are in orange. plot_aligned(cases["Deaths"], "Deaths", day_zero_deaths, log=True) # In[16]: # Plot deaths per 100 residents, with day zero being when that value # passed the value that NYC had on deaths_day_zero_date. # Color encodes how far the time series was shifted. Time series that # were shifted by more days (i.e. more recent outbreaks) are in orange. plot_aligned(cases_per_100["Deaths_per_100"], "Deaths per 100 Residents", day_zero_deaths_per_100, log=True) # In[17]: # Now the dominant factor in the plot is noise due to aliasing. # When the number of deaths goes from 1 to 2 in a sparsely-populated # county, the normalized time series for that county experiences # a very large jump. # # We can remove the aliasing-related noise from the above plot # by restricting the analysis to counties with population above a # threshold. # Filter out the time series subject to aliasing by filtering on population pop_threshold = 50000 low_pop_cases = cases_per_100[cases_per_100["Population"] > pop_threshold] plot_aligned(low_pop_cases["Deaths_per_100"], "Deaths per 100 Residents", day_zero_deaths_per_100, log=True) # In[ ]: