timeseries_us_data.ipynb

Analyze COVID-19 statistics over time for all counties in the United States.

Inputs:

  • outputs/us_counties_clean.feather: The contents of data/us_counties.csv after data cleaning by clean_us_data.ipynb
  • outputs/dates.feather: Dates that go with the points in the time series in outputs/us_counties_clean.feather, produced by clean_us_data.ipynb.

Note: You can redirect these input files by setting the environment variable COVID_OUTPUTS_DIR to a replacement for the prefix outputs in the above paths.

In [1]:
# Initialization boilerplate
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from typing import *

import text_extensions_for_pandas as tp

# Local file of utility functions
import util

# Allow environment variables to override data file locations.
_OUTPUTS_DIR = os.getenv("COVID_OUTPUTS_DIR", "outputs")
util.ensure_dir_exists(_OUTPUTS_DIR)  # create if necessary

# Size of the line charts in this notebook, in inches
_FIGSIZE = (13, 8)

# Globally adjust the font size for matplotlib.
plt.rcParams.update({'font.size': 16})
In [2]:
# Read time series data from the binary file that clean_us_data.ipynb produces
dates_file = os.path.join(_OUTPUTS_DIR, "dates.feather")
cases_file = os.path.join(_OUTPUTS_DIR, "us_counties_clean.feather")
cases = pd.read_feather(cases_file).set_index("FIPS")
dates = pd.read_feather(dates_file)["date"].to_numpy()
cases.head()
Out[2]:
State County Population Confirmed Deaths Recovered Confirmed_Outlier Deaths_Outlier Recovered_Outlier Confirmed_7_Days Deaths_7_Days
FIPS
1001 Alabama Autauga 55869 [ 0 0 0 0 0 0 0 0 0 ... [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 153 3
1003 Alabama Baldwin 223234 [ 0 0 0 0 0 0 0 0 0 ... [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 573 0
1005 Alabama Barbour 24686 [ 0 0 0 0 0 0 0 0 0 ... [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 31 1
1007 Alabama Bibb 22394 [ 0 0 0 0 0 0 0 0 0 ... [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 149 0
1009 Alabama Blount 57826 [ 0 0 0 0 0 0 0 0 0 ... [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 189 2
In [3]:
# Draw a graph of confirmed cases over time in the U.S.
plt.figure(figsize=_FIGSIZE)
plt.plot(dates, np.transpose(cases["Confirmed"].sum()))
plt.show()
In [4]:
# Draw a graph of total cases over time in the U.S., excluding New York City
# and counties adjacent to New York City

# List of the FIPS codes for counties inside New York City
nyc_fips = [
    36005,  # Bronx County
    36047,  # Kings County
    36061,  # New York County
    36081,  # Queens County
    36085,  # Richmond County
]

# List of FIPS codes for counties close to New York City
nyc_adjacent_fips = [
    # Immediately adjacent to NYC
    34023,  # Middlesex County, NJ
    34039,  # Union County, NJ
    34013,  # Essex County, NJ
    34017,  # Hudson County, NJ
    34003,  # Bergen County, NJ
    36119,  # Westchester County, NY
    36059,  # Nassau County, NY
    
    # Uncomment the following lines to also include counties that are 
    # 1 county away from New York City.
#     34025,  # Monmouth County, NJ
#     34021,  # Mercer County, NJ
#     34035,  # Somerset County, NJ
#     34027,  # Morris County, NJ
#     34031,  # Passaic County, NJ
#     36087,  # Rockland County, NY
#     36071,  # Orange County, NY
#     36079,  # Putnam County, NY
#     9001,   # Fairfield County, CT
#     36103,  # Suffolk County, NY
]

plt.figure(figsize=_FIGSIZE)

mask = ~cases.index.isin(nyc_fips + nyc_adjacent_fips)

plt.plot(dates, np.transpose(cases["Confirmed"].sum()), label="Entire U.S.")
plt.plot(dates, np.transpose(cases[mask]["Confirmed"].sum()), label="Outside New York City")
plt.plot(dates, np.transpose(cases[~mask]["Confirmed"].sum()), label="New York City")
plt.legend()
plt.show()
In [5]:
# Repeat the previous graph for the "Deaths" time series.
plt.figure(figsize=_FIGSIZE)
plt.plot(dates, np.transpose(cases["Deaths"].sum()), label="Entire U.S.")
plt.plot(dates, np.transpose(cases[mask]["Deaths"].sum()), label="Outside New York City")
plt.plot(dates, np.transpose(cases[~mask]["Deaths"].sum()), label="New York City")
plt.legend()
plt.show()
In [6]:
# Plot all the "Confirmed" time series
plt.figure(figsize=_FIGSIZE)
plt.plot(dates, np.transpose(cases["Confirmed"].array))
plt.show()
In [7]:
# Repeat the previous plot, but with a log scale
plt.figure(figsize=_FIGSIZE)

plt.yscale("log")
plt.plot(dates, np.transpose(np.maximum(1e-1, cases["Confirmed"].array)))
plt.show()
In [8]:
# The time series in the above plot appear to have a very wide spread --
# multiple orders of magnitude. Much of this spread goes away, however,
# if we normalize the time series for each county to the county's 
# population. Let's do that normalization for all our time series.
#
# The main dataframe is getting crowded at this point, so generate a 
# second dataframe with the same index.
cases_per_100 = cases[["State", "County", "Population"]].copy()
cases_per_100["Confirmed_per_100"] = 100.0 * cases["Confirmed"].array / cases["Population"].values.reshape(-1,1)
cases_per_100["Deaths_per_100"] = 100.0 * cases["Deaths"].array / cases["Population"].values.reshape(-1,1)
cases_per_100["Recovered_per_100"] = 100.0 * cases["Recovered"].array / cases["Population"].values.reshape(-1,1)

# (shallow) copy the outlier masks so our graphing function can use them
cases_per_100["Confirmed_per_100_Outlier"] = cases["Confirmed_Outlier"]
cases_per_100["Deaths_per_100_Outlier"] = cases["Deaths_Outlier"]
cases_per_100["Confirmed_per_100_Outlier"] = cases["Confirmed_Outlier"]

cases_per_100
Out[8]:
State County Population Confirmed_per_100 Deaths_per_100 Recovered_per_100 Confirmed_per_100_Outlier Deaths_per_100_Outlier
FIPS
1001 Alabama Autauga 55869 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0... [0. 0. 0. 0. 0.... [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1003 Alabama Baldwin 223234 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0... [0. 0. 0. 0. 0.... [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1005 Alabama Barbour 24686 [0. 0. 0. 0. 0.... [0. 0. 0. 0. 0.... [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1007 Alabama Bibb 22394 [0. 0. 0. 0. 0.... [0. 0. 0. 0. 0.... [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1009 Alabama Blount 57826 [0. 0. 0. 0. 0.... [0. 0. 0. 0. 0.... [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
... ... ... ... ... ... ... ... ...
56037 Wyoming Sweetwater 42343 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0... [0. 0. 0. 0. 0.... [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
56039 Wyoming Teton 23464 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0... [0. 0. 0. 0. 0.... [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
56041 Wyoming Uinta 20226 [0. 0. 0. 0. 0.... [0. 0. 0. 0. 0.... [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
56043 Wyoming Washakie 7805 [0. 0. 0. 0. 0.... [0. 0. 0. 0. 0. ... [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
56045 Wyoming Weston 6927 [0. 0. 0. 0. 0.... [0. 0. 0. 0. 0.... [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...

3142 rows × 8 columns

In [9]:
# Plot confirmed cases normalized to population, with a log scale on the Y axis
plt.figure(figsize=_FIGSIZE)
plt.yscale("log")
plt.plot(dates, np.transpose(np.maximum(1e-6, cases_per_100["Confirmed_per_100"].array)), )
plt.show()