#!/usr/bin/env python # coding: utf-8 # # Explore the National Water Model Reanalysis v2.1 # Explore the NWM Reanalysis (1979-2020) NetCDF files (all 367,439 of them) on AWS as a single xarray dataset! # The only new file we created was a JSON file that points to data chunks in the original NetCDF files that is then read with the [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) and [zarr](https://zarr.readthedocs.io/en/stable/) packages. # # See this [blog post](https://medium.com/pangeo/cloud-performant-netcdf4-hdf5-with-zarr-fsspec-and-intake-3d3a3e7cb935) for how this works. # # **Important note on performance**: The data in the original NetCDF files is chunked as the entire spatial domain and a single time step. Thus reading a time series will be very slow -- to extract a time series at a single location for the entire time period will require reading and uncompressing 8TB of data! But extraction of a few days or weeks of data will be relatively fast. # # In[1]: import intake # #### Use Intake to load the consolidated NWM dataset # The Intake catalog, the consolidated JSON file it accesses, and the NetCDF files the JSON file references are all on public S3 buckets that do not require an AWS account, so no credentials are required! # In[2]: get_ipython().run_cell_magic('time', '', "cat = intake.open_catalog('s3://esip-qhub-public/noaa/nwm/nwm_catalog.yml')\n") # In[3]: list(cat) # In[4]: get_ipython().run_cell_magic('time', '', "ds = cat['nwm-reanalysis'].to_dask()\n") # In[5]: ds # In[6]: ds.streamflow.nbytes/1e12 # How many terabytes is a single variable # #### Read and plot streamflow for a specific time # The local National Weather Service office in Houston observed all-time record daily rainfall accumulations on both August 26 and 27, measured at 14.4 in (370 mm) and 16.08 in (408 mm) respectively # In[7]: import hvplot.pandas import geoviews as gv from holoviews.operation.datashader import rasterize import cartopy.crs as ccrs import numpy as np import pandas as pd # In[8]: ds1 = ds.sel(time='2017-08-27 18:00:00', method='nearest') # In[9]: var = 'streamflow' # In[10]: df = ds1[var].to_pandas().to_frame() # In[14]: date_title = pd.to_datetime(ds1.time.values).strftime('%Y-%m-%d %H-%M-%D') date_title = f'{var}: {date_title}' date_title # In[12]: df = df.assign(latitude=ds.latitude) df = df.assign(longitude=ds.longitude) df.rename(columns={0: var}, inplace=True) # In[13]: p = df.hvplot.points(x='longitude', y='latitude', geo=True, c=var, colorbar=True, size=14, label=date_title) g = rasterize(p, aggregator='mean', x_sampling=0.02, y_sampling=0.02, width=500).opts(tools=['hover'], aspect='equal', logz=True, cmap='viridis', clim=(1e-2, np.nan)) g * gv.tile_sources.OSM