NWM ReferenceFileSystem JSON¶

Create ReferenceFileSystem JSON file for a collection of NWM NetCDF files on S3

In [5]:

import os
import fsspec
import ujson   # fast json
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr
import xarray as xr
import dask
import hvplot.xarray

In [2]:

import fsspec_reference_maker

In [3]:

fsspec_reference_maker.__version__

Out[3]:

'0.0.1+6.gc3757ec'

In [7]:

fs = fsspec.filesystem('s3', anon=True, skip_instance_cache=True)

In [8]:

best_hour='f001'
var = 'channel_rt'

Cheat on file list¶

globbing all the files takes a long time (> 5 minutes), so instead, just read the dates and generate 24 files for each date, which of course assumes no missing files

In [9]:

#%%time
#flist = fs.glob(f'noaa-nwm-pds/nwm.*/short_range/nwm.*.short_range.{var}.{best_hour}.conus.nc')

In [10]:

days = fs.glob(f'noaa-nwm-pds/nwm.*')

In [11]:

print(days[0])
print(days[-1])

noaa-nwm-pds/nwm.20210803
noaa-nwm-pds/nwm.20210901

In [ ]:

flist=[]
for day in days[2:-2]:
    for i in range(24):
        flist.append(f'{day}/short_range/nwm.t{i:02d}z.short_range.{var}.{best_hour}.conus.nc')

In [ ]:

flist.extend(fs.glob(f'{days[-1]}/short_range/nwm.*.short_range.{var}.{best_hour}.conus.nc'))

In [ ]:

fs.size(flist[0])/1e6

In [ ]:

ds = xr.open_dataset(fs.open(flist[0]))

In [ ]:

ds.streamflow.encoding

In [ ]:

ds.nbytes/1e6

In [ ]:

print(flist[0])
print(flist[-1])

Join the "best time series" from past forecasts with the latest forecast¶

Remove the first day of data since this is a rolling collection and we don't want to be trying to access files that soon will be removed
Use all the files from the last forecast cycle

In [ ]:

last_dir = f'{os.path.dirname(flist[-1])}'
last_dir

In [ ]:

last_file = os.path.basename(flist[-1]).split('.')
last_file

In [ ]:

last_files = fs.glob(f'{last_dir}/{last_file[0]}.{last_file[1]}.{last_file[2]}.{var}.*.conus.nc')
last_files

Skip the first of the last_files since it's a duplicate:

In [ ]:

flist.extend(last_files[1:])

In [ ]:

print(flist[0])
print(flist[-1])

We need to include the "s3://" prefix to the list of files so that fsspec will recognize that these JSON files are on S3. There is no "storage_

In [ ]:

urls = ["s3://" + f for f in flist]

so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first')

In [ ]:

print(urls[0])
print(urls[-1])

In [ ]:

fs.size(urls[10])

In [ ]:

import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 30
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='pangeo', worker_profile='Pangeo Worker', 
                                      propagate_env=True)

Create the individual JSON files directly on S3¶

We passed AWS credentials to the Dask workers via environment variables above, and the dask workers don't have the AWS credentials file with profiles defined, so we don't define a profile here, we just set anon=False and let the workers find the credentials via the environment variables:

In [5]:

fs2 = fsspec.filesystem('s3', anon=False)  

If the directory exists, remove it (and all the files)

In [6]:

json_dir = 's3://esip-qhub/usgs/nwm_forecast/jsons/'

In [ ]:

try:
    fs2.rm(json_dir, recursive=True)
except:
    pass

In [ ]:

def gen_json(u):
    with fs.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        p = u.split('/')
        date = p[3]
        fname = p[5]
        outf = f'{json_dir}{date}.{fname}.json'
        print(outf)
        with fs2.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

Send the list of delayed tasks to the Dask workers to compute¶

In [ ]:

%%time
dask.compute(*[dask.delayed(gen_json)(u) for u in urls], retries=10);

In [7]:

flist2 = fs2.ls(json_dir)
furls = sorted(['s3://'+f for f in flist2])
furls[0]

Out[7]:

's3://esip-qhub/usgs/nwm_forecast/jsons/nwm.20210701.nwm.t00z.short_range.channel_rt.f001.conus.nc.json'

In [8]:

len(furls)

Out[8]:

In [ ]:

from dask.distributed import Client

In [ ]:

#client.close()

In [ ]:

client = Client(n_workers=1)

In [ ]:

client

In [12]:

mzz = MultiZarrToZarr(furls, 
    storage_options={'anon':False}, 
    remote_protocol='s3',
    remote_options={'anon' : 'True'},   #JSON files  
    xarray_open_kwargs={
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
        'drop_variables': ['reference_time', 'crs'],
        'decode_coords' : False
    },
    xarray_concat_args={
#          "data_vars": "minimal",
#          "coords": "minimal",
#          "compat": "override",
        "join": "override",
        "combine_attrs": "override",
        "dim": "time"
    }
)

In [13]:

%%time
#%%prun -D multizarr_profile 
mzz.translate('nwm.json')

CPU times: user 3.5 s, sys: 233 ms, total: 3.73 s
Wall time: 23.4 s

Copy the local consolidated JSON file to S3¶

In [14]:

rpath = 's3://esip-qhub-public/noaa/nwm/nwm_forecast.json'

fs2.put_file(lpath='nwm.json', rpath=rpath)

Try opening the consolidated JSON file from S3¶

In [ ]:

s_opts = {'requester_pays':True, 'skip_instance_cache':True}
r_opts = {'anon':True}
fs = fsspec.filesystem("reference", fo=rpath, ref_storage_args=s_opts,
                       remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")

In [ ]:

ds

In [ ]:

%%time
ds.streamflow[:,1000].hvplot(x='time', grid=True)

In [ ]:

cluster.shutdown(); client.close()

In [ ]: