Requires development version of fsspec_reference_maker
pip install --user git+https://github.com/intake/fsspec-reference-maker
import json
import fsspec
from fsspec_reference_maker.grib2 import scan_grib
from fsspec_reference_maker.combine import MultiZarrToZarr
import os
import re
import hvplot.xarray
import datetime as dt
import dask
import ujson
fs = fsspec.filesystem('s3', anon=True, skip_instance_cache=True)
today = dt.datetime.utcnow().strftime('%Y%m%d')
files = fs.glob(f's3://noaa-hrrr-bdp-pds/hrrr.{today}/conus/*wrfsfcf01.grib2')
files
['noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t00z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t01z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t02z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t03z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t04z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t05z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t06z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t07z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t08z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t09z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t10z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t11z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t12z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t13z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t14z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t15z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t16z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t17z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t18z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t19z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t20z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t21z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf01.grib2']
latest = files[-1].split('/')[3].split('.')[1]
print(latest)
t22z
latest_files = fs.glob(f's3://noaa-hrrr-bdp-pds/hrrr.{today}/conus/hrrr.{latest}.wrfsfc*.grib2')
files.extend(latest_files[2:])
files
['noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t00z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t01z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t02z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t03z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t04z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t05z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t06z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t07z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t08z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t09z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t10z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t11z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t12z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t13z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t14z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t15z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t16z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t17z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t18z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t19z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t20z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t21z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf01.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf02.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf03.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf04.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf05.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf06.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf07.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf08.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf09.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf10.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf11.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf12.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf13.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf14.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf15.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf16.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf17.grib2', 'noaa-hrrr-bdp-pds/hrrr.20210901/conus/hrrr.t22z.wrfsfcf18.grib2']
This is for the ESIP qhub: you will need to modify to work elsewhere.
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd
ebd.set_credentials(profile='esip-qhub')
profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 30
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max,
region=region, use_existing_cluster=True,
adaptive_scaling=False, wait_for_cluster=False,
environment='pangeo', worker_profile='Pangeo Worker',
propagate_env=True)
/home/conda/store/3d745bdbbc77faf1b06381a24d6e593eeec3375ed3ddf4003aa2fc578a214eab-pangeo/lib/python3.9/site-packages/dask_gateway/client.py:21: FutureWarning: format_bytes is deprecated and will be removed in a future release. Please use dask.utils.format_bytes instead. from distributed.utils import LoopRunner, format_bytes
Existing Dask clusters: Cluster Index c_idx: 0 / Name: dev.121ab652553e4aa0a3ad2c05a8f601c3 ClusterStatus.RUNNING Using existing cluster [0]. Setting Fixed Scaling workers=30 Reconnect client to clear cache client.dashboard_link (for new browser tab/window or dashboard searchbar in Jupyterhub): https://jupyter.qhub.esipfed.org/gateway/clusters/dev.121ab652553e4aa0a3ad2c05a8f601c3/status Propagating environment variables to workers
afilter={'typeOfLevel': 'heightAboveGround', 'level': 2}
so = {"anon": True, "default_cache_type": "readahead"}
common = ['time', 'step', 'latitude', 'longitude', 'valid_time']
def gen_json(u):
date = u.split('/')[3].split('.')[1]
name = u.split('/')[5].split('.')[1:3]
outfname = f'{json_dir}{date}.{name[0]}.{name[1]}.json'
out = scan_grib(u, common, so, inline_threashold=100, filter=afilter)
with fs2.open(outfname, "w") as f:
f.write(ujson.dumps(out))
json_dir = 's3://esip-qhub/noaa/hrrr/jsons/'
fs2 = fsspec.filesystem('s3', anon=False)
try:
fs2.rm(json_dir, recursive=True)
except:
pass
urls = [f's3://{file}' for file in files]
#so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first')
%%time
_ = dask.compute(*[dask.delayed(gen_json)(u) for u in urls], retries=10);
CPU times: user 295 ms, sys: 106 ms, total: 401 ms Wall time: 3min 44s
MultiZarrToZarr()
to combine into single reference¶flist2 = fs2.ls(json_dir)
furls = sorted(['s3://'+f for f in flist2])
print(furls[0])
print(furls[-1])
s3://esip-qhub/noaa/hrrr/jsons/20210901.t00z.wrfsfcf01.json s3://esip-qhub/noaa/hrrr/jsons/20210901.t22z.wrfsfcf18.json
# mzz = MultiZarrToZarr(furls,
# storage_options={'anon':False},
# remote_protocol='s3',
# remote_options={'anon' : 'True'}, #JSON files
# xarray_open_kwargs={
# 'decode_cf' : False,
# 'mask_and_scale' : False,
# 'decode_times' : False,
# 'use_cftime' : False,
# 'drop_variables': ['reference_time', 'crs'],
# 'decode_coords' : False
# },
# xarray_concat_args={
# # "data_vars": "minimal",
# # "coords": "minimal",
# # "compat": "override",
# "join": "override",
# "combine_attrs": "override",
# "dim": "time"
# }
# )
mzz = MultiZarrToZarr(furls,
storage_options={'anon':False},
remote_protocol='s3',
remote_options={'anon': True},
xarray_concat_args={'dim': 'time'})
%%time
#%%prun -D multizarr_profile
mzz.translate('hrrr_best.json')
CPU times: user 1.45 s, sys: 329 ms, total: 1.78 s Wall time: 10.3 s
rpath = 's3://esip-qhub-public/noaa/hrrr/hrrr_best.json'
fs2.put_file(lpath='hrrr_best.json', rpath=rpath)
import xarray as xr
from fsspec_reference_maker.grib2 import GRIBCodec
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
import pandas as pd
s_opts = {'requester_pays':True, 'skip_instance_cache':True}
r_opts = {'anon':True}
fs = fsspec.filesystem("reference", fo=rpath, ref_storage_args=s_opts,
remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr")
/tmp/ipykernel_2184/504031357.py:6: RuntimeWarning: Failed to open Zarr store with consolidated metadata, falling back to try reading non-consolidated metadata. This is typically much slower for opening a dataset. To silence this warning, consider: 1. Consolidating metadata in this existing store with zarr.consolidate_metadata(). 2. Explicitly setting consolidated=False, to avoid trying to read consolidate metadata, or 3. Explicitly setting consolidated=True, to raise an error in this case instead of falling back to try reading non-consolidated metadata. ds = xr.open_dataset(m, engine="zarr")
ds
<xarray.Dataset> Dimensions: (y: 1059, x: 1799, time: 40) Coordinates: heightAboveGround float64 1e+03 latitude (y, x) float64 ... longitude (y, x) float64 ... step timedelta64[ns] 01:00:00 * time (time) datetime64[us] 2021-09-01 ... 2021-09-01T22:00:00 valid_time (time) datetime64[ns] 2021-09-01T01:00:00 ... NaT Dimensions without coordinates: y, x Data variables: refd (time, y, x) float32 ... si10 (time, y, x) float32 ... u (time, y, x) float32 ... u10 (time, y, x) float32 ... unknown (time, y, x) float32 ... v (time, y, x) float32 ... v10 (time, y, x) float32 ... Attributes: Conventions: CF-1.7 GRIB_centre: kwbc GRIB_centreDescription: US National Weather Service - NCEP GRIB_edition: 2 GRIB_subCentre: 0 history: 2021-09-01T23:54 GRIB to CDM+CF via cfgrib-0.9.9... institution: US National Weather Service - NCEP
array(1000.)
[1905141 values with dtype=float64]
[1905141 values with dtype=float64]
array(3600000000000, dtype='timedelta64[ns]')
array(['2021-09-01T00:00:00.000000', '2021-09-01T01:00:00.000000', '2021-09-01T02:00:00.000000', '2021-09-01T03:00:00.000000', '2021-09-01T04:00:00.000000', '2021-09-01T05:00:00.000000', '2021-09-01T06:00:00.000000', '2021-09-01T07:00:00.000000', '2021-09-01T08:00:00.000000', '2021-09-01T09:00:00.000000', '2021-09-01T10:00:00.000000', '2021-09-01T11:00:00.000000', '2021-09-01T12:00:00.000000', '2021-09-01T13:00:00.000000', '2021-09-01T14:00:00.000000', '2021-09-01T15:00:00.000000', '2021-09-01T16:00:00.000000', '2021-09-01T17:00:00.000000', '2021-09-01T18:00:00.000000', '2021-09-01T19:00:00.000000', '2021-09-01T20:00:00.000000', '2021-09-01T21:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000', '2021-09-01T22:00:00.000000'], dtype='datetime64[us]')
array(['2021-09-01T01:00:00.000000000', '2021-09-01T02:00:00.000000000', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT'], dtype='datetime64[ns]')
[76205640 values with dtype=float32]
[76205640 values with dtype=float32]
[76205640 values with dtype=float32]
[76205640 values with dtype=float32]
[76205640 values with dtype=float32]
[76205640 values with dtype=float32]
[76205640 values with dtype=float32]
ds.u10.hvplot.quadmesh(x='longitude', y='latitude', rasterize=True, geo=True,
tiles='OSM', cmap='turbo')