#!/usr/bin/env python # coding: utf-8 # # HRRR Forecast Collection Time Series # Read a collection of GRIB2 files on AWS as a single dataset using the Zarr library, via fsspec's ReferenceFileSystem. # # This notebook demonstrates how to generate the reference JSON files using [Kerchunk](https://github.com/fsspec/kerchunk) # # In[1]: import xarray as xr import datetime as dt import fsspec import ujson from kerchunk.grib2 import scan_grib from kerchunk.combine import MultiZarrToZarr # `fsspec` file systems to read grib2 forecast files from AWS and write reference jsons to an aws bucket in this case # In[2]: fs_read = fsspec.filesystem('s3', anon=True, skip_instance_cache=True) fs_write = fsspec.filesystem('s3', anon=False) #fs_write = fsspec.filesystem('') #uncomment this if you intend to write jsons to a local folder # Get the latest forecast # In[3]: days_avail = fs_read.glob('s3://noaa-hrrr-bdp-pds/hrrr.*') files = fs_read.glob(f's3://{days_avail[-1]}/conus/*wrfsfcf01.grib2') files = sorted(['s3://'+f for f in files]) files # `scan_grib` does not require a filter and will happily create a reference file for each available grib message. However when combining the grib messages using `MultiZarrToZarr` it is neccassary for the messages to share a coordinate system. Thus to make our lives easier and ensure all reference outputs from `scan_grib` share a coordinate system we pass a filter argument. # In[4]: afilter={'typeOfLevel': 'heightAboveGround', 'level': [2, 10]} # In[5]: so = {"anon": True} # In[6]: json_dir = 's3://esip-qhub/noaa/hrrr/jsons/' def make_json_name(file_url, message_number): #create a unique name for each reference file date = file_url.split('/')[3].split('.')[1] name = file_url.split('/')[5].split('.')[1:3] return f'{json_dir}{date}_{name[0]}_{name[1]}_message{message_number}.json' def gen_json(file_url): out = scan_grib(file_url, storage_options=so, filter=afilter) #create the reference using scan_grib for i, message in enumerate(out): # scan_grib outputs a list containing one reference per grib message out_file_name = make_json_name(file_url, i) #get name with fs_write.open(out_file_name, "w") as f: f.write(ujson.dumps(message)) #write to file # In[7]: fs_write.rm(json_dir) # clear json directory of old references # In[8]: get_ipython().run_cell_magic('time', '', '#this step is best run via a cluster\nfor f in files:\n gen_json(f)\n') # In[9]: reference_jsons = fs_write.ls(json_dir) #get list of file names reference_jsons = sorted(['s3://'+f for f in reference_jsons]) #prepend s3 protocol (not neccessary if writing to local filesystem) # In[10]: #combine individual references into single consolidated reference mzz = MultiZarrToZarr(reference_jsons, concat_dims = ['valid_time'], identical_dims=['latitude', 'longitude', 'heightAboveGround', 'step']) # In[11]: get_ipython().run_cell_magic('time', '', 'd = mzz.translate()\n') # In[12]: #open dataset as zarr object using fsspec reference file system and xarray fs = fsspec.filesystem("reference", fo=d, remote_protocol='s3', remote_options={'anon':True}) m = fs.get_mapper("") ds = xr.open_dataset(m, engine="zarr", backend_kwargs=dict(consolidated=False), chunks={'valid_time':1}) # In[13]: ds # In[14]: ds['2t'][-1].plot() # In[15]: ds['2t'][:,500,500].plot()