#!/usr/bin/env python # coding: utf-8 # # Accessing the latest HRRR GRIB2 forecast data using Kerchunk # # This notebook demonstates a method to open all the variables in the latest HRRR forecast in a single xarray-datatree object using kerchunk to create and edit the necessary reference files. # In[1]: import datetime as dt import fsspec import ujson import kerchunk from kerchunk.grib2 import scan_grib from kerchunk.combine import MultiZarrToZarr import dask.bag as db import pickle # Much of this script can be run in parallel # In[2]: from dask.distributed import Client client = Client() # First let's create two file systems using `fsspec` one to read the HRRR data from the S3 bucket and another to save the reference files. # In[3]: fs_read = fsspec.filesystem('s3', anon=True, skip_instance_cache=True) fs_write = fsspec.filesystem('') # The we generate a list containing paths to the latest forecast data: # In[4]: days_avail = fs_read.glob('s3://noaa-hrrr-bdp-pds/hrrr.*') runs = fs_read.glob(f's3://{days_avail[-1]}/conus/*wrfsfcf01.grib2') files = fs_read.glob(f's3://{runs[-2][:-8]}*.grib2') #select second last run to ensure it is a complete forecast #please note only 00,06,12 and 18z HRRR forecasts are 48 hours the remainder are 18 hours files = sorted(['s3://'+f for f in files]) files # Create folder path to save reference files to. # In[5]: json_dir = 'jsons/' fs_write.mkdir(json_dir) # We use the below two functions to run `scan_grib` on each HRRR forecast file and save a reference file for each grib message within these files # In[6]: def make_json_name(file_url, message_number): #create a unique name for each reference file date = file_url.split('/')[3].split('.')[1] name = file_url.split('/')[5].split('.')[1:3] return f'{json_dir}{date}_{name[0]}_{name[1]}_message{message_number}.json' def gen_json(file_url): out = scan_grib(file_url, storage_options={"anon": True}) #create the reference using scan_grib for i, message in enumerate(out): # scan_grib outputs a list containing one reference file per grib message out_file_name = make_json_name(file_url, i) #get name with fs_write.open(out_file_name, "w") as f: f.write(ujson.dumps(message)) #write to file # Run the computation, each file takes about 5-10 minutes to scan. I recommend only doing two files at first to check everything runs smoothly. # In[7]: get_ipython().run_cell_magic('time', '', 'bag = db.from_sequence(files[:2])\nbag_map = bag.map(gen_json)\n_ = bag_map.compute()\n') # There are some variables within the HRRR dataset that will return the error message: # # " MissingDimensionsError: 'x' has more than 1-dimension and the same name as one of its dimensions ('x', 'y'). xarray disallows such variables because they conflict with the coordinates used to label dimensions." see https://github.com/pydata/xarray/issues/2233 # # To avoid this issue I have created a dictionary containing the grib message and variable names which cause this issue and use `preprocess` in `MultiZarrtoZarr` to drop these variables. These grib messages are primarily 1 dimensional arrays describing their vertical component. # In[8]: to_drop = {'103.json': 'surface', '106.json': 'isobaricLayer', '107.json': 'surface', '108.json': 'surface', '109.json': 'atmosphereSingleLayer', '110.json': 'atmosphereSingleLayer','111.json': 'atmosphereSingleLayer','112.json': 'atmosphere','113.json': 'atmosphere','114.json': 'boundaryLayerCloudLayer','115.json': 'lowCloudLayer','116.json': 'middleCloudLayer','117.json': 'highCloudLayer','118.json': 'atmosphere','119.json': 'cloudCeiling','120.json': 'cloudBase','121.json': 'cloudBase','122.json': 'cloudTop','123.json': 'cloudTop','124.json': 'nominalTop','129.json': 'surface','130.json': 'surface','131.json': 'surface','132.json': 'nominalTop','138.json': 'heightAboveGroundLayer','139.json': 'heightAboveGroundLayer','140.json': 'heightAboveGroundLayer','141.json': 'isothermZero','142.json': 'isothermZero','143.json': 'isothermZero','144.json': 'highestTroposphericFreezing','145.json': 'highestTroposphericFreezing','146.json': 'highestTroposphericFreezing','147.json': 'isothermal','148.json': 'isothermal','149.json': 'pressureFromGroundLayer','150.json': 'pressureFromGroundLayer','152.json': 'surface','153.json': 'adiabaticCondensation','155.json': 'pressureFromGroundLayer','157.json': 'pressureFromGroundLayer','158.json': 'equilibrium','159.json': 'pressureFromGroundLayer','162.json': 'surface','163.json': 'heightAboveGroundLayer','164.json': 'unknown','165.json': 'heightAboveGroundLayer','166.json': 'atmosphere','167.json': 'surface','168.json': 'surface'} # Next we combine the reference messages along the valid time dimension using `MultiZarrtoZarr`. # In[9]: fs_write.mkdir(f'{json_dir}combined/') # In[10]: def combine(message_number): jsons = fs_write.glob(f'{json_dir}*message{message_number}.json') file_name = f'{json_dir}combined/{message_number}.json' if file_name.split('/')[-1] in list(to_drop): preprocess = kerchunk.combine.drop(to_drop[file_name.split('/')[-1]]) mzz = MultiZarrToZarr(jsons,concat_dims = ['valid_time'], preprocess = preprocess) else: mzz = MultiZarrToZarr(jsons,concat_dims = ['valid_time']) d = mzz.translate(filename = f'{json_dir}combined/{message_number}.json') # And run this combine computation across all 169 grib messages # In[11]: get_ipython().run_cell_magic('time', '', 'bag = db.from_sequence(range(170)) #there are 169 grib messages within an HRRR grib2 file\nbag_map = bag.map(combine)\n_ = bag_map.compute()\n') # The next step is to group the grib messages that can be opened as a single xarray dataset together. I have again created a dictionary that contains the compatible groupings: # In[12]: groups = {'atmosphere': [0, 2, 3, 52, 58, 109, 110, 115, 163], 'cloudTop': [1, 119, 120], 'surface': [4, 8,54, 61, 62, 63, 64, 66, 67, 68, 69, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 104, 105, 122, 123, 124, 125, 126, 127, 128, 149, 159, 164, 165], 'heightAboveGround': [5, 6, 42, 56, 57, 59, 60, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80], 'isothermal': [7, 43, 144, 145], 'isobaricInhPa': [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 41], 'nominalTop': [129, 166, 167, 168, 169, 121], 'heightAboveGroundLayer': [130, 131, 132, 133, 134, 135, 136, 137, 157, 160, 162, 44, 45, 46, 47, 48, 49, 50, 51], 'isothermZero': [138, 139, 140], 'highestTroposphericFreezing': [141, 142, 143], 'pressureFromGroundLayer': [146, 147, 148, 151, 152, 153, 154, 156, 37, 38], 'adiabaticCondensation': [150], 'equilibrium': [155], 'depthBelowLand': [65], 'unknown': [161, 158], 'isobaricLayer': [103], 'sigmaLayer': [39], 'meanSea': [40], 'atmosphereSingleLayer': [106, 107, 108, 55], 'boundaryLayerCloudLayer': [111], 'lowCloudLayer': [112], 'middleCloudLayer': [113], 'highCloudLayer': [114], 'cloudBase': [117, 118], 'cloudCeiling': [116], 'sigma': [53]} # Create list of reference files to group together # Use `MultiZarrtoZarr` to combine each group into a single reference file. We use `zstd` compression when saving to reduce the size of the reference files. # In[13]: def make_groups(group): jsons = [f'{json_dir}combined/{m}.json' for m in groups[group]] if group == 'unknown': preprocess = kerchunk.combine.drop('unknown') mzz = MultiZarrToZarr(jsons,concat_dims = ['valid_time'], preprocess = preprocess) else: mzz = MultiZarrToZarr(jsons,concat_dims = ['valid_time']) ref = mzz.translate(filename=f'{group}.json.zst', storage_options=dict(compression='zstd')) # run the computation: # In[14]: get_ipython().run_cell_magic('time', '', 'bag = db.from_sequence(list(groups))\nbag_map = bag.map(make_groups)\n_ = bag_map.compute()\n') # # Opening the reference files as an `xarray-datatree` # Function to open the references we have just created as zarr object using fsspec reference file system and xarray: # In[15]: import xarray as xr from datatree import DataTree import hvplot.xarray import panel as pn from geoviews import tile_sources as gvts # In[16]: def return_ds(d): fs_ = fsspec.filesystem("reference", fo=d, remote_protocol='s3', remote_options={'anon':True},ref_storage_args={"compression": "zstd"}) m = fs_.get_mapper("") return xr.open_dataset(m, engine="zarr", backend_kwargs=dict(consolidated=False)) # In[17]: reference_files = fs_write.glob('*.json.zst') datasets = {ref.split('/')[-1].split('.')[0]:return_ds(ref) for ref in reference_files} # In[18]: dt = DataTree.from_dict(datasets) # Xarray datatree provides a lot of functionality like performing `.sel()` across all datasets, but here we will just use it as a convenient way to view all the variables available from HRRR # In[19]: print(dt) # In[20]: ds = dt['surface'].to_dataset() ds = ds.assign_coords(longitude=(((ds.longitude + 180) % 360) - 180), latitude=ds.latitude) # In[21]: ds # In[22]: sel_time = ds['valid_time'][0] ds['vis'].sel(valid_time=sel_time).hvplot.quadmesh(x='longitude', y='latitude', geo=True, tiles='OSM', rasterize=True, cmap='viridis_r', title = str(sel_time.values))