#!/usr/bin/env python # coding: utf-8 # # Efficient Cloud Access to ERA5 # # A full archive of ERA5 reanalysis data has been made available as a free to access public data set on [AWS](https://registry.opendata.aws/ecmwf-era5/). To aid effiecent cloud access to this data the provider has reproduced the native netcdf variables as Zarr stores. While this is convenient for the end user the burden of processing and the cost duplicating the original data is significant. # # ## Kerchunk # # [Kerchunk](https://github.com/fsspec/kerchunk) seeks to provide some middle ground to this by creating a sidecar file to go along with the native netcdf dataset. While this requires some preprocessing from the data provider, it does not require any duplicating of data. Only a single 200 MB sidecar file is neccessary to allow severless, zarr-like, access to the native Netcdf datasets. # In[1]: import fsspec import ujson import xarray as xr import hvplot.xarray import panel as pn import cartopy.crs as ccrs import zstandard # In[2]: fs2 = fsspec.filesystem('s3', anon=True) # In[3]: get_ipython().run_cell_magic('time', '', 'fs = fsspec.filesystem("reference", fo=\'s3://esip-qhub-public/ecmwf/ERA5_2020_2022_multivar.json.zst\', \n ref_storage_args={"compression": "zstd"},\n remote_protocol=\'s3\', remote_options={\'anon\':True})\nm = fs.get_mapper("")\nds = xr.open_dataset(m, engine="zarr", backend_kwargs={\'consolidated\':False}, chunks={\'time0\':288})\nds\n') # In[4]: ds = ds.sel(time0 = '2021-01-01T00:00:00') # In[5]: variables = list(ds.data_vars) variables.reverse() # In[6]: #If desired this will load all the variables on this particular day into memory to increase the render speed of the map #ds = ds.load() # In[7]: sel = pn.widgets.Select(options=variables, name='Data Variable') pn.Column(sel, ds.hvplot.image(z=sel, cmap = 'viridis', coastline=True,projection=ccrs.Orthographic(45, -10), global_extent=True, frame_height=500)) # # How was this sidecar file constructed? # # Below is a step by step tutorial to reproduce this on a single variable # Kerchunk is a new package and changes frequently, it is best to import directly from github by cloning the repository or using: # In[8]: #!pip install git+https://github.com/fsspec/kerchunk # In[9]: from kerchunk.hdf import SingleHdf5ToZarr from kerchunk.combine import MultiZarrToZarr # First create a filesystem pointing to the location of 2 meter air temperature files on the S3 bucket. ERA5 is a public data set so anonymous access works fine # In[10]: fs = fsspec.filesystem('s3', anon=True) # In[11]: flist = fs.glob('s3://era5-pds/2020/*/data/air_pressure_at_mean_sea_level.nc')[:2] # only consider two files to speed this up flist #chech the paths are consistent # Next create a local file system to save the files # In[12]: fs2 = fsspec.filesystem('') # First it is neccessary to create a sidecar file for each individual file using kerchunk.hdf.SingleHdf5ToZarr. Later we will combine these into a single file # In[13]: from pathlib import Path import os # In[ ]: os.mkdir('ERA5_jsons') # In[14]: json_dir = 'ERA5_jsons/' # In[15]: so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first') # In[16]: def gen_json(u): with fs.open(u, **so) as infile: h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300) parts = u.split('/') # seperate file path to create a unique name for each json year = parts[1] month = parts[2] fstem = Path(u).stem outf = f'{json_dir}{year}{month}{fstem}.json' print(outf) with fs2.open(outf, 'wb') as f: f.write(ujson.dumps(h5chunks.translate()).encode()); # Create the individual sidecar files. This step is best run in parallel # In[15]: get_ipython().run_cell_magic('time', '', 'for file in flist:\n gen_json(file)\n') # Check these files have been successfully written by openning an single file # In[17]: json_list = fs2.glob(str(json_dir)+'*air_pressure_at_mean_sea_level.json') # The next step is to use kerchunk.combine.MultiZarrtoZarr to combine these into a single sidecar file # In[18]: get_ipython().run_cell_magic('time', '', 'fs_ = fsspec.filesystem("reference", fo=json_list[0], ref_storage_args={\'skip_instance_cache\':True},\n remote_protocol=\'s3\', remote_options={\'anon\':True})\nm = fs_.get_mapper("")\nds = xr.open_dataset(m, engine="zarr", backend_kwargs={\'consolidated\':False}, chunks={})\nprint(ds)\n') # All looks good, except for 0 lon being translated to Nan. This is as a result of no fill_value being assigned in the original Netcdf file and zarr defaulting to 0 as a fill_value. We can us the postprocess method to modify this. # In[19]: import zarr def modify_fill_value(out): out_ = zarr.open(out) out_.lon.fill_value = -999 out_.lat.fill_value = -999 return out def postprocess(out): out = modify_fill_value(out) return out # Now we can use MultiZarrToZarr to combine the individual jsons along the 'time0' dimension # In[20]: mzz = MultiZarrToZarr(json_list, remote_protocol='s3', remote_options={'anon':True}, concat_dims=['time0'], #this is the dimension along which the individual files will be merged postprocess = postprocess ) # In[21]: get_ipython().run_cell_magic('time', '', 'd = mzz.translate()\n') # In[22]: with fs2.open('combined.json', 'wb') as f: f.write(ujson.dumps(d).encode()) # In[23]: get_ipython().run_cell_magic('time', '', 'fs = fsspec.filesystem("reference", fo=\'combined.json\', ref_storage_args={\'skip_instance_cache\':True},\n remote_protocol=\'s3\', remote_options={\'anon\':True})\nm = fs.get_mapper("")\nds = xr.open_dataset(m, engine="zarr", backend_kwargs={\'consolidated\':False}, chunks={})\nprint(ds)\n') # In[24]: get_ipython().run_cell_magic('time', '', "ds = ds.sel(time0 = '2020-01-01T12:00:00')\nds.air_pressure_at_mean_sea_level.hvplot.image(cmap = 'viridis', coastline=True,\n projection=ccrs.Orthographic(45, -10),global_extent=True)\n")