#!/usr/bin/env python # coding: utf-8 # # Create combined JSON for each year in parallel # read all individual referenceFileSystem JSON files and create combined JSON for entire dataset # In[1]: import fsspec import xarray as xr import hvplot.xarray import metpy # In[2]: import ujson # fast json from kerchunk.combine import MultiZarrToZarr import kerchunk # In[3]: json_dir = 's3://esip-qhub/noaa/nwm/grid1km/json' # In[4]: kerchunk.__version__ # For file systems where files are changing, you want `skip_instance_cache=True` or else you won't see the changed files # In[5]: fs_json = fsspec.filesystem('s3', anon=False, skip_instance_cache=True) # Create a dict from the mzz object # In[6]: year_list = fs_json.glob('esip-qhub/noaa/nwm/grid1km/combined_????.json') year_list = [f's3://{y}' for y in year_list] year_list # In[7]: print(len(year_list)) # #### Create ten-year combined files # In[8]: year_dict={} year_dict['a'] = year_list[:10] year_dict['b'] = year_list[10:20] year_dict['c'] = year_list[20:30] year_dict['d'] = year_list[30:40] # In[9]: def key_combine(key): combined_json = f's3://esip-qhub/noaa/nwm/grid1km/combined_{key}.json' mzz = MultiZarrToZarr(year_dict[key], remote_protocol = 's3', remote_options = dict(anon=True), concat_dims = ['time'], identical_dims=["x", "y", "crs"], preprocess = kerchunk.combine.drop("reference_time")) d = mzz.translate() with fs_json.open(combined_json, 'wb') as f: f.write(ujson.dumps(d).encode()); # In[10]: #year_dict['martin'] = year_list[:13] # In[11]: get_ipython().run_cell_magic('time', '', "#key_combine('martin')\n") # In[12]: get_ipython().run_cell_magic('time', '', "#key_combine('b')\n") # In[13]: get_ipython().run_cell_magic('time', '', "#key_combine('c')\n") # In[14]: get_ipython().run_cell_magic('time', '', "#key_combine('d')\n") # #### Try to create a single combined JSON from the 10-year JSONs # First list the decadal JSONs # In[15]: p_list = fs_json.glob('esip-qhub/noaa/nwm/grid1km/combined_?.json') p_list = [f's3://{y}' for y in p_list] p_list # #### Try combining the four decades # The combine step below required 90GB of RAM, which is more than we have here on ESIP Nebari. We ran this step on Denali and passed the 9GB output JSON file to Martin Durant, who created a Parquet version of the references. # In[16]: year_dict['zz'] = p_list # In[ ]: get_ipython().run_cell_magic('time', '', "key_combine('zz')\n") # In[ ]: