read all individual referenceFileSystem JSON files and create combined JSON for entire dataset
import fsspec
import xarray as xr
import hvplot.xarray
import metpy
import ujson # fast json
from kerchunk.combine import MultiZarrToZarr
import kerchunk
json_dir = 's3://esip-qhub/noaa/nwm/grid1km/json'
kerchunk.__version__
'0.0.1+420.gca577c4.dirty'
For file systems where files are changing, you want skip_instance_cache=True
or else you won't see the changed files
fs_json = fsspec.filesystem('s3', anon=False, skip_instance_cache=True)
Create a dict from the mzz object
year_list = fs_json.glob('esip-qhub/noaa/nwm/grid1km/combined_????.json')
year_list = [f's3://{y}' for y in year_list]
year_list
['s3://esip-qhub/noaa/nwm/grid1km/combined_1979.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1980.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1981.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1982.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1983.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1984.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1985.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1986.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1987.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1988.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1989.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1990.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1991.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1992.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1993.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1994.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1995.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1996.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1997.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1998.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_1999.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2001.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2002.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2003.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2004.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2005.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2006.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2007.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2008.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2009.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2010.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2011.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2012.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2013.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2014.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2015.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2016.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2017.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2019.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_2020.json']
print(len(year_list))
40
year_dict={}
year_dict['a'] = year_list[:10]
year_dict['b'] = year_list[10:20]
year_dict['c'] = year_list[20:30]
year_dict['d'] = year_list[30:40]
def key_combine(key):
combined_json = f's3://esip-qhub/noaa/nwm/grid1km/combined_{key}.json'
mzz = MultiZarrToZarr(year_dict[key],
remote_protocol = 's3',
remote_options = dict(anon=True),
concat_dims = ['time'],
identical_dims=["x", "y", "crs"],
preprocess = kerchunk.combine.drop("reference_time"))
d = mzz.translate()
with fs_json.open(combined_json, 'wb') as f:
f.write(ujson.dumps(d).encode());
#year_dict['martin'] = year_list[:13]
%%time
#key_combine('martin')
CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs Wall time: 5.72 µs
%%time
#key_combine('b')
CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs Wall time: 5.96 µs
%%time
#key_combine('c')
CPU times: user 2 µs, sys: 0 ns, total: 2 µs Wall time: 5.25 µs
%%time
#key_combine('d')
CPU times: user 2 µs, sys: 0 ns, total: 2 µs Wall time: 5.48 µs
First list the decadal JSONs
p_list = fs_json.glob('esip-qhub/noaa/nwm/grid1km/combined_?.json')
p_list = [f's3://{y}' for y in p_list]
p_list
['s3://esip-qhub/noaa/nwm/grid1km/combined_a.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_b.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_c.json', 's3://esip-qhub/noaa/nwm/grid1km/combined_d.json']
The combine step below required 90GB of RAM, which is more than we have here on ESIP Nebari. We ran this step on Denali and passed the 9GB output JSON file to Martin Durant, who created a Parquet version of the references.
year_dict['zz'] = p_list
%%time
key_combine('zz')