Explore Kerchunking Harmonie data

Explore zipped test data from: https://stackoverflow.com/questions/70952226/how-to-merge-different-shaped-netcdf4-files

In [1]:
import fsspec
import xarray as xr
from kerchunk.hdf import SingleHdf5ToZarr 
from pathlib import Path
import ujson
In [2]:
fs = fsspec.filesystem('file')
In [3]:
flist = fs.glob('data/test_data_stackoverflow/*.nc')
flist[:5]
Out[3]:
['/shared/users/rsignell/notebooks/testing/data/test_data_stackoverflow/harmonie_knmi_2021_03_20_18_30_18.nc',
 '/shared/users/rsignell/notebooks/testing/data/test_data_stackoverflow/harmonie_knmi_2021_03_20_18_30_19.nc',
 '/shared/users/rsignell/notebooks/testing/data/test_data_stackoverflow/harmonie_knmi_2021_03_20_18_30_20.nc',
 '/shared/users/rsignell/notebooks/testing/data/test_data_stackoverflow/harmonie_knmi_2021_03_20_18_30_21.nc',
 '/shared/users/rsignell/notebooks/testing/data/test_data_stackoverflow/harmonie_knmi_2021_03_20_18_30_22.nc']
In [4]:
flist[0]
Out[4]:
'/shared/users/rsignell/notebooks/testing/data/test_data_stackoverflow/harmonie_knmi_2021_03_20_18_30_18.nc'
In [5]:
for f in flist:
    print(xr.open_dataset(flist[0]).x.values)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
In [6]:
for f in flist:
    print(xr.open_dataset(flist[0]).dt_calc.values)
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
2021-03-20T18:00:00.000000000
In [7]:
for f in flist:
    print(xr.open_dataset(flist[0]).dt_fore[0].values)
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
1616263200000000000
In [8]:
json_dir = 'jsons'
In [9]:
#generate a json from a netCDF
def gen_json(u):
    #open the file (u) with desiganted parameters (so), call it infile
    with fs.open(u, **so) as infile:
        #inline_threshold: chunks smaller than 300 (I'm guessing mb?) are included in output
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        #Path().stem is from the pathlib library; returns the file name w/o the extension
        fstem = Path(u).stem
        #here we create the output file path
        outf = f'{json_dir}/{fstem}.json'
        #print(outf)
        #wb: write binary file
        with fs.open(outf, 'wb') as f:
            #create json file and write it to the path specified above (f)
            #dumps: dictionary to string
            #translate: translate contents of HDF5 to Zarr
            f.write(ujson.dumps(h5chunks.translate()).encode());
In [10]:
so = dict(mode='rb')
In [11]:
#loop through list of netCDFs and put them through our function for generating jsons
for f in flist:
    gen_json(f)
In [12]:
#create and view a sorted list of all jsons created in the previous step
json_list = fs.glob(f'{json_dir}/*.json')
json_list[:5]
Out[12]:
['/shared/users/rsignell/notebooks/testing/jsons/harmonie_knmi_2021_03_20_18_30_18.json',
 '/shared/users/rsignell/notebooks/testing/jsons/harmonie_knmi_2021_03_20_18_30_19.json',
 '/shared/users/rsignell/notebooks/testing/jsons/harmonie_knmi_2021_03_20_18_30_20.json',
 '/shared/users/rsignell/notebooks/testing/jsons/harmonie_knmi_2021_03_20_18_30_21.json',
 '/shared/users/rsignell/notebooks/testing/jsons/harmonie_knmi_2021_03_20_18_30_22.json']
In [13]:
rpath = json_list[0]
fs3 = fsspec.filesystem("reference", fo=rpath)

m = fs3.get_mapper("")
ds = xr.open_dataset(m, engine="zarr",chunks={}, backend_kwargs={'consolidated':False})

ds
Out[13]:
<xarray.Dataset>
Dimensions:                       (dt_fore: 49, x: 31, y: 54)
Coordinates:
    dt_calc                       datetime64[ns] ...
  * dt_fore                       (dt_fore) float64 1.616e+18 ... 1.616e+18
    lat                           (x, y) float64 dask.array<chunksize=(31, 54), meta=np.ndarray>
    lon                           (x, y) float64 dask.array<chunksize=(31, 54), meta=np.ndarray>
Dimensions without coordinates: x, y
Data variables: (12/27)
    air_temperature_2m            (dt_fore, x, y) float16 dask.array<chunksize=(13, 16, 27), meta=np.ndarray>
    air_temperature_500           (dt_fore, x, y) float16 dask.array<chunksize=(13, 16, 27), meta=np.ndarray>
    air_temperature_850           (dt_fore, x, y) float16 dask.array<chunksize=(13, 16, 27), meta=np.ndarray>
    dewpoint_2m                   (dt_fore, x, y) float16 dask.array<chunksize=(13, 16, 27), meta=np.ndarray>
    geopotential_height_500       (dt_fore, x, y) float64 dask.array<chunksize=(13, 8, 14), meta=np.ndarray>
    geopotential_height_850       (dt_fore, x, y) float64 dask.array<chunksize=(13, 8, 14), meta=np.ndarray>
    ...                            ...
    wind_direction_50             (dt_fore, x, y) float64 dask.array<chunksize=(13, 8, 14), meta=np.ndarray>
    wind_speed_10                 (dt_fore, x, y) float64 dask.array<chunksize=(13, 8, 14), meta=np.ndarray>
    wind_speed_100                (dt_fore, x, y) float64 dask.array<chunksize=(13, 8, 14), meta=np.ndarray>
    wind_speed_200                (dt_fore, x, y) float64 dask.array<chunksize=(13, 8, 14), meta=np.ndarray>
    wind_speed_300                (dt_fore, x, y) float64 dask.array<chunksize=(13, 8, 14), meta=np.ndarray>
    wind_speed_50                 (dt_fore, x, y) float64 dask.array<chunksize=(13, 8, 14), meta=np.ndarray>