import fsspec
import xarray as xr
import hvplot.xarray
This fsspec referenceFileSystem JSON file below was created with the kerchunk package, and can be updated by running this script. See this blog post for more about using Kerchunk to create these JSON objects from collections of NetCDF4 and GRIB2 files.
reference_file = 's3://esip-qhub-public/LiveOcean/LiveOcean_reference_20220912.json'
fs1 = fsspec.filesystem('s3', anon=True)
fs1.ls('s3://esip-qhub-public/LiveOcean')
['esip-qhub-public/LiveOcean/LiveOcean_reference.json', 'esip-qhub-public/LiveOcean/LiveOcean_reference_20220912.json', 'esip-qhub-public/LiveOcean/etags.pickle', 'esip-qhub-public/LiveOcean/individual']
When creating the fsspec referenceFileSystem object, we need to include access options for both the JSON references and the data files:
topts = dict(anon=True, skip_instance_cache=True) # json reference
ropts = dict(account_name='pm2', skip_instance_cache=True) # netcdf data files
fs = fsspec.filesystem("reference", fo=reference_file, target_options=topts,
remote_protocol='abfs', remote_options=ropts)
This virtual filesystem object looks just like a Zarr dataset:
fs.ls('')
[{'name': '.zgroup', 'type': 'file', 'size': 17}, {'name': 'ocean_time', 'type': 'directory', 'size': 0}, {'name': '.zattrs', 'type': 'file', 'size': 3980}, {'name': 'AKs', 'type': 'directory', 'size': 0}, {'name': 'AKv', 'type': 'directory', 'size': 0}, {'name': 'Akk_bak', 'type': 'directory', 'size': 0}, {'name': 'Akp_bak', 'type': 'directory', 'size': 0}, {'name': 'Akt_bak', 'type': 'directory', 'size': 0}, {'name': 'Akv_bak', 'type': 'directory', 'size': 0}, {'name': 'AttP', 'type': 'directory', 'size': 0}, {'name': 'AttS', 'type': 'directory', 'size': 0}, {'name': 'AttSW', 'type': 'directory', 'size': 0}, {'name': 'BioIter', 'type': 'directory', 'size': 0}, {'name': 'CaCO3', 'type': 'directory', 'size': 0}, {'name': 'Charnok_alpha', 'type': 'directory', 'size': 0}, {'name': 'CoagR', 'type': 'directory', 'size': 0}, {'name': 'CrgBan_cw', 'type': 'directory', 'size': 0}, {'name': 'Cs_r', 'type': 'directory', 'size': 0}, {'name': 'Cs_w', 'type': 'directory', 'size': 0}, {'name': 'FSobc_in', 'type': 'directory', 'size': 0}, {'name': 'FSobc_out', 'type': 'directory', 'size': 0}, {'name': 'Falpha', 'type': 'directory', 'size': 0}, {'name': 'Fbeta', 'type': 'directory', 'size': 0}, {'name': 'Fgamma', 'type': 'directory', 'size': 0}, {'name': 'LdetWsink', 'type': 'directory', 'size': 0}, {'name': 'Ldetritus', 'type': 'directory', 'size': 0}, {'name': 'Lm2CLM', 'type': 'directory', 'size': 0}, {'name': 'Lm3CLM', 'type': 'directory', 'size': 0}, {'name': 'LnudgeM2CLM', 'type': 'directory', 'size': 0}, {'name': 'LnudgeM3CLM', 'type': 'directory', 'size': 0}, {'name': 'LnudgeTCLM', 'type': 'directory', 'size': 0}, {'name': 'LsshCLM', 'type': 'directory', 'size': 0}, {'name': 'LtracerCLM', 'type': 'directory', 'size': 0}, {'name': 'LtracerSponge', 'type': 'directory', 'size': 0}, {'name': 'LtracerSrc', 'type': 'directory', 'size': 0}, {'name': 'LuvSrc', 'type': 'directory', 'size': 0}, {'name': 'LwSrc', 'type': 'directory', 'size': 0}, {'name': 'M2nudg', 'type': 'directory', 'size': 0}, {'name': 'M2obc_in', 'type': 'directory', 'size': 0}, {'name': 'M2obc_out', 'type': 'directory', 'size': 0}, {'name': 'M3nudg', 'type': 'directory', 'size': 0}, {'name': 'M3obc_in', 'type': 'directory', 'size': 0}, {'name': 'M3obc_out', 'type': 'directory', 'size': 0}, {'name': 'NO3', 'type': 'directory', 'size': 0}, {'name': 'PARfrac', 'type': 'directory', 'size': 0}, {'name': 'Pair', 'type': 'directory', 'size': 0}, {'name': 'TIC', 'type': 'directory', 'size': 0}, {'name': 'Tcline', 'type': 'directory', 'size': 0}, {'name': 'Tnudg', 'type': 'directory', 'size': 0}, {'name': 'Tobc_in', 'type': 'directory', 'size': 0}, {'name': 'Tobc_out', 'type': 'directory', 'size': 0}, {'name': 'Uwind', 'type': 'directory', 'size': 0}, {'name': 'Vstretching', 'type': 'directory', 'size': 0}, {'name': 'Vtransform', 'type': 'directory', 'size': 0}, {'name': 'Vwind', 'type': 'directory', 'size': 0}, {'name': 'Znudg', 'type': 'directory', 'size': 0}, {'name': 'Zob', 'type': 'directory', 'size': 0}, {'name': 'Zos', 'type': 'directory', 'size': 0}, {'name': 'Zos_hsig_alpha', 'type': 'directory', 'size': 0}, {'name': 'alkalinity', 'type': 'directory', 'size': 0}, {'name': 'bustr', 'type': 'directory', 'size': 0}, {'name': 'bvstr', 'type': 'directory', 'size': 0}, {'name': 'deflate', 'type': 'directory', 'size': 0}, {'name': 'deflate_level', 'type': 'directory', 'size': 0}, {'name': 'detRemin', 'type': 'directory', 'size': 0}, {'name': 'detWsink', 'type': 'directory', 'size': 0}, {'name': 'detritus', 'type': 'directory', 'size': 0}, {'name': 'dstart', 'type': 'directory', 'size': 0}, {'name': 'dt', 'type': 'directory', 'size': 0}, {'name': 'dtfast', 'type': 'directory', 'size': 0}, {'name': 'el', 'type': 'directory', 'size': 0}, {'name': 'f', 'type': 'directory', 'size': 0}, {'name': 'gamma2', 'type': 'directory', 'size': 0}, {'name': 'gls_Kmin', 'type': 'directory', 'size': 0}, {'name': 'gls_Pmin', 'type': 'directory', 'size': 0}, {'name': 'gls_c1', 'type': 'directory', 'size': 0}, {'name': 'gls_c2', 'type': 'directory', 'size': 0}, {'name': 'gls_c3m', 'type': 'directory', 'size': 0}, {'name': 'gls_c3p', 'type': 'directory', 'size': 0}, {'name': 'gls_cmu0', 'type': 'directory', 'size': 0}, {'name': 'gls_m', 'type': 'directory', 'size': 0}, {'name': 'gls_n', 'type': 'directory', 'size': 0}, {'name': 'gls_p', 'type': 'directory', 'size': 0}, {'name': 'gls_sigk', 'type': 'directory', 'size': 0}, {'name': 'gls_sigp', 'type': 'directory', 'size': 0}, {'name': 'grid', 'type': 'directory', 'size': 0}, {'name': 'h', 'type': 'directory', 'size': 0}, {'name': 'hc', 'type': 'directory', 'size': 0}, {'name': 'lat_psi', 'type': 'directory', 'size': 0}, {'name': 'lat_rho', 'type': 'directory', 'size': 0}, {'name': 'lat_u', 'type': 'directory', 'size': 0}, {'name': 'lat_v', 'type': 'directory', 'size': 0}, {'name': 'latent', 'type': 'directory', 'size': 0}, {'name': 'lon_psi', 'type': 'directory', 'size': 0}, {'name': 'lon_rho', 'type': 'directory', 'size': 0}, {'name': 'lon_u', 'type': 'directory', 'size': 0}, {'name': 'lon_v', 'type': 'directory', 'size': 0}, {'name': 'lwrad', 'type': 'directory', 'size': 0}, {'name': 'mask_psi', 'type': 'directory', 'size': 0}, {'name': 'mask_rho', 'type': 'directory', 'size': 0}, {'name': 'mask_u', 'type': 'directory', 'size': 0}, {'name': 'mask_v', 'type': 'directory', 'size': 0}, {'name': 'nHIS', 'type': 'directory', 'size': 0}, {'name': 'nRST', 'type': 'directory', 'size': 0}, {'name': 'ndefHIS', 'type': 'directory', 'size': 0}, {'name': 'ndtfast', 'type': 'directory', 'size': 0}, {'name': 'nl_tnu2', 'type': 'directory', 'size': 0}, {'name': 'ntimes', 'type': 'directory', 'size': 0}, {'name': 'oxygen', 'type': 'directory', 'size': 0}, {'name': 'pCO2air', 'type': 'directory', 'size': 0}, {'name': 'phyAlpha', 'type': 'directory', 'size': 0}, {'name': 'phyKs', 'type': 'directory', 'size': 0}, {'name': 'phyM', 'type': 'directory', 'size': 0}, {'name': 'phyMin', 'type': 'directory', 'size': 0}, {'name': 'phyMu0', 'type': 'directory', 'size': 0}, {'name': 'phytoplankton', 'type': 'directory', 'size': 0}, {'name': 'pm', 'type': 'directory', 'size': 0}, {'name': 'pn', 'type': 'directory', 'size': 0}, {'name': 'rdrg', 'type': 'directory', 'size': 0}, {'name': 'rdrg2', 'type': 'directory', 'size': 0}, {'name': 'rho', 'type': 'directory', 'size': 0}, {'name': 'rho0', 'type': 'directory', 'size': 0}, {'name': 's_rho', 'type': 'directory', 'size': 0}, {'name': 's_w', 'type': 'directory', 'size': 0}, {'name': 'salt', 'type': 'directory', 'size': 0}, {'name': 'sensible', 'type': 'directory', 'size': 0}, {'name': 'shflux', 'type': 'directory', 'size': 0}, {'name': 'shuffle', 'type': 'directory', 'size': 0}, {'name': 'spherical', 'type': 'directory', 'size': 0}, {'name': 'ssflux', 'type': 'directory', 'size': 0}, {'name': 'sustr', 'type': 'directory', 'size': 0}, {'name': 'svstr', 'type': 'directory', 'size': 0}, {'name': 'swrad', 'type': 'directory', 'size': 0}, {'name': 'sz_alpha', 'type': 'directory', 'size': 0}, {'name': 'temp', 'type': 'directory', 'size': 0}, {'name': 'theta_b', 'type': 'directory', 'size': 0}, {'name': 'theta_s', 'type': 'directory', 'size': 0}, {'name': 'u', 'type': 'directory', 'size': 0}, {'name': 'ubar', 'type': 'directory', 'size': 0}, {'name': 'v', 'type': 'directory', 'size': 0}, {'name': 'vbar', 'type': 'directory', 'size': 0}, {'name': 'w', 'type': 'directory', 'size': 0}, {'name': 'xl', 'type': 'directory', 'size': 0}, {'name': 'zeta', 'type': 'directory', 'size': 0}, {'name': 'zooEps', 'type': 'directory', 'size': 0}, {'name': 'zooFegest', 'type': 'directory', 'size': 0}, {'name': 'zooI0', 'type': 'directory', 'size': 0}, {'name': 'zooKs', 'type': 'directory', 'size': 0}, {'name': 'zooMin', 'type': 'directory', 'size': 0}, {'name': 'zooZeta', 'type': 'directory', 'size': 0}, {'name': 'zooplankton', 'type': 'directory', 'size': 0}]
Since it looks like a Zarr dataset, we can create a mapper and open the data with the Zarr library:
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", consolidated=False, chunks={})
ds
<xarray.Dataset> Dimensions: (ocean_time: 1368, s_w: 31, eta_rho: 1302, xi_rho: 663, tracer: 11, s_rho: 30, boundary: 4, eta_u: 1302, xi_u: 662, eta_v: 1301, xi_v: 663, eta_psi: 1301, xi_psi: 662) Coordinates: lat_psi (eta_psi, xi_psi) float64 dask.array<chunksize=(651, 331), meta=np.ndarray> lat_rho (eta_rho, xi_rho) float64 dask.array<chunksize=(651, 332), meta=np.ndarray> lat_u (eta_u, xi_u) float64 dask.array<chunksize=(651, 331), meta=np.ndarray> lat_v (eta_v, xi_v) float64 dask.array<chunksize=(651, 332), meta=np.ndarray> lon_psi (eta_psi, xi_psi) float64 dask.array<chunksize=(651, 331), meta=np.ndarray> lon_rho (eta_rho, xi_rho) float64 dask.array<chunksize=(651, 332), meta=np.ndarray> lon_u (eta_u, xi_u) float64 dask.array<chunksize=(651, 331), meta=np.ndarray> lon_v (eta_v, xi_v) float64 dask.array<chunksize=(651, 332), meta=np.ndarray> * ocean_time (ocean_time) datetime64[ns] 2022-07-18T01:00:00 ... 2022-... * s_rho (s_rho) float64 -0.9833 -0.95 -0.9167 ... -0.05 -0.01667 * s_w (s_w) float64 -1.0 -0.9667 -0.9333 ... -0.06667 -0.03333 0.0 Dimensions without coordinates: eta_rho, xi_rho, tracer, boundary, eta_u, xi_u, eta_v, xi_v, eta_psi, xi_psi Data variables: (12/138) AKs (ocean_time, s_w, eta_rho, xi_rho) float32 dask.array<chunksize=(1, 8, 434, 221), meta=np.ndarray> AKv (ocean_time, s_w, eta_rho, xi_rho) float32 dask.array<chunksize=(1, 8, 434, 221), meta=np.ndarray> Akk_bak (ocean_time) float64 dask.array<chunksize=(1,), meta=np.ndarray> Akp_bak (ocean_time) float64 dask.array<chunksize=(1,), meta=np.ndarray> Akt_bak (ocean_time, tracer) float64 dask.array<chunksize=(1, 11), meta=np.ndarray> Akv_bak (ocean_time) float64 dask.array<chunksize=(1,), meta=np.ndarray> ... ... zooFegest (ocean_time) float64 dask.array<chunksize=(1,), meta=np.ndarray> zooI0 (ocean_time) float64 dask.array<chunksize=(1,), meta=np.ndarray> zooKs (ocean_time) float64 dask.array<chunksize=(1,), meta=np.ndarray> zooMin (ocean_time) float64 dask.array<chunksize=(1,), meta=np.ndarray> zooZeta (ocean_time) float64 dask.array<chunksize=(1,), meta=np.ndarray> zooplankton (ocean_time, s_rho, eta_rho, xi_rho) float32 dask.array<chunksize=(1, 8, 434, 221), meta=np.ndarray> Attributes: (12/40) CPP_options: U0KB, ADD_FSOBC, ADD_M2OBC, ANA_BPFLUX, ANA_BSFLUX, AN... Conventions: CF-1.4, SGRID-0.3 NLM_LBC: \nEDGE: WEST SOUTH EAST NORTH \nzeta: ... ana_file: ROMS/Functionals/ana_btflux.h, ROMS/Functionals/ana_st... bio_file: ROMS/Nonlinear/Biology/npzd2o_banas.h bpar_file: /gscratch/macc/parker/LO_roms/cas6_v0_u0kb/f2022.07.18... ... ... svn_rev: 824M svn_url: https://www.myroms.org/svn/src/trunk tiling: 020x020 title: First LiveOcean input file type: ROMS/TOMS history file var_info: /gscratch/macc/parker/LiveOcean_roms/LO_ROMS/ROMS/Exte...
|
|
|
|
|
|
|
|
array(['2022-07-18T01:00:00.000000000', '2022-07-18T02:00:00.000000000', '2022-07-18T03:00:00.000000000', ..., '2022-09-12T22:00:00.000000000', '2022-09-12T23:00:00.000000000', '2022-09-13T00:00:00.000000000'], dtype='datetime64[ns]')
array([-0.983333, -0.95 , -0.916667, -0.883333, -0.85 , -0.816667, -0.783333, -0.75 , -0.716667, -0.683333, -0.65 , -0.616667, -0.583333, -0.55 , -0.516667, -0.483333, -0.45 , -0.416667, -0.383333, -0.35 , -0.316667, -0.283333, -0.25 , -0.216667, -0.183333, -0.15 , -0.116667, -0.083333, -0.05 , -0.016667])
array([-1. , -0.966667, -0.933333, -0.9 , -0.866667, -0.833333, -0.8 , -0.766667, -0.733333, -0.7 , -0.666667, -0.633333, -0.6 , -0.566667, -0.533333, -0.5 , -0.466667, -0.433333, -0.4 , -0.366667, -0.333333, -0.3 , -0.266667, -0.233333, -0.2 , -0.166667, -0.133333, -0.1 , -0.066667, -0.033333, 0. ])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
If we examine a variable, we can see how the data in the dataset is chunked. Since we are just accessing the original NetCDF4 files, we are stuck with whatever chunking was used when the files were created:
ds.temp
<xarray.DataArray 'temp' (ocean_time: 1368, s_rho: 30, eta_rho: 1302, xi_rho: 663)> dask.array<open_dataset-33e277dd49a5b8dac7f5127d003f933etemp, shape=(1368, 30, 1302, 663), dtype=float32, chunksize=(1, 8, 434, 221), chunktype=numpy.ndarray> Coordinates: lat_rho (eta_rho, xi_rho) float64 dask.array<chunksize=(651, 332), meta=np.ndarray> lon_rho (eta_rho, xi_rho) float64 dask.array<chunksize=(651, 332), meta=np.ndarray> * ocean_time (ocean_time) datetime64[ns] 2022-07-18T01:00:00 ... 2022-09-13 * s_rho (s_rho) float64 -0.9833 -0.95 -0.9167 ... -0.05 -0.01667 Dimensions without coordinates: eta_rho, xi_rho Attributes: field: temperature, scalar, series grid: grid location: face long_name: potential temperature time: ocean_time units: Celsius
|
|
|
array(['2022-07-18T01:00:00.000000000', '2022-07-18T02:00:00.000000000', '2022-07-18T03:00:00.000000000', ..., '2022-09-12T22:00:00.000000000', '2022-09-12T23:00:00.000000000', '2022-09-13T00:00:00.000000000'], dtype='datetime64[ns]')
array([-0.983333, -0.95 , -0.916667, -0.883333, -0.85 , -0.816667, -0.783333, -0.75 , -0.716667, -0.683333, -0.65 , -0.616667, -0.583333, -0.55 , -0.516667, -0.483333, -0.45 , -0.416667, -0.383333, -0.35 , -0.316667, -0.283333, -0.25 , -0.216667, -0.183333, -0.15 , -0.116667, -0.083333, -0.05 , -0.016667])
Accessing data at a specific time and level will be relatively fast, as only nine chunks of data need to be loaded:
da = ds.temp.sel(ocean_time='2022-09-12 00:00').isel(s_rho=-1)
da
<xarray.DataArray 'temp' (eta_rho: 1302, xi_rho: 663)> dask.array<getitem, shape=(1302, 663), dtype=float32, chunksize=(434, 221), chunktype=numpy.ndarray> Coordinates: lat_rho (eta_rho, xi_rho) float64 dask.array<chunksize=(651, 332), meta=np.ndarray> lon_rho (eta_rho, xi_rho) float64 dask.array<chunksize=(651, 332), meta=np.ndarray> ocean_time datetime64[ns] 2022-09-12 s_rho float64 -0.01667 Dimensions without coordinates: eta_rho, xi_rho Attributes: field: temperature, scalar, series grid: grid location: face long_name: potential temperature time: ocean_time units: Celsius
|
|
|
array('2022-09-12T00:00:00.000000000', dtype='datetime64[ns]')
array(-0.01666667)
Data is only loaded when needed, here when we actually make a plot:
%%time
da = da.load()
da.unify_chunks().hvplot.quadmesh(x='lon_rho', y='lat_rho', geo=True,
frame_width=500, cmap='turbo', tiles='OSM',
rasterize=True, widget_location='bottom')
CPU times: user 3.25 s, sys: 253 ms, total: 3.5 s Wall time: 4.25 s
Loading a time series at a specific grid cell requires reading many more chunks of data (1344 in this example!). The NetCDF4 files are on Azure in the US Western Region, and access will be faster with a faster network between the data and where this notebook is being run.
Xarray uses Dask to parallelize data reads and other operations, so access will be faster with more Dask workers. The machine we are using here has 8 cpus. Larger clusters with Dask Distributed would speed things up even more.
da = ds.temp.sel(s_rho = 0, method = 'nearest')[:,100,50]
da
<xarray.DataArray 'temp' (ocean_time: 1368)> dask.array<getitem, shape=(1368,), dtype=float32, chunksize=(1,), chunktype=numpy.ndarray> Coordinates: lat_rho float64 43.24 lon_rho float64 -128.3 * ocean_time (ocean_time) datetime64[ns] 2022-07-18T01:00:00 ... 2022-09-13 s_rho float64 -0.01667 Attributes: field: temperature, scalar, series grid: grid location: face long_name: potential temperature time: ocean_time units: Celsius
|
array(43.24209959)
array(-128.30720085)
array(['2022-07-18T01:00:00.000000000', '2022-07-18T02:00:00.000000000', '2022-07-18T03:00:00.000000000', ..., '2022-09-12T22:00:00.000000000', '2022-09-12T23:00:00.000000000', '2022-09-13T00:00:00.000000000'], dtype='datetime64[ns]')
array(-0.01666667)
%%time
# 8 workers running in AWS us-west-2 region
da.hvplot(x = 'ocean_time', grid=True)
CPU times: user 20.1 s, sys: 1.59 s, total: 21.6 s Wall time: 19.9 s