#!/usr/bin/env python # coding: utf-8 # ## Explore the IOOS LiveOcean Forecast Collection # Access a collection of [LiveOcean](https://faculty.washington.edu/pmacc/LO/LiveOcean.html) Forecast NetCDF4 files on Azure as a single virtual Zarr dataset. # This work was done during my Google Summer of Code project with the [US-IOOS](https://ioos.noaa.gov/) Organization. # In[1]: import fsspec import xarray as xr import hvplot.xarray # This fsspec referenceFileSystem JSON file below was created with the [kerchunk](https://fsspec.github.io/kerchunk/) package, and can be updated by running [this script](https://nbviewer.org/gist/345b97485752f9d3f5d435fcae1a1be5). See [this blog post](https://medium.com/pangeo/accessing-netcdf-and-grib-file-collections-as-cloud-native-virtual-datasets-using-kerchunk-625a2d0a9191) for more about using Kerchunk to create these JSON objects from collections of NetCDF4 and GRIB2 files. # In[2]: reference_file = 's3://esip-qhub-public/LiveOcean/LiveOcean_reference_20220912.json' # In[3]: fs1 = fsspec.filesystem('s3', anon=True) fs1.ls('s3://esip-qhub-public/LiveOcean') # When creating the fsspec referenceFileSystem object, we need to include access options for both the JSON references and the data files: # In[4]: topts = dict(anon=True, skip_instance_cache=True) # json reference ropts = dict(account_name='pm2', skip_instance_cache=True) # netcdf data files # In[5]: fs = fsspec.filesystem("reference", fo=reference_file, target_options=topts, remote_protocol='abfs', remote_options=ropts) # This virtual filesystem object looks just like a Zarr dataset: # In[6]: fs.ls('') # Since it looks like a Zarr dataset, we can create a mapper and open the data with the Zarr library: # In[7]: m = fs.get_mapper("") ds = xr.open_dataset(m, engine="zarr", consolidated=False, chunks={}) ds # If we examine a variable, we can see how the data in the dataset is chunked. Since we are just accessing the original NetCDF4 files, we are stuck with whatever chunking was used when the files were created: # In[8]: ds.temp # Accessing data at a specific time and level will be relatively fast, as only nine chunks of data need to be loaded: # In[9]: da = ds.temp.sel(ocean_time='2022-09-12 00:00').isel(s_rho=-1) da # Data is only loaded when needed, here when we actually make a plot: # In[10]: get_ipython().run_cell_magic('time', '', "da = da.load()\nda.unify_chunks().hvplot.quadmesh(x='lon_rho', y='lat_rho', geo=True, \n frame_width=500, cmap='turbo', tiles='OSM',\n rasterize=True, widget_location='bottom')\n") # Loading a time series at a specific grid cell requires reading many more chunks of data (1344 in this example!). The NetCDF4 files are on Azure in the US Western Region, and access will be faster with a faster network between the data and where this notebook is being run. # # Xarray uses Dask to parallelize data reads and other operations, so access will be faster with more Dask workers. The machine we are using here has 8 cpus. Larger clusters with [Dask Distributed](https://distributed.dask.org/en/stable/) would speed things up even more. # In[11]: da = ds.temp.sel(s_rho = 0, method = 'nearest')[:,100,50] da # In[12]: get_ipython().run_cell_magic('time', '', "# 8 workers running in AWS us-west-2 region\nda.hvplot(x = 'ocean_time', grid=True)\n")