#!/usr/bin/env python # coding: utf-8 # # Lazy loading a big (25GB) NetCDF4 file on S3 with Xarray # Just as when opening from a filesystem, when opening from s3, xarray loads: # * metadata (including coordinate variables) eagerly # * data variables lazily # # We demonstrate this here opening a 25GB NetCDF file using S3 in a matter of seconds with memory usage below 1GB. # In[1]: import fsspec import xarray as xr # ## Inspect the big NetCDF4 file on S3 using fsspec # In[2]: fs = fsspec.filesystem('s3', anon=True, client_kwargs=dict(endpoint_url='https://ncsa.osn.xsede.org')) # In[3]: ncfile_on_s3 = 's3://esip/examples/adcirc/adcirc_01.nc' # In[4]: fs.size(ncfile_on_s3)/1e9 # GB # ## Open the file with Xarray, loading only the metadata and coordinate vars # In[5]: get_ipython().run_cell_magic('time', '', "ds = xr.open_dataset(fs.open(ncfile_on_s3), chunks={'time':10, 'node':141973})\n") # In[6]: ds.zeta # ## Compute the mean of the first 30 time steps (about 2GB of data) # Data is loaded lazily in chunks and in parallel by Dask using available cores # In[7]: da = ds.zeta[:30,:] da # In[8]: get_ipython().run_cell_magic('time', '', "da.mean(dim='time').compute()\n")