#!/usr/bin/env python
# coding: utf-8

# # Lazy loading a big (25GB) NetCDF4 file on S3 with Xarray
# Just as when opening from a filesystem, when opening from s3, xarray loads:
# * metadata (including coordinate variables) eagerly
# * data variables lazily
# 
# We demonstrate this here opening a 25GB NetCDF file using S3 in a matter of seconds with memory usage below 1GB. 

# In[1]:


import fsspec
import xarray as xr


# ## Inspect the big NetCDF4 file on S3 using fsspec

# In[2]:


fs = fsspec.filesystem('s3', anon=True, client_kwargs=dict(endpoint_url='https://ncsa.osn.xsede.org'))


# In[3]:


ncfile_on_s3 = 's3://esip/examples/adcirc/adcirc_01.nc'


# In[4]:


fs.size(ncfile_on_s3)/1e9  # GB


# ## Open the file with Xarray, loading only the metadata and coordinate vars

# In[5]:


get_ipython().run_cell_magic('time', '', "ds = xr.open_dataset(fs.open(ncfile_on_s3), chunks={'time':10, 'node':141973})\n")


# In[6]:


ds.zeta


# ## Compute the mean of the first 30 time steps (about 2GB of data)
# Data is loaded lazily in chunks and in parallel by Dask using available cores

# In[7]:


da = ds.zeta[:30,:]
da


# In[8]:


get_ipython().run_cell_magic('time', '', "da.mean(dim='time').compute()\n")