#!/usr/bin/env python # coding: utf-8 # ## Sparse backed array example for EO data # Based on discussions [here](https://discourse.pangeo.io/t/tables-x-arrays-and-rasters/1945) and [here](https://github.com/gjoseph92/stackstac/discussions/178). Can DataArrays backed by `sparse.COO` improve performance and dask graph size over `numpy.ndarray` when working with with large EO time series data cubes. # In[1]: import xarray as xr import pandas as pd import numpy as np from dask.distributed import Client import dask import sparse # #### Generate a synthetic dataarray which represents the sparseness of an EO platform in a sun-synchronous orbit collecting data. # In[2]: times = pd.date_range(start='2000-01-01',freq='1D',periods=200) time_da = xr.DataArray(times, [('time', times)]) # In[21]: size = 2000 data_value = 100.0 da = xr.DataArray(np.zeros((size, size)), dims=["x", "y"], coords=[range(size), range(size)]) da = da.expand_dims(time=time_da).copy() strip = 0 for idx, time in enumerate(da.time): da.data[idx, strip, 0:size] = data_value if strip == size - 1: strip = 0 else: strip = strip + int(size / len(da.time)) da # In[22]: chunked = da.chunk(chunks={'x': 255, 'y': 255, 'time': -1}) chunked # #### Apply an aggregation operation commonly used in our ML workflows. # In[23]: chunked.groupby('time.month').mean(dim='time').data.dask # In[26]: get_ipython().run_cell_magic('time', '', "mean = chunked.groupby('time.month').mean(dim='time').compute()\nmean\n") # ### Create a sparse.COO representation of the sample dataarray. # #### Use the non-zero values to determine the coordinate locations. # In[27]: coords = np.transpose(np.argwhere(da.to_numpy())) coords.shape # In[28]: s = sparse.COO(coords, data_value, shape=(len(time_da), size, size)) s # In[29]: dasky_array = dask.array.from_array(s, chunks=255) dasky_array # In[30]: da2 = xr.DataArray(dasky_array, dims=["time","x", "y"], coords=[time_da, range(size), range(size)]) da2 # In[31]: da2.groupby('time.month').mean(dim='time').data.dask # In[32]: get_ipython().run_cell_magic('time', '', "sparse_mean = da2.groupby('time.month').mean(dim='time').compute()\nsparse_mean\n") # In[ ]: