#!/usr/bin/env python
# coding: utf-8

# ## Sparse backed array example for EO data
# Based on discussions [here](https://discourse.pangeo.io/t/tables-x-arrays-and-rasters/1945) and [here](https://github.com/gjoseph92/stackstac/discussions/178).  Can DataArrays backed by `sparse.COO` improve performance and dask graph size over `numpy.ndarray` when working with with large EO time series data cubes.

# In[1]:


import xarray as xr
import pandas as pd
import numpy as np
from dask.distributed import Client
import dask
import sparse


# #### Generate a synthetic dataarray which represents the sparseness of an EO platform in a sun-synchronous orbit collecting data.

# In[2]:


times = pd.date_range(start='2000-01-01',freq='1D',periods=200)
time_da = xr.DataArray(times, [('time', times)])


# In[21]:


size = 2000
data_value = 100.0
da = xr.DataArray(np.zeros((size, size)), dims=["x", "y"], coords=[range(size), range(size)])
da = da.expand_dims(time=time_da).copy()

strip = 0
for idx, time in enumerate(da.time):
    da.data[idx, strip, 0:size] = data_value
    if strip == size - 1:
        strip = 0
    else:
        strip = strip + int(size / len(da.time))
da


# In[22]:


chunked = da.chunk(chunks={'x': 255, 'y': 255, 'time': -1})
chunked


# #### Apply an aggregation operation commonly used in our ML workflows.

# In[23]:


chunked.groupby('time.month').mean(dim='time').data.dask


# In[26]:


get_ipython().run_cell_magic('time', '', "mean = chunked.groupby('time.month').mean(dim='time').compute()\nmean\n")


# ### Create a sparse.COO representation of the sample dataarray.

# #### Use the non-zero values to determine the coordinate locations.

# In[27]:


coords = np.transpose(np.argwhere(da.to_numpy()))
coords.shape


# In[28]:


s = sparse.COO(coords, data_value, shape=(len(time_da), size, size))
s


# In[29]:


dasky_array = dask.array.from_array(s, chunks=255)
dasky_array


# In[30]:


da2 = xr.DataArray(dasky_array, dims=["time","x", "y"], coords=[time_da, range(size), range(size)])
da2


# In[31]:


da2.groupby('time.month').mean(dim='time').data.dask


# In[32]:


get_ipython().run_cell_magic('time', '', "sparse_mean = da2.groupby('time.month').mean(dim='time').compute()\nsparse_mean\n")


# In[ ]: