#!/usr/bin/env python
# coding: utf-8

# # Gridding PFT variables to sparse arrays
# 
# This notebook explores regridding the PFT variables as sparse arrays using the `pydata/sparse` package.
# 
# Inspired by `PFT-Gridding.ipynb`

# ### Importing Libraries

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')

import numpy as np
import xarray as xr
from ctsm_py import utils


# ### Defining simulation information

# In[2]:


datadir = "/glade/p/cgd/tss/people/dll/TRENDY2019_History/"
sim = "S0_control/"
datadir = datadir + sim
simname = "TRENDY2019_S0_control_v2.clm2.h1."
var = "GPP"
years = "170001-201812"

maxval = "True"


# In[3]:


print(datadir + simname + var + "." + years + ".nc")


# In[4]:


# This is an example copied from Dan's script -- helps to read in multiple variables
# dir =
# sim =
# pref = 'lnd/proc/tseries/month_1'
# suff = ".clm2.h0."
# variables = [" "]
# pattern   = dir + sim + proc + pref + '{var}' + suff
# files = [pattern.format(var=var) for var in variables]

# for multiple files, use xr.open_mfdataset; dan also uses utils.time_set_mid to make the time dims work properly

# 365*utils.weighted_annual_mean --> weights by days/month
# timeslice: ix_time = (ds['time.year']>1963) & (ds['time.year']<2014) # note that dan's dataset is 'ds'
# plt.subplot(121) #-->1 row, 2 plots, plot 1
# signal.detrend (?)


# In[5]:


data1 = utils.time_set_mid(
    xr.open_dataset(
        datadir + simname + var + "." + years + ".nc",
        decode_times=True,
        chunks={"time": 100},
    ),
    "time",
)
data1


# ### Convert the 1D PFT dataarrays to nD sparse arrays
# 

# In[35]:


def to_sparse(data, vegtype, jxy, ixy, shape):
    """ Takes an input numpy array and converts it to a sparse array."""
    import itertools

    import sparse

    codes = zip(vegtype, jxy - 1, ixy - 1)

    # some magic from https://github.com/pydata/xarray/pull/5577
    # This constructs a list of coordinate locations at which data exists
    # it works for arbitrary number of dimensions but assumes that the last dimension
    # is the "stacked" dimension i.e. "pft"
    if data.ndim == 1:
        indexes = codes
    else:
        sizes = list(itertools.product(*[range(s) for s in data.shape[:-1]]))
        tuple_indexes = itertools.product(sizes, codes)
        indexes = map(lambda x: list(itertools.chain(*x)), tuple_indexes)

    return sparse.COO(
        coords=np.array(list(indexes)).T,
        data=data.ravel(),
        shape=data.shape[:-1] + shape,
    )


def convert_pft_variables_to_sparse(dataset):
    import sparse
    import xarray as xr

    # extract PFT variables
    pfts = xr.Dataset({k: v for k, v in dataset.items() if "pft" in v.dims})

    ixy = dataset.pfts1d_ixy.astype(int)
    jxy = dataset.pfts1d_jxy.astype(int)
    vegtype = dataset.pfts1d_itype_veg.astype(int)
    npft = vegtype.max().load().item()
    # expected shape of sparse arrays (excludes time)
    output_sizes = {
        "pft": npft + 1,
        "lat": dataset.sizes["lat"],
        "lon": dataset.sizes["lon"],
    }

    result = xr.Dataset()
    for var in pfts:
        result[var] = xr.apply_ufunc(
            to_sparse,
            pfts[var],
            vegtype,
            jxy,
            ixy,
            input_core_dims=[["pft"]] * 4,
            output_core_dims=[["pft", "lat", "lon"]],
            exclude_dims=set(("pft",)),  # changes size
            dask="parallelized",
            kwargs=dict(shape=tuple(output_sizes.values())),
            dask_gufunc_kwargs=dict(
                meta=sparse.COO([]),
                output_sizes=output_sizes,
            ),  # lets dask know that we are changing from numpy to sparse
            output_dtypes=[pfts[var].dtype],
        )

    # copy over coordinate variables lat, lon
    result = result.update(dataset[["lat", "lon"]])
    result["pft"] = np.arange(result.sizes["pft"])
    return result


# In[36]:


pfts = convert_pft_variables_to_sparse(data1)
pfts


# ### test out a plot
# 
# 
# xarray cannot handle directly plotting a sparse variable (yet). This  will be fixed soon; instead we manually "densify" to a numpy array using `.copy(data=gpp.data.todense())`

# In[31]:


gpp = pfts.GPP.isel(time=3606).compute()
gpp = gpp.copy(data=gpp.data.todense())


# In[32]:


gpp.isel(pft=1).plot()