#!/usr/bin/env python # coding: utf-8 # # Gridding PFT variables to sparse arrays # # This notebook explores regridding the PFT variables as sparse arrays using the `pydata/sparse` package. # # Inspired by `PFT-Gridding.ipynb` # ### Importing Libraries # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import xarray as xr from ctsm_py import utils # ### Defining simulation information # In[2]: datadir = "/glade/p/cgd/tss/people/dll/TRENDY2019_History/" sim = "S0_control/" datadir = datadir + sim simname = "TRENDY2019_S0_control_v2.clm2.h1." var = "GPP" years = "170001-201812" maxval = "True" # In[3]: print(datadir + simname + var + "." + years + ".nc") # In[4]: # This is an example copied from Dan's script -- helps to read in multiple variables # dir = # sim = # pref = 'lnd/proc/tseries/month_1' # suff = ".clm2.h0." # variables = [" "] # pattern = dir + sim + proc + pref + '{var}' + suff # files = [pattern.format(var=var) for var in variables] # for multiple files, use xr.open_mfdataset; dan also uses utils.time_set_mid to make the time dims work properly # 365*utils.weighted_annual_mean --> weights by days/month # timeslice: ix_time = (ds['time.year']>1963) & (ds['time.year']<2014) # note that dan's dataset is 'ds' # plt.subplot(121) #-->1 row, 2 plots, plot 1 # signal.detrend (?) # In[5]: data1 = utils.time_set_mid( xr.open_dataset( datadir + simname + var + "." + years + ".nc", decode_times=True, chunks={"time": 100}, ), "time", ) data1 # ### Convert the 1D PFT dataarrays to nD sparse arrays # # In[35]: def to_sparse(data, vegtype, jxy, ixy, shape): """ Takes an input numpy array and converts it to a sparse array.""" import itertools import sparse codes = zip(vegtype, jxy - 1, ixy - 1) # some magic from https://github.com/pydata/xarray/pull/5577 # This constructs a list of coordinate locations at which data exists # it works for arbitrary number of dimensions but assumes that the last dimension # is the "stacked" dimension i.e. "pft" if data.ndim == 1: indexes = codes else: sizes = list(itertools.product(*[range(s) for s in data.shape[:-1]])) tuple_indexes = itertools.product(sizes, codes) indexes = map(lambda x: list(itertools.chain(*x)), tuple_indexes) return sparse.COO( coords=np.array(list(indexes)).T, data=data.ravel(), shape=data.shape[:-1] + shape, ) def convert_pft_variables_to_sparse(dataset): import sparse import xarray as xr # extract PFT variables pfts = xr.Dataset({k: v for k, v in dataset.items() if "pft" in v.dims}) ixy = dataset.pfts1d_ixy.astype(int) jxy = dataset.pfts1d_jxy.astype(int) vegtype = dataset.pfts1d_itype_veg.astype(int) npft = vegtype.max().load().item() # expected shape of sparse arrays (excludes time) output_sizes = { "pft": npft + 1, "lat": dataset.sizes["lat"], "lon": dataset.sizes["lon"], } result = xr.Dataset() for var in pfts: result[var] = xr.apply_ufunc( to_sparse, pfts[var], vegtype, jxy, ixy, input_core_dims=[["pft"]] * 4, output_core_dims=[["pft", "lat", "lon"]], exclude_dims=set(("pft",)), # changes size dask="parallelized", kwargs=dict(shape=tuple(output_sizes.values())), dask_gufunc_kwargs=dict( meta=sparse.COO([]), output_sizes=output_sizes, ), # lets dask know that we are changing from numpy to sparse output_dtypes=[pfts[var].dtype], ) # copy over coordinate variables lat, lon result = result.update(dataset[["lat", "lon"]]) result["pft"] = np.arange(result.sizes["pft"]) return result # In[36]: pfts = convert_pft_variables_to_sparse(data1) pfts # ### test out a plot # # # xarray cannot handle directly plotting a sparse variable (yet). This will be fixed soon; instead we manually "densify" to a numpy array using `.copy(data=gpp.data.todense())` # In[31]: gpp = pfts.GPP.isel(time=3606).compute() gpp = gpp.copy(data=gpp.data.todense()) # In[32]: gpp.isel(pft=1).plot()