#!/usr/bin/env python
# coding: utf-8

# # Conservative Region Aggregation with Xarray, Geopandas and Sparse
# 
# **Goal:** Regrid a global precipitation dataset into countries _conservatively_, i.e. by exactly partitioning each grid cell into the precise region boundaries.
# 
# **Meta Goal:** Demonstrate that we don't necessarily need a package for this workflow and showcase some of the new capabilities of GeoPandas along the way.
# 
# **Approach:** We take a three step approach:
# - Represent both the original grid and target grid as GeoSeries with Polygon geometry
# - Compute their area overlay and turn it into a sparse matrix
# - Perform matrix multiplication on the full Xarray dataset (with a time dimension)
# 
# It is quite fast and transparent.

# In[1]:


import xarray as xr
import geopandas as gp
import pandas as pd
import sparse
get_ipython().run_line_magic('xmode', 'minimal')
import hvplot.pandas
import hvplot.xarray
import dask
import cf_xarray


# ## Load Region Data

# In[2]:


#! wget https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/50m/cultural/ne_50m_admin_0_countries.zip
#! unzip ne_50m_admin_0_countries.zip


# Load with geopandas:

# In[3]:


regions_df = gp.read_file("ne_50m_admin_0_countries.shp")


# In[4]:


regions_df.head(1)


# In[5]:


region_name = 'SOVEREIGNT'


# Check it:

# In[6]:


regions_df.plot(column=region_name, figsize=(10,4))


# In[7]:


bbox = tuple(regions_df.total_bounds)
bbox


# All geodataframes should have a coordinate reference system. This is important (and sometimes unfamiliar to users coming from the global climate world).
# 

# In[8]:


crs_orig = f'EPSG:{regions_df.crs.to_epsg()}'
crs_orig


# We will now transform to an area preserving projection. This is imporant because we want to do area-weighted regridding.

# In[9]:


# use an area preserving projections
crs_area = "ESRI:53034"
regions_df = regions_df.to_crs(crs_area)


# ## Load Precipitation Data
# 
# This is the NASA / NOAA Global Precipitation Climatology Project processed and stored by Pangeo Forge: https://pangeo-forge.org/dashboard/feedstock/42. We want to aggregate the precipitation into countries defined in the shapefile.
# 
# Note that we have the entire dataset in one object.  We convert the longitude from [0,360] to [-180,180] because that's what holoviz expects.

# In[10]:


store = 'https://ncsa.osn.xsede.org/Pangeo/pangeo-forge/gpcp-feedstock/gpcp.zarr'
with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ds = xr.open_dataset(store, engine='zarr', chunks={})
    lon = ds.cf['X'].name
    lat = ds.cf['Y'].name
    lon_attrs = ds.cf['X'].attrs # assign_coords drops attrs, so copy...
    ds = ds.cf.assign_coords(X=(((ds[lon] + 180) % 360) - 180)).sortby(lon)
    ds[lon].attrs = lon_attrs # ... and paste
    ds[lon].attrs['valid_range'] = [-180,180]


# Subset gridded model results to bounds of spatial dataframe to save on memory and computation. More useful when the regions_df is much smaller than the footprint of the gridded model

# In[11]:


ds = ds.cf.sel(X=slice(bbox[0],bbox[2]), Y=slice(bbox[1],bbox[3]))


# Now we extract just the horizontal grid information.
# The dataset has information about the lat and lon bounds of each cell, which we need to create the polygons.

# In[12]:


grid = ds.drop(['time', 'time_bounds',  'precip']).reset_coords().load()
grid


# In[13]:


ds


# Now we "stack" the data into a single 1D array. This is the first step towards transitioning to pandas.

# In[14]:


points = grid.stack(point=(lat, lon))
points


# This function creates geometries for a single pair of bounds.
# It is not fast, but it is fast enough here.
# Perhaps could be vectorized using pygeos...

# In[15]:


from shapely.geometry import Polygon

def bounds_to_poly(lon_bounds, lat_bounds):
    if lon_bounds[0] >= 180:
        # geopandas needs this
        lon_bounds = lon_bounds - 360
    return Polygon([
        (lon_bounds[0], lat_bounds[0]),
        (lon_bounds[0], lat_bounds[1]),
        (lon_bounds[1], lat_bounds[1]),
        (lon_bounds[1], lat_bounds[0])
    ])


# We apply this function to each grid cell.

# In[16]:


get_ipython().run_cell_magic('time', '', 'import numpy as np\nboxes = xr.apply_ufunc(\n    bounds_to_poly,\n    points.lon_bounds,\n    points.lat_bounds,\n    input_core_dims=[("nv",),  ("nv",)],\n    output_dtypes=[np.dtype(\'O\')],\n    vectorize=True\n)\nboxes\n')


# Finally, we convert to a GeoDataframe, specifying the original CRS

# In[17]:


get_ipython().run_cell_magic('time', '', 'grid_df= gp.GeoDataFrame(\n    data={"geometry": boxes.values, "latitude": boxes[lat], "longitude": boxes[lon]},\n    index=boxes.indexes["point"],\n    crs=crs_orig\n)\ngrid_df\n')


# In[18]:


grid_df.crs


# Now we transform to the area-preserving CRS.

# In[19]:


grid_df = grid_df.to_crs(crs_area)
grid_df.crs


# ## Key Step: Overlay the two geometries
# 
# This is the magic of geopandas; it can calculate the overlap between the original grid and the regions.
# It is expensive because it has to compare 64800 grid boxes with 242 regions. 
# 
# In this dataframe, the `latitude` and `longitude` values are from the grid, while all the other columns are from the regions.

# In[20]:


get_ipython().run_line_magic('time', 'overlay = grid_df.overlay(regions_df, keep_geom_type=True)')
overlay


# This is essentially already a sparse matrix mapping one grid space to the other. How sparse?

# In[21]:


sparsity = len(overlay) / (len(grid_df) * len(regions_df))
sparsity


# In[22]:


overlay


# Let's explore these overlays a little bit

# In[23]:


overlay[overlay[region_name] == "Italy"].geometry.plot(edgecolor='k')


# We can verify that each basin's area is preserved in the overlay operation.

# In[24]:


overlay.geometry.area.groupby(overlay[region_name]).sum().nlargest(10)/1e6  # km2


# In[25]:


regions_df.geometry.area.groupby(regions_df[region_name]).sum().nlargest(10)


# ## Calculate the area fraction for each region
# 
# This is another key step. This transform tells us _how much of a country's total area comes from each of the grid cells._
# This is accurate because we used an area-preserving CRS.

# In[26]:


grid_cell_fraction = overlay.geometry.area.groupby(overlay[region_name]).transform(lambda x: x / x.sum())
grid_cell_fraction


# We can verify that these all sum up to one.

# In[27]:


grid_cell_fraction.groupby(overlay[region_name]).sum()


# ## Turn this into a sparse Xarray DataArray
# 
# The first step is making a MultiIndex

# In[28]:


multi_index = overlay.set_index([lat, lon, region_name]).index
df_weights = pd.DataFrame({"weights": grid_cell_fraction.values}, index=multi_index)
df_weights


# We can bring this directly into xarray as a 1D Dataset.

# In[29]:


import xarray as xr
ds_weights = xr.Dataset(df_weights)
ds_weights


# Now we unstack it into a sparse array.

# In[30]:


weights_sparse = ds_weights.unstack(sparse=True, fill_value=0.).weights
weights_sparse


# Here we can clearly see that this is a sparse matrix, mapping the input space (lat, lon) to the output space (SOVEREIGNT).

# ## Perform Matrix Multiplication
# 
# To regrid the data, we just have to multiply the original precip dataset by this matrix.
# There are many different ways to do this.
# The simplest one would just be:

# In[31]:


regridded = xr.dot(ds.precip, weights_sparse)
regridded


# Unfortunately, that doesn't work out of the box, because sparse doesn't implement einsum (see https://github.com/pydata/sparse/issues/31).

# In[32]:


# regridded[0].compute()  # fails


# There are many ways to work around this, starting from just using a dense matrix instead.
# Below is the fastest thing I came up with after some extensive experimentation.
# 
# Sparse does implement matmul, so we can use that. But we have to do some reshaping to make it work with our data.

# In[33]:


def apply_weights_matmul_sparse(weights, data):

    assert isinstance(weights, sparse.SparseArray)
    assert isinstance(data, np.ndarray)
    data = sparse.COO.from_numpy(data)
    data_shape = data.shape
    # k = nlat * nlon
    n, k = data_shape[0], data_shape[1] * data_shape[2]
    data = data.reshape((n, k))
    weights_shape = weights.shape
    k_, m = weights_shape[0] * weights_shape[1], weights_shape[2]
    assert k == k_
    weights_data = weights.reshape((k, m))

    regridded = sparse.matmul(data, weights_data)
    assert regridded.shape == (n, m)
    return regridded.todense()


# Before applying this to the data, let's load it into memory and then chunk it again.
# This is not necessary (we could just stream the data from the cloud), but it is a cleaner benchmark.
# Chunking again allows us to leverage dask parallelism. We also eliminate some know corrupted values via a mask.

# In[34]:


get_ipython().run_cell_magic('time', '', 'mask = (ds.precip >= 0) & (ds.precip < 3000)\nprecip = ds.precip.where(mask)\nprecip_in_mem = precip.compute().chunk({"time": "10MB"})\nprecip_in_mem\n')


# Take a look at mean precip over CONUS

# In[35]:


da = precip_in_mem.mean(dim='time')


# In[36]:


da.hvplot.image(x=ds.cf['X'].name, y=ds.cf['Y'].name, rasterize=True, geo=True, cmap='viridis', frame_width=600, tiles='OSM')


# In[37]:


with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    precip_regridded = xr.apply_ufunc(
        apply_weights_matmul_sparse,
        weights_sparse,
        precip_in_mem,
        join="left",
        input_core_dims=[[lat, lon, region_name], [lat, lon]],
        output_core_dims=[[region_name]],
        dask="parallelized",
        dask_gufunc_kwargs=dict(meta=[np.ndarray((0,))])
    )

    precip_regridded


# Finally, we compute it!

# In[38]:


get_ipython().run_cell_magic('time', '', 'from dask.diagnostics import ProgressBar\n\nwith ProgressBar():\n    precip_regridded.load()\n')


# In[39]:


precip_regridded


# #### Explore timeseries data by region:

# In[40]:


ds_precip = precip_regridded.sel(SOVEREIGNT=['Italy', 'Morocco']).resample(time="MS").mean().to_dataset(region_name)


# In[41]:


ds_precip.hvplot(x='time', grid=True, frame_width=1000)


# #### Explore the mean precip by region

# In[42]:


df_mean = precip_regridded.to_pandas().mean()
df_mean.name = "precip"
df_mean = pd.DataFrame(df_mean).reset_index()


# In[43]:


merged = pd.merge(regions_df, df_mean)


# Convert back to geographic coordinates for plotting

# In[44]:


merged_geo = merged.to_crs(crs_orig)


# Matplotlib:

# In[45]:


merged_geo.plot(column='precip', edgecolor='k', figsize=(12,5), legend=True)


# Holoviz:

# In[46]:


merged_geo.hvplot(c='precip', geo=True, cmap='viridis', frame_width=600, tiles='OSM', 
                  title='Mean Precip (mm/day): NASA/NOAA Global Precip Climatology [1996-10-01/2021-12-31]')