#!/usr/bin/env python # coding: utf-8 # # Conservative Region Aggregation with Xarray, Geopandas and Sparse # # **Goal:** Regrid a global precipitation dataset into countries _conservatively_, i.e. by exactly partitioning each grid cell into the precise region boundaries. # # **Meta Goal:** Demonstrate that we don't necessarily need a package for this workflow and showcase some of the new capabilities of GeoPandas along the way. # # **Approach:** We take a three step approach: # - Represent both the original grid and target grid as GeoSeries with Polygon geometry # - Compute their area overlay and turn it into a sparse matrix # - Perform matrix multiplication on the full Xarray dataset (with a time dimension) # # It is quite fast and transparent. # In[1]: import xarray as xr import geopandas as gp import pandas as pd import sparse get_ipython().run_line_magic('xmode', 'minimal') import hvplot.pandas import hvplot.xarray import dask import cf_xarray # ## Load Region Data # In[2]: #! wget https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/50m/cultural/ne_50m_admin_0_countries.zip #! unzip ne_50m_admin_0_countries.zip # Load with geopandas: # In[3]: regions_df = gp.read_file("ne_50m_admin_0_countries.shp") # In[4]: regions_df.head(1) # In[5]: region_name = 'SOVEREIGNT' # Check it: # In[6]: regions_df.plot(column=region_name, figsize=(10,4)) # In[7]: bbox = tuple(regions_df.total_bounds) bbox # All geodataframes should have a coordinate reference system. This is important (and sometimes unfamiliar to users coming from the global climate world). # # In[8]: crs_orig = f'EPSG:{regions_df.crs.to_epsg()}' crs_orig # We will now transform to an area preserving projection. This is imporant because we want to do area-weighted regridding. # In[9]: # use an area preserving projections crs_area = "ESRI:53034" regions_df = regions_df.to_crs(crs_area) # ## Load Precipitation Data # # This is the NASA / NOAA Global Precipitation Climatology Project processed and stored by Pangeo Forge: https://pangeo-forge.org/dashboard/feedstock/42. We want to aggregate the precipitation into countries defined in the shapefile. # # Note that we have the entire dataset in one object. We convert the longitude from [0,360] to [-180,180] because that's what holoviz expects. # In[10]: store = 'https://ncsa.osn.xsede.org/Pangeo/pangeo-forge/gpcp-feedstock/gpcp.zarr' with dask.config.set(**{'array.slicing.split_large_chunks': False}): ds = xr.open_dataset(store, engine='zarr', chunks={}) lon = ds.cf['X'].name lat = ds.cf['Y'].name lon_attrs = ds.cf['X'].attrs # assign_coords drops attrs, so copy... ds = ds.cf.assign_coords(X=(((ds[lon] + 180) % 360) - 180)).sortby(lon) ds[lon].attrs = lon_attrs # ... and paste ds[lon].attrs['valid_range'] = [-180,180] # Subset gridded model results to bounds of spatial dataframe to save on memory and computation. More useful when the regions_df is much smaller than the footprint of the gridded model # In[11]: ds = ds.cf.sel(X=slice(bbox[0],bbox[2]), Y=slice(bbox[1],bbox[3])) # Now we extract just the horizontal grid information. # The dataset has information about the lat and lon bounds of each cell, which we need to create the polygons. # In[12]: grid = ds.drop(['time', 'time_bounds', 'precip']).reset_coords().load() grid # In[13]: ds # Now we "stack" the data into a single 1D array. This is the first step towards transitioning to pandas. # In[14]: points = grid.stack(point=(lat, lon)) points # This function creates geometries for a single pair of bounds. # It is not fast, but it is fast enough here. # Perhaps could be vectorized using pygeos... # In[15]: from shapely.geometry import Polygon def bounds_to_poly(lon_bounds, lat_bounds): if lon_bounds[0] >= 180: # geopandas needs this lon_bounds = lon_bounds - 360 return Polygon([ (lon_bounds[0], lat_bounds[0]), (lon_bounds[0], lat_bounds[1]), (lon_bounds[1], lat_bounds[1]), (lon_bounds[1], lat_bounds[0]) ]) # We apply this function to each grid cell. # In[16]: get_ipython().run_cell_magic('time', '', 'import numpy as np\nboxes = xr.apply_ufunc(\n bounds_to_poly,\n points.lon_bounds,\n points.lat_bounds,\n input_core_dims=[("nv",), ("nv",)],\n output_dtypes=[np.dtype(\'O\')],\n vectorize=True\n)\nboxes\n') # Finally, we convert to a GeoDataframe, specifying the original CRS # In[17]: get_ipython().run_cell_magic('time', '', 'grid_df= gp.GeoDataFrame(\n data={"geometry": boxes.values, "latitude": boxes[lat], "longitude": boxes[lon]},\n index=boxes.indexes["point"],\n crs=crs_orig\n)\ngrid_df\n') # In[18]: grid_df.crs # Now we transform to the area-preserving CRS. # In[19]: grid_df = grid_df.to_crs(crs_area) grid_df.crs # ## Key Step: Overlay the two geometries # # This is the magic of geopandas; it can calculate the overlap between the original grid and the regions. # It is expensive because it has to compare 64800 grid boxes with 242 regions. # # In this dataframe, the `latitude` and `longitude` values are from the grid, while all the other columns are from the regions. # In[20]: get_ipython().run_line_magic('time', 'overlay = grid_df.overlay(regions_df, keep_geom_type=True)') overlay # This is essentially already a sparse matrix mapping one grid space to the other. How sparse? # In[21]: sparsity = len(overlay) / (len(grid_df) * len(regions_df)) sparsity # In[22]: overlay # Let's explore these overlays a little bit # In[23]: overlay[overlay[region_name] == "Italy"].geometry.plot(edgecolor='k') # We can verify that each basin's area is preserved in the overlay operation. # In[24]: overlay.geometry.area.groupby(overlay[region_name]).sum().nlargest(10)/1e6 # km2 # In[25]: regions_df.geometry.area.groupby(regions_df[region_name]).sum().nlargest(10) # ## Calculate the area fraction for each region # # This is another key step. This transform tells us _how much of a country's total area comes from each of the grid cells._ # This is accurate because we used an area-preserving CRS. # In[26]: grid_cell_fraction = overlay.geometry.area.groupby(overlay[region_name]).transform(lambda x: x / x.sum()) grid_cell_fraction # We can verify that these all sum up to one. # In[27]: grid_cell_fraction.groupby(overlay[region_name]).sum() # ## Turn this into a sparse Xarray DataArray # # The first step is making a MultiIndex # In[28]: multi_index = overlay.set_index([lat, lon, region_name]).index df_weights = pd.DataFrame({"weights": grid_cell_fraction.values}, index=multi_index) df_weights # We can bring this directly into xarray as a 1D Dataset. # In[29]: import xarray as xr ds_weights = xr.Dataset(df_weights) ds_weights # Now we unstack it into a sparse array. # In[30]: weights_sparse = ds_weights.unstack(sparse=True, fill_value=0.).weights weights_sparse # Here we can clearly see that this is a sparse matrix, mapping the input space (lat, lon) to the output space (SOVEREIGNT). # ## Perform Matrix Multiplication # # To regrid the data, we just have to multiply the original precip dataset by this matrix. # There are many different ways to do this. # The simplest one would just be: # In[31]: regridded = xr.dot(ds.precip, weights_sparse) regridded # Unfortunately, that doesn't work out of the box, because sparse doesn't implement einsum (see https://github.com/pydata/sparse/issues/31). # In[32]: # regridded[0].compute() # fails # There are many ways to work around this, starting from just using a dense matrix instead. # Below is the fastest thing I came up with after some extensive experimentation. # # Sparse does implement matmul, so we can use that. But we have to do some reshaping to make it work with our data. # In[33]: def apply_weights_matmul_sparse(weights, data): assert isinstance(weights, sparse.SparseArray) assert isinstance(data, np.ndarray) data = sparse.COO.from_numpy(data) data_shape = data.shape # k = nlat * nlon n, k = data_shape[0], data_shape[1] * data_shape[2] data = data.reshape((n, k)) weights_shape = weights.shape k_, m = weights_shape[0] * weights_shape[1], weights_shape[2] assert k == k_ weights_data = weights.reshape((k, m)) regridded = sparse.matmul(data, weights_data) assert regridded.shape == (n, m) return regridded.todense() # Before applying this to the data, let's load it into memory and then chunk it again. # This is not necessary (we could just stream the data from the cloud), but it is a cleaner benchmark. # Chunking again allows us to leverage dask parallelism. We also eliminate some know corrupted values via a mask. # In[34]: get_ipython().run_cell_magic('time', '', 'mask = (ds.precip >= 0) & (ds.precip < 3000)\nprecip = ds.precip.where(mask)\nprecip_in_mem = precip.compute().chunk({"time": "10MB"})\nprecip_in_mem\n') # Take a look at mean precip over CONUS # In[35]: da = precip_in_mem.mean(dim='time') # In[36]: da.hvplot.image(x=ds.cf['X'].name, y=ds.cf['Y'].name, rasterize=True, geo=True, cmap='viridis', frame_width=600, tiles='OSM') # In[37]: with dask.config.set(**{'array.slicing.split_large_chunks': False}): precip_regridded = xr.apply_ufunc( apply_weights_matmul_sparse, weights_sparse, precip_in_mem, join="left", input_core_dims=[[lat, lon, region_name], [lat, lon]], output_core_dims=[[region_name]], dask="parallelized", dask_gufunc_kwargs=dict(meta=[np.ndarray((0,))]) ) precip_regridded # Finally, we compute it! # In[38]: get_ipython().run_cell_magic('time', '', 'from dask.diagnostics import ProgressBar\n\nwith ProgressBar():\n precip_regridded.load()\n') # In[39]: precip_regridded # #### Explore timeseries data by region: # In[40]: ds_precip = precip_regridded.sel(SOVEREIGNT=['Italy', 'Morocco']).resample(time="MS").mean().to_dataset(region_name) # In[41]: ds_precip.hvplot(x='time', grid=True, frame_width=1000) # #### Explore the mean precip by region # In[42]: df_mean = precip_regridded.to_pandas().mean() df_mean.name = "precip" df_mean = pd.DataFrame(df_mean).reset_index() # In[43]: merged = pd.merge(regions_df, df_mean) # Convert back to geographic coordinates for plotting # In[44]: merged_geo = merged.to_crs(crs_orig) # Matplotlib: # In[45]: merged_geo.plot(column='precip', edgecolor='k', figsize=(12,5), legend=True) # Holoviz: # In[46]: merged_geo.hvplot(c='precip', geo=True, cmap='viridis', frame_width=600, tiles='OSM', title='Mean Precip (mm/day): NASA/NOAA Global Precip Climatology [1996-10-01/2021-12-31]')