#grayscale");filter:gray;-webkit-filter:grayscale(100%);}.bk-root .bk-logo-small{width:20px;height:20px;background-image:url();}.bk-root .bk-logo-notebook{display:inline-block;vertical-align:middle;margin-right:5px;}.rendered_html .bk-root .bk-tooltip table,.rendered_html .bk-root .bk-tooltip tr,.rendered_html .bk-root .bk-tooltip th,.rendered_html .bk-root .bk-tooltip td{border:none;padding:1px;}.bk-root .bk-tile-attribution a{color:black;}.bk-root .bk-toolbar-hidden{visibility:hidden;opacity:0;transition:visibility 0.3s linear, opacity 0.3s linear;}.bk-root .bk-toolbar{display:flex;flex-wrap:nowrap;align-items:center;user-select:none;-ms-user-select:none;-moz-user-select:none;-webkit-user-select:none;}.bk-root .bk-toolbar .bk-logo{flex-shrink:0;}.bk-root .bk-toolbar.bk-above,.bk-root .bk-toolbar.bk-below{flex-direction:row;justify-content:flex-end;}.bk-root .bk-toolbar.bk-above .bk-logo,.bk-root .bk-toolbar.bk-below .bk-logo{order:1;margin-left:5px;margin-right:0px;}.bk-root .bk-toolbar.bk-left,.bk-root .bk-toolbar.bk-right{flex-direction:column;justify-content:flex-start;}.bk-root .bk-toolbar.bk-left .bk-logo,.bk-root .bk-toolbar.bk-right .bk-logo{order:0;margin-bottom:5px;margin-top:0px;}.bk-root .bk-toolbar-button{width:30px;height:30px;cursor:pointer;background-size:60% 60%;background-origin:border-box;background-color:transparent;background-repeat:no-repeat;background-position:center center;}.bk-root .bk-toolbar-button:hover,.bk-root .bk-tool-overflow:hover{background-color:rgba(192, 192, 192, 0.15);}.bk-root .bk-toolbar-button:focus,.bk-root .bk-tool-overflow:focus,.bk-root .bk-toolbar-button:focus-visible,.bk-root .bk-tool-overflow:focus-visible{outline:1px dotted #26aae1;outline-offset:-1px;}.bk-root .bk-toolbar-button::-moz-focus-inner,.bk-root .bk-tool-overflow::-moz-focus-inner{border:0;}.bk-root .bk-above .bk-toolbar-button{border-bottom:2px solid transparent;}.bk-root .bk-above .bk-toolbar-button.bk-active{border-bottom-color:#26aae1;}.bk-root .bk-below .bk-toolbar-button{border-top:2px solid transparent;}.bk-root .bk-below .bk-toolbar-button.bk-active{border-top-color:#26aae1;}.bk-root .bk-right .bk-toolbar-button{border-left:2px solid transparent;}.bk-root .bk-right .bk-toolbar-button.bk-active{border-left-color:#26aae1;}.bk-root .bk-left .bk-toolbar-button{border-right:2px solid transparent;}.bk-root .bk-left .bk-toolbar-button.bk-active{border-right-color:#26aae1;}.bk-root .bk-divider{content:" ";display:inline-block;background-color:lightgray;}.bk-root .bk-above .bk-divider,.bk-root .bk-below .bk-divider{height:10px;width:1px;}.bk-root .bk-left .bk-divider,.bk-root .bk-right .bk-divider{height:1px;width:10px;}.bk-root .bk-tool-overflow{color:gray;display:flex;align-items:center;}.bk-root .bk-above .bk-tool-overflow,.bk-root .bk-below .bk-tool-overflow,.bk-root .bk-horizontal .bk-tool-overflow{width:15px;height:30px;flex-direction:row;}.bk-root .bk-left .bk-tool-overflow,.bk-root .bk-right .bk-tool-overflow,.bk-root .bk-vertical .bk-tool-overflow{width:30px;height:15px;flex-direction:column;}.bk-root .bk-tool-icon-copy-to-clipboard{background-image:url("");}.bk-root .bk-tool-icon-replace-mode{background-image:url("");}.bk-root .bk-tool-icon-append-mode{background-image:url("");}.bk-root .bk-tool-icon-intersect-mode{background-image:url("");}.bk-root .bk-tool-icon-subtract-mode{background-image:url("");}.bk-root .bk-tool-icon-clear-selection{background-image:url("");}.bk-root .bk-tool-icon-box-select{background-image:url("");}.bk-root .bk-tool-icon-box-zoom{background-image:url("");}.bk-root .bk-tool-icon-zoom-in{background-image:url("");}.bk-root .bk-tool-icon-zoom-out{background-image:url("");}.bk-root .bk-tool-icon-help{background-image:url("");}.bk-root .bk-tool-icon-hover{background-image:url("");}.bk-root .bk-tool-icon-crosshair{background-image:url("");}.bk-root .bk-tool-icon-lasso-select{background-image:url("");}.bk-root .bk-tool-icon-pan{background-image:url("");}.bk-root .bk-tool-icon-xpan{background-image:url("");}.bk-root .bk-tool-icon-ypan{background-image:url("");}.bk-root .bk-tool-icon-range{background-image:url("");}.bk-root .bk-tool-icon-polygon-select{background-image:url("");}.bk-root .bk-tool-icon-redo{background-image:url("");}.bk-root .bk-tool-icon-reset{background-image:url("");}.bk-root .bk-tool-icon-save{background-image:url("");}.bk-root .bk-tool-icon-tap-select{background-image:url("");}.bk-root .bk-tool-icon-undo{background-image:url("");}.bk-root .bk-tool-icon-wheel-pan{background-image:url("");}.bk-root .bk-tool-icon-wheel-zoom{background-image:url("");}.bk-root .bk-tool-icon-box-edit{background-image:url("");}.bk-root .bk-tool-icon-freehand-draw{background-image:url("");}.bk-root .bk-tool-icon-poly-draw{background-image:url("");}.bk-root .bk-tool-icon-point-draw{background-image:url("");}.bk-root .bk-tool-icon-poly-edit{background-image:url("");}.bk-root .bk-tool-icon-line-edit{background-image:url("");}.bk-root .bk-menu-icon{width:28px;height:28px;background-size:60%;background-color:transparent;background-repeat:no-repeat;background-position:center center;}.bk-root .bk-context-menu{position:absolute;display:inline-flex;flex-wrap:nowrap;user-select:none;-ms-user-select:none;-moz-user-select:none;-webkit-user-select:none;width:auto;height:auto;z-index:100;cursor:pointer;font-size:12px;background-color:#fff;border:1px solid #ccc;border-radius:4px;box-shadow:0 6px 12px rgba(0, 0, 0, 0.175);}.bk-root .bk-context-menu.bk-horizontal{flex-direction:row;}.bk-root .bk-context-menu.bk-vertical{flex-direction:column;}.bk-root .bk-context-menu > .bk-divider{cursor:default;overflow:hidden;background-color:#e5e5e5;}.bk-root .bk-context-menu.bk-horizontal > .bk-divider{width:1px;margin:5px 0;}.bk-root .bk-context-menu.bk-vertical > .bk-divider{height:1px;margin:0 5px;}.bk-root .bk-context-menu > :not(.bk-divider){border:1px solid transparent;}.bk-root .bk-context-menu > :not(.bk-divider).bk-active{border-color:#26aae1;}.bk-root .bk-context-menu > :not(.bk-divider):hover{background-color:#f9f9f9;}.bk-root .bk-context-menu > :not(.bk-divider):focus,.bk-root .bk-context-menu > :not(.bk-divider):focus-visible{outline:1px dotted #26aae1;outline-offset:-1px;}.bk-root .bk-context-menu > :not(.bk-divider)::-moz-focus-inner{border:0;}.bk-root .bk-context-menu.bk-horizontal > :not(.bk-divider):first-child{border-top-left-radius:4px;border-bottom-left-radius:4px;}.bk-root .bk-context-menu.bk-horizontal > :not(.bk-divider):last-child{border-top-right-radius:4px;border-bottom-right-radius:4px;}.bk-root .bk-context-menu.bk-vertical > :not(.bk-divider):first-child{border-top-left-radius:4px;border-top-right-radius:4px;}.bk-root .bk-context-menu.bk-vertical > :not(.bk-divider):last-child{border-bottom-left-radius:4px;border-bottom-right-radius:4px;}.bk-root .bk-menu{position:absolute;left:0;width:100%;z-index:100;cursor:pointer;font-size:12px;background-color:#fff;border:1px solid #ccc;border-radius:4px;box-shadow:0 6px 12px rgba(0, 0, 0, 0.175);}.bk-root .bk-menu.bk-above{bottom:100%;}.bk-root .bk-menu.bk-below{top:100%;}.bk-root .bk-menu > .bk-divider{height:1px;margin:7.5px 0;overflow:hidden;background-color:#e5e5e5;}.bk-root .bk-menu > :not(.bk-divider){padding:6px 12px;}.bk-root .bk-menu > :not(.bk-divider):hover,.bk-root .bk-menu > :not(.bk-divider).bk-active{background-color:#e6e6e6;}.bk-root .bk-caret{display:inline-block;vertical-align:middle;width:0;height:0;margin:0 5px;}.bk-root .bk-caret.bk-down{border-top:4px solid;}.bk-root .bk-caret.bk-up{border-bottom:4px solid;}.bk-root .bk-caret.bk-down,.bk-root .bk-caret.bk-up{border-right:4px solid transparent;border-left:4px solid transparent;}.bk-root .bk-caret.bk-left{border-right:4px solid;}.bk-root .bk-caret.bk-right{border-left:4px solid;}.bk-root .bk-caret.bk-left,.bk-root .bk-caret.bk-right{border-top:4px solid transparent;border-bottom:4px solid transparent;}.bk-root{}.bk-root .bk-tooltip{font-weight:300;font-size:12px;position:absolute;padding:5px;border:1px solid #e5e5e5;color:#2f2f2f;background-color:white;pointer-events:none;opacity:0.95;z-index:100;}.bk-root .bk-tooltip > div:not(:first-child){margin-top:5px;border-top:#e5e5e5 1px dashed;}.bk-root .bk-tooltip.bk-left.bk-tooltip-arrow::before{position:absolute;margin:-7px 0 0 0;top:50%;width:0;height:0;border-style:solid;border-width:7px 0 7px 0;border-color:transparent;content:" ";display:block;left:-10px;border-right-width:10px;border-right-color:#909599;}.bk-root .bk-tooltip.bk-left::before{left:-10px;border-right-width:10px;border-right-color:#909599;}.bk-root .bk-tooltip.bk-right.bk-tooltip-arrow::after{position:absolute;margin:-7px 0 0 0;top:50%;width:0;height:0;border-style:solid;border-width:7px 0 7px 0;border-color:transparent;content:" ";display:block;right:-10px;border-left-width:10px;border-left-color:#909599;}.bk-root .bk-tooltip.bk-right::after{right:-10px;border-left-width:10px;border-left-color:#909599;}.bk-root .bk-tooltip.bk-above::before{position:absolute;margin:0 0 0 -7px;left:50%;width:0;height:0;border-style:solid;border-width:0 7px 0 7px;border-color:transparent;content:" ";display:block;top:-10px;border-bottom-width:10px;border-bottom-color:#909599;}.bk-root .bk-tooltip.bk-below::after{position:absolute;margin:0 0 0 -7px;left:50%;width:0;height:0;border-style:solid;border-width:0 7px 0 7px;border-color:transparent;content:" ";display:block;bottom:-10px;border-top-width:10px;border-top-color:#909599;}.bk-root .bk-tooltip-row-label{text-align:right;color:#26aae1;}.bk-root .bk-tooltip-row-value{color:default;}.bk-root .bk-tooltip-color-block{width:12px;height:12px;margin-left:5px;margin-right:5px;outline:#dddddd solid 1px;display:inline-block;}
Goal: Regrid a global precipitation dataset into countries conservatively, i.e. by exactly partitioning each grid cell into the precise region boundaries.
Meta Goal: Demonstrate that we don't necessarily need a package for this workflow and showcase some of the new capabilities of GeoPandas along the way.
Approach: We take a three step approach:
It is quite fast and transparent.
import xarray as xr
import geopandas as gp
import pandas as pd
import sparse
%xmode minimal
import hvplot.pandas
import hvplot.xarray
import dask
import cf_xarray
Exception reporting mode: Minimal
#! wget https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/50m/cultural/ne_50m_admin_0_countries.zip
#! unzip ne_50m_admin_0_countries.zip
Load with geopandas:
regions_df = gp.read_file("ne_50m_admin_0_countries.shp")
regions_df.head(1)
featurecla | scalerank | LABELRANK | SOVEREIGNT | SOV_A3 | ADM0_DIF | LEVEL | TYPE | TLC | ADMIN | ... | FCLASS_TR | FCLASS_ID | FCLASS_PL | FCLASS_GR | FCLASS_IT | FCLASS_NL | FCLASS_SE | FCLASS_BD | FCLASS_UA | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Admin-0 country | 1 | 3 | Zimbabwe | ZWE | 0 | 2 | Sovereign country | 1 | Zimbabwe | ... | None | None | None | None | None | None | None | None | None | POLYGON ((31.28789 -22.40205, 31.19727 -22.344... |
1 rows × 169 columns
region_name = 'SOVEREIGNT'
Check it:
regions_df.plot(column=region_name, figsize=(10,4))
<AxesSubplot:>
bbox = tuple(regions_df.total_bounds)
bbox
(-180.0, -89.99892578125002, 180.0, 83.599609375)
All geodataframes should have a coordinate reference system. This is important (and sometimes unfamiliar to users coming from the global climate world).
crs_orig = f'EPSG:{regions_df.crs.to_epsg()}'
crs_orig
'EPSG:4326'
We will now transform to an area preserving projection. This is imporant because we want to do area-weighted regridding.
# use an area preserving projections
crs_area = "ESRI:53034"
regions_df = regions_df.to_crs(crs_area)
This is the NASA / NOAA Global Precipitation Climatology Project processed and stored by Pangeo Forge: https://pangeo-forge.org/dashboard/feedstock/42. We want to aggregate the precipitation into countries defined in the shapefile.
Note that we have the entire dataset in one object. We convert the longitude from [0,360] to [-180,180] because that's what holoviz expects.
store = 'https://ncsa.osn.xsede.org/Pangeo/pangeo-forge/gpcp-feedstock/gpcp.zarr'
with dask.config.set(**{'array.slicing.split_large_chunks': False}):
ds = xr.open_dataset(store, engine='zarr', chunks={})
lon = ds.cf['X'].name
lat = ds.cf['Y'].name
lon_attrs = ds.cf['X'].attrs # assign_coords drops attrs, so copy...
ds = ds.cf.assign_coords(X=(((ds[lon] + 180) % 360) - 180)).sortby(lon)
ds[lon].attrs = lon_attrs # ... and paste
ds[lon].attrs['valid_range'] = [-180,180]
Subset gridded model results to bounds of spatial dataframe to save on memory and computation. More useful when the regions_df is much smaller than the footprint of the gridded model
ds = ds.cf.sel(X=slice(bbox[0],bbox[2]), Y=slice(bbox[1],bbox[3]))
Now we extract just the horizontal grid information. The dataset has information about the lat and lon bounds of each cell, which we need to create the polygons.
grid = ds.drop(['time', 'time_bounds', 'precip']).reset_coords().load()
grid
<xarray.Dataset> Dimensions: (latitude: 173, nv: 2, longitude: 360) Coordinates: * latitude (latitude) float32 -89.0 -88.0 -87.0 -86.0 ... 81.0 82.0 83.0 * longitude (longitude) float32 -180.0 -179.0 -178.0 ... 177.0 178.0 179.0 Dimensions without coordinates: nv Data variables: lat_bounds (latitude, nv) float32 -89.0 -88.0 -88.0 ... 83.0 83.0 84.0 lon_bounds (longitude, nv) float32 180.0 181.0 181.0 ... 179.0 179.0 180.0 Attributes: (12/45) Conventions: CF-1.6, ACDD 1.3 Metadata_Conventions: CF-1.6, Unidata Dataset Discovery v1.0, NOAA ... acknowledgment: This project was supported in part by a grant... cdm_data_type: Grid cdr_program: NOAA Climate Data Record Program for satellit... cdr_variable: precipitation ... ... standard_name_vocabulary: CF Standard Name Table (v41, 22 February 2017) summary: Global Precipitation Climatology Project (GPC... time_coverage_duration: P1D time_coverage_end: 1996-10-01T23:59:59Z time_coverage_start: 1996-10-01T00:00:00Z title: Global Precipitation Climatatology Project (G...
ds
<xarray.Dataset> Dimensions: (latitude: 173, nv: 2, longitude: 360, time: 9226) Coordinates: lat_bounds (latitude, nv) float32 dask.array<chunksize=(173, 2), meta=np.ndarray> * latitude (latitude) float32 -89.0 -88.0 -87.0 -86.0 ... 81.0 82.0 83.0 lon_bounds (longitude, nv) float32 dask.array<chunksize=(360, 2), meta=np.ndarray> * longitude (longitude) float32 -180.0 -179.0 -178.0 ... 177.0 178.0 179.0 * time (time) datetime64[ns] 1996-10-01 1996-10-02 ... 2021-12-31 time_bounds (time, nv) datetime64[ns] dask.array<chunksize=(200, 2), meta=np.ndarray> Dimensions without coordinates: nv Data variables: precip (time, latitude, longitude) float32 dask.array<chunksize=(200, 173, 360), meta=np.ndarray> Attributes: (12/45) Conventions: CF-1.6, ACDD 1.3 Metadata_Conventions: CF-1.6, Unidata Dataset Discovery v1.0, NOAA ... acknowledgment: This project was supported in part by a grant... cdm_data_type: Grid cdr_program: NOAA Climate Data Record Program for satellit... cdr_variable: precipitation ... ... standard_name_vocabulary: CF Standard Name Table (v41, 22 February 2017) summary: Global Precipitation Climatology Project (GPC... time_coverage_duration: P1D time_coverage_end: 1996-10-01T23:59:59Z time_coverage_start: 1996-10-01T00:00:00Z title: Global Precipitation Climatatology Project (G...
Now we "stack" the data into a single 1D array. This is the first step towards transitioning to pandas.
points = grid.stack(point=(lat, lon))
points
<xarray.Dataset> Dimensions: (nv: 2, point: 62280) Coordinates: * point (point) object MultiIndex * latitude (point) float32 -89.0 -89.0 -89.0 -89.0 ... 83.0 83.0 83.0 83.0 * longitude (point) float32 -180.0 -179.0 -178.0 ... 177.0 178.0 179.0 Dimensions without coordinates: nv Data variables: lat_bounds (nv, point) float32 -89.0 -89.0 -89.0 -89.0 ... 84.0 84.0 84.0 lon_bounds (nv, point) float32 180.0 181.0 182.0 ... 178.0 179.0 180.0 Attributes: (12/45) Conventions: CF-1.6, ACDD 1.3 Metadata_Conventions: CF-1.6, Unidata Dataset Discovery v1.0, NOAA ... acknowledgment: This project was supported in part by a grant... cdm_data_type: Grid cdr_program: NOAA Climate Data Record Program for satellit... cdr_variable: precipitation ... ... standard_name_vocabulary: CF Standard Name Table (v41, 22 February 2017) summary: Global Precipitation Climatology Project (GPC... time_coverage_duration: P1D time_coverage_end: 1996-10-01T23:59:59Z time_coverage_start: 1996-10-01T00:00:00Z title: Global Precipitation Climatatology Project (G...
This function creates geometries for a single pair of bounds. It is not fast, but it is fast enough here. Perhaps could be vectorized using pygeos...
from shapely.geometry import Polygon
def bounds_to_poly(lon_bounds, lat_bounds):
if lon_bounds[0] >= 180:
# geopandas needs this
lon_bounds = lon_bounds - 360
return Polygon([
(lon_bounds[0], lat_bounds[0]),
(lon_bounds[0], lat_bounds[1]),
(lon_bounds[1], lat_bounds[1]),
(lon_bounds[1], lat_bounds[0])
])
We apply this function to each grid cell.
%%time
import numpy as np
boxes = xr.apply_ufunc(
bounds_to_poly,
points.lon_bounds,
points.lat_bounds,
input_core_dims=[("nv",), ("nv",)],
output_dtypes=[np.dtype('O')],
vectorize=True
)
boxes
CPU times: user 1.05 s, sys: 60.7 ms, total: 1.11 s Wall time: 1.04 s
<xarray.DataArray (point: 62280)> array([<shapely.geometry.polygon.Polygon object at 0x7fb5f4356040>, <shapely.geometry.polygon.Polygon object at 0x7fb5f43a44c0>, <shapely.geometry.polygon.Polygon object at 0x7fb5f43a46a0>, ..., <shapely.geometry.polygon.Polygon object at 0x7fb5edc1dfa0>, <shapely.geometry.polygon.Polygon object at 0x7fb5edc1dfd0>, <shapely.geometry.polygon.Polygon object at 0x7fb5edc24040>], dtype=object) Coordinates: * point (point) object MultiIndex * latitude (point) float32 -89.0 -89.0 -89.0 -89.0 ... 83.0 83.0 83.0 83.0 * longitude (point) float32 -180.0 -179.0 -178.0 -177.0 ... 177.0 178.0 179.0
Finally, we convert to a GeoDataframe, specifying the original CRS
%%time
grid_df= gp.GeoDataFrame(
data={"geometry": boxes.values, "latitude": boxes[lat], "longitude": boxes[lon]},
index=boxes.indexes["point"],
crs=crs_orig
)
grid_df
CPU times: user 53 ms, sys: 8.07 ms, total: 61.1 ms Wall time: 59.7 ms
geometry | latitude | longitude | ||
---|---|---|---|---|
latitude | longitude | |||
-89.0 | -180.0 | POLYGON ((-180.00000 -89.00000, -180.00000 -88... | -89.0 | -180.0 |
-179.0 | POLYGON ((-179.00000 -89.00000, -179.00000 -88... | -89.0 | -179.0 | |
-178.0 | POLYGON ((-178.00000 -89.00000, -178.00000 -88... | -89.0 | -178.0 | |
-177.0 | POLYGON ((-177.00000 -89.00000, -177.00000 -88... | -89.0 | -177.0 | |
-176.0 | POLYGON ((-176.00000 -89.00000, -176.00000 -88... | -89.0 | -176.0 | |
... | ... | ... | ... | ... |
83.0 | 175.0 | POLYGON ((175.00000 83.00000, 175.00000 84.000... | 83.0 | 175.0 |
176.0 | POLYGON ((176.00000 83.00000, 176.00000 84.000... | 83.0 | 176.0 | |
177.0 | POLYGON ((177.00000 83.00000, 177.00000 84.000... | 83.0 | 177.0 | |
178.0 | POLYGON ((178.00000 83.00000, 178.00000 84.000... | 83.0 | 178.0 | |
179.0 | POLYGON ((179.00000 83.00000, 179.00000 84.000... | 83.0 | 179.0 |
62280 rows × 3 columns
grid_df.crs
<Geographic 2D CRS: EPSG:4326> Name: WGS 84 Axis Info [ellipsoidal]: - Lat[north]: Geodetic latitude (degree) - Lon[east]: Geodetic longitude (degree) Area of Use: - name: World. - bounds: (-180.0, -90.0, 180.0, 90.0) Datum: World Geodetic System 1984 ensemble - Ellipsoid: WGS 84 - Prime Meridian: Greenwich
Now we transform to the area-preserving CRS.
grid_df = grid_df.to_crs(crs_area)
grid_df.crs
<Derived Projected CRS: ESRI:53034> Name: Sphere_Cylindrical_Equal_Area Axis Info [cartesian]: - E[east]: Easting (metre) - N[north]: Northing (metre) Area of Use: - name: World. - bounds: (-180.0, -90.0, 180.0, 90.0) Coordinate Operation: - name: Sphere_Cylindrical_Equal_Area - method: Lambert Cylindrical Equal Area (Spherical) Datum: Not specified (based on Authalic Sphere) - Ellipsoid: Sphere - Prime Meridian: Greenwich
This is the magic of geopandas; it can calculate the overlap between the original grid and the regions. It is expensive because it has to compare 64800 grid boxes with 242 regions.
In this dataframe, the latitude
and longitude
values are from the grid, while all the other columns are from the regions.
%time overlay = grid_df.overlay(regions_df, keep_geom_type=True)
overlay
CPU times: user 12.7 s, sys: 93.1 ms, total: 12.8 s Wall time: 12.8 s
latitude | longitude | featurecla | scalerank | LABELRANK | SOVEREIGNT | SOV_A3 | ADM0_DIF | LEVEL | TYPE | ... | FCLASS_TR | FCLASS_ID | FCLASS_PL | FCLASS_GR | FCLASS_IT | FCLASS_NL | FCLASS_SE | FCLASS_BD | FCLASS_UA | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -89.0 | -180.0 | Admin-0 country | 3 | 4 | Antarctica | ATA | 0 | 2 | Indeterminate | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-20015086.796 -6369062.742, -2001508... |
1 | -89.0 | -179.0 | Admin-0 country | 3 | 4 | Antarctica | ATA | 0 | 2 | Indeterminate | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-19903891.869 -6367118.959, -1979269... |
2 | -89.0 | -178.0 | Admin-0 country | 3 | 4 | Antarctica | ATA | 0 | 2 | Indeterminate | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-19792696.943 -6367118.959, -1968150... |
3 | -89.0 | -177.0 | Admin-0 country | 3 | 4 | Antarctica | ATA | 0 | 2 | Indeterminate | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-19681502.016 -6367118.959, -1957030... |
4 | -89.0 | -176.0 | Admin-0 country | 3 | 4 | Antarctica | ATA | 0 | 2 | Indeterminate | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-19570307.089 -6367118.959, -1945911... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
27274 | 66.0 | -19.0 | Admin-0 country | 1 | 3 | Iceland | ISL | 0 | 2 | Sovereign country | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((-2001508.680 5820198.111, -202... |
27275 | 66.0 | -18.0 | Admin-0 country | 1 | 3 | Iceland | ISL | 0 | 2 | Sovereign country | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((-1890313.753 5820198.111, -194... |
27276 | 66.0 | -17.0 | Admin-0 country | 1 | 3 | Iceland | ISL | 0 | 2 | Sovereign country | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-1890313.753 5828182.772, -1886925.7... |
27277 | 66.0 | -16.0 | Admin-0 country | 1 | 3 | Iceland | ISL | 0 | 2 | Sovereign country | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-1779118.826 5843385.413, -1777495.4... |
27278 | 66.0 | -15.0 | Admin-0 country | 1 | 3 | Iceland | ISL | 0 | 2 | Sovereign country | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((-1667923.900 5835808.462, -166... |
27279 rows × 171 columns
This is essentially already a sparse matrix mapping one grid space to the other. How sparse?
sparsity = len(overlay) / (len(grid_df) * len(regions_df))
sparsity
0.0018099412411025652
overlay
latitude | longitude | featurecla | scalerank | LABELRANK | SOVEREIGNT | SOV_A3 | ADM0_DIF | LEVEL | TYPE | ... | FCLASS_TR | FCLASS_ID | FCLASS_PL | FCLASS_GR | FCLASS_IT | FCLASS_NL | FCLASS_SE | FCLASS_BD | FCLASS_UA | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -89.0 | -180.0 | Admin-0 country | 3 | 4 | Antarctica | ATA | 0 | 2 | Indeterminate | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-20015086.796 -6369062.742, -2001508... |
1 | -89.0 | -179.0 | Admin-0 country | 3 | 4 | Antarctica | ATA | 0 | 2 | Indeterminate | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-19903891.869 -6367118.959, -1979269... |
2 | -89.0 | -178.0 | Admin-0 country | 3 | 4 | Antarctica | ATA | 0 | 2 | Indeterminate | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-19792696.943 -6367118.959, -1968150... |
3 | -89.0 | -177.0 | Admin-0 country | 3 | 4 | Antarctica | ATA | 0 | 2 | Indeterminate | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-19681502.016 -6367118.959, -1957030... |
4 | -89.0 | -176.0 | Admin-0 country | 3 | 4 | Antarctica | ATA | 0 | 2 | Indeterminate | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-19570307.089 -6367118.959, -1945911... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
27274 | 66.0 | -19.0 | Admin-0 country | 1 | 3 | Iceland | ISL | 0 | 2 | Sovereign country | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((-2001508.680 5820198.111, -202... |
27275 | 66.0 | -18.0 | Admin-0 country | 1 | 3 | Iceland | ISL | 0 | 2 | Sovereign country | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((-1890313.753 5820198.111, -194... |
27276 | 66.0 | -17.0 | Admin-0 country | 1 | 3 | Iceland | ISL | 0 | 2 | Sovereign country | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-1890313.753 5828182.772, -1886925.7... |
27277 | 66.0 | -16.0 | Admin-0 country | 1 | 3 | Iceland | ISL | 0 | 2 | Sovereign country | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-1779118.826 5843385.413, -1777495.4... |
27278 | 66.0 | -15.0 | Admin-0 country | 1 | 3 | Iceland | ISL | 0 | 2 | Sovereign country | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((-1667923.900 5835808.462, -166... |
27279 rows × 171 columns
Let's explore these overlays a little bit
overlay[overlay[region_name] == "Italy"].geometry.plot(edgecolor='k')
<AxesSubplot:>
We can verify that each basin's area is preserved in the overlay operation.
overlay.geometry.area.groupby(overlay[region_name]).sum().nlargest(10)/1e6 # km2
SOVEREIGNT Russia 1.687963e+07 Antarctica 1.221779e+07 Canada 9.872058e+06 United States of America 9.449645e+06 China 9.372241e+06 Brazil 8.499337e+06 Australia 7.706575e+06 India 3.155821e+06 Argentina 2.782670e+06 Kazakhstan 2.712770e+06 dtype: float64
regions_df.geometry.area.groupby(regions_df[region_name]).sum().nlargest(10)
SOVEREIGNT Russia 1.687963e+13 Antarctica 1.225664e+13 Canada 9.872058e+12 United States of America 9.449645e+12 China 9.372241e+12 Brazil 8.499337e+12 Australia 7.706575e+12 India 3.155821e+12 Argentina 2.782670e+12 Kazakhstan 2.712770e+12 dtype: float64
This is another key step. This transform tells us how much of a country's total area comes from each of the grid cells. This is accurate because we used an area-preserving CRS.
grid_cell_fraction = overlay.geometry.area.groupby(overlay[region_name]).transform(lambda x: x / x.sum())
grid_cell_fraction
0 0.000026 1 0.000026 2 0.000026 3 0.000026 4 0.000026 ... 27274 0.005459 27275 0.004817 27276 0.016131 27277 0.015022 27278 0.001937 Length: 27279, dtype: float64
We can verify that these all sum up to one.
grid_cell_fraction.groupby(overlay[region_name]).sum()
SOVEREIGNT Afghanistan 1.0 Albania 1.0 Algeria 1.0 Andorra 1.0 Angola 1.0 ... Western Sahara 1.0 Yemen 1.0 Zambia 1.0 Zimbabwe 1.0 eSwatini 1.0 Length: 201, dtype: float64
The first step is making a MultiIndex
multi_index = overlay.set_index([lat, lon, region_name]).index
df_weights = pd.DataFrame({"weights": grid_cell_fraction.values}, index=multi_index)
df_weights
weights | |||
---|---|---|---|
latitude | longitude | SOVEREIGNT | |
-89.0 | -180.0 | Antarctica | 0.000026 |
-179.0 | Antarctica | 0.000026 | |
-178.0 | Antarctica | 0.000026 | |
-177.0 | Antarctica | 0.000026 | |
-176.0 | Antarctica | 0.000026 | |
... | ... | ... | ... |
66.0 | -19.0 | Iceland | 0.005459 |
-18.0 | Iceland | 0.004817 | |
-17.0 | Iceland | 0.016131 | |
-16.0 | Iceland | 0.015022 | |
-15.0 | Iceland | 0.001937 |
27279 rows × 1 columns
We can bring this directly into xarray as a 1D Dataset.
import xarray as xr
ds_weights = xr.Dataset(df_weights)
ds_weights
<xarray.Dataset> Dimensions: (dim_0: 27279) Coordinates: * dim_0 (dim_0) object MultiIndex * latitude (dim_0) float64 -89.0 -89.0 -89.0 -89.0 ... 66.0 66.0 66.0 66.0 * longitude (dim_0) float64 -180.0 -179.0 -178.0 ... -17.0 -16.0 -15.0 * SOVEREIGNT (dim_0) object 'Antarctica' 'Antarctica' ... 'Iceland' 'Iceland' Data variables: weights (dim_0) float64 2.649e-05 2.649e-05 ... 0.01502 0.001937
Now we unstack it into a sparse array.
weights_sparse = ds_weights.unstack(sparse=True, fill_value=0.).weights
weights_sparse
<xarray.DataArray 'weights' (latitude: 170, longitude: 360, SOVEREIGNT: 201)> <COO: shape=(170, 360, 201), dtype=float64, nnz=27267, fill_value=0.0> Coordinates: * latitude (latitude) float64 -89.0 -88.0 -87.0 -86.0 ... 81.0 82.0 83.0 * longitude (longitude) float64 -180.0 -179.0 -178.0 ... 177.0 178.0 179.0 * SOVEREIGNT (SOVEREIGNT) object 'Afghanistan' 'Albania' ... 'eSwatini'
Here we can clearly see that this is a sparse matrix, mapping the input space (lat, lon) to the output space (SOVEREIGNT).
To regrid the data, we just have to multiply the original precip dataset by this matrix. There are many different ways to do this. The simplest one would just be:
regridded = xr.dot(ds.precip, weights_sparse)
regridded
/home/conda/users/266bc1f046057293eb778c8b95277aae62235b8cac1c12e1334002c7a7656436-20221021-195048-630919-211-pangeo/lib/python3.9/site-packages/xarray/core/indexing.py:1374: PerformanceWarning: Slicing is producing a large chunk. To accept the large chunk and silence this warning, set the option >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}): ... array[indexer] To avoid creating the large chunks, set the option >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}): ... array[indexer] return self.array[key]
<xarray.DataArray (time: 9226, SOVEREIGNT: 201)> dask.array<sum-aggregate, shape=(9226, 201), dtype=float64, chunksize=(200, 201), chunktype=numpy.ndarray> Coordinates: * time (time) datetime64[ns] 1996-10-01 1996-10-02 ... 2021-12-31 * SOVEREIGNT (SOVEREIGNT) object 'Afghanistan' 'Albania' ... 'eSwatini'
Unfortunately, that doesn't work out of the box, because sparse doesn't implement einsum (see https://github.com/pydata/sparse/issues/31).
# regridded[0].compute() # fails
There are many ways to work around this, starting from just using a dense matrix instead. Below is the fastest thing I came up with after some extensive experimentation.
Sparse does implement matmul, so we can use that. But we have to do some reshaping to make it work with our data.
def apply_weights_matmul_sparse(weights, data):
assert isinstance(weights, sparse.SparseArray)
assert isinstance(data, np.ndarray)
data = sparse.COO.from_numpy(data)
data_shape = data.shape
# k = nlat * nlon
n, k = data_shape[0], data_shape[1] * data_shape[2]
data = data.reshape((n, k))
weights_shape = weights.shape
k_, m = weights_shape[0] * weights_shape[1], weights_shape[2]
assert k == k_
weights_data = weights.reshape((k, m))
regridded = sparse.matmul(data, weights_data)
assert regridded.shape == (n, m)
return regridded.todense()
Before applying this to the data, let's load it into memory and then chunk it again. This is not necessary (we could just stream the data from the cloud), but it is a cleaner benchmark. Chunking again allows us to leverage dask parallelism. We also eliminate some know corrupted values via a mask.
%%time
mask = (ds.precip >= 0) & (ds.precip < 3000)
precip = ds.precip.where(mask)
precip_in_mem = precip.compute().chunk({"time": "10MB"})
precip_in_mem
CPU times: user 12.3 s, sys: 3.14 s, total: 15.4 s Wall time: 23.4 s
<xarray.DataArray 'precip' (time: 9226, latitude: 173, longitude: 360)> dask.array<xarray-<this-array>, shape=(9226, 173, 360), dtype=float32, chunksize=(40, 173, 360), chunktype=numpy.ndarray> Coordinates: * latitude (latitude) float32 -89.0 -88.0 -87.0 -86.0 ... 81.0 82.0 83.0 * longitude (longitude) float32 -180.0 -179.0 -178.0 ... 177.0 178.0 179.0 * time (time) datetime64[ns] 1996-10-01 1996-10-02 ... 2021-12-31 Attributes: cell_methods: area: mean time: mean long_name: NOAA Climate Data Record (CDR) of Daily GPCP Satellite-Ga... standard_name: lwe_precipitation_rate units: mm/day valid_range: [0.0, 100.0]
Take a look at mean precip over CONUS
da = precip_in_mem.mean(dim='time')
da.hvplot.image(x=ds.cf['X'].name, y=ds.cf['Y'].name, rasterize=True, geo=True, cmap='viridis', frame_width=600, tiles='OSM')
with dask.config.set(**{'array.slicing.split_large_chunks': False}):
precip_regridded = xr.apply_ufunc(
apply_weights_matmul_sparse,
weights_sparse,
precip_in_mem,
join="left",
input_core_dims=[[lat, lon, region_name], [lat, lon]],
output_core_dims=[[region_name]],
dask="parallelized",
dask_gufunc_kwargs=dict(meta=[np.ndarray((0,))])
)
precip_regridded
Finally, we compute it!
%%time
from dask.diagnostics import ProgressBar
with ProgressBar():
precip_regridded.load()
[# ] | 3% Completed | 3.0s
/home/conda/users/266bc1f046057293eb778c8b95277aae62235b8cac1c12e1334002c7a7656436-20221021-195048-630919-211-pangeo/lib/python3.9/site-packages/sparse/_common.py:232: RuntimeWarning: Nan will not be propagated in matrix multiplication warnings.warn(
[########################################] | 100% Completed | 22.2s CPU times: user 41.1 s, sys: 556 ms, total: 41.6 s Wall time: 22.5 s
precip_regridded
<xarray.DataArray (time: 9226, SOVEREIGNT: 201)> array([[3.82323997e-05, 0.00000000e+00, 6.48175116e-06, ..., 1.74498871e-02, 0.00000000e+00, 0.00000000e+00], [4.26562427e-01, 1.12849841e-02, 8.86701773e-02, ..., 4.27312567e-02, 0.00000000e+00, 0.00000000e+00], [8.62065596e-01, 9.50921967e+00, 1.33509326e-02, ..., 7.96630231e-04, 0.00000000e+00, 0.00000000e+00], ..., [2.08564464e-01, 1.53430027e+00, 0.00000000e+00, ..., 5.72802686e-01, 8.95664055e+00, 1.99760773e+00], [7.82498343e-02, 2.20751937e-02, 4.20740792e-03, ..., 1.99925202e+00, 1.33976102e+00, 1.72203755e+01], [8.46889840e+00, 0.00000000e+00, 0.00000000e+00, ..., 4.77679192e+00, 3.91162785e+00, 1.33407013e+01]]) Coordinates: * SOVEREIGNT (SOVEREIGNT) object 'Afghanistan' 'Albania' ... 'eSwatini' * time (time) datetime64[ns] 1996-10-01 1996-10-02 ... 2021-12-31
ds_precip = precip_regridded.sel(SOVEREIGNT=['Italy', 'Morocco']).resample(time="MS").mean().to_dataset(region_name)
ds_precip.hvplot(x='time', grid=True, frame_width=1000)
df_mean = precip_regridded.to_pandas().mean()
df_mean.name = "precip"
df_mean = pd.DataFrame(df_mean).reset_index()
merged = pd.merge(regions_df, df_mean)
Convert back to geographic coordinates for plotting
merged_geo = merged.to_crs(crs_orig)
Matplotlib:
merged_geo.plot(column='precip', edgecolor='k', figsize=(12,5), legend=True)
<AxesSubplot:>
Holoviz:
merged_geo.hvplot(c='precip', geo=True, cmap='viridis', frame_width=600, tiles='OSM',
title='Mean Precip (mm/day): NASA/NOAA Global Precip Climatology [1996-10-01/2021-12-31]')
/home/conda/users/266bc1f046057293eb778c8b95277aae62235b8cac1c12e1334002c7a7656436-20221021-195048-630919-211-pangeo/lib/python3.9/site-packages/geoviews/operation/projection.py:79: ShapelyDeprecationWarning: Iteration over multi-part geometries is deprecated and will be removed in Shapely 2.0. Use the `geoms` property to access the constituent parts of a multi-part geometry. polys = [g for g in geom if g.area > 1e-15]