#!/usr/bin/env python # coding: utf-8 # # Spatial Joins # # A *spatial join* uses [binary predicates](http://shapely.readthedocs.io/en/latest/manual.html#binary-predicates) # such as `intersects` and `crosses` to combine two `GeoDataFrames` based on the spatial relationship # between their geometries. # # A common use case might be a spatial join between a point layer and a polygon layer where you want to retain the point geometries and grab the attributes of the intersecting polygons. # # ![illustration](https://web.natur.cuni.cz/~langhamr/lectures/vtfg1/mapinfo_1/about_gis/Image23.gif) # # ## Types of spatial joins # # We currently support the following methods of spatial joins. We refer to the *left_df* and *right_df* which are the correspond to the two dataframes passed in as args. # # ### Left outer join # # In a LEFT OUTER JOIN (`how='left'`), we keep *all* rows from the left and duplicate them if necessary to represent multiple hits between the two dataframes. We retain attributes of the right if they intersect and lose right rows that don't intersect. A left outer join implies that we are interested in retaining the geometries of the left. # # This is equivalent to the PostGIS query: # ``` # SELECT pts.geom, pts.id as ptid, polys.id as polyid # FROM pts # LEFT OUTER JOIN polys # ON ST_Intersects(pts.geom, polys.geom); # # geom | ptid | polyid # --------------------------------------------+------+-------- # 010100000040A9FBF2D88AD03F349CD47D796CE9BF | 4 | 10 # 010100000048EABE3CB622D8BFA8FBF2D88AA0E9BF | 3 | 10 # 010100000048EABE3CB622D8BFA8FBF2D88AA0E9BF | 3 | 20 # 0101000000F0D88AA0E1A4EEBF7052F7E5B115E9BF | 2 | 20 # 0101000000818693BA2F8FF7BF4ADD97C75604E9BF | 1 | # (5 rows) # ``` # # ### Right outer join # # In a RIGHT OUTER JOIN (`how='right'`), we keep *all* rows from the right and duplicate them if necessary to represent multiple hits between the two dataframes. We retain attributes of the left if they intersect and lose left rows that don't intersect. A right outer join implies that we are interested in retaining the geometries of the right. # # This is equivalent to the PostGIS query: # ``` # SELECT polys.geom, pts.id as ptid, polys.id as polyid # FROM pts # RIGHT OUTER JOIN polys # ON ST_Intersects(pts.geom, polys.geom); # # geom | ptid | polyid # ----------+------+-------- # 01...9BF | 4 | 10 # 01...9BF | 3 | 10 # 02...7BF | 3 | 20 # 02...7BF | 2 | 20 # 00...5BF | | 30 # (5 rows) # ``` # # ### Inner join # # In an INNER JOIN (`how='inner'`), we keep rows from the right and left only where their binary predicate is `True`. We duplicate them if necessary to represent multiple hits between the two dataframes. We retain attributes of the right and left only if they intersect and lose all rows that do not. An inner join implies that we are interested in retaining the geometries of the left. # # This is equivalent to the PostGIS query: # ``` # SELECT pts.geom, pts.id as ptid, polys.id as polyid # FROM pts # INNER JOIN polys # ON ST_Intersects(pts.geom, polys.geom); # # geom | ptid | polyid # --------------------------------------------+------+-------- # 010100000040A9FBF2D88AD03F349CD47D796CE9BF | 4 | 10 # 010100000048EABE3CB622D8BFA8FBF2D88AA0E9BF | 3 | 10 # 010100000048EABE3CB622D8BFA8FBF2D88AA0E9BF | 3 | 20 # 0101000000F0D88AA0E1A4EEBF7052F7E5B115E9BF | 2 | 20 # (4 rows) # ``` # ## Spatial Joins between two GeoDataFrames # # Let's take a look at how we'd implement these using `GeoPandas`. First, load up the NYC test data into `GeoDataFrames`: # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') from shapely.geometry import Point from geopandas import GeoDataFrame, read_file import geodatasets # NYC Boros zippath = geodatasets.get_path("nybb") polydf = read_file(zippath) # Generate some points b = [int(x) for x in polydf.total_bounds] N = 8 pointdf = GeoDataFrame( [ {"geometry": Point(x, y), "value1": x + y, "value2": x - y} for x, y in zip( range(b[0], b[2], int((b[2] - b[0]) / N)), range(b[1], b[3], int((b[3] - b[1]) / N)), ) ] ) # Make sure they're using the same projection reference pointdf.crs = polydf.crs # In[ ]: pointdf # In[ ]: polydf # In[ ]: pointdf.plot() # In[ ]: polydf.plot() # ## Joins # In[ ]: join_left_df = pointdf.sjoin(polydf, how="left") join_left_df # Note the NaNs where the point did not intersect a boro # In[ ]: join_right_df = pointdf.sjoin(polydf, how="right") join_right_df # Note Staten Island is repeated # In[ ]: join_inner_df = pointdf.sjoin(polydf, how="inner") join_inner_df # Note the lack of NaNs; dropped anything that didn't intersect # We're not limited to using the `intersection` binary predicate. Any of the `Shapely` geometry methods that return a Boolean can be used by specifying the `predicate` kwarg. # In[ ]: pointdf.sjoin(polydf, how="left", predicate="within") # We can also conduct a nearest neighbour join with `sjoin_nearest`. # In[ ]: pointdf.sjoin_nearest(polydf, how="left", distance_col="Distances") # Note the optional Distances column with computed distances between each point # and the nearest polydf geometry.