Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
from pyspark.sql import SparkSession
from pyspark import StorageLevel
import geopandas as gpd
import pandas as pd
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType
from pyspark.sql.types import LongType
from shapely.geometry import Point
from shapely.geometry import Polygon
from sedona.spark import *
from sedona.core.geom.envelope import Envelope
config = SedonaContext.builder() .\
config('spark.jars.packages',
'org.apache.sedona:sedona-spark-3.4_2.12:1.5.1,'
'org.datasyslab:geotools-wrapper:1.5.1-28.2,'
'uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.4'). \
config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all'). \
getOrCreate()
sedona = SedonaContext.create(config)
https://artifacts.unidata.ucar.edu/repository/unidata-all added as a remote repository with the name: repo-1 Ivy Default Cache set to: /root/.ivy2/cache The jars for the packages stored in: /root/.ivy2/jars org.apache.sedona#sedona-spark-shaded-3.4_2.12 added as a dependency org.datasyslab#geotools-wrapper added as a dependency uk.co.gresearch.spark#spark-extension_2.12 added as a dependency :: resolving dependencies :: org.apache.spark#spark-submit-parent-7eb0c6dc-19b8-477f-b252-512428246ef7;1.0 confs: [default] found org.apache.sedona#sedona-spark-shaded-3.4_2.12;1.5.1 in central found edu.ucar#cdm-core;5.4.2 in repo-1 found edu.ucar#udunits;5.4.2 in repo-1 found edu.ucar#httpservices;5.4.2 in repo-1 found com.google.guava#guava;30.1-jre in central found com.google.guava#failureaccess;1.0.1 in central found com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central
:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
found com.google.code.findbugs#jsr305;3.0.2 in central found org.checkerframework#checker-qual;3.5.0 in central found com.google.errorprone#error_prone_annotations;2.3.4 in central found com.google.j2objc#j2objc-annotations;1.3 in central found org.apache.httpcomponents#httpclient;4.5.13 in central found org.apache.httpcomponents#httpcore;4.4.13 in central found commons-logging#commons-logging;1.2 in central found commons-codec#commons-codec;1.11 in central found com.beust#jcommander;1.78 in central found com.google.protobuf#protobuf-java;3.12.4 in central found com.google.re2j#re2j;1.3 in central found joda-time#joda-time;2.10.3 in central found org.jdom#jdom2;2.0.6 in central found org.slf4j#slf4j-api;1.7.36 in central found org.apache.httpcomponents#httpmime;4.5.13 in central found org.datasyslab#geotools-wrapper;1.5.1-28.2 in central found uk.co.gresearch.spark#spark-extension_2.12;2.11.0-3.4 in central found com.github.scopt#scopt_2.12;4.1.0 in central :: resolution report :: resolve 292ms :: artifacts dl 10ms :: modules in use: com.beust#jcommander;1.78 from central in [default] com.github.scopt#scopt_2.12;4.1.0 from central in [default] com.google.code.findbugs#jsr305;3.0.2 from central in [default] com.google.errorprone#error_prone_annotations;2.3.4 from central in [default] com.google.guava#failureaccess;1.0.1 from central in [default] com.google.guava#guava;30.1-jre from central in [default] com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava from central in [default] com.google.j2objc#j2objc-annotations;1.3 from central in [default] com.google.protobuf#protobuf-java;3.12.4 from central in [default] com.google.re2j#re2j;1.3 from central in [default] commons-codec#commons-codec;1.11 from central in [default] commons-logging#commons-logging;1.2 from central in [default] edu.ucar#cdm-core;5.4.2 from repo-1 in [default] edu.ucar#httpservices;5.4.2 from repo-1 in [default] edu.ucar#udunits;5.4.2 from repo-1 in [default] joda-time#joda-time;2.10.3 from central in [default] org.apache.httpcomponents#httpclient;4.5.13 from central in [default] org.apache.httpcomponents#httpcore;4.4.13 from central in [default] org.apache.httpcomponents#httpmime;4.5.13 from central in [default] org.apache.sedona#sedona-spark-shaded-3.4_2.12;1.5.1 from central in [default] org.checkerframework#checker-qual;3.5.0 from central in [default] org.datasyslab#geotools-wrapper;1.5.1-28.2 from central in [default] org.jdom#jdom2;2.0.6 from central in [default] org.slf4j#slf4j-api;1.7.36 from central in [default] uk.co.gresearch.spark#spark-extension_2.12;2.11.0-3.4 from central in [default] --------------------------------------------------------------------- | | modules || artifacts | | conf | number| search|dwnlded|evicted|| number|dwnlded| --------------------------------------------------------------------- | default | 25 | 0 | 0 | 0 || 25 | 0 | --------------------------------------------------------------------- :: retrieving :: org.apache.spark#spark-submit-parent-7eb0c6dc-19b8-477f-b252-512428246ef7 confs: [default] 0 artifacts copied, 25 already retrieved (0kB/9ms) 24/01/20 22:54:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
sc = sedona.sparkContext
Suppose we want load the CSV file into Apache Sedona PointRDD
testattribute0,-88.331492,32.324142,testattribute1,testattribute2
testattribute0,-88.175933,32.360763,testattribute1,testattribute2
testattribute0,-88.388954,32.357073,testattribute1,testattribute2
testattribute0,-88.221102,32.35078,testattribute1,testattribute2
testattribute0,-88.323995,32.950671,testattribute1,testattribute2
testattribute0,-88.231077,32.700812,testattribute1,testattribute2
point_rdd = PointRDD(sc, "data/arealm-small.csv", 1, FileDataSplitter.CSV, True, 10)
## Getting approximate total count
point_rdd.approximateTotalCount
3000
# getting boundary for PointRDD or any other SpatialRDD, it returns Envelope object which inherits from
# shapely.geometry.Polygon
point_rdd.boundary()
/usr/local/lib/python3.10/dist-packages/sedona/core/geom/envelope.py:27: ShapelyDeprecationWarning: Setting custom attributes on geometry objects is deprecated, and will raise an AttributeError in Shapely 2.0 self.minx = minx /usr/local/lib/python3.10/dist-packages/sedona/core/geom/envelope.py:28: ShapelyDeprecationWarning: Setting custom attributes on geometry objects is deprecated, and will raise an AttributeError in Shapely 2.0 self.maxx = maxx /usr/local/lib/python3.10/dist-packages/sedona/core/geom/envelope.py:29: ShapelyDeprecationWarning: Setting custom attributes on geometry objects is deprecated, and will raise an AttributeError in Shapely 2.0 self.miny = miny /usr/local/lib/python3.10/dist-packages/sedona/core/geom/envelope.py:30: ShapelyDeprecationWarning: Setting custom attributes on geometry objects is deprecated, and will raise an AttributeError in Shapely 2.0 self.maxy = maxy
# To run analyze please use function analyze
point_rdd.analyze()
True
# Finding boundary envelope for PointRDD or any other SpatialRDD, it returns Envelope object which inherits from
# shapely.geometry.Polygon
point_rdd.boundaryEnvelope
# Calculate number of records without duplicates
point_rdd.countWithoutDuplicates()
2996
# Getting source epsg code
point_rdd.getSourceEpsgCode()
''
# Getting target epsg code
point_rdd.getTargetEpsgCode()
''
# Spatial partitioning data
point_rdd.spatialPartitioning(GridType.KDBTREE)
True
rawSpatialRDD method returns RDD which consists of GeoData objects which has 2 attributes
You can use any operations on those objects and spread across machines
# take firs element
point_rdd.rawSpatialRDD.take(1)
[Geometry: Point userData: testattribute0 testattribute1 testattribute2]
# collect to Python list
point_rdd.rawSpatialRDD.collect()[:5]
[Geometry: Point userData: testattribute0 testattribute1 testattribute2, Geometry: Point userData: testattribute0 testattribute1 testattribute2, Geometry: Point userData: testattribute0 testattribute1 testattribute2, Geometry: Point userData: testattribute0 testattribute1 testattribute2, Geometry: Point userData: testattribute0 testattribute1 testattribute2]
# apply map functions, for example distance to Point(52 21)
point_rdd.rawSpatialRDD.map(lambda x: x.geom.distance(Point(21, 52))).take(5)
[111.08786851399313, 110.92828303170774, 111.1385974283527, 110.97450594034112, 110.97122518072091]
point_rdd_to_geo = point_rdd.rawSpatialRDD.map(lambda x: [x.geom, *x.getUserData().split("\t")])
point_gdf = gpd.GeoDataFrame(
point_rdd_to_geo.collect(), columns=["geom", "attr1", "attr2", "attr3"], geometry="geom"
)
point_gdf[:5]
geom | attr1 | attr2 | attr3 | |
---|---|---|---|---|
0 | POINT (-88.33149 32.32414) | testattribute0 | testattribute1 | testattribute2 |
1 | POINT (-88.17593 32.36076) | testattribute0 | testattribute1 | testattribute2 |
2 | POINT (-88.38895 32.35707) | testattribute0 | testattribute1 | testattribute2 |
3 | POINT (-88.22110 32.35078) | testattribute0 | testattribute1 | testattribute2 |
4 | POINT (-88.32399 32.95067) | testattribute0 | testattribute1 | testattribute2 |
# Adapter allows you to convert geospatial data types introduced with sedona to other ones
spatial_df = Adapter.\
toDf(point_rdd, ["attr1", "attr2", "attr3"], sedona).\
createOrReplaceTempView("spatial_df")
spatial_gdf = sedona.sql("Select attr1, attr2, attr3, geometry as geom from spatial_df")
spatial_gdf.show(5, False)
+--------------+--------------+--------------+----------------------------+ |attr1 |attr2 |attr3 |geom | +--------------+--------------+--------------+----------------------------+ |testattribute0|testattribute1|testattribute2|POINT (-88.331492 32.324142)| |testattribute0|testattribute1|testattribute2|POINT (-88.175933 32.360763)| |testattribute0|testattribute1|testattribute2|POINT (-88.388954 32.357073)| |testattribute0|testattribute1|testattribute2|POINT (-88.221102 32.35078) | |testattribute0|testattribute1|testattribute2|POINT (-88.323995 32.950671)| +--------------+--------------+--------------+----------------------------+ only showing top 5 rows
gpd.GeoDataFrame(spatial_gdf.toPandas(), geometry="geom")[:5]
attr1 | attr2 | attr3 | geom | |
---|---|---|---|---|
0 | testattribute0 | testattribute1 | testattribute2 | POINT (-88.33149 32.32414) |
1 | testattribute0 | testattribute1 | testattribute2 | POINT (-88.17593 32.36076) |
2 | testattribute0 | testattribute1 | testattribute2 | POINT (-88.38895 32.35707) |
3 | testattribute0 | testattribute1 | testattribute2 | POINT (-88.22110 32.35078) |
4 | testattribute0 | testattribute1 | testattribute2 | POINT (-88.32399 32.95067) |
schema = StructType(
[
StructField("geometry", GeometryType(), False),
StructField("attr1", StringType(), False),
StructField("attr2", StringType(), False),
StructField("attr3", StringType(), False),
]
)
geo_df = sedona.createDataFrame(point_rdd_to_geo, schema, verifySchema=False)
gpd.GeoDataFrame(geo_df.toPandas(), geometry="geometry")[:5]
geometry | attr1 | attr2 | attr3 | |
---|---|---|---|---|
0 | POINT (-88.33149 32.32414) | testattribute0 | testattribute1 | testattribute2 |
1 | POINT (-88.17593 32.36076) | testattribute0 | testattribute1 | testattribute2 |
2 | POINT (-88.38895 32.35707) | testattribute0 | testattribute1 | testattribute2 |
3 | POINT (-88.22110 32.35078) | testattribute0 | testattribute1 | testattribute2 |
4 | POINT (-88.32399 32.95067) | testattribute0 | testattribute1 | testattribute2 |
Currently The library supports 5 typed SpatialRDDs:
rectangle_rdd = RectangleRDD(sc, "data/zcta510-small.csv", FileDataSplitter.CSV, True, 11)
point_rdd = PointRDD(sc, "data/arealm-small.csv", 1, FileDataSplitter.CSV, False, 11)
polygon_rdd = PolygonRDD(sc, "data/primaryroads-polygon.csv", FileDataSplitter.CSV, True, 11)
linestring_rdd = LineStringRDD(sc, "data/primaryroads-linestring.csv", FileDataSplitter.CSV, True)
rectangle_rdd.analyze()
point_rdd.analyze()
polygon_rdd.analyze()
linestring_rdd.analyze()
True
Apache Sedona spatial partitioning method can significantly speed up the join query. Three spatial partitioning methods are available: KDB-Tree, Quad-Tree and R-Tree. Two SpatialRDD must be partitioned by the same way.
point_rdd.spatialPartitioning(GridType.KDBTREE)
True
Apache Sedona provides two types of spatial indexes, Quad-Tree and R-Tree. Once you specify an index type, Apache Sedona will build a local tree index on each of the SpatialRDD partition.
point_rdd.buildIndex(IndexType.RTREE, True)
Spatial join is operation which combines data based on spatial relations like:
To Use Spatial Join in GeoPyspark library please use JoinQuery object, which has implemented below methods:
SpatialJoinQuery(spatialRDD: SpatialRDD, queryRDD: SpatialRDD, useIndex: bool, considerBoundaryIntersection: bool) -> RDD
DistanceJoinQuery(spatialRDD: SpatialRDD, queryRDD: SpatialRDD, useIndex: bool, considerBoundaryIntersection: bool) -> RDD
spatialJoin(queryWindowRDD: SpatialRDD, objectRDD: SpatialRDD, joinParams: JoinParams) -> RDD
DistanceJoinQueryFlat(spatialRDD: SpatialRDD, queryRDD: SpatialRDD, useIndex: bool, considerBoundaryIntersection: bool) -> RDD
SpatialJoinQueryFlat(spatialRDD: SpatialRDD, queryRDD: SpatialRDD, useIndex: bool, considerBoundaryIntersection: bool) -> RDD
# partitioning the data
point_rdd.spatialPartitioning(GridType.KDBTREE)
rectangle_rdd.spatialPartitioning(point_rdd.getPartitioner())
# building an index
point_rdd.buildIndex(IndexType.RTREE, True)
# Perform Spatial Join Query
result = JoinQuery.SpatialJoinQueryFlat(point_rdd, rectangle_rdd, False, True)
As result we will get RDD[GeoData, GeoData] It can be used like any other Python RDD. You can use map, take, collect and other functions
result
MapPartitionsRDD[63] at map at FlatPairRddConverter.scala:30
result.take(2)
[[Geometry: Polygon userData: , Geometry: Point userData: ], [Geometry: Polygon userData: , Geometry: Point userData: ]]
result.collect()[:3]
[[Geometry: Polygon userData: , Geometry: Point userData: ], [Geometry: Polygon userData: , Geometry: Point userData: ], [Geometry: Polygon userData: , Geometry: Point userData: ]]
# getting distance using SpatialObjects
result.map(lambda x: x[0].geom.distance(x[1].geom)).take(5)
[0.0, 0.0, 0.0, 0.0, 0.0]
# getting area of polygon data
result.map(lambda x: x[0].geom.area).take(5)
[0.051572544132000575, 0.051572544132000575, 0.05189354027999942, 0.057069904940998895, 0.057069904940998895]
# Base on result you can create DataFrame object, using map function and build DataFrame from RDD
schema = StructType(
[
StructField("geom_left", GeometryType(), False),
StructField("geom_right", GeometryType(), False)
]
)
# Set verifySchema to False
spatial_join_result = result.map(lambda x: [x[0].geom, x[1].geom])
sedona.createDataFrame(spatial_join_result, schema, verifySchema=False).show(5, True)
+--------------------+--------------------+ | geom_left| geom_right| +--------------------+--------------------+ |POLYGON ((-87.082...|POINT (-87.075409...| |POLYGON ((-87.082...|POINT (-87.08084 ...| |POLYGON ((-87.092...|POINT (-87.08084 ...| |POLYGON ((-87.285...|POINT (-87.095533...| |POLYGON ((-87.285...|POINT (-87.124441...| +--------------------+--------------------+ only showing top 5 rows
# Above code produces DataFrame with geometry Data type
sedona.createDataFrame(spatial_join_result, schema, verifySchema=False).printSchema()
root |-- geom_left: geometry (nullable = false) |-- geom_right: geometry (nullable = false)
We can create DataFrame object from Spatial Pair RDD using Adapter object as follows
Adapter.toDf(result, ["attr1"], ["attr2"], sedona).show(5, True)
+--------------------+-----+--------------------+-----+ | geom_1|attr1| geom_2|attr2| +--------------------+-----+--------------------+-----+ |POLYGON ((-87.082...| |POINT (-87.075409...| | |POLYGON ((-87.082...| |POINT (-87.08084 ...| | |POLYGON ((-87.092...| |POINT (-87.08084 ...| | |POLYGON ((-87.285...| |POINT (-87.095533...| | |POLYGON ((-87.285...| |POINT (-87.124441...| | +--------------------+-----+--------------------+-----+ only showing top 5 rows
This also produce DataFrame with geometry DataType
Adapter.toDf(result, ["attr1"], ["attr2"], sedona).printSchema()
root |-- geom_1: geometry (nullable = true) |-- attr1: string (nullable = true) |-- geom_2: geometry (nullable = true) |-- attr2: string (nullable = true)
We can create RDD which will be of type RDD[GeoData, List[GeoData]] We can for example calculate number of Points within some polygon data
To do that we can use code specified below
point_rdd.spatialPartitioning(GridType.KDBTREE)
rectangle_rdd.spatialPartitioning(point_rdd.getPartitioner())
spatial_join_result_non_flat = JoinQuery.SpatialJoinQuery(point_rdd, rectangle_rdd, False, True)
# number of point for each polygon
number_of_points = spatial_join_result_non_flat.map(lambda x: [x[0].geom, x[1].__len__()])
schema = StructType([
StructField("geometry", GeometryType(), False),
StructField("number_of_points", LongType(), False)
])
sedona.createDataFrame(number_of_points, schema, verifySchema=False).show()
+--------------------+----------------+ | geometry|number_of_points| +--------------------+----------------+ |POLYGON ((-87.114...| 15| |POLYGON ((-87.082...| 12| |POLYGON ((-86.697...| 1| |POLYGON ((-87.285...| 26| |POLYGON ((-87.105...| 15| |POLYGON ((-86.816...| 6| |POLYGON ((-87.229...| 7| |POLYGON ((-87.092...| 5| |POLYGON ((-86.749...| 4| |POLYGON ((-86.860...| 12| +--------------------+----------------+
Spatial KNNQuery is operation which help us find answer which k number of geometries lays closest to other geometry.
For Example: 5 closest Shops to your home. To use Spatial KNNQuery please use object KNNQuery which has one method:
SpatialKnnQuery(spatialRDD: SpatialRDD, originalQueryPoint: BaseGeometry, k: int, useIndex: bool)-> List[GeoData]
result = KNNQuery.SpatialKnnQuery(point_rdd, Point(-84.01, 34.01), 5, False)
result
[Geometry: Point userData: , Geometry: Point userData: , Geometry: Point userData: , Geometry: Point userData: , Geometry: Point userData: ]
As Reference geometry you can also use Polygon or LineString object
polygon = Polygon(
[(-84.237756, 33.904859), (-84.237756, 34.090426),
(-83.833011, 34.090426), (-83.833011, 33.904859),
(-84.237756, 33.904859)
])
polygons_nearby = KNNQuery.SpatialKnnQuery(polygon_rdd, polygon, 5, False)
polygons_nearby
[Geometry: Polygon userData: , Geometry: Polygon userData: , Geometry: Polygon userData: , Geometry: Polygon userData: , Geometry: Polygon userData: ]
polygons_nearby[0].geom.wkt
'POLYGON ((-83.993559 34.087259, -83.993559 34.131247, -83.959903 34.131247, -83.959903 34.087259, -83.993559 34.087259))'
A spatial range query takes as input a range query window and an SpatialRDD and returns all geometries that intersect / are fully covered by the query window. RangeQuery has one method:
SpatialRangeQuery(self, spatialRDD: SpatialRDD, rangeQueryWindow: BaseGeometry, considerBoundaryIntersection: bool, usingIndex: bool) -> RDD
from sedona.core.geom.envelope import Envelope
query_envelope = Envelope(-85.01, -60.01, 34.01, 50.01)
result_range_query = RangeQuery.SpatialRangeQuery(linestring_rdd, query_envelope, False, False)
/usr/local/lib/python3.10/dist-packages/sedona/core/geom/envelope.py:27: ShapelyDeprecationWarning: Setting custom attributes on geometry objects is deprecated, and will raise an AttributeError in Shapely 2.0 self.minx = minx /usr/local/lib/python3.10/dist-packages/sedona/core/geom/envelope.py:28: ShapelyDeprecationWarning: Setting custom attributes on geometry objects is deprecated, and will raise an AttributeError in Shapely 2.0 self.maxx = maxx /usr/local/lib/python3.10/dist-packages/sedona/core/geom/envelope.py:29: ShapelyDeprecationWarning: Setting custom attributes on geometry objects is deprecated, and will raise an AttributeError in Shapely 2.0 self.miny = miny /usr/local/lib/python3.10/dist-packages/sedona/core/geom/envelope.py:30: ShapelyDeprecationWarning: Setting custom attributes on geometry objects is deprecated, and will raise an AttributeError in Shapely 2.0 self.maxy = maxy
result_range_query
MapPartitionsRDD[127] at map at GeometryRddConverter.scala:30
result_range_query.take(6)
[Geometry: LineString userData: , Geometry: LineString userData: , Geometry: LineString userData: , Geometry: LineString userData: , Geometry: LineString userData: , Geometry: LineString userData: ]
# Creating DataFrame from result
schema = StructType([StructField("geometry", GeometryType(), False)])
sedona.createDataFrame(
result_range_query.map(lambda x: [x.geom]),
schema,
verifySchema=False
).show(5, True)
+--------------------+ | geometry| +--------------------+ |LINESTRING (-72.1...| |LINESTRING (-72.4...| |LINESTRING (-72.4...| |LINESTRING (-73.4...| |LINESTRING (-73.6...| +--------------------+ only showing top 5 rows
GeoPyspark allows to load the data from other Data formats like:
## ShapeFile - load to SpatialRDD
shape_rdd = ShapefileReader.readToGeometryRDD(sc, "data/polygon")
shape_rdd
<sedona.core.SpatialRDD.spatial_rdd.SpatialRDD at 0xffff725e64a0>
Adapter.toDf(shape_rdd, sedona).show(5, True)
+--------------------+ | geometry| +--------------------+ |MULTIPOLYGON (((1...| |MULTIPOLYGON (((-...| |MULTIPOLYGON (((1...| |POLYGON ((118.362...| |MULTIPOLYGON (((-...| +--------------------+ only showing top 5 rows
## GeoJSON - load to SpatialRDD
{ "type": "Feature", "properties": { "STATEFP": "01", "COUNTYFP": "077", "TRACTCE": "011501", "BLKGRPCE": "5", "AFFGEOID": "1500000US010770115015", "GEOID": "010770115015", "NAME": "5", "LSAD": "BG", "ALAND": 6844991, "AWATER": 32636 }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -87.621765, 34.873444 ], [ -87.617535, 34.873369 ], [ -87.6123, 34.873337 ], [ -87.604049, 34.873303 ], [ -87.604033, 34.872316 ], [ -87.60415, 34.867502 ], [ -87.604218, 34.865687 ], [ -87.604409, 34.858537 ], [ -87.604018, 34.851336 ], [ -87.603716, 34.844829 ], [ -87.603696, 34.844307 ], [ -87.603673, 34.841884 ], [ -87.60372, 34.841003 ], [ -87.603879, 34.838423 ], [ -87.603888, 34.837682 ], [ -87.603889, 34.83763 ], [ -87.613127, 34.833938 ], [ -87.616451, 34.832699 ], [ -87.621041, 34.831431 ], [ -87.621056, 34.831526 ], [ -87.62112, 34.831925 ], [ -87.621603, 34.8352 ], [ -87.62158, 34.836087 ], [ -87.621383, 34.84329 ], [ -87.621359, 34.844438 ], [ -87.62129, 34.846387 ], [ -87.62119, 34.85053 ], [ -87.62144, 34.865379 ], [ -87.621765, 34.873444 ] ] ] } },
geo_json_rdd = GeoJsonReader.readToGeometryRDD(sc, "data/testPolygon.json")
geo_json_rdd
<sedona.core.SpatialRDD.spatial_rdd.SpatialRDD at 0xffff7260cdc0>
Adapter.toDf(geo_json_rdd, sedona).drop("AWATER").show(5, True)
+--------------------+-------+--------+-------+--------+--------------------+------------+----+----+--------+ | geometry|STATEFP|COUNTYFP|TRACTCE|BLKGRPCE| AFFGEOID| GEOID|NAME|LSAD| ALAND| +--------------------+-------+--------+-------+--------+--------------------+------------+----+----+--------+ |POLYGON ((-87.621...| 01| 077| 011501| 5|1500000US01077011...|010770115015| 5| BG| 6844991| |POLYGON ((-85.719...| 01| 045| 021102| 4|1500000US01045021...|010450211024| 4| BG|11360854| |POLYGON ((-86.000...| 01| 055| 001300| 3|1500000US01055001...|010550013003| 3| BG| 1378742| |POLYGON ((-86.574...| 01| 089| 001700| 2|1500000US01089001...|010890017002| 2| BG| 1040641| |POLYGON ((-85.382...| 01| 069| 041400| 1|1500000US01069041...|010690414001| 1| BG| 8243574| +--------------------+-------+--------+-------+--------+--------------------+------------+----+----+--------+ only showing top 5 rows
## WKT - loading to SpatialRDD
wkt_rdd = WktReader.readToGeometryRDD(sc, "data/county_small.tsv", 0, True, False)
wkt_rdd
<sedona.core.SpatialRDD.spatial_rdd.SpatialRDD at 0xffff7260efe0>
Adapter.toDf(wkt_rdd, sedona).printSchema()
root |-- geometry: geometry (nullable = true)
Adapter.toDf(wkt_rdd, sedona).show(5, True)
+--------------------+ | geometry| +--------------------+ |POLYGON ((-97.019...| |POLYGON ((-123.43...| |POLYGON ((-104.56...| |POLYGON ((-96.910...| |POLYGON ((-98.273...| +--------------------+ only showing top 5 rows
## WKB - load to SpatialRDD
wkb_rdd = WkbReader.readToGeometryRDD(sc, "data/county_small_wkb.tsv", 0, True, False)
Adapter.toDf(wkb_rdd, sedona).show(5, True)
+--------------------+ | geometry| +--------------------+ |POLYGON ((-97.019...| |POLYGON ((-123.43...| |POLYGON ((-104.56...| |POLYGON ((-96.910...| |POLYGON ((-98.273...| +--------------------+ only showing top 5 rows
point_rdd.spatialPartitioning(GridType.KDBTREE)
rectangle_rdd.spatialPartitioning(point_rdd.getPartitioner())
# building an index
point_rdd.buildIndex(IndexType.RTREE, True)
# Perform Spatial Join Query
result = JoinQueryRaw.SpatialJoinQueryFlat(point_rdd, rectangle_rdd, False, True)
# without passing column names, the result will contain only two geometries columns
geometry_df = Adapter.toDf(result, sedona)
geometry_df.printSchema()
root |-- leftgeometry: geometry (nullable = true) |-- rightgeometry: geometry (nullable = true)
geometry_df.show(5)
+--------------------+--------------------+ | leftgeometry| rightgeometry| +--------------------+--------------------+ |POLYGON ((-87.285...|POINT (-87.28468 ...| |POLYGON ((-87.285...|POINT (-87.278485...| |POLYGON ((-87.285...|POINT (-87.280556...| |POLYGON ((-87.285...|POINT (-87.270187...| |POLYGON ((-87.285...|POINT (-87.268766...| +--------------------+--------------------+ only showing top 5 rows
geometry_df.collect()[0]
Row(leftgeometry=<shapely.geometry.polygon.Polygon object at 0xffff72618520>, rightgeometry=<shapely.geometry.point.Point object at 0xffff7261a0e0>)
geometry_df = Adapter.toDf(result, ["left_user_data"], ["right_user_data"], sedona)
geometry_df.show(5)
+--------------------+--------------+--------------------+---------------+ | leftgeometry|left_user_data| rightgeometry|right_user_data| +--------------------+--------------+--------------------+---------------+ |POLYGON ((-87.285...| |POINT (-87.28468 ...| null| |POLYGON ((-87.285...| |POINT (-87.278485...| null| |POLYGON ((-87.285...| |POINT (-87.280556...| null| |POLYGON ((-87.285...| |POINT (-87.270187...| null| |POLYGON ((-87.285...| |POINT (-87.268766...| null| +--------------------+--------------+--------------------+---------------+ only showing top 5 rows
query_envelope = Envelope(-85.01, -60.01, 34.01, 50.01)
result_range_query = RangeQueryRaw.SpatialRangeQuery(linestring_rdd, query_envelope, False, False)
# converting to df
gdf = Adapter.toDf(result_range_query, sedona)
gdf.show(5)
+--------------------+ | geometry| +--------------------+ |LINESTRING (-72.1...| |LINESTRING (-72.4...| |LINESTRING (-72.4...| |LINESTRING (-73.4...| |LINESTRING (-73.6...| +--------------------+ only showing top 5 rows
gdf.printSchema()
root |-- geometry: geometry (nullable = true)
# Passing column names
# converting to df
gdf_with_columns = Adapter.toDf(result_range_query, sedona, ["_c1"])
gdf_with_columns.show(5)
+--------------------+---+ | geometry|_c1| +--------------------+---+ |LINESTRING (-72.1...| | |LINESTRING (-72.4...| | |LINESTRING (-72.4...| | |LINESTRING (-73.4...| | |LINESTRING (-73.6...| | +--------------------+---+ only showing top 5 rows
gdf_with_columns.printSchema()
root |-- geometry: geometry (nullable = true) |-- _c1: string (nullable = true)