Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
import os
import geopandas as gpd
from pyspark.sql import SparkSession
from sedona.spark import *
config = (
SedonaContext.builder()
.config(
"spark.jars.packages",
"uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.4",
)
.master("spark://localhost:7077")
.getOrCreate()
)
sedona = SedonaContext.create(config)
:: loading settings :: url = jar:file:/home/jovyan/spark-3.4.2-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/jovyan/.ivy2/cache The jars for the packages stored in: /home/jovyan/.ivy2/jars org.apache.sedona#sedona-spark-3.4_2.12 added as a dependency org.datasyslab#geotools-wrapper added as a dependency uk.co.gresearch.spark#spark-extension_2.12 added as a dependency :: resolving dependencies :: org.apache.spark#spark-submit-parent-713da225-9e04-45e3-9233-485d4b212b3c;1.0 confs: [default] found org.apache.sedona#sedona-spark-3.4_2.12;1.6.0 in central found org.apache.sedona#sedona-common;1.6.0 in central found org.apache.commons#commons-math3;3.6.1 in central found org.locationtech.jts#jts-core;1.19.0 in central found org.wololo#jts2geojson;0.16.1 in central found org.locationtech.spatial4j#spatial4j;0.8 in central found com.google.geometry#s2-geometry;2.0.0 in central found com.google.guava#guava;25.1-jre in central found com.google.code.findbugs#jsr305;3.0.2 in central found org.checkerframework#checker-qual;2.0.0 in central found com.google.errorprone#error_prone_annotations;2.1.3 in central found com.google.j2objc#j2objc-annotations;1.1 in central found org.codehaus.mojo#animal-sniffer-annotations;1.14 in central found com.uber#h3;4.1.1 in central found net.sf.geographiclib#GeographicLib-Java;1.52 in central found com.github.ben-manes.caffeine#caffeine;2.9.2 in central found org.checkerframework#checker-qual;3.10.0 in central found com.google.errorprone#error_prone_annotations;2.5.1 in central found org.apache.sedona#sedona-spark-common-3.4_2.12;1.6.0 in central found commons-lang#commons-lang;2.6 in central found org.scala-lang.modules#scala-collection-compat_2.12;2.5.0 in central found org.beryx#awt-color-factory;1.0.0 in central found org.datasyslab#geotools-wrapper;1.6.0-28.2 in central found uk.co.gresearch.spark#spark-extension_2.12;2.11.0-3.4 in central found com.github.scopt#scopt_2.12;4.1.0 in central :: resolution report :: resolve 2280ms :: artifacts dl 96ms :: modules in use: com.github.ben-manes.caffeine#caffeine;2.9.2 from central in [default] com.github.scopt#scopt_2.12;4.1.0 from central in [default] com.google.code.findbugs#jsr305;3.0.2 from central in [default] com.google.errorprone#error_prone_annotations;2.5.1 from central in [default] com.google.geometry#s2-geometry;2.0.0 from central in [default] com.google.guava#guava;25.1-jre from central in [default] com.google.j2objc#j2objc-annotations;1.1 from central in [default] com.uber#h3;4.1.1 from central in [default] commons-lang#commons-lang;2.6 from central in [default] net.sf.geographiclib#GeographicLib-Java;1.52 from central in [default] org.apache.commons#commons-math3;3.6.1 from central in [default] org.apache.sedona#sedona-common;1.6.0 from central in [default] org.apache.sedona#sedona-spark-3.4_2.12;1.6.0 from central in [default] org.apache.sedona#sedona-spark-common-3.4_2.12;1.6.0 from central in [default] org.beryx#awt-color-factory;1.0.0 from central in [default] org.checkerframework#checker-qual;3.10.0 from central in [default] org.codehaus.mojo#animal-sniffer-annotations;1.14 from central in [default] org.datasyslab#geotools-wrapper;1.6.0-28.2 from central in [default] org.locationtech.jts#jts-core;1.19.0 from central in [default] org.locationtech.spatial4j#spatial4j;0.8 from central in [default] org.scala-lang.modules#scala-collection-compat_2.12;2.5.0 from central in [default] org.wololo#jts2geojson;0.16.1 from central in [default] uk.co.gresearch.spark#spark-extension_2.12;2.11.0-3.4 from central in [default] :: evicted modules: org.checkerframework#checker-qual;2.0.0 by [org.checkerframework#checker-qual;3.10.0] in [default] com.google.errorprone#error_prone_annotations;2.1.3 by [com.google.errorprone#error_prone_annotations;2.5.1] in [default] --------------------------------------------------------------------- | | modules || artifacts | | conf | number| search|dwnlded|evicted|| number|dwnlded| --------------------------------------------------------------------- | default | 25 | 0 | 0 | 2 || 23 | 0 | --------------------------------------------------------------------- :: retrieving :: org.apache.spark#spark-submit-parent-713da225-9e04-45e3-9233-485d4b212b3c confs: [default] 0 artifacts copied, 23 already retrieved (0kB/88ms) 24/05/22 18:06:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
point_csv_df = (
sedona.read.format("csv")
.option("delimiter", ",")
.option("header", "false")
.load("data/testpoint.csv")
)
point_csv_df.createOrReplaceTempView("pointtable")
point_df = sedona.sql(
"select ST_Point(cast(pointtable._c0 as Decimal(24,20)), cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable"
)
point_df.show(5)
[Stage 4:> (0 + 1) / 1]
+-----------------+ | arealandmark| +-----------------+ |POINT (1.1 101.1)| |POINT (2.1 102.1)| |POINT (3.1 103.1)| |POINT (4.1 104.1)| |POINT (5.1 105.1)| +-----------------+ only showing top 5 rows
polygon_wkt_df = (
sedona.read.format("csv")
.option("delimiter", "\t")
.option("header", "false")
.load("data/county_small.tsv")
)
polygon_wkt_df.createOrReplaceTempView("polygontable")
polygon_df = sedona.sql(
"select polygontable._c6 as name, ST_GeomFromText(polygontable._c0) as countyshape from polygontable"
)
polygon_df.show(5)
+----------------+--------------------+ | name| countyshape| +----------------+--------------------+ | Cuming County|POLYGON ((-97.019...| |Wahkiakum County|POLYGON ((-123.43...| | De Baca County|POLYGON ((-104.56...| |Lancaster County|POLYGON ((-96.910...| | Nuckolls County|POLYGON ((-98.273...| +----------------+--------------------+ only showing top 5 rows
polygon_wkb_df = (
sedona.read.format("csv")
.option("delimiter", "\t")
.option("header", "false")
.load("data/county_small_wkb.tsv")
)
polygon_wkb_df.createOrReplaceTempView("polygontable")
polygon_df = sedona.sql(
"select polygontable._c6 as name, ST_GeomFromWKB(polygontable._c0) as countyshape from polygontable"
)
polygon_df.show(5)
+----------------+--------------------+ | name| countyshape| +----------------+--------------------+ | Cuming County|POLYGON ((-97.019...| |Wahkiakum County|POLYGON ((-123.43...| | De Baca County|POLYGON ((-104.56...| |Lancaster County|POLYGON ((-96.910...| | Nuckolls County|POLYGON ((-98.273...| +----------------+--------------------+ only showing top 5 rows
polygon_json_df = (
sedona.read.format("csv")
.option("delimiter", "\t")
.option("header", "false")
.load("data/testPolygon.json")
)
polygon_json_df.createOrReplaceTempView("polygontable")
polygon_df = sedona.sql(
"select ST_GeomFromGeoJSON(polygontable._c0) as countyshape from polygontable"
)
polygon_df.show(5)
+--------------------+ | countyshape| +--------------------+ |POLYGON ((-87.621...| |POLYGON ((-85.719...| |POLYGON ((-86.000...| |POLYGON ((-86.574...| |POLYGON ((-85.382...| +--------------------+ only showing top 5 rows
point_csv_df_1 = (
sedona.read.format("csv")
.option("delimiter", ",")
.option("header", "false")
.load("data/testpoint.csv")
)
point_csv_df_1.createOrReplaceTempView("pointtable")
point_df1 = sedona.sql(
"SELECT ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape1, 'abc' as name1 from pointtable"
)
point_df1.createOrReplaceTempView("pointdf1")
point_csv_df2 = (
sedona.read.format("csv")
.option("delimiter", ",")
.option("header", "false")
.load("data/testpoint.csv")
)
point_csv_df2.createOrReplaceTempView("pointtable")
point_df2 = sedona.sql(
"select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape2, 'def' as name2 from pointtable"
)
point_df2.createOrReplaceTempView("pointdf2")
distance_join_df = sedona.sql(
"select * from pointdf1, pointdf2 where ST_Distance(pointdf1.pointshape1,pointdf2.pointshape2) < 2"
)
distance_join_df.explain()
distance_join_df.show(5)
== Physical Plan == BroadcastIndexJoin pointshape2#257: geometry, LeftSide, LeftSide, Inner, INTERSECTS, ( **org.apache.spark.sql.sedona_sql.expressions.ST_Distance** < 2.0) ST_INTERSECTS(pointshape1#232, pointshape2#257) :- SpatialIndex pointshape1#232: geometry, RTREE, false, false, 2.0 : +- Project [ **org.apache.spark.sql.sedona_sql.expressions.ST_Point** AS pointshape1#232, abc AS name1#233] : +- FileScan csv [_c0#228,_c1#229] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/docs/usecases/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string> +- Project [ **org.apache.spark.sql.sedona_sql.expressions.ST_Point** AS pointshape2#257, def AS name2#258] +- FileScan csv [_c0#253,_c1#254] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/docs/usecases/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>
+-----------------+-----+-----------------+-----+ | pointshape1|name1| pointshape2|name2| +-----------------+-----+-----------------+-----+ |POINT (1.1 101.1)| abc|POINT (1.1 101.1)| def| |POINT (2.1 102.1)| abc|POINT (1.1 101.1)| def| |POINT (1.1 101.1)| abc|POINT (2.1 102.1)| def| |POINT (2.1 102.1)| abc|POINT (2.1 102.1)| def| |POINT (3.1 103.1)| abc|POINT (2.1 102.1)| def| +-----------------+-----+-----------------+-----+ only showing top 5 rows
Please refer to the example - airports per country: https://github.com/apache/sedona/blob/master/docs/usecases/ApacheSedonaSQL_SpatialJoin_AirportsPerCountry.ipynb
import pandas as pd
gdf = gpd.read_file("data/gis_osm_pois_free_1.shp")
gdf = gdf.replace(pd.NA, "")
osm_points = sedona.createDataFrame(gdf)
osm_points.printSchema()
root |-- osm_id: string (nullable = true) |-- code: long (nullable = true) |-- fclass: string (nullable = true) |-- name: string (nullable = true) |-- geometry: geometry (nullable = true)
osm_points.show(5)
[Stage 15:> (0 + 1) / 1]
+--------+----+---------+--------------+--------------------+ | osm_id|code| fclass| name| geometry| +--------+----+---------+--------------+--------------------+ |26860257|2422|camp_site| de Kroon|POINT (15.3393145...| |26860294|2406| chalet|Leśne Ustronie|POINT (14.8709625...| |29947493|2402| motel| |POINT (15.0946636...| |29947498|2602| atm| |POINT (15.0732014...| |29947499|2401| hotel| |POINT (15.0696777...| +--------+----+---------+--------------+--------------------+ only showing top 5 rows
osm_points.createOrReplaceTempView("points")
transformed_df = sedona.sql(
"""
SELECT osm_id,
code,
fclass,
name,
ST_Transform(geometry, 'epsg:4326', 'epsg:2180') as geom
FROM points
"""
)
transformed_df.show(5)
[Stage 16:> (0 + 1) / 1]
+--------+----+---------+--------------+--------------------+ | osm_id|code| fclass| name| geom| +--------+----+---------+--------------+--------------------+ |26860257|2422|camp_site| de Kroon|POINT (250776.778...| |26860294|2406| chalet|Leśne Ustronie|POINT (221076.709...| |29947493|2402| motel| |POINT (233902.541...| |29947498|2602| atm| |POINT (232447.203...| |29947499|2401| hotel| |POINT (232208.377...| +--------+----+---------+--------------+--------------------+ only showing top 5 rows
transformed_df.createOrReplaceTempView("points_2180")
neighbours_within_1000m = sedona.sql(
"""
SELECT a.osm_id AS id_1,
b.osm_id AS id_2,
a.geom
FROM points_2180 AS a, points_2180 AS b
WHERE ST_Distance(a.geom,b.geom) < 50
"""
)
neighbours_within_1000m.show()
24/05/22 18:07:13 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly. [Stage 20:> (0 + 1) / 1]
+----------+---------+--------------------+ | id_1| id_2| geom| +----------+---------+--------------------+ | 26860257| 26860257|POINT (250776.778...| | 26860294| 26860294|POINT (221076.709...| | 29947493| 29947493|POINT (233902.541...| |3241834852| 29947493|POINT (233866.098...| |5964811085| 29947493|POINT (233861.172...| |5818905324| 29947498|POINT (232446.535...| |4165181885| 29947498|POINT (232449.441...| |5846858758| 29947498|POINT (232407.167...| | 29947498| 29947498|POINT (232447.203...| | 29947499| 29947499|POINT (232208.377...| | 30077461| 29947499|POINT (232185.872...| | 29947505| 29947505|POINT (228595.321...| | 29947499| 30077461|POINT (232208.377...| | 30077461| 30077461|POINT (232185.872...| | 30079117| 30079117|POINT (273599.241...| | 197624402|197624402|POINT (203703.035...| | 197663196|197663196|POINT (203936.327...| | 197953474|197953474|POINT (203724.746...| |1074233127|262310516|POINT (203524.110...| | 262310516|262310516|POINT (203507.730...| +----------+---------+--------------------+ only showing top 20 rows
df = neighbours_within_1000m.toPandas()
24/05/22 18:07:17 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.
gdf = gpd.GeoDataFrame(df, geometry="geom")
gdf
id_1 | id_2 | geom | |
---|---|---|---|
0 | 26860257 | 26860257 | POINT (250776.778 504581.332) |
1 | 26860294 | 26860294 | POINT (221076.710 544222.650) |
2 | 29947493 | 29947493 | POINT (233902.541 501298.382) |
3 | 3241834852 | 29947493 | POINT (233866.099 501323.801) |
4 | 5964811085 | 29947493 | POINT (233861.173 501326.441) |
... | ... | ... | ... |
65670 | 6823696220 | 6823696220 | POINT (234310.474 465790.364) |
65671 | 6823721834 | 6823721834 | POINT (234313.567 465869.023) |
65672 | 6823721838 | 6823721838 | POINT (234425.224 465829.994) |
65673 | 6823772928 | 6823772928 | POINT (234582.864 465875.142) |
65674 | 6823823445 | 6823823445 | POINT (236859.419 428547.388) |
65675 rows × 3 columns