#!/usr/bin/env python # coding: utf-8 # # STAC Geoparquet with DuckDB # # ## A short word on STAC # # STAC is the central way of accessing any spatio-temporal data on terrabyte. See here for an introduction and the detailed sepcification: # * https://stacspec.org/en # * https://github.com/radiantearth/stac-spec # # Principally, data is offered over a *catalog* containing data from various sources. This catalog is further sub-divided into *collections*. # A collection could for example contain a certain satellite data product like Sentinel-1 GRD, SLC or Sentinel-2 L2A. # Each collection consists of multiple items, which might represent individual satellite scenes or product tiles (e.g. the MGRS tiles of Sentinel-2). # Each item consists of one or many *assets*, which contain links to the actual data. For example individual GeoTIFF files for each band. # # ## STCA Items as Geoparquet files # # Besides the STAC API (https://stac.terrabyte.lrz.de/public/api) we have made available an export of the STAC Items for each collection as Geoparquet files. You can query those Geoparquet files with the in-memory database [DuckDB](https://duckdb.org/). Without interacting with the terrabyte STAC API you can query and filter STAC items with your own computing resources. # ## Requirements # # For this example, you need to have the Python libraries `duckdb` (we still need version 1.0.0!), `pygeofilter-duckdb` (https://github.com/DLR-terrabyte/pygeofilter-duckdb), and `stac-geoparquet` installed. # In[1]: #!pip install duckdb==1.0.0 stac-geoparquet git+https://github.com/DLR-terrabyte/pygeofilter-duckdb.git # ## Import of libraries # In[2]: import json import pystac import duckdb from stac_geoparquet.arrow._api import stac_table_to_items # Install and load DuckDB spatial extension duckdb.install_extension('spatial') duckdb.load_extension('spatial') # Use pygeofilter library to convert between CQL2-JSON/Text and SQL query from pygeofilter.parsers.cql2_json import parse as json_parse from pygeofilter.backends.duckdb import to_sql_where from pygeofilter.util import IdempotentDict # ## Define STAC Item export as Geoparquet files # In[3]: geoparquet_files = { 'sentinel-1-grd': '/dss/dsstbyfs03/pn56su/pn56su-dss-0022/Sentinel-1/GRD/geoparquet/*.parquet', 'sentinel-1-slc': '/dss/dsstbyfs03/pn56su/pn56su-dss-0022/Sentinel-1/SLC/geoparquet/*.parquet', 'sentinel-2-c1-l1c': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/Sentinel-2-Col-1/L1C/geoparquet/*.parquet', 'sentinel-2-c1-l2a': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/Sentinel-2-Col-1/L2A/geoparquet/*.parquet', 'sentinel-3-olci-l1-efr': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/Sentinel-3/OLCI/OL_1_EFR___/geoparquet/*.parquet', 'landsat-tm-c2-l2': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/Landsat/collection-2/level-2/standard/tm/geoparquet/*.parquet', 'landsat-etm-c2-l2': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/Landsat/collection-2/level-2/standard/etm/geoparquet/*.parquet', 'landsat-ot-c2-l2': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/Landsat/collection-2/level-2/standard/oli-tirs/geoparquet/*.parquet', } # ## Define query as CQL2-JSON # In[4]: start = '2023-02-01T00:00:00Z' end = '2023-02-28T23:59:59Z' collection = 'sentinel-2-c1-l2a' cql2_filter = { "op": "and", "args": [ { "op": "between", "args": [ { "property": "eo:cloud_cover" }, [0, 21] ] }, { "op": "between", "args": [ { "property": "datetime" }, [start, end] ] }, { "op": "s_intersects", "args": [ { "property": "geometry" } , { "type": "Polygon", # Baden-Württemberg "coordinates": [[ [7.5113934084, 47.5338000528], [10.4918239143, 47.5338000528], [10.4918239143, 49.7913749328], [7.5113934084, 49.7913749328], [7.5113934084, 47.5338000528] ]] } ] } ] } sql_where = to_sql_where(json_parse(cql2_filter), IdempotentDict()) # ## Query data with DuckDB # In[5]: get_ipython().run_cell_magic('time', '', '# Define geoparquet files\ngeoparquet = geoparquet_files[collection]\n\n# Define and execute query\n# Note: union_by_name slows down the query process, but is necessary when there are properties not available in all STAC items\nsql_query = f"SELECT * FROM read_parquet(\'{geoparquet}\', union_by_name=False) WHERE {sql_where}"\nprint(f"DuckDB Query:\\n{sql_query}\\n")\ndb = duckdb.query(sql_query)\n\n## Convert DuckDB result to Arrow table\ntable = db.fetch_arrow_table()\n\n## Convert Arrow table to List of PyStac-Items\nitems = []\nfor item in stac_table_to_items(table): \n item[\'assets\'] = json.loads(item[\'assets\'])\n items.append(pystac.Item.from_dict(item))\n\nprint("%s items found\\n" % len(items))\n') # ## Show first item # In[6]: items[0] # ## List items as Pandas GeoDataFrame # In[7]: import geopandas as gpd dataframe = gpd.GeoDataFrame.from_features(items) dataframe # ## Visualize the covered area # In[8]: import folium import folium.plugins as folium_plugins map = folium.Map() layer_control = folium.LayerControl(position='topright', collapsed=True) fullscreen = folium_plugins.Fullscreen() style = {'fillColor': '#00000000', "color": "#0000ff", "weight": 1} footprints = folium.GeoJson( dataframe.to_json(), name='Stac Item footprints', style_function=lambda x: style, control=True ) footprints.add_to(map) layer_control.add_to(map) fullscreen.add_to(map) map.fit_bounds(map.get_bounds()) map