#!/usr/bin/env python
# coding: utf-8

# # STAC Geoparquet with DuckDB
# 
# ## A short word on STAC
# 
# STAC is the central way of accessing any spatio-temporal data on terrabyte. See here for an introduction and the detailed sepcification:
# * https://stacspec.org/en
# * https://github.com/radiantearth/stac-spec
# 
# Principally, data is offered over a *catalog* containing data from various sources. This catalog is further sub-divided into *collections*. 
# A collection could for example contain a certain satellite data product like Sentinel-1 GRD, SLC or Sentinel-2 L2A. 
# Each collection consists of multiple items, which might represent individual satellite scenes or product tiles (e.g. the MGRS tiles of Sentinel-2). 
# Each item consists of one or many *assets*, which contain links to the actual data. For example individual GeoTIFF files for each band.
# 
# ## STCA Items as Geoparquet files
# 
# Besides the STAC API (https://stac.terrabyte.lrz.de/public/api) we have made available an export of the STAC Items for each collection as Geoparquet files. You can query those Geoparquet files with the in-memory database [DuckDB](https://duckdb.org/). Without interacting with the terrabyte STAC API you can query and filter STAC items with your own computing resources. 

# ## Requirements
# 
# For this example, you need to have the Python libraries `duckdb` (we still need version 1.0.0!), `pygeofilter-duckdb` (https://github.com/DLR-terrabyte/pygeofilter-duckdb), and `stac-geoparquet` installed.

# In[1]:


#!pip install duckdb==1.0.0 stac-geoparquet git+https://github.com/DLR-terrabyte/pygeofilter-duckdb.git


# ## Import of libraries

# In[2]:


import json
import pystac
import duckdb
from stac_geoparquet.arrow._api import stac_table_to_items

# Install and load DuckDB spatial extension
duckdb.install_extension('spatial')
duckdb.load_extension('spatial')

# Use pygeofilter library to convert between CQL2-JSON/Text and SQL query
from pygeofilter.parsers.cql2_json import parse as json_parse
from pygeofilter.backends.duckdb import to_sql_where
from pygeofilter.util import IdempotentDict


# ## Define STAC Item export as Geoparquet files

# In[3]:


geoparquet_files = {
    'sentinel-1-grd': '/dss/dsstbyfs03/pn56su/pn56su-dss-0022/Sentinel-1/GRD/geoparquet/*.parquet',
    'sentinel-1-slc': '/dss/dsstbyfs03/pn56su/pn56su-dss-0022/Sentinel-1/SLC/geoparquet/*.parquet',
    'sentinel-2-c1-l1c': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/Sentinel-2-Col-1/L1C/geoparquet/*.parquet',
    'sentinel-2-c1-l2a': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/Sentinel-2-Col-1/L2A/geoparquet/*.parquet',
    'sentinel-3-olci-l1-efr': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/Sentinel-3/OLCI/OL_1_EFR___/geoparquet/*.parquet',
    'landsat-tm-c2-l2': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/Landsat/collection-2/level-2/standard/tm/geoparquet/*.parquet',
    'landsat-etm-c2-l2': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/Landsat/collection-2/level-2/standard/etm/geoparquet/*.parquet',
    'landsat-ot-c2-l2': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/Landsat/collection-2/level-2/standard/oli-tirs/geoparquet/*.parquet',
}


# ## Define query as CQL2-JSON

# In[4]:


start = '2023-02-01T00:00:00Z'
end = '2023-02-28T23:59:59Z'
collection = 'sentinel-2-c1-l2a'

cql2_filter = {
  "op": "and",
  "args": [
    {
      "op": "between",
      "args": [
        {
          "property": "eo:cloud_cover"
        },
        [0, 21]
      ]
    },
    {
      "op": "between",
      "args": [
        {
          "property": "datetime"
        },
        [start, end]
      ]
    },
    {
        "op": "s_intersects",
        "args": [
          { "property": "geometry" } ,
          {
            "type": "Polygon", # Baden-Württemberg
            "coordinates": [[
                [7.5113934084, 47.5338000528],
    			[10.4918239143, 47.5338000528],
    			[10.4918239143, 49.7913749328],
    			[7.5113934084, 49.7913749328],
    			[7.5113934084, 47.5338000528]
            ]]
          }
        ]
      }
  ]
}

sql_where = to_sql_where(json_parse(cql2_filter), IdempotentDict())


# ## Query data with DuckDB

# In[5]:


get_ipython().run_cell_magic('time', '', '# Define geoparquet files\ngeoparquet = geoparquet_files[collection]\n\n# Define and execute query\n# Note: union_by_name slows down the query process, but is necessary when there are properties not available in all STAC items\nsql_query = f"SELECT * FROM read_parquet(\'{geoparquet}\', union_by_name=False) WHERE {sql_where}"\nprint(f"DuckDB Query:\\n{sql_query}\\n")\ndb = duckdb.query(sql_query)\n\n## Convert DuckDB result to Arrow table\ntable = db.fetch_arrow_table()\n\n## Convert Arrow table to List of PyStac-Items\nitems = []\nfor item in stac_table_to_items(table): \n    item[\'assets\'] = json.loads(item[\'assets\'])\n    items.append(pystac.Item.from_dict(item))\n\nprint("%s items found\\n" % len(items))\n')


# ## Show first item

# In[6]:


items[0]


# ## List items as Pandas GeoDataFrame

# In[7]:


import geopandas as gpd
dataframe = gpd.GeoDataFrame.from_features(items)
dataframe


# ## Visualize the covered area

# In[8]:


import folium
import folium.plugins as folium_plugins

map = folium.Map()
layer_control = folium.LayerControl(position='topright', collapsed=True)
fullscreen = folium_plugins.Fullscreen()
style = {'fillColor': '#00000000', "color": "#0000ff", "weight": 1}

footprints = folium.GeoJson(
    dataframe.to_json(),
    name='Stac Item footprints',
    style_function=lambda x: style,
    control=True
)

footprints.add_to(map)
layer_control.add_to(map)
fullscreen.add_to(map)
map.fit_bounds(map.get_bounds())
map