In this Jupyter Notebook we demonstrate how to use our monthly generated inventory files to get insights about which data is available on terrabyte.
You can query these inventory files by collection, acquisition time, geometry, and inventory status.
For this tutorial you need to install the Python interfaces of duckdb
. You can use conda, micromamba or pip to install duckdb.
#!micromamba install python-duckdb -c conda-forge
#!pip install duckdb
import duckdb
duckdb.install_extension('spatial')
duckdb.load_extension('spatial')
def query_geoparquet(collection, wkt_geometry=None, datetime=None, status=None):
# We only query scenes that are not deprecated
where_query = [
"deprecated = 'false'"
]
# Filter by order status
if status:
where_query.append(f"status = '{status}'")
# Filter by geometry
if wkt_geometry:
where_query.append(f"ST_Intersects(ST_GeomFromText(geometry), ST_GeomFromText('{wkt_geometry}'))")
# Filter by datetime
if datetime:
where_query.append(f"datetime >= '{datetime[0]}'")
where_query.append(f"datetime < '{datetime[1]}'")
if len(where_query) > 0:
where_query = ' and '.join(where_query)
where_query = 'WHERE ' + where_query
else:
where_query = ''
# SQL statement
query = f"set TimeZone = 'UTC'; SELECT DATE_TRUNC('year', datetime) as year, count(id) as count FROM '{inventory_geoparquet[collection]}' {where_query} GROUP BY year"
print(f"SQL Query: {query}")
# Define duckdb query
return duckdb.query(query)
The following files are updated monthly after a synchronization with the data provider's (ESA, USGS, NASA) inventory.
All of these files have the same columns:
id
contains the official scene identifierdatetime
contains the acquisition timegeometry
contains the bounding box of the scene (as character)status
contains the terrabyte status, one oforderable
for scenes that still needs to be ingested into terrabyteordered
for scenes that are in an ingestion queue of terrabyteremoved
for scenes removed from the data provider's inventorysucceeded
for scenes available on terrabytedeprecated
contains true
if the scene has been replaced by another scene.inventory_geoparquet = {
'sentinel-1-grd': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/sentinel-1-grd.parquet',
'sentinel-1-slc': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/sentinel-1-slc.parquet',
'sentinel-2-c1-l1c': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/sentinel-2-c1-l1c.parquet',
'sentinel-2-c1-l2a': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/sentinel-2-c1-l2a.parquet',
'sentinel-3-olci-l1-efr': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/sentinel-3-olci-l1-efr.parquet',
'sentinel-5p-l1b': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/sentinel-5p-l1b.parquet',
'sentinel-5p-l2': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/sentinel-5p-l2.parquet',
'landsat-tm-c2-l2': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/landsat-tm-c2-l2.parquet',
'landsat-etm-c2-l2': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/landsat-etm-c2-l2.parquet',
'landsat-ot-c2-l2': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/landsat-ot-c2-l2.parquet',
'modis-mod09ga-061': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/modis-mod09ga-061.parquet',
'modis-myd09ga-061': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/modis-myd09ga-061.parquet',
'modis-mod09gq-061': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/modis-mod09gq-061.parquet',
'modis-myd09gq-061': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/modis-myd09gq-061.parquet',
'modis-mod10a1-061': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/modis-mod10a1-061.parquet',
'modis-myd10a1-061': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/modis-myd10a1-061.parquet',
'modis-mod13a2-061': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/modis-mod13a2-061.parquet',
'modis-myd13a2-061': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/modis-myd13a2-061.parquet',
'modis-mod13a3-061': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/modis-mod13a3-061.parquet',
'modis-myd13a3-061': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/modis-myd13a3-061.parquet',
'modis-mod13q1-061': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/modis-mod13q1-061.parquet',
'modis-myd13q1-061': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/modis-myd13q1-061.parquet',
'viirs-vnp09ga-001': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/viirs-vnp09ga-001.parquet',
'viirs-vnp13a1-001': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/viirs-vnp13a1-001.parquet',
'viirs-vnp15a2h-001': '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/viirs-vnp15a2h-001.parquet',
}
duckdb.query(f"DESCRIBE SELECT * FROM '{inventory_geoparquet['sentinel-1-grd']}';")
┌─────────────┬──────────────────────────┬─────────┬─────────┬─────────┬─────────┐ │ column_name │ column_type │ null │ key │ default │ extra │ │ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │ ├─────────────┼──────────────────────────┼─────────┼─────────┼─────────┼─────────┤ │ id │ VARCHAR │ YES │ NULL │ NULL │ NULL │ │ datetime │ TIMESTAMP WITH TIME ZONE │ YES │ NULL │ NULL │ NULL │ │ geometry │ VARCHAR │ YES │ NULL │ NULL │ NULL │ │ status │ VARCHAR │ YES │ NULL │ NULL │ NULL │ │ deprecated │ VARCHAR │ YES │ NULL │ NULL │ NULL │ └─────────────┴──────────────────────────┴─────────┴─────────┴─────────┴─────────┘
You need at least need define the name of the collection (this specifies the parquet file to use).
In addition, you can query by geometry (in WKT format) and datetime range.
# Area of Interest (aoi_wkt)
germany_wkt = "POLYGON ((6.589640121864647 55.22103843385409, 6.589640121864647 47.35331569919995, 15.156481237091782 47.35331569919995, 15.156481237091782 55.22103843385409, 6.589640121864647 55.22103843385409))"
europe_wkt = "POLYGON ((48.10789885600357 72.72018652036792, -21.028592953785534 72.72018652036792, -21.028592953785534 32.15348338807499, 48.10789885600357 32.15348338807499, 48.10789885600357 72.72018652036792))"
aoi_wkt = None
# Collection (for a list, see dictionary keys of the inventory cell above)
collection = 'sentinel-1-grd'
# Time range
datetime = None
#datetime = ['2024-01-01', '2024-07-01']
Query available data at the data provider regardless of the terrabyte order status
inventory = query_geoparquet(collection, wkt_geometry=aoi_wkt, datetime=datetime)
SQL Query: set TimeZone = 'UTC'; SELECT DATE_TRUNC('year', datetime) as year, count(id) as count FROM '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/sentinel-1-grd.parquet' WHERE deprecated = 'false' GROUP BY year
To query available data on terrabyte, we need to filter by order status succeeded
.
online = query_geoparquet(collection, wkt_geometry=aoi_wkt, datetime=datetime, status='succeeded')
SQL Query: set TimeZone = 'UTC'; SELECT DATE_TRUNC('year', datetime) as year, count(id) as count FROM '/dss/dsstbyfs01/pn56su/pn56su-dss-0008/inventory/terrabyte/sentinel-1-grd.parquet' WHERE deprecated = 'false' and status = 'succeeded' GROUP BY year
We use the duckdb results before to calculate the missing scenes on terrabyte
query = "SELECT strftime(inventory.year, '%Y')::int as year, inventory.count as count_inventory, online.count as count_online, (count_inventory - count_online) as missing FROM inventory, online WHERE inventory.year = online.year ORDER BY inventory.year"
print(f"SQL Query: {query}")
report = duckdb.query(query)
report
SQL Query: SELECT strftime(inventory.year, '%Y')::int as year, inventory.count as count_inventory, online.count as count_online, (count_inventory - count_online) as missing FROM inventory, online WHERE inventory.year = online.year ORDER BY inventory.year
┌───────┬─────────────────┬──────────────┬─────────┐ │ year │ count_inventory │ count_online │ missing │ │ int32 │ int64 │ int64 │ int64 │ ├───────┼─────────────────┼──────────────┼─────────┤ │ 2014 │ 24506 │ 24395 │ 111 │ │ 2015 │ 134592 │ 134202 │ 390 │ │ 2016 │ 197614 │ 195794 │ 1820 │ │ 2017 │ 365838 │ 365627 │ 211 │ │ 2018 │ 398323 │ 387218 │ 11105 │ │ 2019 │ 422219 │ 381755 │ 40464 │ │ 2020 │ 432719 │ 387879 │ 44840 │ │ 2021 │ 407121 │ 399455 │ 7666 │ │ 2022 │ 221861 │ 221346 │ 515 │ │ 2023 │ 226346 │ 198139 │ 28207 │ │ 2024 │ 168594 │ 105164 │ 63430 │ ├───────┴─────────────────┴──────────────┴─────────┤ │ 11 rows 4 columns │ └──────────────────────────────────────────────────┘
duckdb.query("SELECT sum(missing), sum(count_inventory), sum(count_online) FROM report")
┌──────────────┬──────────────────────┬───────────────────┐ │ sum(missing) │ sum(count_inventory) │ sum(count_online) │ │ int128 │ int128 │ int128 │ ├──────────────┼──────────────────────┼───────────────────┤ │ 198759 │ 2999733 │ 2800974 │ └──────────────┴──────────────────────┴───────────────────┘