Open In Colab

A quick intro to accessing Open Cities AI Challenge data STACs with pystac

In [0]:
!apt-get install python3-rtree
!pip install pystac
!pip install geopandas
!pip install rasterio
In [0]:
import os
os.environ["CURL_CA_BUNDLE"] = "/etc/ssl/certs/ca-certificates.crt"

from matplotlib import pyplot as plt
import numpy as np
from pprint import pprint

import rasterio
from rasterio.windows import Window
import geopandas as gpd
from pyproj import CRS

from pystac import (Catalog, CatalogType, Item, Asset, LabelItem, Collection)
In [0]:
# overwriting STAC_IO read method to handle http/s as per https://pystac.readthedocs.io/en/latest/concepts.html#using-stac-io

from urllib.parse import urlparse
import requests
from pystac import STAC_IO

def my_read_method(uri):
    parsed = urlparse(uri)
    if parsed.scheme.startswith('http'):
        return requests.get(uri).text
    else:
        return STAC_IO.default_read_text_method(uri)

STAC_IO.read_text_method = my_read_method
In [0]:
# load our training and test catalogs
train1_cat = Catalog.from_file('https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_1/catalog.json')
train2_cat = Catalog.from_file('https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_2/catalog.json')
test_cat = Catalog.from_file('https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/test/catalog.json')
In [5]:
# look at what's in train1_cat (aka train_tier_1)
train1_cat.describe()
* <Catalog id=train_tier_1>
    * <Collection id=acc>
      * <Item id=665946>
      * <LabelItem id=665946-labels>
      * <Item id=a42435>
      * <LabelItem id=a42435-labels>
      * <Item id=ca041a>
      * <LabelItem id=ca041a-labels>
      * <Item id=d41d81>
      * <LabelItem id=d41d81-labels>
    * <Collection id=mon>
      * <Item id=401175>
      * <LabelItem id=401175-labels>
      * <Item id=493701>
      * <LabelItem id=493701-labels>
      * <Item id=207cc7>
      * <LabelItem id=207cc7-labels>
      * <Item id=f15272>
      * <LabelItem id=f15272-labels>
    * <Collection id=ptn>
      * <Item id=abe1a3>
      * <LabelItem id=abe1a3-labels>
      * <Item id=f49f31>
      * <LabelItem id=f49f31-labels>
    * <Collection id=kam>
      * <Item id=4e7c7f>
      * <LabelItem id=4e7c7f-labels>
    * <Collection id=dar>
      * <Item id=a017f9>
      * <LabelItem id=a017f9-labels>
      * <Item id=b15fce>
      * <LabelItem id=b15fce-labels>
      * <Item id=353093>
      * <LabelItem id=353093-labels>
      * <Item id=f883a0>
      * <LabelItem id=f883a0-labels>
      * <Item id=42f235>
      * <LabelItem id=42f235-labels>
      * <Item id=0a4c40>
      * <LabelItem id=0a4c40-labels>
    * <Collection id=znz>
      * <Item id=33cae6>
      * <LabelItem id=33cae6-labels>
      * <Item id=3b20d4>
      * <LabelItem id=3b20d4-labels>
      * <Item id=076995>
      * <LabelItem id=076995-labels>
      * <Item id=75cdfa>
      * <LabelItem id=75cdfa-labels>
      * <Item id=9b8638>
      * <LabelItem id=9b8638-labels>
      * <Item id=06f252>
      * <LabelItem id=06f252-labels>
      * <Item id=c7415c>
      * <LabelItem id=c7415c-labels>
      * <Item id=aee7fd>
      * <LabelItem id=aee7fd-labels>
      * <Item id=3f8360>
      * <LabelItem id=3f8360-labels>
      * <Item id=425403>
      * <LabelItem id=425403-labels>
      * <Item id=bd5c14>
      * <LabelItem id=bd5c14-labels>
      * <Item id=e52478>
      * <LabelItem id=e52478-labels>
      * <Item id=bc32f1>
      * <LabelItem id=bc32f1-labels>
    * <Collection id=nia>
      * <Item id=825a50>
      * <LabelItem id=825a50-labels>
In [6]:
# make a dict of all collections witinin train1 catalog
 cols = {cols.id:cols for cols in train1_cat.get_children()}
 cols
Out[6]:
{'acc': <Collection id=acc>,
 'dar': <Collection id=dar>,
 'kam': <Collection id=kam>,
 'mon': <Collection id=mon>,
 'nia': <Collection id=nia>,
 'ptn': <Collection id=ptn>,
 'znz': <Collection id=znz>}
In [7]:
# look at the acc collection, as a dict
cols['acc'].to_dict()
Out[7]:
{'description': 'Tier 1 training data from acc',
 'extent': {'spatial': {'bbox': [[-0.20863145179911316,
     5.573262528211078,
     -0.18948660187120017,
     5.593203677296213]]},
  'temporal': {'interval': [['2019-10-29T00:00:00Z', None]]}},
 'id': 'acc',
 'license': 'various',
 'links': [{'href': './665946/665946.json',
   'rel': 'item',
   'type': 'application/json'},
  {'href': './665946-labels/665946-labels.json',
   'rel': 'item',
   'type': 'application/json'},
  {'href': './a42435/a42435.json', 'rel': 'item', 'type': 'application/json'},
  {'href': './a42435-labels/a42435-labels.json',
   'rel': 'item',
   'type': 'application/json'},
  {'href': './ca041a/ca041a.json', 'rel': 'item', 'type': 'application/json'},
  {'href': './ca041a-labels/ca041a-labels.json',
   'rel': 'item',
   'type': 'application/json'},
  {'href': './d41d81/d41d81.json', 'rel': 'item', 'type': 'application/json'},
  {'href': './d41d81-labels/d41d81-labels.json',
   'rel': 'item',
   'type': 'application/json'},
  {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_1/acc/collection.json',
   'rel': 'self',
   'type': 'application/json'},
  {'href': '../catalog.json', 'rel': 'root', 'type': 'application/json'},
  {'href': '../catalog.json', 'rel': 'parent', 'type': 'application/json'}],
 'stac_extensions': ['label'],
 'stac_version': '0.8.1'}
In [8]:
# iterate through all the items within acc collection and print their ids
for i in cols['acc'].get_all_items():
  print(i.id)
665946
665946-labels
a42435
a42435-labels
ca041a
ca041a-labels
d41d81
d41d81-labels
In [9]:
# for all items within acc col, either load and display label geojson with geopandas or raster metadata with rasterio

for i in cols['acc'].get_all_items():
  print(i.id,'\n----------------------------')
  pprint(i.properties)
  if 'label' in i.id:
    
    gdf = gpd.read_file(i.make_asset_hrefs_absolute().assets['labels'].href)
    gdf.plot()
    plt.show()
  else: 
    print('raster metadata:')
    pprint(rasterio.open(i.make_asset_hrefs_absolute().assets['image'].href).meta)
  print('\n----------------------------')
665946 
----------------------------
{'area': 'acc', 'datetime': '2018-08-05 00:00:00Z', 'license': 'CC BY 4.0'}
raster metadata:
{'count': 4,
 'crs': CRS.from_epsg(32630),
 'driver': 'GTiff',
 'dtype': 'uint8',
 'height': 150147,
 'nodata': None,
 'transform': Affine(0.02001518707102818, 0.0, 805429.9166958937,
       0.0, -0.02001518707102818, 624939.1898949385),
 'width': 84466}

----------------------------
665946-labels 
----------------------------
{'area': 'acc',
 'datetime': '2018-08-05 00:00:00Z',
 'label:description': 'Geojson building labels for scene 665946',
 'label:overviews': [{'counts': [{'count': 7308, 'name': 'yes'}],
                      'property_key': ['building']}],
 'label:properties': ['building'],
 'label:type': 'vector',
 'license': 'ODbL-1.0'}
----------------------------
a42435 
----------------------------
{'area': 'acc', 'datetime': '2018-10-06 00:00:00Z', 'license': 'CC BY 4.0'}
raster metadata:
{'count': 4,
 'crs': CRS.from_epsg(32630),
 'driver': 'GTiff',
 'dtype': 'uint8',
 'height': 39162,
 'nodata': None,
 'transform': Affine(0.032029411960186015, 0.0, 804676.2688712641,
       0.0, -0.03202926727370731, 621829.9693785439),
 'width': 57540}

----------------------------
a42435-labels 
----------------------------
{'area': 'acc',
 'datetime': '2018-10-06 00:00:00Z',
 'label:description': 'Geojson building labels for scene a42435',
 'label:overviews': [{'counts': [{'count': 6647, 'name': 'yes'}],
                      'property_key': ['building']}],
 'label:properties': ['building'],
 'label:type': 'vector',
 'license': 'ODbL-1.0'}
----------------------------
ca041a 
----------------------------
{'area': 'acc', 'datetime': '2018-11-12 00:00:00Z', 'license': 'CC BY 4.0'}
raster metadata:
{'count': 4,
 'crs': CRS.from_epsg(32630),
 'driver': 'GTiff',
 'dtype': 'uint8',
 'height': 77778,
 'nodata': None,
 'transform': Affine(0.035820209694930036, 0.0, 807207.717115721,
       0.0, -0.035820613560994544, 620903.4357975163),
 'width': 65882}

----------------------------
ca041a-labels 
----------------------------
{'area': 'acc',
 'datetime': '2018-11-12 00:00:00Z',
 'label:description': 'Geojson building labels for scene ca041a',
 'label:overviews': [{'counts': [{'count': 10194, 'name': 'yes'}],
                      'property_key': ['building']}],
 'label:properties': ['building'],
 'label:type': 'vector',
 'license': 'ODbL-1.0'}
----------------------------
d41d81 
----------------------------
{'area': 'acc', 'datetime': '2019-07-07 00:00:00Z', 'license': 'CC BY 4.0'}
raster metadata:
{'count': 4,
 'crs': CRS.from_epsg(32630),
 'driver': 'GTiff',
 'dtype': 'uint8',
 'height': 42719,
 'nodata': None,
 'transform': Affine(0.05179965064903244, 0.0, 809267.1099320438,
       0.0, -0.05180039261972288, 618981.5459276063),
 'width': 40868}

----------------------------
d41d81-labels 
----------------------------
{'area': 'acc',
 'datetime': '2019-07-07 00:00:00Z',
 'label:description': 'Geojson building labels for scene d41d81',
 'label:overviews': [{'counts': [{'count': 9436, 'name': 'yes'}],
                      'property_key': ['building']}],
 'label:properties': ['building'],
 'label:type': 'vector',
 'license': 'ODbL-1.0'}
----------------------------

Load one image scene and create one training chip (pair of image and label) with windowed reads

In [10]:
# open one image item
SCENE_ID = 'ca041a'

one_item = cols['acc'].get_item(id=SCENE_ID)
one_item.to_dict()
Out[10]:
{'assets': {'image': {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_1/acc/ca041a/ca041a.tif',
   'title': 'GeoTIFF',
   'type': 'image/tiff; application=geotiff; profile=cloud-optimized'}},
 'bbox': [-0.22707525357332697,
  5.585527399115482,
  -0.20581415249279408,
  5.610742610987594],
 'collection': 'acc',
 'geometry': {'coordinates': [[[-0.2260939759101167, 5.607821019807083],
    [-0.22707525357332697, 5.609567361411101],
    [-0.2257626190986551, 5.610742610987594],
    [-0.2209214783972656, 5.60396659440964],
    [-0.2209297943096631, 5.603475955578037],
    [-0.21938590601191368, 5.601711342600872],
    [-0.21863322644066166, 5.601370284670147],
    [-0.2171984310079642, 5.60126443910544],
    [-0.2150344772406196, 5.602172946869177],
    [-0.21221192884842804, 5.603687126475405],
    [-0.2071666235973881, 5.60628622311988],
    [-0.20581415249279408, 5.604666197948947],
    [-0.2074253572000044, 5.603584221065274],
    [-0.2083544460457665, 5.6019965375946645],
    [-0.20906008314381597, 5.600996885039098],
    [-0.21070656970592522, 5.599526807751497],
    [-0.2114122068039735, 5.597897962116837],
    [-0.212416616071534, 5.595984932725826],
    [-0.2138692108714978, 5.593295072530488],
    [-0.21395275670010877, 5.593002662130355],
    [-0.21397782044869207, 5.590391854986293],
    [-0.2141449121059085, 5.590032607923272],
    [-0.21550670911225214, 5.588150738133833],
    [-0.21721104401589492, 5.585527399115482],
    [-0.21795907856051402, 5.590627248068187],
    [-0.21855973661963968, 5.59108215821591],
    [-0.2203440443835102, 5.593219794249854],
    [-0.22225201704190803, 5.59588742268891],
    [-0.2234356667466541, 5.598325387752418],
    [-0.22523764092403067, 5.601536258406711],
    [-0.22571463408862905, 5.6031880680693025],
    [-0.2259010426101557, 5.606526622234949],
    [-0.2260939759101167, 5.607821019807083]]],
  'type': 'Polygon'},
 'id': 'ca041a',
 'links': [{'href': '../collection.json',
   'rel': 'collection',
   'type': 'application/json'},
  {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_1/acc/ca041a/ca041a.json',
   'rel': 'self',
   'type': 'application/json'},
  {'href': '../../catalog.json', 'rel': 'root', 'type': 'application/json'},
  {'href': '../collection.json', 'rel': 'parent', 'type': 'application/json'}],
 'properties': {'area': 'acc',
  'datetime': '2018-11-12T00:00:00Z',
  'license': 'CC BY 4.0'},
 'stac_version': '0.8.1',
 'type': 'Feature'}
In [11]:
# load raster for this item
rst = rasterio.open(one_item.assets['image'].href)
rst.meta
Out[11]:
{'count': 4,
 'crs': CRS.from_epsg(32630),
 'driver': 'GTiff',
 'dtype': 'uint8',
 'height': 77778,
 'nodata': None,
 'transform': Affine(0.035820209694930036, 0.0, 807207.717115721,
       0.0, -0.035820613560994544, 620903.4357975163),
 'width': 65882}
In [12]:
# check raster resolution
rst.res
Out[12]:
(0.035820209694930036, 0.035820613560994544)
In [13]:
# make a windowed read of this raster and reshape into a displayable 4-d array (RGB+alpha channel)
# more on windowed reads with rasterio: https://rasterio.readthedocs.io/en/stable/topics/windowed-rw.html#windows

win_sz = 1024

window = Window(rst.meta['width']//2,rst.meta['height']//2,win_sz,win_sz) # 1024x1024 window starting at center of raster
win_arr = rst.read(window=window)
win_arr = np.moveaxis(win_arr,0,2)
plt.figure(figsize=(10,10))
plt.imshow(win_arr)
Out[13]:
<matplotlib.image.AxesImage at 0x7f11336ace80>