#!/usr/bin/env python # coding: utf-8 # ## How to use the Radiant MLHub API to browse and download the LandCoverNet dataset # # Radiant MLHub Logo # # This Jupyter notebook, which you may copy and adapt for any use, shows basic examples of how to use the API to download labels and source imagery for the LandCoverNet dataset. Full documentation for the API is available at [docs.mlhub.earth](http://docs.mlhub.earth). # # We'll show you how to set up your authorization, list collection properties, and retrieve the items (the data contained within them) from those collections. # # Each item in our collection is explained in json format compliant with STAC label extension definition. # ### Dataset Citation # # Alemohammad S.H., Ballantyne A., Bromberg Gaber Y., Booth K., Nakanuku-Diggs L., & Miglarese A.H. (2020) "LandCoverNet: A Global Land Cover Classification Training Dataset", Version 1.0, Radiant MLHub. \[Date Accessed\] [https://doi.org/10.34911/rdnt.d2ce8i](https://doi.org/10.34911/rdnt.d2ce8i) # ### Authentication # # Access to the Radiant MLHub API requires an API key. To get your API key, go to [mlhub.earth](https://mlhub.earth/) and click the "Sign in / Register" button in the top right to log in. If you have not used Radiant MLHub before, you will need to sign up and create a new account; otherwise, just sign in. Once you have signed in, click on your user avatar in the top right and select the "Settings & API keys" from the dropdown menu. # # In the **API Keys** section of this page, you will be able to create new API key(s). *Do not share* your API key with others as this may pose a security risk. # # Next, we will create a `MLHUB_API_KEY` variable that `pystac-client` will use later use to add our API key to all requests: # In[1]: import getpass MLHUB_API_KEY = getpass.getpass(prompt="MLHub API Key: ") MLHUB_ROOT_URL = "https://api.radiant.earth/mlhub/v1" # In[2]: import os import requests import shutil import tempfile from pathlib import Path import itertools as it from urllib.parse import urljoin from pystac import Item from pystac.extensions.eo import EOExtension from pystac.extensions.label import LabelRelType from pystac.extensions.scientific import ScientificExtension from pystac_client import Client client = Client.open( MLHUB_ROOT_URL, parameters={"key": MLHUB_API_KEY}, ignore_conformance=True ) # Next, we will create a custom `requests.Session` instance that will automatically include our API key in requests. # In[3]: class MLHubSession(requests.Session): def __init__(self, *args, api_key=None, **kwargs): super().__init__(*args, **kwargs) self.params.update({"key": api_key}) def request(self, method, url, *args, **kwargs): url_prefix = MLHUB_ROOT_URL.rstrip("/") + "/" url = urljoin(url_prefix, url) return super().request(method, url, *args, **kwargs) session = MLHubSession(api_key=MLHUB_API_KEY) # ### Listing Collection Properties # # The following cell makes a request to the API for the metadata of the LandCoverNet labels collection and prints out a few important properties. # In[4]: collection_id = "ref_landcovernet_af_v1_labels" collection = client.get_collection(collection_id) collection_sci_ext = ScientificExtension.ext(collection) print(f"Description: {collection.description}") print(f"License: {collection.license}") print(f"DOI: {collection_sci_ext.doi}") print(f"Citation: {collection_sci_ext.citation}") # ### Finding Possible Land Cover Labels # # Each item has a property which lists all of the possible land cover types in the dataset and which ones are present in the current item. # The code below prints out those land cover types present in the dataset and we will reference these later in the notebook when we filter downloads. # In[5]: item_search = client.search(collections=[collection_id]) first_item = next(item_search.get_items()) labels_asset = first_item.get_assets()["labels"] classification_labels = labels_asset.to_dict()["file:values"] for cl in classification_labels: classification_id = cl["value"][0] classification_label = cl["summary"] print(f"{classification_id}: {classification_label}") # ### Downloading Assets # # For this exercise, we will find the first Item that contains labels with the `"Woody Vegetation"` class and download the label asset for this Item. We will then follow the link to the source imagery for these labels and download the RGB band assets for that imagery. # # First, we create a temporary directory into which we will download the assets. # In[6]: tmp_dir = tempfile.mkdtemp() # #### Downloading Labels # # Next, we search for Items in our collection and inspect the `"label:overviews"` property to find an item that contains `"Woody Vegetation"`. # In[7]: for item in item_search.get_items(): labels_count = item.properties["label:overviews"][0]["counts"] item_labels = [lc["name"] for lc in labels_count] if "Woody Vegetation" in item_labels: break print(f"Item ID: {item.id}") print("Classification labels:") for label in item_labels: print(f"- {label}") print("Assets:") for asset_key in item.assets.keys(): print(f"- {asset_key}") # We can see that this Item has a `"labels"` asset, which contains the segmentation labels for this dataset. We can download these labels using the `"href"` property of the asset. # In[8]: labels_path = os.path.join(tmp_dir, "labels.tif") labels_href = item.assets["labels"].href print(f"Downloading labels from {labels_href}") response = requests.get(labels_href, allow_redirects=True) with open(labels_path, "wb") as dst: dst.write(response.content) # #### Downloading Source Imagery # # Let's find the the source imagery associated with those labels by examining the Item links (source imagery links will have a `"rel"` type of `"source"`. # In[9]: source_imagery_links = item.get_links(rel=LabelRelType.SOURCE) links_limit = 10 print(f"Source Imagery Links: {len(source_imagery_links)}") for link in it.islice(source_imagery_links, links_limit): print(f"- {link.href}") if len(source_imagery_links) > links_limit: print("...") # We can see that there are 167 different source images that can be associated with these labels. Let's grab the STAC Item for the first one so we can download the RGB band assets for that image. Because the Radiant MLHub API requires authentication to retrieve Items, we cannot use the standard PySTAC methods for resolving STAC Objects. Instead, we will use the custom `requests.Session` instance we created above. # In[10]: image_link = source_imagery_links[0] response = session.get(image_link.href) image_item = Item.from_dict(response.json()) print(f"Item ID: {image_item.id}") print("Assets:") for asset_key, asset in image_item.assets.items(): print(f"- Asset Key: {asset_key}") asset_eo_ext = EOExtension.ext(asset) if asset_eo_ext.bands is not None: band_names = ", ".join(band.common_name for band in asset_eo_ext.bands) print(f" Bands:{band_names}") # Since we are interested in the RGB bands for this image, we will download the `"B04"`, `"B03"`, and `"B02"` assets. # In[11]: for asset_key in {"B04", "B03", "B02"}: file_path = os.path.join(tmp_dir, f"image-{asset_key}") asset = image_item.assets[asset_key] response = session.get(asset.href, allow_redirects=True) with open(file_path, "wb") as dst: dst.write(response.content) # Let's confirm that our downloads are all in the temporary directory. # In[12]: os.listdir(tmp_dir) # ### Download Collection Archives # # If you are interested in downloading all label and/or source imagery assets for this dataset, you can do so using the `/archive/{collection_id}` endpoint documented [here](https://docs.mlhub.earth/#operation/Download_Archive_archive__collection_id__get). You can see an example of using a custom `requests.Session` instance to download this archive` in the ["Using the Radiant MLHub API" tutorial](./using-radiant-mlhub-api.ipynb#Download-Data-Archives). Before downloading the archive, you can also use the `/archive/{collection_id}/info` endpoint to determine the size of the archive file. # In[13]: response = session.get(f"/archive/{collection_id}/info") response.json() # In[14]: archive_path = os.path.join(tmp_dir, f"{collection_id}.tar.gz") response = session.get(f"/archive/{collection_id}", allow_redirects=True) with open(archive_path, "wb") as dst: dst.write(response.content) # Let's check that our archive file was successfully downloaded. # In[15]: archive_path_obj = Path(archive_path) print(f"Archive Exists?: {archive_path_obj.exists()}") print(f"Archive Size: {archive_path_obj.stat().st_size}") # It does, and the size matches what we expected from the info endpoint! # ### Clean Up # # Finally, we remove the temporary directory and its contents. # In[16]: shutil.rmtree(tmp_dir)