#!/usr/bin/env python # coding: utf-8 # ## Accessing NRCAN Landcover data with the Planetary Computer STAC API # # [National Resources Canada (NRCAN) Landcover](https://www.nrcan.gc.ca/maps-tools-publications/satellite-imagery-air-photos/application-development/land-cover/21755) is derived from cloud-free mosaics from thousands of satellite images collected in the short summer months when snow and ice cover is lowest. Advanced machine learning techniques and complex algorithms developed and verified through fieldwork are used to identify land cover types within the images. The result is a collection of continental, national scale and regional land cover products encompassing different time periods and frequencies, and at different spatial resolutions. # # In this notebook, we'll demonstrate how to access and work with this data through the Planetary Computer. # # Complete documentation for this dataset is available on the [Planetary Computer data catalog](https://planetarycomputer.microsoft.com/dataset/nrcan-landcover). # ### Environment setup # # This notebook works with or without an API key, but you will be given more permissive access to the data with an API key. # The [Planetary Computer Hub](https://planetarycomputer.microsoft.com/compute) is pre-configured to use your API key. # In[1]: from matplotlib.colors import ListedColormap import pystac_client from pystac.extensions.projection import ProjectionExtension as proj import cartopy.crs as ccrs import matplotlib.pyplot as plt import numpy as np import pandas as pd import planetary_computer import rasterio import rasterio.features import stackstac # Set the environment variable PC_SDK_SUBSCRIPTION_KEY, or set it here. # The Hub sets PC_SDK_SUBSCRIPTION_KEY automatically. # pc.settings.set_subscription_key() # ### Data access # # The datasets hosted by the Planetary Computer are available from [Azure Blob Storage](https://docs.microsoft.com/en-us/azure/storage/blobs/). We'll use [pystac-client](https://pystac-client.readthedocs.io/) to search the Planetary Computer's [STAC API](https://planetarycomputer.microsoft.com/api/stac/v1/docs) for the subset of the data that we care about, and then we'll load the data directly from Azure Blob Storage. We'll specify a `modifier` so that we can access the data stored in the Planetary Computer's private Blob Storage Containers. See [Reading from the STAC API](https://planetarycomputer.microsoft.com/docs/quickstarts/reading-stac/) and [Using tokens for data access](https://planetarycomputer.microsoft.com/docs/concepts/sas/) for more. # In[2]: catalog = pystac_client.Client.open( "https://planetarycomputer.microsoft.com/api/stac/v1", modifier=planetary_computer.sign_inplace, ) # ### Select a region and find data items # # We'll pick an area surrounding Vancouver and use the STAC API to find what data items are available. # In[3]: area_of_interest = { "type": "Polygon", "coordinates": [ [ [-124.134521484375, 49.00724918431423], [-121.75872802734375, 49.00724918431423], [-121.75872802734375, 50.07300647938297], [-124.134521484375, 50.07300647938297], [-124.134521484375, 49.00724918431423], ] ], } bounds_latlon = rasterio.features.bounds(area_of_interest) # In[4]: search = catalog.search(collections=["nrcan-landcover"], intersects=area_of_interest) # Check how many items were returned items = search.item_collection() print(f"Returned {len(items)} Items") # We found 2 items that intersect with our area of interest, which means the data we want to work with is spread out over 2 non-overlapping GeoTIFF files. In order to merge them together, we could open each item, clip to the subset of our area of interest, and merge them together manually with rasterio. We'd also have to reproject each item which may span multiple UTM projections. # # Instead, we'll use the [stackstac](https://stackstac.readthedocs.io/en/latest/) library to read, merge, and reproject in a single step - all without loading the rest of the file data we don't need. # In[5]: # The STAC metadata contains some information we'll want to use when creating # our merged dataset. Get the EPSG code of the first item and the nodata value. item = items[0] epsg = proj.ext(item).epsg nodata = item.assets["landcover"].extra_fields["raster:bands"][0]["nodata"] # Create a single DataArray from out multiple resutls with the corresponding # rasters projected to a single CRS. Note that we set the dtype to ubyte, which # matches our data, since stackstac will use float64 by default. stack = stackstac.stack(items, epsg=epsg, bounds_latlon=bounds_latlon) # Mask no data values stack = stack.where(stack != nodata) stack # ### Mosaic and clip the raster # # So far, we haven't read in any data. Stackstac has used the STAC metadata to construct a DataArray that will contain our Item data. Let's mosaic the rasters across the `time` dimension (remember, they're all from a single synthesized "time" from 2020) and drop the single `band` dimension. Finally, we ask Dask to read the actual data by calling `.compute()`. # In[6]: merged = stackstac.mosaic(stack, dim="time", axis=None).squeeze().compute() merged # Now a quick plot to check that we've got the data we want. # In[7]: merged.plot() plt.show() # It looks good, but it doesn't look like a land cover map. The source GeoTIFFs contain a colormap and the STAC metadata contains the class names. We'll open one of the source files just to read this metadata and construct the right colors and names for our plot. # In[8]: # Create a list of class names indexed according to the landcover values file_values = { d["values"][0]: d["summary"] for d in items[0].assets["landcover"].extra_fields["file:values"] } min_value = int(min(file_values.keys())) max_value = int(max(file_values.keys())) class_names = [ f"{i}: {file_values.get(value, 'Empty class')}" for i, value in enumerate(range(min_value, max_value + 1), start=min_value) ] # Create color map for plotting with rasterio.open(item.assets["landcover"].href) as src: colormap = src.colormap(1) # get metadata colormap for band 1 colors_rgb = [ c for i, c in colormap.items() if i >= min_value and i <= max_value ] # keep only colors within the range of the named classes colors_mpl = [ np.array(colors_rgb[i]) / 255 for i in range(len(colors_rgb)) ] # transform to matplotlib color format cmap = ListedColormap(colors_mpl) # In[9]: fig, ax = plt.subplots( figsize=(18, 9), dpi=125, subplot_kw=dict(projection=ccrs.epsg(epsg)), frameon=False ) p = merged.plot( ax=ax, transform=ccrs.epsg(epsg), cmap=cmap, add_colorbar=False, vmin=min_value, vmax=max_value + 1, ) ax.set_title("Land Cover Around Vancouver") cbar = plt.colorbar(p) cbar.set_ticks(range(1, max_value + 1)) cbar.set_ticklabels(class_names) # In[10]: ax = plt.figure(figsize=(8, 6), dpi=100).gca() colors = list(cmap.colors) df = ( pd.value_counts(merged.data.ravel(), sort=False) .divide(merged.shape[0] * merged.shape[1] / 100) .sort_index() .reindex(range(min_value, max_value + 1), fill_value=0) .rename(dict(enumerate(class_names, start=min_value))) ) df.plot.barh(ax=ax, color=colors, rot=0, width=0.9, linewidth=1, edgecolor="black") ax.set( title="Distribution of Land Cover Classes", ylabel="Landcover Class", xlabel="Percentage Cover", facecolor="white", );