#!/usr/bin/env python # coding: utf-8 # ## Creating a cross-model ensemble using STAC # This tutorial builds a cross-collection ensemble of GDPCIR bias corrected and downscaled data, and plots a single variable time series for the ensemble. # In[1]: # required to locate and authenticate with the stac collection import planetary_computer import pystac_client # required to load a zarr array using xarray import xarray as xr # optional imports used in this notebook import pandas as pd from dask.diagnostics import ProgressBar from tqdm.auto import tqdm # ### Understanding the GDPCIR collections # # The [CIL-GDPCIR datasets](https://planetarycomputer.microsoft.com/dataset/group/cil-gdpcir) are grouped into two collections, depending on the license the data are provided under. # # - [CIL-GDPCIR-CC0](https://planetarycomputer.microsoft.com/dataset/cil-gdpcir-cc0) - provided in public domain using a [CC 1.0 Universal Public Domain Dedication](https://creativecommons.org/publicdomain/zero/1.0/) # - [CIL-GDPCIR-CC-BY](https://planetarycomputer.microsoft.com/dataset/cil-gdpcir-cc-by) - provided under a [CC Attribution 4.0 License](https://creativecommons.org/licenses/by/4.0/) # # Note that the first group, CC0, places no restrictions on the data. CC-BY 4.0 requires citations of the climate models these datasets are derived from. See the [ClimateImpactLab/downscaleCMIP6 README](github.com/ClimateImpactLab/downscaleCMIP6) for the citation information for each GCM. # # Also, note that none of the descriptions of these licenses on this page, in this repository, and associated with this repository constitute legal advice. We are highlighting some of the key terms of these licenses, but this information should not be considered a replacement for the actual license terms, which are provided on the Creative Commons website at the links above. # # ### Structure of the STAC collection # # The data assets in this collection are a set of [Zarr](https://zarr.readthedocs.io/) groups which can be opend by tools like [xarray](https://xarray.pydata.org/). Each Zarr group contains a single data variable (either `pr`, `tasmax`, or `tasmin`). The Planetary Computer provides a single STAC item per experiment, and each STAC item has one asset per data variable. # # Altogether, the collection is just over 21TB, with 247,997 individual files. The STAC collection is here to help search and make sense of this huge archive! # # For example, let's take a look at the CC0 collection: # In[2]: catalog = pystac_client.Client.open( "https://planetarycomputer.microsoft.com/api/stac/v1/", modifier=planetary_computer.sign_inplace, ) collection_cc0 = catalog.get_collection("cil-gdpcir-cc0") collection_cc0 # In[3]: collection_cc0.summaries.to_dict() # The CC0 (Public Domain) collection has three models, from which a historical scenario and four future simulations are available. # # *Note that not all models provide all simulations. See the [ClimateImpactLab/downscaleCMIP6 README](https://github.com/ClimateImpactLab/downscaleCMIP6) for a list of the available model/scenario/variable combinations.* # ### Querying the STAC API # # Use the Planetary Computer STAC API to find the exact data you want. You'll most likely want to query on the controlled vocabularies fields, under the `cmip6:` prefix. See the collection summary for the set of allowed values for each of those. # ### Combining collections to form a custom ensemble # As an example, if you would like to use both the CC0 and CC-BY collections, you can combine them as follows: # In[6]: search = catalog.search( collections=["cil-gdpcir-cc0", "cil-gdpcir-cc-by"], query={"cmip6:experiment_id": {"eq": "ssp370"}}, ) ensemble = search.item_collection() len(ensemble) # In[7]: import collections collections.Counter(x.collection_id for x in ensemble) # ### Reading a single variable across models into xarray # In[8]: # select this variable ID for all models in a collection variable_id = "tasmax" datasets_by_model = [] for item in tqdm(ensemble): asset = item.assets[variable_id] datasets_by_model.append( xr.open_dataset(asset.href, **asset.extra_fields["xarray:open_kwargs"]) ) all_datasets = xr.concat( datasets_by_model, dim=pd.Index([ds.attrs["source_id"] for ds in datasets_by_model], name="model"), combine_attrs="drop_conflicts", ) # In[9]: all_datasets # ### Subsetting the data # # Now that the metadata has been loaded into xarray, you can use xarray's methods for [Indexing and Selecting Data](https://xarray.pydata.org/en/latest/user-guide/indexing.html) to extract the subset the arrays to the portions meaningful to your analysis. # # Note that the data has not been read yet - this is simply working with the coordinates to schedule the task graph using [dask](https://docs.xarray.dev/en/latest/user-guide/dask.html). # In[10]: # let's select a subset of the data for the first five days of 2020 over Japan. # Thanks to https://gist.github.com/graydon/11198540 for the bounding box! subset = all_datasets.tasmax.sel( lon=slice(129.408463169, 145.543137242), lat=slice(31.0295791692, 45.5514834662), time=slice("2020-01-01", "2020-01-05"), ) # In[11]: with ProgressBar(): subset = subset.compute() # In[12]: subset # At this point, you could do anything you like with the data. See the great [xarray getting started guide](https://xarray.pydata.org/en/latest/getting-started-guide/quick-overview.html#) for more information. For now, we'll plot it all! # In[13]: subset.plot(row="model", col="time");