Notebook

If you run into any issues or have questions/concerns about the data catalog API, usage patterns, or anything else, please do not hesistate to email them to danf@usc.edu.

Thanks, Dan Feldman

This notebook contains commands to browse available CHIRPS data using MINT Data Catalog API¶

In [ ]:

# Prerequisites: python 3.6 or later
import requests
import json
import uuid
import pprint
import datetime
pp = pprint.PrettyPrinter(indent=2)

Boilerplate setup¶

In [ ]:

# Data Catalog api endpoint url 
url = "https://api.mint-data-catalog.org"

In [ ]:

# Step 1: Get session token to use the API
resp = requests.get(f"{url}/get_session_token").json()
print(resp)
api_key = resp['X-Api-Key']

request_headers = {
    'Content-Type': "application/json",
    'X-Api-Key': api_key
}

In [ ]:

# This is a convenience method to handle api responses. The main portion of the notebook starts in the the next cell
def handle_api_response(response, print_response=True):
    parsed_response = response.json()

    if print_response:
        pp.pprint(parsed_response)
    
    if response.status_code == 200:
        return parsed_response
    elif response.status_code == 400:
        raise Exception("Bad request ^")
    elif response.status_code == 403:
        msg = "Please make sure your request headers include X-Api-Key and that you are using correct url"
        raise Exception(msg)
    else:
        msg = """It seems our server encountered an error which it doesn't know how to handle yet. 
        This sometimes happens with unexpected input(s). In order to help us diagnose and resolve the issue, 
        please notify Dan Feldman (danf@usc.edu) of this error."""
    
    return parsed_response

Search queries¶

In [ ]:

# 1. Search for datasets whose names that start with "CHIRPS" (or "chirps"; it's case-insensitive).
# Also, note the wildcard character '*' in the query. Without it, the query searches for exact matches.

search_query = {
    "dataset_names__in": ["CHIRPS*"]
}

resp = requests.post(f"{url}/find_datasets", 
                                        headers=request_headers,
                                        json=search_query)

# This will return a list of datasets that match the query, along with dataset's record_id, description, name, and metadata
response = handle_api_response(resp, print_response=True)

In [ ]:

# Suppose we are interested in "CHIRPS_accumulated_precipitation_one_month". 
# We can get it from the response object above in the following way (but you can use any other way of iterating through
# and filtering response object as it's a standard dict/list combo):

desired_dataset = next(dataset for dataset in response["datasets"] if dataset["dataset_name"] == "CHIRPS_accumulated_precipitation_one_month")
pp.pprint(desired_dataset)

In [ ]:

# 2. Once we've chosen our dataset, we can use data catalog API to find more information about it.
# First, we need to note dataset's record_id since we'll need it to interact with other data catalog API 
# endpoints

dataset_id = desired_dataset["dataset_id"]
print(dataset_id)

In [ ]:

# 2.1 View dataset's variables
search_query = {
    "dataset_id": dataset_id
}

resp = requests.post(f"{url}/datasets/dataset_variables", 
                                        headers=request_headers,
                                        json=search_query)

# This will return a list of variables (along with variables' metadata) that our dataset contains
dataset_variables = handle_api_response(resp, print_response=True)

In [ ]:

# 2.2 Search dataset's resources based on temporal and spatial coverage

# ----- WARNING -----------------------------------------------------------------------------
# Currently, there is a limit to the number of records that the API will return at once. 
# By default it's 20, but it's possible to return up to 2000 records by specifying the 
# appropriate limit. However, some datasets (like CHIRPS) contain ~100k resources, 
# which is why it's important to provide additional filtering criteria like spatial 
# and temporal coverage
# -------------------------------------------------------------------------------------------



# 2.2.1 Specifying spatial coverage as a lat/lon bounding box:
# Bounding box search parameter is a 4-element numeric array (in WGS84 coordinate system) [xmin, ymin, xmax, ymax]
# As a reminder, x is longitude, y is latitude

# For example, bounding box for Ethiopia+SNNPR+KAT (adm level 2) is
# {"xmax": 38.062137603759766, "xmin": 37.3511962890625, "ymax": 7.4791812896728525, "ymin": 7.147633552551269}
# We don't have to match those coordinates exactly as data catalog supports "within" and "intersects" geospatial queries
bounding_box = [37.0, 7.0, 39.0, 8.0]



# 2.2.2 Specifying temporal coverage as start/end times in ISO8601 format. Supported operators are: 
# gt (greater than), 
# gte (greater than or equal)
# lt (less than),
# lte (less than or equal)
#
# For example, to specify temporal coverage for the entire 2018, we will write 
start_time = "2018-01-01T00:00:00"
end_time = "2018-12-31T23:59:59"


# Together, the complete search query becomes
search_query = {
    "spatial_coverage__within": bounding_box,    
    "start_time__gte": start_time,
    "end_time__lte": end_time,
    "limit": 2000
}

resp = requests.post(f"{url}/datasets/find", 
                                        headers=request_headers,
                                        json=search_query)

response = handle_api_response(resp, print_response=True)

resources = response["resources"]

In [ ]:

# This is an example of a returned resource record. It contains dataset's record_id and metadata,
# resource's record_id, name, metadata, and data_url
resource_record = resources[0]
pp.pprint(resource_record)

In [ ]:

# In order to download actual data, we can get the relevant url from the "resource_data_url" field:

data_url = resource_record["resource_data_url"]
print(data_url)