If a given dataset is in the cloud we have a set of libraries to access them. If the data is in a cloud friendly format we can efficiently load only what we need. If not we may need to read entire files.
Dependencies:
.netrc
file in home directoryGlossary
Workflow
import requests
from pprint import pprint
from pathlib import Path
import s3fs
import fsspec
# added this two for the h5 example
import numpy
import h5py
import xarray as xr
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
# setting the endpoints and credentials
# Here we know we want a certain dataset and know the concept ID
# This endpoint is specific to daac, if we want cloud data from a different DAAC
# we need to change it. See: https://raw.githubusercontent.com/betolink/earthdata/main/earthdata/daac.py
s3_cred_endpoint = 'https://data.ornldaac.earthdata.nasa.gov/s3credentials'
cmr_search_url = 'https://cmr.earthdata.nasa.gov/search'
cmr_granule_url = f'{cmr_search_url}/{"granules"}'
def get_temp_creds():
temp_creds_url = s3_cred_endpoint
return requests.get(temp_creds_url).json()
temp_creds_req = get_temp_creds()
s3_fs = s3fs.S3FileSystem(
key=temp_creds_req['accessKeyId'],
secret=temp_creds_req['secretAccessKey'],
token=temp_creds_req['sessionToken'],
client_kwargs={'region_name':'us-west-2'},
)
# Search for granule with specified parameters
# In this case we know the dataset id in advance but we could use CMR to look for one
concept_id = 'C2114031882-ORNL_CLOUD'
# Bounding Box spatial parameter in decimal degree 'W,S,E,N' format.
# If the dataset is global the bbox is just ignored because it will match anything.
bounding_box = '-105,21,-125,32'
# Each date in yyyy-MM-ddTHH:mm:ssZ format; date range in start,end format
temporal = '2019-06-22T00:00:00Z,2019-06-23T23:59:59Z'
# setting up query to CMR and explore meta data to find direct links
response = requests.get(cmr_granule_url,
params={
'concept_id': concept_id,
'temporal': temporal,
'bounding_box': bounding_box,
'page_size': 200,
},
headers={
'Accept': 'application/json'
}
)
# These are the metadata records for each granule, this is where we get our links to the data
# If the data is in the cloud the link prefix will start with s3://
granules = response.json()['feed']['entry']
#urls = []
#can uncomment to find links you need that points to the s3 data
#for granule in granules:
# print(granule['links'])
# we knew we need the files that end with s3# from looking at the links
# Get urls of the data
urls = []
for granule in granules:
for link in granule['links']:
if link['rel'].endswith('/s3#'):
urls.append(link['href'])
break
pprint(urls)
['s3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019173011100_O02969_T02656_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019173024347_O02970_T01081_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019173041633_O02971_T01082_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019173054919_O02972_T05352_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019173072205_O02973_T05353_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019173085451_O02974_T02508_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019173102737_O02975_T03779_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019173120023_O02976_T00934_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019173133309_O02977_T00935_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019173150556_O02978_T05205_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019173181128_O02980_T02361_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019173194414_O02981_T00786_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019173211700_O02982_T00787_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019173224946_O02983_T05057_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019174002232_O02984_T02212_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019174015518_O02985_T03483_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019174032805_O02986_T03484_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019174050051_O02987_T00639_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019174063337_O02988_T04909_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019174080623_O02989_T02064_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019174093909_O02990_T02065_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019174111155_O02991_T03336_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019174124441_O02992_T03337_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019174141727_O02993_T00492_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019174172300_O02995_T03187_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019174185546_O02996_T03188_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019174202832_O02997_T00343_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019174220118_O02998_T00344_02_001_01.h5', 's3://ornl-cumulus-prod-protected/gedi/GEDI_L4A_AGB_Density/data/GEDI04_A_2019174233404_O02999_T04614_02_001_01.h5']
# Open 1 granule (file) on S3 to explore the data
%time
file = s3_fs.open(urls[0])
ds = xr.open_dataset(file)
ds
CPU times: user 2 µs, sys: 0 ns, total: 2 µs Wall time: 6.91 µs
<xarray.Dataset> Dimensions: () Data variables: *empty* Attributes: short_name: GEDI_L4A
From looking at ds, we can see it is a GEDI level4 dataset. We will look into the dataset more to find what data is in it.
# Explored the data file hierarchy and chose a variable of interest
dataset=h5py.File(file,'r')
dataset.keys()
<KeysViewHDF5 ['ANCILLARY', 'BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011', 'METADATA']>
#now lets see what is in beam0001 by doing the same thing
beam1=dataset['BEAM0001']
beam1.keys()
<KeysViewHDF5 ['agbd', 'agbd_pi_lower', 'agbd_pi_upper', 'agbd_prediction', 'agbd_se', 'agbd_t', 'agbd_t_se', 'algorithm_run_flag', 'beam', 'channel', 'degrade_flag', 'delta_time', 'elev_lowestmode', 'geolocation', 'l2_quality_flag', 'l4_quality_flag', 'land_cover_data', 'lat_lowestmode', 'lon_lowestmode', 'master_frac', 'master_int', 'predict_stratum', 'predictor_limit_flag', 'response_limit_flag', 'selected_algorithm', 'selected_mode', 'selected_mode_flag', 'sensitivity', 'shot_number', 'solar_elevation', 'surface_flag', 'xvar']>
#look at a agbd to understand more about the data
print("agbd data: {}".format(beam1['agbd']))
print("agbd data attributes: {}".format(list(beam1['agbd'].attrs)))
agbd data: <HDF5 dataset "agbd": shape (364708,), type "<f4"> agbd data attributes: ['coordinates', 'description', 'long_name', 'units']
# look at a geolocation to understand more about the data (didn't look promising)
print("geolocation data: {}".format(beam1['geolocation']))
print("geolocation data attributes: {}".format(list(beam1['geolocation'].attrs)))
geolocation data: <HDF5 group "/BEAM0001/geolocation" (30 members)> geolocation data attributes: []
# look at a lat_lowest mode to understand more about the data
print("lat data: {}".format(beam1['lat_lowestmode']))
print("lat data attributes: {}".format(list(beam1['lat_lowestmode'].attrs)))
lat data: <HDF5 dataset "lat_lowestmode": shape (364708,), type "<f8"> lat data attributes: ['coordinates', 'description', 'long_name', 'source', 'units', 'valid_range']
# now check out its units
print("Unit: {}".format(beam1['lat_lowestmode'].attrs['units']))
Unit: degrees
Now we are going to get the variables we want out of the h5 file
variable_names = [
'/BEAM0001/lat_lowestmode',
'/BEAM0001/lon_lowestmode',
'/BEAM0001/agbd'
]
with h5py.File(file, 'r') as h5:
data_vars = {}
for varname in variable_names:
var = h5[varname]
name = varname.split('/')[-1]
# Convert attributes
attrs = {}
for k, v in var.attrs.items():
if k != 'DIMENSION_LIST':
if isinstance(v, bytes):
attrs[k] = v.decode('utf-8')
else:
attrs[k] = v
data = var[:]
if '_FillValue' in attrs:
data = np.where(data < attrs['_FillValue'], data, np.nan)
data_vars[name] = (['segment'], data, attrs)
gedi_ds = xr.Dataset(data_vars)
gedi_ds
<xarray.Dataset> Dimensions: (segment: 364708) Dimensions without coordinates: segment Data variables: lat_lowestmode (segment) float64 -43.12 -43.12 -43.12 ... -51.81 -51.81 lon_lowestmode (segment) float64 0.1851 0.1857 0.1863 ... -63.85 -63.85 agbd (segment) float32 -9.999e+03 -9.999e+03 ... -9.999e+03
array([-43.11873616, -43.11845488, -43.11817365, ..., -51.805011 , -51.80500994, -51.80500883])
array([ 0.18511495, 0.18569463, 0.18627412, ..., -63.84809997, -63.84727722, -63.84645444])
array([-9999., -9999., -9999., ..., -9999., -9999., -9999.], dtype=float32)
#plot agbd from the gedi dataset
gedi_ds.agbd.plot() ;
#plot location of data and values
map_proj = ccrs.PlateCarree()
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(projection=map_proj)
ax.coastlines()
# Plot IS2 surface height
gedi_img = ax.scatter(gedi_ds.lon_lowestmode, gedi_ds.lat_lowestmode,
c=gedi_ds.agbd,
vmax=2000, # Set max height to plot
cmap='Reds', alpha=0.6, s=2
)
# Add colorbars
fig.colorbar(gedi_img, label='GEDI agbd (m)')
<matplotlib.colorbar.Colorbar at 0x7f5aeb2dd8e0>