#!/usr/bin/env python # coding: utf-8 # # # 🌍 Analyzing Sea Level Rise Using Earth Data in the Cloud # ### This notebook is entirely based on Jinbo Wang's [tutorial](https://github.com/betolink/the-coding-club/blob/main/notebooks/Earthdata_webinar_20220727.ipynb) # # # # **We are going to plot the orange line** # # In[ ]: # We need to run this just one then restart the kernel (and clean the output) after installing the library get_ipython().system('pip install git+https://github.com/nsidc/earthdata.git@main') # In[1]: from earthdata import Auth, Store, DataGranules, DataCollections from pprint import pprint auth = Auth().login(strategy="netrc") # are we authenticated? if not auth.authenticated: # ask for credentials and persist them in a .netrc file auth.login(strategy="interactive", persist=True) # The Store class will let us download data from NASA directly store = Store(auth) # # Authentication in the cloud # # ### If the collection is a cloud-hosted collection we can print the `summary()` and get the S3 credential endpoint. These S3 credentials are temporary and we can use them with third party libraries such as s3fs, boto3 or awscli. # In[2]: # We'll get 4 collections that match with our keywords collections = DataCollections(auth).keyword("SEA SURFACE HEIGHT").cloud_hosted(True).get(4) # Let's print 2 collections for collection in collections[0:2]: # pprint(collection.summary()) print(pprint(collection.summary()) , collection.abstract(), "\n") # ## A year of data # # Things to keep in mind: # # * this particular dataset has data until 2019 # * this is a global dataset, each granule represents the whole world # * temporal coverage is 1 granule each 5 days # In[3]: granules = DataGranules().concept_id("C2036880672-POCLOUD").temporal("2017-01","2018-01").get() print(len(granules)) # ## Working with the URLs directly # # If we chose, we can use `earthdata` to grab the file's URLs and then download them with another library (if we have a `.netrc` file configured with NASA's EDL credentials) # Getting the links to our data is quiet simple with the `data_links()` method on each of the results: # In[4]: # on_prem is a bit misleading, the collection is cloud hosted, in this case the access can be done out of region granules[0].data_links(access="onprem") # In[5]: granules[0].data_links(access="direct") # ## POC: streaming into xarray # # We can use `earthdata` to stream files directly into xarray, this will work well for cases where: # # * Data is in NetCDF/HDF5 format # * xarray can read bytes directly for remote datasets only with **`h5netcdf`** and **`scipy`** back-ends, if we deal with a format that won't be recognized by these 2 backends xarray will raise an exception. # * Data is not big data (multi TB) # * not fully tested with Dask distributed # * Data is gridded # * xarray works better with homogeneous coordinates, working with swath data will be cumbersome. # * Data is chunked using reasonable large sizes(1MB or more) # * If our files are chunked in small pieces the access time will be orders of magnitude bigger than just downloading the data and accessing it locally. # # Opening a year of SSH (SEA_SURFACE_HEIGHT_ALT_GRIDS_L4_2SATS_5DAY_6THDEG_V_JPL1812) data (1.1 GB approx) takes about 5 minutes using streaming when we access out of region(not in AWS) # In[6]: get_ipython().run_cell_magic('time', '', 'import xarray as xr\n\ngranules = []\n\n# we just grab 1 granule from May for each year of the dataset\nfor year in range(1990, 2019):\n granule = DataGranules().concept_id("C2036880672-POCLOUD").temporal(f"{year}-05", f"{year}-06").get(1)\n if len(granule)>0:\n granules.append(granule[0])\n\nds = xr.open_mfdataset(store.open(granules), chunks={})\nds\n') # ## Plotting # In[7]: ds.SLA.where((ds.SLA>=0) & (ds.SLA < 10)).mean('Time').plot(figsize=(14,6), x='Longitude', y='Latitude') # In[8]: from pyproj import Geod import numpy as np def ssl_area(lats): """ Calculate the area associated with a 1/6 by 1/6 degree box at latitude specified in 'lats'. Parameter ========== lats: a list or numpy array of size N the latitudes of interest. Return ======= out: Array (N) area values (unit: m^2) """ # Define WGS84 as CRS: geod = Geod(ellps='WGS84') dx=1/12.0 # TODO: explain this c_area=lambda lat: geod.polygon_area_perimeter(np.r_[-dx,dx,dx,-dx], lat+np.r_[-dx,-dx,dx,dx])[0] out=[] for lat in lats: out.append(c_area(lat)) return np.array(out) ssh_area = ssl_area(ds.Latitude.data).reshape(1,-1) print(ssh_area.shape) # In[9]: get_ipython().run_cell_magic('time', '', '\nimport numpy as np\n\ndef global_mean(SLA, **kwargs):\n dout=((SLA*ssh_area).sum()/(SLA/SLA*ssh_area).sum())\n return dout\n\nresult = ds.SLA.groupby(\'Time\').apply(global_mean)\n\n\nresult.plot(color="red", marker="o",figsize=(8,4))\n')