Read in multiple years of SST data¶

Import Required Packages¶

In [1]:

%matplotlib inline
import matplotlib.pyplot as plt
from datetime import datetime
import os
import subprocess
import requests
import boto3
import s3fs
import pandas as pd
import numpy as np
import xarray as xr
import rasterio as rio
from rasterio.session import AWSSession
from rasterio.plot import show
import rioxarray
import geopandas
import pyproj
from pyproj import Proj
from shapely.ops import transform
import geoviews as gv
from cartopy import crs
import hvplot.xarray
import holoviews as hv
gv.extension('bokeh', 'matplotlib')

Get the urls for the files¶

In [2]:

from pystac_client import Client

In [82]:

# Get credentials
s3_cred_endpoint = {
    'podaac':'https://archive.podaac.earthdata.nasa.gov/s3credentials',
    'lpdaac':'https://data.lpdaac.earthdatacloud.nasa.gov/s3credentials'
}
def get_temp_creds():
    temp_creds_url = s3_cred_endpoint['podaac']
    return requests.get(temp_creds_url).json()
temp_creds_req = get_temp_creds()

Search the client and find the files with the collection id.

In [74]:

podaac_cat = Client.open('https://cmr.earthdata.nasa.gov/stac/POCLOUD/')
search = podaac_cat.search(
    collections=['AVHRR_OI-NCEI-L4-GLOB-v2.1'],
    datetime='2016/2020'
)

Here are the number of files matched. Since it is 5 years and 2020 was a leap year, there should be 1827 files.

In [64]:

365*5+2

Out[64]:

In [75]:

search.matched()

Out[75]:

These lines get the urls and convert to s3 urls."

In [76]:

items = search.get_all_items()

In [77]:

sst_https = items[1].get_assets()['data'].href
sst_https

Out[77]:

'https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/AVHRR_OI-NCEI-L4-GLOB-v2.1/20160102120000-NCEI-L4_GHRSST-SSTblend-AVHRR_OI-GLOB-v02.0-fv02.1.nc'

In [25]:

sst_s3 = sst_https.replace('https://archive.podaac.earthdata.nasa.gov/', 's3://')
sst_s3

Out[25]:

's3://podaac-ops-cumulus-protected/AVHRR_OI-NCEI-L4-GLOB-v2.1/20160102120000-NCEI-L4_GHRSST-SSTblend-AVHRR_OI-GLOB-v02.0-fv02.1.nc'

Create the https urls from the items.

In [78]:

sst_https_urls = [x.get_assets()['data'].href for x in items]
sst_https_urls[0]

Out[78]:

'https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/AVHRR_OI-NCEI-L4-GLOB-v2.1/20160101120000-NCEI-L4_GHRSST-SSTblend-AVHRR_OI-GLOB-v02.0-fv02.1.nc'

Create the s3 urls by replacing the https part with the s3 part.

In [79]:

sst_s3_urls = [x.replace('https://archive.podaac.earthdata.nasa.gov/', 's3://') for x in sst_https_urls]
sst_s3_urls[0]

Out[79]:

's3://podaac-ops-cumulus-protected/AVHRR_OI-NCEI-L4-GLOB-v2.1/20160101120000-NCEI-L4_GHRSST-SSTblend-AVHRR_OI-GLOB-v02.0-fv02.1.nc'

In [80]:

len(sst_s3_urls)

Out[80]:

Set up file access credentials¶

In [84]:

fs_s3 = s3fs.S3FileSystem(anon=False, key=temp_creds_req['accessKeyId'], secret=temp_creds_req['secretAccessKey'], token=temp_creds_req['sessionToken'])

Single file in-region direct S3 access of netcdf file¶

In [85]:

%%time
s3_file_obj = fs_s3.open(sst_s3_urls[0], mode='rb')
sst_xr = xr.open_dataset(s3_file_obj, engine='h5netcdf')

CPU times: user 112 ms, sys: 8.17 ms, total: 120 ms
Wall time: 240 ms

In [32]:

sst_xr

Should take about 19 seconds per year.

In [33]:

54*0.001*365

Out[33]:

19.71

Multi-file in-region direct S3 access of netcdf files¶

In [86]:

%%time
# Iterate through remote_files to create a fileset
fileset = [fs_s3.open(file) for file in sst_s3_urls]
# This works
sst_xr_ts = xr.open_mfdataset(fileset, engine='h5netcdf')
# chunks doesn't seem to make it faster
#sst_xr_ts = xr.open_mfdataset(fileset, engine='h5netcdf', chunks= {'time':1096, 'lat':100, 'lon':100})

CPU times: user 2min 7s, sys: 4.95 s, total: 2min 11s
Wall time: 5min 45s

In [ ]:

#sst_xr_ts.analysed_sst.hvplot.image()

Process the nearshore and offshore SST¶

Read in point locations. Has the lat/lon values of the samples along the coast and the corresponding offshore point to compare that point to: basically a point 280km from the nearshore point and perpendicular to the coast. The lat/lon columns are the ones we want.

In [37]:

df = pd.read_csv('../data/sample_point_pairs_trim.csv')

In [39]:

df.head(1)

Out[39]:

	x.km.ns	y.km.ns	x.km.os	y.km.os	lon.ns	lat.ns	lon.os	lat.os
0	-14655.998881	6958.759684	-14782.212703	7186.052086	-165.064195	54.796769	-169.532303	56.432301

Create.

In [40]:

'abc' + '.ns'

Out[40]:

'abc.ns'

In [41]:

def getdf2(ras, pts, loc="ns"):
    ind_x = xr.DataArray(df["lon."+loc], dims=['i'])
    ind_y = xr.DataArray(df["lat."+loc], dims=['i'])
    xr_new = ras.analysed_sst.sel(lon=ind_x, lat=ind_y, method='nearest')
    return xr_new

In [87]:

sst_ns = getdf2(sst_xr_ts, df, loc="ns")
sst_os = getdf2(sst_xr_ts, df, loc="os")
sst_dif = sst_os - sst_ns

In [126]:

%store sst_ns
%store sst_os
%store sst_dif

Stored 'sst_ns' (DataArray)
Stored 'sst_os' (DataArray)
Stored 'sst_dif' (DataArray)

In [88]:

%%time
upwelling_index = xr.where(sst_dif > 2, 1, 0).compute()

CPU times: user 49.9 s, sys: 830 ms, total: 50.7 s
Wall time: 41.5 s

In [117]:

upwelling_index.hvplot(colorbar=False)

Out[117]:

In [90]:

sst_dif.hvplot(cmap="Spectral")

Out[90]:

In [106]:

import geopandas
%matplotlib inline
gdf = geopandas.GeoDataFrame(df, geometry=geopandas.points_from_xy(df['lon.ns'], df['lat.ns']))
gdf = gdf.set_crs("+proj=longlat +datum=WGS84 +no_defs", allow_override=True)
gdf.plot(color='red')

Out[106]:

<AxesSubplot:>

Step 1. Create a geodateframe with a geometry column that is the points in Wintri Tripel crs.

In [112]:

gdf['name']=gdf.index

In [122]:

gdf_sub = gdf.iloc[np.arange(0,567,10)]
ax = gdf.plot(figsize=(10,8), markersize=1)
for x, y, label in zip(gdf_sub.geometry.x, gdf_sub.geometry.y, gdf_sub.name):
    ax.annotate(label, xy=(x, y), xytext=(3, 3), textcoords="offset points")

In [ ]: