See http://sentinel-1-global-coherence-earthbigdata.s3-website-us-west-2.amazonaws.com/ for a description of the data description, format and layout. It is made of millions of geoTIFF files.
import fsspec
import geoviews as gv
import imagecodecs.numcodecs
import hvplot.xarray
import holoviews as hv
import numpy as np
import panel as pn
import param
import intake
from tqdm import tqdm
import xarray as xr
import itertools
import math
imagecodecs.numcodecs.register_codecs() # register the TIFF codec
pn.extension() # viz
import warnings
warnings.filterwarnings("ignore")
zarr_all_url='https://sentinel-1-global-coherence-earthbigdata.s3.us-west-2.amazonaws.com/data/wrappers/zarr-all.json'
mapper = fsspec.get_mapper(
'reference://',
fo=zarr_all_url,
target_protocol='http',
)
dataset = xr.open_dataset(
mapper, engine='zarr', backend_kwargs={'consolidated': False}
)
dataset
<xarray.Dataset> Dimensions: (season: 4, polarization: 4, latitude: 193200, longitude: 432000, coherence: 6, flightdirection: 2, orbit: 175) Coordinates: * coherence (coherence) float32 6.0 12.0 18.0 24.0 36.0 48.0 * flightdirection (flightdirection) object 'A' 'D' * latitude (latitude) float32 82.0 82.0 82.0 ... -79.0 -79.0 -79.0 * longitude (longitude) float32 -180.0 -180.0 -180.0 ... 180.0 180.0 * orbit (orbit) float64 1.0 2.0 3.0 4.0 ... 172.0 173.0 174.0 175.0 * polarization (polarization) object 'vv' 'vh' 'hv' 'hh' * season (season) object 'winter' 'spring' 'summer' 'fall' Data variables: AMP (season, polarization, latitude, longitude) float32 ... COH (season, polarization, coherence, latitude, longitude) float32 ... inc (orbit, flightdirection, latitude, longitude) float32 ... lsmap (orbit, flightdirection, latitude, longitude) float32 ... rho (season, polarization, latitude, longitude) float32 ... rmse (season, polarization, latitude, longitude) float32 ... tau (season, polarization, latitude, longitude) float32 ...
array([ 6., 12., 18., 24., 36., 48.], dtype=float32)
array(['A', 'D'], dtype=object)
array([ 81.99958, 81.99875, 81.99792, ..., -78.99792, -78.99875, -78.99958], dtype=float32)
array([-179.99959, -179.99875, -179.99791, ..., 179.99791, 179.99875, 179.99959], dtype=float32)
array([ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., 66., 67., 68., 69., 70., 71., 72., 73., 74., 75., 76., 77., 78., 79., 80., 81., 82., 83., 84., 85., 86., 87., 88., 89., 90., 91., 92., 93., 94., 95., 96., 97., 98., 99., 100., 101., 102., 103., 104., 105., 106., 107., 108., 109., 110., 111., 112., 113., 114., 115., 116., 117., 118., 119., 120., 121., 122., 123., 124., 125., 126., 127., 128., 129., 130., 131., 132., 133., 134., 135., 136., 137., 138., 139., 140., 141., 142., 143., 144., 145., 146., 147., 148., 149., 150., 151., 152., 153., 154., 155., 156., 157., 158., 159., 160., 161., 162., 163., 164., 165., 166., 167., 168., 169., 170., 171., 172., 173., 174., 175.])
array(['vv', 'vh', 'hv', 'hh'], dtype=object)
array(['winter', 'spring', 'summer', 'fall'], dtype=object)
[1335398400000 values with dtype=float32]
[8012390400000 values with dtype=float32]
[29211840000000 values with dtype=float32]
[29211840000000 values with dtype=float32]
[1335398400000 values with dtype=float32]
[1335398400000 values with dtype=float32]
[1335398400000 values with dtype=float32]
dataset.nbytes / 2**40 # effective in-memory size, in TB
261.125622730382
Much of the dimension space is empty, contains no data file. These areas would return a bunch of NaNs if we tried to extract the data. To be able to explore, we create a view of the whole dataset, showing where data files do exist. We do this very much downsampled, because the process is quite slow.
STEP = 5
das = {}
new_coords = {}
for var in dataset.data_vars:
newc = {k:v.values for k,v in dataset[var].coords.items()}
newc['latitude'] = np.arange(89.5, -90.5, -STEP)
newc['longitude'] = np.arange(-179.5, 180.5, STEP)
empty_da = xr.DataArray(data=np.nan, dims=list(newc), coords=newc)
das[var] = empty_da
new_coords[var] = newc
mask_ds = xr.Dataset(das)
zkeys = set(mapper)
def find_nearest_idx(array, value):
if array[1] > array[0]:
idx = np.searchsorted(array, value, side="left")
if idx > 0 and (idx == len(array) or math.fabs(value - array[idx-1]) < math.fabs(value - array[idx])):
idx = idx-1
else:
idx = array.size - np.searchsorted(array[::-1], value, side="right")
# idx = np.searchsorted(array, value, side="left", sorter=np.argsort(array))
if idx > 0 and (idx == len(array) or math.fabs(value - array[idx]) < math.fabs(value - array[idx+1])):
idx = idx-1
return idx
def zarr_key(variable: xr.DataArray, coords: dict, chunks: dict, indexes: dict) -> str:
chunk = []
for i, dim in enumerate(variable.dims):
vals = indexes[dim]
if vals.dtype == "O":
chunk.append(list(vals).index(coords[dim]) // chunks[dim])
else:
chunk.append(find_nearest_idx(vals.values, coords[dim]) // chunks[dim])
return variable.name + "/" + ".".join(str(ch) for ch in chunk)
#%%time
chunks = {"latitude": 1200, "longitude": 1200}
for var in mask_ds.data_vars:
print(f'Processing {var}...')
chunks = {dim: chunks.get(dim, 1) for i, dim in enumerate(dataset[var].dims)}
# chunks = {dim:dataset[var].chunks[i][0] for i, dim in enumerate(dataset[var].dims)}
indexes = {dim: dataset[var].indexes[dim] for dim in dataset[var].dims}
total = mask_ds[var].size
mask = np.full(total, np.nan, dtype=np.float32)
for i, coords in enumerate(itertools.product(*(new_coords[var].values()))):
coords = dict(zip(new_coords[var].keys(), coords))
zkey = zarr_key(dataset[var], coords, chunks, indexes)
mask[i] = zkey in zkeys
mask = mask.reshape(mask_ds[var].shape)
mask = np.where(mask == 0, np.nan, 1)
mask_ds[var].values = mask
print("done")
Processing AMP... Processing COH... Processing inc... Processing lsmap... Processing rho... Processing rmse... Processing tau... done
We use a custom viz tool to be able to navigate the data space. As coded here, this will open in a separate browser tab. We may generalise this tool for other datasets in the future.
hv.config.image_rtol = 0.01
class ZarrExplorer(param.Parameterized):
local_map_extent = param.Number(default=0.2)
variable = param.Selector(doc='Dataset Variable', default='COH', objects=list(mask_ds.data_vars))
stream_tap_global = param.ClassSelector(hv.streams.SingleTap, hv.streams.SingleTap(x=-40, y=70), precedence=-1)
update_localmap = param.Action(lambda x: x.param.trigger('update_localmap'), label='Click to load data after panning/zooming')
def __init__(self, **params):
super().__init__(**params)
self.global_map()
self.lm = pn.pane.HoloViews(None, linked_axes=False)
self.stream_rng = hv.streams.RangeXY()
self.x_range, self.y_range = None, None
self.update_local_map_after_map_click()
@param.depends('variable')
def global_map(self):
ds = hv.Dataset(mask_ds[self.variable])
self.img_dm = ds.to(gv.QuadMesh, kdims=['longitude', 'latitude'], dynamic=True).opts()
self.img_dm.cache_size = 1 # No cache so that last_key returns the current widgets state
overlay = self.img_dm * gv.feature.coastline.opts(scale='50m')
self.stream_tap_global.source = self.img_dm # Attache the tap stream to this map
overlay = overlay * self.withregion()
return pn.panel(overlay.opts(width=600, height=500), widget_location='left')
def withregion(self):
def make_point(x, y):
return gv.Points([(x, y)]).opts(color='red', marker='+', size=20)
return hv.DynamicMap(make_point, streams=dict(x=self.stream_tap_global.param.x, y=self.stream_tap_global.param.y))
@param.depends('stream_tap_global.x', 'stream_tap_global.y', watch=True)
def update_local_map_after_map_click(self):
x, y = self.stream_tap_global.x, self.stream_tap_global.y
half_lme = self.local_map_extent / 2
self.x_range = (x-half_lme, x+half_lme)
self.y_range = (y+half_lme, y-half_lme) # The dataset has reversed longitude
@param.depends('update_localmap', watch=True)
def update_local_map_after_refresh(self):
y0, y1 = self.stream_rng.y_range
self.x_range = self.stream_rng.x_range
self.y_range = (y1, y0) # The dataset has reversed longitude
@param.depends('update_local_map_after_map_click', 'update_local_map_after_refresh')
def local_map(self):
if self.img_dm.last_key:
state = {kdim.name: val for kdim, val in zip(self.img_dm.kdims, self.img_dm.last_key)}
else:
state = {kdim.name: kdim.values[0] for kdim in self.img_dm.kdims}
dssub = dataset[self.variable].sel(latitude=slice(*self.y_range), longitude=slice(*self.x_range), **state)
title = f'{self.variable} @' + ', '.join(f'{dim}: {val}' for dim, val in state.items())
img = dssub.hvplot.image(
x="longitude", y="latitude",
cmap='spectral_r', frame_width=400, geo=True,
rasterize=True,
title=title,
shared_axes=False,
fields={'season': {'default': 'fall'}}
)
self.stream_rng.source = img
return img
ze = ZarrExplorer()
app = pn.Column(
pn.Param(ze.param.variable, width=150),
pn.Row(
ze.global_map,
pn.Column(
pn.panel(ze.local_map, loading_indicator=True),
ze.param.update_localmap
),
),
)
#app.show()
app.servable('Global Coherence Explorer')