from a collection of 2496 six-day zarr datasets, using .to_zarr(zarr_whole, region = {'time': slice(start, stop)})
%%time
import xarray as xr
import fsspec
import pandas as pd
import hvplot.xarray
import numpy as np
CPU times: user 3.53 s, sys: 3.28 s, total: 6.81 s Wall time: 3.62 s
fs = fsspec.filesystem('file')
zarr_whole = '/caldera/projects/usgs/hazards/cmgp/woodshole/rsignell/conus404/conus404_whole.zarr'
zlist = sorted(fs.glob('/caldera/projects/usgs/water/wbeep/conus404_work/test1/target_0*'))
len(zlist)
2496
ds0 = xr.open_dataset(zlist[0], engine='zarr', chunks={})
print(zlist[0])
print(ds0.time[0].values)
/caldera/projects/usgs/water/wbeep/conus404_work/test1/target_00000 1979-10-01T00:00:00.000000000
ds1 = xr.open_dataset(zlist[-1], engine='zarr', chunks={})
print(zlist[-1])
print(ds1.time[-1].values)
/caldera/projects/usgs/water/wbeep/conus404_work/test1/target_02495 2020-09-30T23:00:00.000000000
constant_vars = ['DZS', 'HGT', 'ISLTYP', 'IVGTYP', 'LAKEMASK', 'LANDMASK', 'LU_INDEX',
'MUB', 'P00', 'PB', 'PHB', 'P_TOP', 'SHDMAX', 'SHDMIN', 'SNOALB', 'T00',
'TISO', 'TLP', 'TSK_FORCE', 'TSK_FORCE_TEND', 'VAR', 'VAR_SSO', 'XLAND',
'ZETATOP', 'ZS', 'lat', 'lat_u', 'lat_v', 'lon', 'lon_u', 'lon_v']
source_dataset = ds0.drop(constant_vars)
dates = pd.date_range(start=ds0.time[0].values, end=ds1.time[-1].values, freq='1h')
Use the first dataset as a template, but then extend the time dimension to the full range of dates.
This bit of xarray wizardry is courtesy of Stefan Hoyer:
template = (
source_dataset
.chunk()
.pipe(xr.zeros_like)
.isel(time=0, drop=True)
.expand_dims(time=len(dates))
)
template['time'] = dates
tchunk = ds0.time.encoding['chunks'][0]
tchunk
144
template = template.chunk({'time':tchunk})
from dask.distributed import Client
client = Client()
client
Client-c8edef21-84ff-11ec-8124-000101000011
Connection method: Cluster object | Cluster type: distributed.LocalCluster |
Dashboard: /proxy/8787/status |
d8c0aac0
Dashboard: /proxy/8787/status | Workers: 10 |
Total threads: 80 | Total memory: 187.58 GiB |
Status: running | Using processes: True |
Scheduler-bb656ab7-f699-40f7-8a08-3bf3f41242e9
Comm: tcp://127.0.0.1:39723 | Workers: 10 |
Dashboard: /proxy/8787/status | Total threads: 80 |
Started: Just now | Total memory: 187.58 GiB |
Comm: tcp://127.0.0.1:35755 | Total threads: 8 |
Dashboard: /proxy/8787/status | Memory: 18.76 GiB |
Nanny: tcp://127.0.0.1:38977 | |
Local directory: /home/rsignell/EarthMap/Projects/esip-qhub-notebooks/hytest/dask-worker-space/worker-dqj3u7bp |
Comm: tcp://127.0.0.1:41199 | Total threads: 8 |
Dashboard: /proxy/8787/status | Memory: 18.76 GiB |
Nanny: tcp://127.0.0.1:42135 | |
Local directory: /home/rsignell/EarthMap/Projects/esip-qhub-notebooks/hytest/dask-worker-space/worker-anbpzzwc |
Comm: tcp://127.0.0.1:45795 | Total threads: 8 |
Dashboard: /proxy/8787/status | Memory: 18.76 GiB |
Nanny: tcp://127.0.0.1:41029 | |
Local directory: /home/rsignell/EarthMap/Projects/esip-qhub-notebooks/hytest/dask-worker-space/worker-rp5crlcv |
Comm: tcp://127.0.0.1:38063 | Total threads: 8 |
Dashboard: /proxy/8787/status | Memory: 18.76 GiB |
Nanny: tcp://127.0.0.1:37475 | |
Local directory: /home/rsignell/EarthMap/Projects/esip-qhub-notebooks/hytest/dask-worker-space/worker-9eyta703 |
Comm: tcp://127.0.0.1:33243 | Total threads: 8 |
Dashboard: /proxy/8787/status | Memory: 18.76 GiB |
Nanny: tcp://127.0.0.1:33537 | |
Local directory: /home/rsignell/EarthMap/Projects/esip-qhub-notebooks/hytest/dask-worker-space/worker-kqlovkb_ |
Comm: tcp://127.0.0.1:39229 | Total threads: 8 |
Dashboard: /proxy/8787/status | Memory: 18.76 GiB |
Nanny: tcp://127.0.0.1:33927 | |
Local directory: /home/rsignell/EarthMap/Projects/esip-qhub-notebooks/hytest/dask-worker-space/worker-ialh6scb |
Comm: tcp://127.0.0.1:43829 | Total threads: 8 |
Dashboard: /proxy/8787/status | Memory: 18.76 GiB |
Nanny: tcp://127.0.0.1:40363 | |
Local directory: /home/rsignell/EarthMap/Projects/esip-qhub-notebooks/hytest/dask-worker-space/worker-ndns94p1 |
Comm: tcp://127.0.0.1:32921 | Total threads: 8 |
Dashboard: /proxy/8787/status | Memory: 18.76 GiB |
Nanny: tcp://127.0.0.1:42505 | |
Local directory: /home/rsignell/EarthMap/Projects/esip-qhub-notebooks/hytest/dask-worker-space/worker-2n59tqu1 |
Comm: tcp://127.0.0.1:33935 | Total threads: 8 |
Dashboard: /proxy/8787/status | Memory: 18.76 GiB |
Nanny: tcp://127.0.0.1:42159 | |
Local directory: /home/rsignell/EarthMap/Projects/esip-qhub-notebooks/hytest/dask-worker-space/worker-x74ghjce |
Comm: tcp://127.0.0.1:42341 | Total threads: 8 |
Dashboard: /proxy/8787/status | Memory: 18.76 GiB |
Nanny: tcp://127.0.0.1:40579 | |
Local directory: /home/rsignell/EarthMap/Projects/esip-qhub-notebooks/hytest/dask-worker-space/worker-cyrm4hwt |
%%time
template.to_zarr(zarr_whole, compute=False, consolidated=True, mode='w')
CPU times: user 1min 14s, sys: 9.45 s, total: 1min 24s Wall time: 1min 25s
Delayed('_finalize_store-37130370-6aa3-4ce3-9d7c-0368a86f54cd')
%%time
ds0.drop(constant_vars).to_zarr(zarr_whole, region = {'time': slice(0, tchunk)})
CPU times: user 3.21 s, sys: 1.08 s, total: 4.29 s Wall time: 7.2 s
<xarray.backends.zarr.ZarrStore at 0x7ffb823eeb30>
ds0[constant_vars].to_zarr(zarr_whole, mode='a')
<xarray.backends.zarr.ZarrStore at 0x7ffcf56b5190>
(start at 1, not 0)
%%time
#for i in range(1, len(zlist)):
for i in range(1, 5):
start = i*tchunk
stop =(i+1)*tchunk
print(zlist[i])
dsi = xr.open_dataset(zlist[i], engine='zarr', chunks={})
dsi.to_zarr(zarr_whole, region = {'time': slice(start, stop)})
/caldera/projects/usgs/water/wbeep/conus404_work/test1/target_00001 /caldera/projects/usgs/water/wbeep/conus404_work/test1/target_00002 /caldera/projects/usgs/water/wbeep/conus404_work/test1/target_00003 /caldera/projects/usgs/water/wbeep/conus404_work/test1/target_00004 CPU times: user 15 s, sys: 1.43 s, total: 16.4 s Wall time: 26.7 s
ds = xr.open_dataset(zarr_whole, engine='zarr', consolidated=True)
ds.U10[:1000,500,500].hvplot(x='time', grid=True)
ds.U10[0,:,:].hvplot.quadmesh(x='lon', y='lat', geo=True, tiles='OSM',
cmap='turbo', rasterize=True)