Compare using streamflow data from the National Water Model 2.1
import xarray as xr
import pandas as pd
import fsspec
import hvplot.xarray
import hvplot.pandas
import fsspec
fs = fsspec.filesystem('s3', anon=True, skip_instance_cache=True,
client_kwargs={'endpoint_url': 'https://mghp.osn.xsede.org'})
sta = ['USGS-06430800', 'USGS-06430770', 'USGS-08083230', 'USGS-08083100',
'USGS-08084000', 'USGS-08083240', 'USGS-08084200', 'USGS-08083480',
'USGS-08083420', 'USGS-08088000', 'USGS-08089000', 'USGS-08090800',
'USGS-08090905', 'USGS-08084800', 'USGS-08091500', 'USGS-08092000',
'USGS-08091000', 'USGS-08093100', 'USGS-08096500', 'USGS-08085500',
'USGS-08095300', 'USGS-08095400', 'USGS-08086290', 'USGS-08086212',
'USGS-08086050', 'USGS-08094800', 'USGS-08095200', 'USGS-08095000',
'USGS-08082500', 'USGS-08082700', 'USGS-08098300', 'USGS-08098290',
'USGS-08108700', 'USGS-08110200', 'USGS-08110100', 'USGS-08109800',
'USGS-08109700', 'USGS-08110000', 'USGS-08110430', 'USGS-08110325',
'USGS-08110500', 'USGS-08110800', 'USGS-08111052', 'USGS-08111054',
'USGS-08103800', 'USGS-08104100', 'USGS-08103900', 'USGS-08103940',
'USGS-08104300', 'USGS-08107950', 'USGS-08104500', 'USGS-08106350',
'USGS-08108250', 'USGS-08106500', 'USGS-08117995', 'USGS-08119500',
'USGS-08120500', 'USGS-08121000', 'USGS-05503800', 'USGS-05503100']
parquet_dataset = 's3://rsignellbucket1/testing/parquet/pred.parquet'
fs.size(parquet_dataset)/1e6
381.264719
%%time
df_parquet = pd.read_parquet(fs.open(parquet_dataset), columns=sta)
smean_parquet = df_parquet.mean()
CPU times: user 381 ms, sys: 195 ms, total: 576 ms Wall time: 1.25 s
zarr_dataset = 's3://rsignellbucket1/testing/parquet/pred.zarr'
fs.du(zarr_dataset)/1e6
396.731735
%%time
ds_zarr = xr.open_dataset(fs.get_mapper(zarr_dataset),
backend_kwargs=dict(consolidated=True),
engine='zarr', chunks={})
da_s = ds_zarr.streamflow.sel(gage_id=sta).load()
smean_zarr = da_s.mean(dim='time').data
CPU times: user 1.14 s, sys: 182 ms, total: 1.33 s Wall time: 2.89 s
ds_zarr
<xarray.Dataset> Dimensions: (gage_id: 1000, time: 367439) Coordinates: elevation (gage_id) float32 dask.array<chunksize=(1000,), meta=np.ndarray> * gage_id (gage_id) <U20 'USGS-06730160' ... 'USGS-05503100' latitude (gage_id) float32 dask.array<chunksize=(1000,), meta=np.ndarray> longitude (gage_id) float32 dask.array<chunksize=(1000,), meta=np.ndarray> order (gage_id) int32 dask.array<chunksize=(1000,), meta=np.ndarray> * time (time) datetime64[ns] 1979-02-01T01:00:00 ... 2020-12-31T23:0... Data variables: streamflow (time, gage_id) float32 dask.array<chunksize=(367439, 1), meta=np.ndarray>
da_s.encoding
{'chunks': (367439, 1), 'preferred_chunks': {'time': 367439, 'gage_id': 1}, 'compressor': Blosc(cname='zstd', clevel=5, shuffle=NOSHUFFLE, blocksize=0), 'filters': None, '_FillValue': nan, 'dtype': dtype('float32'), 'coordinates': 'elevation order latitude longitude'}
smean_parquet[[5,10]]
gage_id USGS-08083240 2.818946 USGS-08089000 47.232895 dtype: float32
smean_zarr[[5,10]]
array([ 2.8189127, 47.223545 ], dtype=float32)
da_s.sel(gage_id='USGS-05503100').hvplot(x='time')
df_parquet['USGS-05503100'].hvplot(x='time')