dataset_name = "mooring_hourly_timeseries_delayed_qc"
# only run once, then restart session if needed
!pip install uv
import os
import sys
def is_colab():
try:
import google.colab
return True
except ImportError:
return False
# Get the current directory of the notebook
current_dir = os.getcwd()
# Check if requirements.txt exists in the current directory
local_requirements = os.path.join(current_dir, 'requirements.txt')
if os.path.exists(local_requirements):
requirements_path = local_requirements
else:
# Fall back to the online requirements.txt file
requirements_path = 'https://raw.githubusercontent.com/aodn/aodn_cloud_optimised/main/notebooks/requirements.txt'
# Install packages using uv and the determined requirements file
if is_colab():
os.system(f'uv pip install --system -r {requirements_path}')
else:
os.system('uv venv')
os.system(f'uv pip install -r {requirements_path}')
Requirement already satisfied: uv in /home/lbesnard/miniforge3/envs/AodnCloudOptimised/lib/python3.12/site-packages (0.4.18)
Using CPython 3.12.6 interpreter at: /home/lbesnard/miniforge3/envs/AodnCloudOptimised/bin/python Creating virtual environment at: .venv Activate with: source .venv/bin/activate Audited 230 packages in 45ms
import requests
if not os.path.exists('parquet_queries.py'):
print('Downloading parquet_queries.py')
url = 'https://raw.githubusercontent.com/aodn/aodn_cloud_optimised/main/aodn_cloud_optimised/lib/ParquetDataQuery.py'
response = requests.get(url)
with open('parquet_queries.py', 'w') as f:
f.write(response.text)
from parquet_queries import create_time_filter, create_bbox_filter, query_unique_value, plot_spatial_extent, \
get_temporal_extent, get_schema_metadata, plot_ts_diagram
import pyarrow.parquet as pq
import pyarrow.dataset as pds
import pandas as pd
import pyarrow.compute as pc
BUCKET_OPTIMISED_DEFAULT="aodn-cloud-optimised"
dname = f"s3://anonymous@{BUCKET_OPTIMISED_DEFAULT}/{dataset_name}.parquet/"
parquet_ds = pq.ParquetDataset(dname,partitioning='hive')
Partitioning in Parquet involves organising data files based on the values of one or more columns, known as partition keys. When data is written to Parquet files with partitioning enabled, the files are physically stored in a directory structure that reflects the partition keys. This directory structure makes it easier to retrieve and process specific subsets of data based on the partition keys.
dataset = pds.dataset(dname, format="parquet", partitioning="hive")
partition_keys = dataset.partitioning.schema
print(partition_keys)
site_code: string timestamp: int32 polygon: string
%%time
unique_partition_value = query_unique_value(parquet_ds, 'site_code')
print(list(unique_partition_value)[0:2]) # showing a subset only
['PIL100', 'ITFTIS'] CPU times: user 6.83 ms, sys: 0 ns, total: 6.83 ms Wall time: 6.34 ms
In this section, we're plotting the polygons where data exists. This helps then with creating a bounding box where there is data
plot_spatial_extent(parquet_ds)
/home/lbesnard/miniforge3/envs/AodnCloudOptimised/lib/python3.12/site-packages/cartopy/mpl/feature_artist.py:144: UserWarning: facecolor will have no effect as it has been defined as "never". warnings.warn('facecolor will have no effect as it has been ' /home/lbesnard/github_repo/aodn_cloud_optimised/notebooks/parquet_queries.py:449: UserWarning: Legend does not support handles for PatchCollection instances. See: https://matplotlib.org/stable/tutorials/intermediate/legend_guide.html#implementing-a-custom-legend-handler ax.legend() /home/lbesnard/github_repo/aodn_cloud_optimised/notebooks/parquet_queries.py:449: UserWarning: No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. ax.legend()
Similary to the spatial extent, we're retrieving the minimum and maximum timestamp partition values of the dataset. This is not necessarely accurately representative of the TIME values, as the timestamp partition can be yearly/monthly... but is here to give an idea
get_temporal_extent(parquet_ds)
(datetime.datetime(2007, 7, 1, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 1, 0, 0, tzinfo=datetime.timezone.utc))
For all parquet dataset, we create a sidecar file in the root of the dataset named _common_matadata. This contains the variable attributes.
# parquet_meta = pa.parquet.read_schema(os.path.join(dname + '_common_metadata')) # parquet metadata
metadata = get_schema_metadata(dname) # schema metadata
metadata
{'instrument_index': {'type': 'int32', 'long_name': 'which instrument this obs is for', 'instance_dimension': 'INSTRUMENT'}, 'instrument_id': {'type': 'string', 'long_name': 'source deployment code, instrument make, model, serial_number'}, 'source_file': {'type': 'string', 'long_name': 'source file for this instrument', 'comment': 'This variable lists the relative path of each input file. To obtain a download URL for a file, append its path to the download_url_prefix attribute. To interact with the file remotely via the OPENDAP protocol, append its path to the opendap_url_prefix attribute.', 'download_url_prefix': 'https://s3-ap-southeast-2.amazonaws.com/imos-data/', 'opendap_url_prefix': 'http://thredds.aodn.org.au/thredds/dodsC/'}, 'TIME': {'type': 'timestamp[ns]', 'axis': 'T', 'comment': 'time stamp corresponds to the hour and represents binned data [30,30) minutes before and after the hour', 'long_name': 'time', 'standard_name': 'time', 'valid_max': 90000.0, 'valid_min': 0.0}, 'LONGITUDE': {'type': 'double', 'axis': 'X', 'long_name': 'longitude', 'reference_datum': 'WGS84 geographic coordinate system', 'standard_name': 'longitude', 'units': 'degrees_east', 'valid_max': 180.0, 'valid_min': -180.0}, 'LATITUDE': {'type': 'double', 'axis': 'Y', 'long_name': 'latitude', 'reference_datum': 'WGS84 geographic coordinate system', 'standard_name': 'latitude', 'units': 'degrees_north', 'valid_max': 90.0, 'valid_min': -90.0}, 'NOMINAL_DEPTH': {'type': 'float', 'axis': 'Z', 'long_name': 'nominal depth', 'positive': 'down', 'reference_datum': 'sea surface', 'standard_name': 'depth', 'units': 'm', 'valid_max': 12000.0, 'valid_min': -5.0}, 'DEPTH': {'type': 'float', 'ancillary_variables': 'DEPTH_min DEPTH_max DEPTH_std DEPTH_count', 'long_name': 'mean actual depth', 'positive': 'down', 'reference_datum': 'sea surface', 'standard_name': 'depth', 'units': 'm', 'valid_max': 12000.0, 'valid_min': -5.0, 'cell_methods': 'TIME:mean (interval: 1 hr comment: time mid point)'}, 'DEPTH_count': {'type': 'float', 'standard_name': 'depth number_of_observations', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'DEPTH_min': {'type': 'float', 'units': 'm', 'standard_name': 'depth', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'DEPTH_max': {'type': 'float', 'units': 'm', 'standard_name': 'depth', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'DEPTH_std': {'type': 'float', 'units': 'm', 'standard_name': 'depth', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'PRES': {'type': 'float', 'ancillary_variables': 'PRES_min PRES_max PRES_std PRES_count', 'long_name': 'mean sea_water_pressure_due_to_sea_water', 'standard_name': 'sea_water_pressure_due_to_sea_water', 'units': 'dbar', 'valid_max': 12000.0, 'valid_min': -15.0, 'cell_methods': 'TIME:mean (interval: 1 hr comment: time mid point)'}, 'PRES_REL': {'type': 'float', 'ancillary_variables': 'PRES_REL_min PRES_REL_max PRES_REL_std PRES_REL_count', 'long_name': 'mean sea_water_pressure_due_to_sea_water', 'standard_name': 'sea_water_pressure_due_to_sea_water', 'units': 'dbar', 'valid_max': 12000.0, 'valid_min': -15.0, 'applied_offset_by_instrument': [-10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219, -10.132499694824219], 'cell_methods': 'TIME:mean (interval: 1 hr comment: time mid point)'}, 'PRES_REL_count': {'type': 'float', 'standard_name': 'sea_water_pressure_due_to_sea_water number_of_observations', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'PRES_REL_max': {'type': 'float', 'units': 'dbar', 'standard_name': 'sea_water_pressure_due_to_sea_water', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'PRES_REL_min': {'type': 'float', 'units': 'dbar', 'standard_name': 'sea_water_pressure_due_to_sea_water', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'PRES_REL_std': {'type': 'float', 'units': 'dbar', 'standard_name': 'sea_water_pressure_due_to_sea_water', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'PRES_count': {'type': 'float', 'standard_name': 'sea_water_pressure_due_to_sea_water number_of_observations', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'PRES_max': {'type': 'float', 'units': 'dbar', 'standard_name': 'sea_water_pressure_due_to_sea_water', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'PRES_min': {'type': 'float', 'units': 'dbar', 'standard_name': 'sea_water_pressure_due_to_sea_water', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'PRES_std': {'type': 'float', 'units': 'dbar', 'standard_name': 'sea_water_pressure_due_to_sea_water', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'TEMP': {'type': 'float', 'ancillary_variables': 'TEMP_min TEMP_max TEMP_std TEMP_count', 'long_name': 'mean sea_water_temperature', 'standard_name': 'sea_water_temperature', 'units': 'degrees_Celsius', 'valid_max': 40.0, 'valid_min': -2.5, 'cell_methods': 'TIME:mean (interval: 1 hr comment: time mid point)'}, 'TEMP_count': {'type': 'float', 'standard_name': 'sea_water_temperature number_of_observations', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'TEMP_max': {'type': 'float', 'units': 'degrees_Celsius', 'standard_name': 'sea_water_temperature', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'TEMP_min': {'type': 'float', 'units': 'degrees_Celsius', 'standard_name': 'sea_water_temperature', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'TEMP_std': {'type': 'float', 'units': 'degrees_Celsius', 'standard_name': 'sea_water_temperature', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'PSAL': {'type': 'float', 'ancillary_variables': 'PSAL_min PSAL_max PSAL_std PSAL_count', 'long_name': 'mean sea_water_practical_salinity', 'standard_name': 'sea_water_practical_salinity', 'units': 'S m-1', 'valid_max': 41.0, 'valid_min': 2.0, 'cell_methods': 'TIME:mean (interval: 1 hr comment: time mid point)'}, 'PSAL_count': {'type': 'float', 'standard_name': 'sea_water_practical_salinity number_of_observations', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'PSAL_max': {'type': 'float', 'units': 'S m-1', 'standard_name': 'sea_water_practical_salinity', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'PSAL_min': {'type': 'float', 'units': 'S m-1', 'standard_name': 'sea_water_practical_salinity', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'PSAL_std': {'type': 'float', 'units': 'S m-1', 'standard_name': 'sea_water_practical_salinity', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'TURB': {'type': 'float', 'ancillary_variables': 'TURB_min TURB_max TURB_std TURB_count', 'long_name': 'median sea_water_turbidity', 'standard_name': 'sea_water_turbidity', 'units': '1', 'valid_max': 1000.0, 'valid_min': 0.0, 'cell_methods': 'TIME:median (interval: 1 hr comment: time mid point)'}, 'TURB_count': {'type': 'float', 'standard_name': 'sea_water_turbidity number_of_observations', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'TURB_max': {'type': 'float', 'units': '1', 'standard_name': 'sea_water_turbidity', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'TURB_min': {'type': 'float', 'units': '1', 'standard_name': 'sea_water_turbidity', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'TURB_std': {'type': 'float', 'units': '1', 'standard_name': 'sea_water_turbidity', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'CHLF': {'type': 'float', 'ancillary_variables': 'CHLF_min CHLF_max CHLF_std CHLF_count', 'comment': 'Artificial chlorophyll data', 'long_name': 'median mass_concentration_of_inferred_chlorophyll_from_relative_fluorescence_units_in_sea_water', 'units': 'mg m-3', 'valid_max': 100.0, 'valid_min': 0.0, 'cell_methods': 'TIME:median (interval: 1 hr comment: time mid point)'}, 'CHLF_count': {'type': 'float', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'CHLF_max': {'type': 'float', 'units': 'mg m-3', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'CHLF_min': {'type': 'float', 'units': 'mg m-3', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'CHLF_std': {'type': 'float', 'units': 'mg m-3', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'CHLU': {'type': 'float', 'comment': 'Artificial chlorophyll data', 'long_name': 'median mass_concentration_of_inferred_chlorophyll_from_relative_fluorescence_units_in_sea_water', 'units': 'mg m-3', 'valid_min': 0.0, 'valid_max': 100.0, 'ancillary_variables': 'CHLU_min CHLU_max CHLU_std CHLU_count', 'cell_methods': 'TIME:median (interval: 1 hr comment: time mid point)'}, 'CHLU_count': {'type': 'float', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'CHLU_max': {'type': 'float', 'units': 'mg m-3', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'CHLU_min': {'type': 'float', 'units': 'mg m-3', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'CHLU_std': {'type': 'float', 'units': 'mg m-3', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'CPHL': {'type': 'float', 'comment': 'Artificial chlorophyll data computed from bio-optical sensor raw counts measurements.', 'long_name': 'median mass_concentration_of_inferred_chlorophyll_from_relative_fluorescence_units_in_sea_water', 'units': 'mg m-3', 'valid_min': 0.0, 'valid_max': 100.0, 'ancillary_variables': 'CPHL_min CPHL_max CPHL_std CPHL_count', 'cell_methods': 'TIME:median (interval: 1 hr comment: time mid point)'}, 'CPHL_count': {'type': 'float', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'CPHL_max': {'type': 'float', 'units': 'mg m-3', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'CPHL_min': {'type': 'float', 'units': 'mg m-3', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'CPHL_std': {'type': 'float', 'units': 'mg m-3', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'DOX': {'type': 'float', 'ancillary_variables': 'DOX_min DOX_max DOX_std DOX_count', 'long_name': 'mean volume_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'standard_name': 'volume_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'units': 'ml l-1', 'valid_max': 200.0, 'valid_min': 0.0, 'cell_methods': 'TIME:mean (interval: 1 hr comment: time mid point)'}, 'DOX_min': {'type': 'float', 'units': 'ml l-1', 'standard_name': 'volume_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'DOX_max': {'type': 'float', 'units': 'ml l-1', 'standard_name': 'volume_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'DOX_std': {'type': 'float', 'units': 'ml l-1', 'standard_name': 'volume_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'DOX_count': {'type': 'float', 'standard_name': 'volume_concentration_of_dissolved_molecular_oxygen_in_sea_water number_of_observations', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'DOX1': {'type': 'float', 'ancillary_variables': 'DOX1_min DOX1_max DOX1_std DOX1_count', 'comment': 'oxygenPP.m: DOX1 derived using DOX1 = DOX * 44.6596.', 'long_name': 'mean mole_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'standard_name': 'mole_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'units': 'umol l-1', 'valid_max': 1000.0, 'valid_min': 0.0, 'cell_methods': 'TIME:mean (interval: 1 hr comment: time mid point)'}, 'DOX1_count': {'type': 'float', 'standard_name': 'mole_concentration_of_dissolved_molecular_oxygen_in_sea_water number_of_observations', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'DOX1_max': {'type': 'float', 'units': 'umol l-1', 'standard_name': 'mole_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'DOX1_min': {'type': 'float', 'units': 'umol l-1', 'standard_name': 'mole_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'DOX1_std': {'type': 'float', 'units': 'umol l-1', 'standard_name': 'mole_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'DOX1_2': {'type': 'float', 'ancillary_variables': 'DOX1_2_min DOX1_2_max DOX1_2_std DOX1_2_count', 'comment': 'Originally expressed in ml/l, 1ml/l = 44.660umol/l was assumed.', 'long_name': 'mean mole_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'standard_name': 'mole_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'units': 'umol l-1', 'valid_max': 1000.0, 'valid_min': 0.0, 'cell_methods': 'TIME:mean (interval: 1 hr comment: time mid point)'}, 'DOX1_2_min': {'type': 'float', 'units': 'umol l-1', 'standard_name': 'mole_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'DOX1_2_max': {'type': 'float', 'units': 'umol l-1', 'standard_name': 'mole_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'DOX1_2_std': {'type': 'float', 'units': 'umol l-1', 'standard_name': 'mole_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'DOX1_2_count': {'type': 'float', 'standard_name': 'mole_concentration_of_dissolved_molecular_oxygen_in_sea_water number_of_observations', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'DOX2': {'type': 'float', 'ancillary_variables': 'DOX2_min DOX2_max DOX2_std DOX2_count', 'comment': 'Originally expressed in ml/l, assuming 1ml/l = 44.660umol/l and using density computed from Temperature, Salinity and Pressure with the CSIRO SeaWater library (EOS-80) v1.1.', 'long_name': 'mean moles_of_oxygen_per_unit_mass_in_sea_water', 'standard_name': 'moles_of_oxygen_per_unit_mass_in_sea_water', 'units': 'umol kg-1', 'valid_max': 1000.0, 'valid_min': 0.0, 'cell_methods': 'TIME:mean (interval: 1 hr comment: time mid point)'}, 'DOX2_min': {'type': 'float', 'units': 'umol kg-1', 'standard_name': 'moles_of_oxygen_per_unit_mass_in_sea_water', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'DOX2_max': {'type': 'float', 'units': 'umol kg-1', 'standard_name': 'moles_of_oxygen_per_unit_mass_in_sea_water', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'DOX2_count': {'type': 'float', 'standard_name': 'moles_of_oxygen_per_unit_mass_in_sea_water number_of_observations', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'DOX2_std': {'type': 'float', 'units': 'umol kg-1', 'standard_name': 'moles_of_oxygen_per_unit_mass_in_sea_water', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'DOX1_3': {'type': 'float', 'ancillary_variables': 'DOX1_3_min DOX1_3_max DOX1_3_std DOX1_3_count', 'comment': 'Originally expressed in mg/l, O2 density = 1.429kg/m3 and 1ml/l = 44.660umol/l were assumed.', 'long_name': 'mean mole_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'standard_name': 'mole_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'units': 'umol l-1', 'valid_max': 1000.0, 'valid_min': 0.0, 'cell_methods': 'TIME:mean (interval: 1 hr comment: time mid point)'}, 'DOX1_3_min': {'type': 'float', 'units': 'umol l-1', 'standard_name': 'mole_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'DOX1_3_max': {'type': 'float', 'units': 'umol l-1', 'standard_name': 'mole_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'DOX1_3_count': {'type': 'float', 'standard_name': 'mole_concentration_of_dissolved_molecular_oxygen_in_sea_water number_of_observations', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'DOX1_3_std': {'type': 'float', 'units': 'umol l-1', 'standard_name': 'mole_concentration_of_dissolved_molecular_oxygen_in_sea_water', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'DOXY': {'type': 'float', 'ancillary_variables': 'DOXY_min DOXY_max DOXY_std DOXY_count', 'long_name': 'mean mass_concentration_of_oxygen_in_sea_water', 'standard_name': 'mass_concentration_of_oxygen_in_sea_water', 'units': 'mg l-1', 'cell_methods': 'TIME:mean (interval: 1 hr comment: time mid point)'}, 'DOXY_std': {'type': 'float', 'units': 'mg l-1', 'standard_name': 'mass_concentration_of_oxygen_in_sea_water', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'DOXY_min': {'type': 'float', 'units': 'mg l-1', 'standard_name': 'mass_concentration_of_oxygen_in_sea_water', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'DOXY_max': {'type': 'float', 'units': 'mg l-1', 'standard_name': 'mass_concentration_of_oxygen_in_sea_water', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'DOXY_count': {'type': 'float', 'standard_name': 'mass_concentration_of_oxygen_in_sea_water number_of_observations', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'DOXS': {'type': 'float', 'ancillary_variables': 'DOXS_min DOXS_max DOXS_std DOXS_count', 'comment': 'oxygenPP.m: DOXS derived using DOXS = 100 * DOX2 / OXSOL_SURFACE. OXSOL_SURFACE derived from TEMP, PSAL, PRES_REL , LATITUDE and LONGITUDE using gsw_O2sol_SP_pt, gsw_pt0_from_t and gsw_SA_from_SP from the Gibbs-SeaWater toolbox (TEOS-10) v3.06. See SeaBird data processing manual (http://www.seabird.com/document/sbe-data-processing-manual).', 'long_name': 'mean fractional_saturation_of_oxygen_in_sea_water', 'standard_name': 'fractional_saturation_of_oxygen_in_sea_water', 'units': '%', 'cell_methods': 'TIME:mean (interval: 1 hr comment: time mid point)'}, 'DOXS_std': {'type': 'float', 'units': '%', 'standard_name': 'fractional_saturation_of_oxygen_in_sea_water', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'DOXS_min': {'type': 'float', 'units': '%', 'standard_name': 'fractional_saturation_of_oxygen_in_sea_water', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'DOXS_max': {'type': 'float', 'units': '%', 'standard_name': 'fractional_saturation_of_oxygen_in_sea_water', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'DOXS_count': {'type': 'float', 'standard_name': 'fractional_saturation_of_oxygen_in_sea_water number_of_observations', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'PAR': {'type': 'float', 'ancillary_variables': 'PAR_min PAR_max PAR_std PAR_count', 'long_name': 'median downwelling_photosynthetic_photon_flux_in_sea_water', 'standard_name': 'downwelling_photosynthetic_photon_flux_in_sea_water', 'units': 'umole m-2 s-1', 'cell_methods': 'TIME:median (interval: 1 hr comment: time mid point)'}, 'PAR_std': {'type': 'float', 'units': 'umole m-2 s-1', 'standard_name': 'downwelling_photosynthetic_photon_flux_in_sea_water', 'long_name': 'std data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:std'}, 'PAR_min': {'type': 'float', 'units': 'umole m-2 s-1', 'standard_name': 'downwelling_photosynthetic_photon_flux_in_sea_water', 'long_name': 'min data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:min'}, 'PAR_max': {'type': 'float', 'units': 'umole m-2 s-1', 'standard_name': 'downwelling_photosynthetic_photon_flux_in_sea_water', 'long_name': 'max data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:max'}, 'PAR_count': {'type': 'float', 'standard_name': 'downwelling_photosynthetic_photon_flux_in_sea_water number_of_observations', 'units': '1', 'long_name': 'count data value in the bin, after rejection of flagged data', 'cell_methods': 'TIME:count'}, 'timestamp': {'type': 'int64'}, 'polygon': {'type': 'string'}, 'site_code': {'type': 'string'}, 'filename': {'type': 'string'}, 'dataset_metadata': {'metadata_uuid': 'efd8201c-1eca-412e-9ad2-0534e96cea14', 'title': 'ANMN hourly timeseries', 'featureType': 'timeSeries'}}
filter_time = create_time_filter(parquet_ds, date_start='2022-09-01', date_end='2022-11-01')
filter_geo = create_bbox_filter(parquet_ds, lat_min=-37, lat_max=-34, lon_min=149, lon_max=151)
filter = filter_geo & filter_time
%%time
# using pandas instead of pyarrow so that filters can directly be applied to the data, and not just the partition
df = pd.read_parquet(dname, engine='pyarrow',filters=filter)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 39307 entries, 0 to 39306 Data columns (total 96 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 instrument_index 39307 non-null int32 1 instrument_id 39307 non-null object 2 source_file 39307 non-null object 3 TIME 39307 non-null datetime64[ns] 4 LONGITUDE 39307 non-null float64 5 LATITUDE 39307 non-null float64 6 NOMINAL_DEPTH 39307 non-null float32 7 DEPTH 39307 non-null float32 8 DEPTH_count 39307 non-null float32 9 DEPTH_min 39307 non-null float32 10 DEPTH_max 39307 non-null float32 11 DEPTH_std 39307 non-null float32 12 PRES 19948 non-null float32 13 PRES_REL 5862 non-null float32 14 PRES_REL_count 5862 non-null float32 15 PRES_REL_max 5862 non-null float32 16 PRES_REL_min 5862 non-null float32 17 PRES_REL_std 5862 non-null float32 18 PRES_count 19948 non-null float32 19 PRES_max 19948 non-null float32 20 PRES_min 19948 non-null float32 21 PRES_std 19948 non-null float32 22 TEMP 36685 non-null float32 23 TEMP_count 39307 non-null float32 24 TEMP_max 36685 non-null float32 25 TEMP_min 36685 non-null float32 26 TEMP_std 36677 non-null float32 27 PSAL 2930 non-null float32 28 PSAL_count 2930 non-null float32 29 PSAL_max 2930 non-null float32 30 PSAL_min 2930 non-null float32 31 PSAL_std 2930 non-null float32 32 filename 39307 non-null object 33 TURB 0 non-null float32 34 TURB_count 0 non-null float32 35 TURB_max 0 non-null float32 36 TURB_min 0 non-null float32 37 TURB_std 0 non-null float32 38 CHLF 0 non-null float32 39 CHLF_count 0 non-null float32 40 CHLF_max 0 non-null float32 41 CHLF_min 0 non-null float32 42 CHLF_std 0 non-null float32 43 CHLU 0 non-null float32 44 CHLU_count 0 non-null float32 45 CHLU_max 0 non-null float32 46 CHLU_min 0 non-null float32 47 CHLU_std 0 non-null float32 48 CPHL 0 non-null float32 49 CPHL_count 0 non-null float32 50 CPHL_max 0 non-null float32 51 CPHL_min 0 non-null float32 52 CPHL_std 0 non-null float32 53 DOX 0 non-null float32 54 DOX_min 0 non-null float32 55 DOX_max 0 non-null float32 56 DOX_std 0 non-null float32 57 DOX_count 0 non-null float32 58 DOX1 0 non-null float32 59 DOX1_count 0 non-null float32 60 DOX1_max 0 non-null float32 61 DOX1_min 0 non-null float32 62 DOX1_std 0 non-null float32 63 DOX1_2 0 non-null float32 64 DOX1_2_min 0 non-null float32 65 DOX1_2_max 0 non-null float32 66 DOX1_2_std 0 non-null float32 67 DOX1_2_count 0 non-null float32 68 DOX2 0 non-null float32 69 DOX2_min 0 non-null float32 70 DOX2_max 0 non-null float32 71 DOX2_count 0 non-null float32 72 DOX2_std 0 non-null float32 73 DOX1_3 0 non-null float32 74 DOX1_3_min 0 non-null float32 75 DOX1_3_max 0 non-null float32 76 DOX1_3_count 0 non-null float32 77 DOX1_3_std 0 non-null float32 78 DOXY 0 non-null float32 79 DOXY_std 0 non-null float32 80 DOXY_min 0 non-null float32 81 DOXY_max 0 non-null float32 82 DOXY_count 0 non-null float32 83 DOXS 0 non-null float32 84 DOXS_std 0 non-null float32 85 DOXS_min 0 non-null float32 86 DOXS_max 0 non-null float32 87 DOXS_count 0 non-null float32 88 PAR 0 non-null float32 89 PAR_std 0 non-null float32 90 PAR_min 0 non-null float32 91 PAR_max 0 non-null float32 92 PAR_count 0 non-null float32 93 site_code 39307 non-null category 94 timestamp 39307 non-null category 95 polygon 39307 non-null category dtypes: category(3), datetime64[ns](1), float32(86), float64(2), int32(1), object(3) memory usage: 15.0+ MB CPU times: user 577 ms, sys: 147 ms, total: 723 ms Wall time: 5.01 s
print(df["NOMINAL_DEPTH"].unique())
print(df["site_code"].unique())
[ 69. 53. 61. 21. 37. 29. 14. 45. 72.9 71.9 44. 36. 52. 60. 68. 28. 20. 13. 116.9 17. 113. 33. 65. 81. 105. 73. 97. 25. 89. 57. 41. 49. 50. 34. 90. 58. 74. 82. 98. 106. 114. 18. 26. 42. 66. 117.9] ['BMP070', 'BMP120'] Categories (63, object): ['BMP070', 'BMP090', 'BMP120', 'CAM050', ..., 'WATR10', 'WATR15', 'WATR20', 'WATR50']
import matplotlib.pyplot as plt
# Unique NOMINAL_DEPTH values, sorted in ascending order
df = df[df["site_code"] == "BMP120"].sort_values('TIME')
depths = sorted(df['NOMINAL_DEPTH'].unique())
# Create subplots per NOMINAL_DEPTH values
n = len(depths)
fig, axes = plt.subplots(n, 1, figsize=(10, 5 * n), sharex=True, sharey=True)
# Plot data for each depth in a separate subplot
for ax, depth in zip(axes, depths):
subset = df[df['NOMINAL_DEPTH'] == depth]
ax.plot(subset['TIME'], subset['TEMP'], label=f'Depth {depth}')
ax.set_title(f'Depth {depth}')
ax.set_xlabel('Time')
ax.set_ylabel('Temperature')
ax.legend()
# Adjust layout
plt.tight_layout()
plt.show()
# Create a colormap
import numpy as np
import matplotlib.cm as cm
cmap = cm.get_cmap('viridis')
norm = plt.Normalize(vmin=df['TEMP'].min(), vmax=df['TEMP'].max())
fig, ax = plt.subplots(figsize=(10, 6))
# Scatter plot with reduced dot size and transparency
sc = ax.scatter(df['TIME'], df['DEPTH'], c=df['TEMP'], cmap=cmap, norm=norm,
edgecolor='k', s=0.5, alpha=0.8) # Adjust size (s) and transparency (alpha)
# Create a colorbar
cbar = plt.colorbar(sc, ax=ax, orientation='vertical')
cbar.set_label('Temperature')
# Labels and title
ax.set_xlabel('Time')
ax.set_ylabel('Depth')
ax.set_title('Depth over Time with Temperature Color-Coding')
plt.show()
/tmp/ipykernel_2856679/4173067584.py:4: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead. cmap = cm.get_cmap('viridis')
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.transform import linear_cmap
from bokeh.palettes import Viridis256
from bokeh.models import ColorBar, LinearColorMapper
import pandas as pd
output_notebook() # If using Jupyter Notebook or JupyterLab
# Convert datetime to milliseconds since epoch for Bokeh
df['TIME'] = df['TIME'].astype('datetime64[ms]')
# Create a color mapper
mapper = linear_cmap(field_name='TEMP', palette=Viridis256, low=df['TEMP'].min(), high=df['TEMP'].max())
# Create the figure with a datetime x-axis
p = figure(width=800, height=400, title="Depth over Time with Temperature Color-Coding",
x_axis_label='Time', y_axis_label='Depth', x_axis_type='datetime')
# Add the scatter renderer
p.scatter(x='TIME', y='DEPTH', source=df, size=6, color=mapper, fill_alpha=0.8, line_color='black')
# Create and add the color bar
color_bar = ColorBar(color_mapper=mapper['transform'], width=8, location=(0,0))
p.add_layout(color_bar, 'right')
# Invert the y-axis
p.y_range.start = df['DEPTH'].max()
p.y_range.end = df['DEPTH'].min()
# Show the plot
show(p)