dataset_name = "vessel_sst_delayed_qc"
# only run once, then restart session if needed
!pip install uv
import os
import sys
def is_colab():
try:
import google.colab
return True
except ImportError:
return False
# Get the current directory of the notebook
current_dir = os.getcwd()
# Check if requirements.txt exists in the current directory
local_requirements = os.path.join(current_dir, 'requirements.txt')
if os.path.exists(local_requirements):
requirements_path = local_requirements
else:
# Fall back to the online requirements.txt file
requirements_path = 'https://raw.githubusercontent.com/aodn/aodn_cloud_optimised/main/notebooks/requirements.txt'
# Install packages using uv and the determined requirements file
if is_colab():
os.system(f'uv pip install --system -r {requirements_path}')
else:
os.system('uv venv')
os.system(f'uv pip install -r {requirements_path}')
Requirement already satisfied: uv in /home/lbesnard/miniforge3/envs/AodnCloudOptimised/lib/python3.12/site-packages (0.4.18)
Using CPython 3.12.6 interpreter at: /home/lbesnard/miniforge3/envs/AodnCloudOptimised/bin/python Creating virtual environment at: .venv Activate with: source .venv/bin/activate Audited 230 packages in 36ms
import requests
import os
if not os.path.exists('parquet_queries.py'):
print('Downloading parquet_queries.py')
url = 'https://raw.githubusercontent.com/aodn/aodn_cloud_optimised/main/aodn_cloud_optimised/lib/ParquetDataQuery.py'
response = requests.get(url)
with open('parquet_queries.py', 'w') as f:
f.write(response.text)
from parquet_queries import create_time_filter, create_bbox_filter, query_unique_value, plot_spatial_extent, get_spatial_extent, get_temporal_extent, get_schema_metadata
import pyarrow.parquet as pq
import pyarrow.dataset as pds
import pyarrow as pa
import os
import pandas as pd
import pyarrow.compute as pc
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import numpy as np
BUCKET_OPTIMISED_DEFAULT="aodn-cloud-optimised"
dname = f"s3://anonymous@{BUCKET_OPTIMISED_DEFAULT}/{dataset_name}.parquet/"
parquet_ds = pq.ParquetDataset(dname,partitioning='hive')
Partitioning in Parquet involves organising data files based on the values of one or more columns, known as partition keys. When data is written to Parquet files with partitioning enabled, the files are physically stored in a directory structure that reflects the partition keys. This directory structure makes it easier to retrieve and process specific subsets of data based on the partition keys.
dataset = pds.dataset(dname, format="parquet", partitioning="hive")
partition_keys = dataset.partitioning.schema
print(partition_keys)
timestamp: int32 polygon: string platform_code: string
%%time
unique_partition_value = query_unique_value(parquet_ds, 'platform_code')
print(list(unique_partition_value)[0:2]) # showing a subset only
['VRDU8', 'VNCF'] CPU times: user 111 ms, sys: 3.18 ms, total: 114 ms Wall time: 113 ms
In this section, we're plotting the polygons where data exists. This helps then with creating a bounding box where there is data
plot_spatial_extent(parquet_ds)
/home/lbesnard/miniforge3/envs/AodnCloudOptimised/lib/python3.12/site-packages/cartopy/mpl/feature_artist.py:144: UserWarning: facecolor will have no effect as it has been defined as "never". warnings.warn('facecolor will have no effect as it has been ' /home/lbesnard/github_repo/aodn_cloud_optimised/notebooks/parquet_queries.py:449: UserWarning: Legend does not support handles for PatchCollection instances. See: https://matplotlib.org/stable/tutorials/intermediate/legend_guide.html#implementing-a-custom-legend-handler ax.legend() /home/lbesnard/github_repo/aodn_cloud_optimised/notebooks/parquet_queries.py:449: UserWarning: No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. ax.legend()
Similary to the spatial extent, we're retrieving the minimum and maximum timestamp partition values of the dataset. This is not necessarely accurately representative of the TIME values, as the timestamp partition can be yearly/monthly... but is here to give an idea
get_temporal_extent(parquet_ds)
(datetime.datetime(2008, 1, 1, 11, 0), datetime.datetime(2024, 7, 1, 10, 0))
For all parquet dataset, we create a sidecar file in the root of the dataset named _common_matadata. This contains the variable attributes.
# parquet_meta = pa.parquet.read_schema(os.path.join(dname + '_common_metadata')) # parquet metadata
metadata = get_schema_metadata(dname) # schema metadata
metadata
{'TIME': {'type': 'timestamp[ns]', 'standard_name': 'time', 'long_name': 'time', 'axis': 'T', 'valid_min': 0, 'valid_max': 90000.0, 'comment': 'Relative julian days with decimal part as parts of the day', 'ancillary_variables': 'TIME_quality_control'}, 'LATITUDE': {'type': 'double', 'long_name': 'latitude', 'units': 'degrees_north', 'instrument': 'unknown', 'observation_type': 'measured', 'standard_name': 'latitude', 'axis': 'Y', 'valid_min': -90, 'valid_max': 90, 'reference_datum': 'geographical coordinates, WGS84', 'ancillary_variables': 'LATITUDE_quality_control'}, 'LONGITUDE': {'type': 'double', 'long_name': 'longitude', 'units': 'degrees_east', 'instrument': 'unknown', 'observation_type': 'measured', 'standard_name': 'longitude', 'axis': 'X', 'valid_min': -180, 'valid_max': 180, 'reference_datum': 'geographical coordinates, WGS84', 'ancillary_variables': 'LONGITUDE_quality_control'}, 'TEMP': {'type': 'float', 'long_name': 'sea temperature', 'units': 'celsius', 'instrument': 'Seabird SBE 38', 'observation_type': 'measured', 'distance_from_bow': -9999.0, 'centerline_offset': -9999.0, 'sensor_depth': 1.899999976158142, 'standard_name': 'sea_surface_temperature', 'ancillary_variables': 'TEMP_quality_control'}, 'TEMP_2': {'type': 'float', 'long_name': 'sea temperature', 'units': 'celsius', 'instrument': 'Seabird SBE38 (s/n 0434)', 'observation_type': 'measured', 'distance_from_bow': 29.0, 'centerline_offset': 4.099999904632568, 'sensor_depth': 0.800000011920929, 'standard_name': 'sea_surface_temperature', 'ancillary_variables': 'TEMP_2_quality_control'}, 'TEMP_2_quality_control': {'type': 'string', 'standard_name': 'sea_surface_temperature status_flag', 'long_name': 'quality flags for sea_surface_temperature', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'PSAL': {'type': 'float', 'long_name': 'sea salinity', 'units': '1e-3', 'instrument': 'Seabird SBE 21', 'observation_type': 'calculated', 'distance_from_bow': -9999.0, 'centerline_offset': -9999.0, 'sensor_depth': 1.899999976158142, 'standard_name': 'sea_water_salinity', 'ancillary_variables': 'PSAL_quality_control'}, 'history': {'type': 'string', 'long_name': 'file history information'}, 'TIME_quality_control': {'type': 'string', 'standard_name': 'time status_flag', 'long_name': 'quality flags for time', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'LATITUDE_quality_control': {'type': 'string', 'standard_name': 'latitude status_flag', 'long_name': 'quality flags for latitude', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'LONGITUDE_quality_control': {'type': 'string', 'standard_name': 'longitude status_flag', 'long_name': 'quality flags for longitude', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'TEMP_quality_control': {'type': 'string', 'standard_name': 'sea_surface_temperature status_flag', 'long_name': 'quality flags for sea_surface_temperature', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'PSAL_quality_control': {'type': 'string', 'standard_name': 'sea_water_salinity status_flag', 'long_name': 'quality flags for sea_water_salinity', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'AIRT_quality_control': {'type': 'string', 'standard_name': 'air_temperature status_flag', 'long_name': 'quality flags for air_temperature', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'WSPD': {'type': 'float', 'long_name': 'earth-relative wind speed', 'units': 'meter second-1', 'instrument': 'Calculated on ship from relative wind and GPS', 'observation_type': 'calculated', 'distance_from_bow': 55.0, 'centerline_offset': 0.0, 'sensor_height': 38.79999923706055, 'standard_name': 'wind_speed', 'ancillary_variables': 'WSPD_quality_control'}, 'WSPD_quality_control': {'type': 'string', 'standard_name': 'wind_speed status_flag', 'long_name': 'quality flags for wind_speed', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'WDIR': {'type': 'float', 'long_name': 'earth-relative wind direction', 'units': 'degrees (clockwise from true north)', 'instrument': 'Calculated on ship from relative wind and GPS', 'observation_type': 'calculated', 'distance_from_bow': 55.0, 'centerline_offset': 0.0, 'sensor_height': 38.79999923706055, 'standard_name': 'wind_from_direction', 'ancillary_variables': 'WDIR_quality_control'}, 'WDIR_quality_control': {'type': 'string', 'standard_name': 'wind_from_direction status_flag', 'long_name': 'quality flags for wind_from_direction', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'PL_WSPD_quality_control': {'type': 'string', 'standard_name': 'wind_speed status_flag', 'long_name': 'quality flags for wind_speed', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'PL_WDIR': {'type': 'float', 'long_name': 'platform relative wind direction', 'units': 'degrees (clockwise from true north)', 'instrument': 'Vaisala WAV151 (s/n W29135)', 'sensor_height': 21.100000381469727, 'distance_from_bow': 20.5, 'centerline_offset': -9999.0, 'observation_type': 'measured', 'standard_name': 'wind_from_direction', 'ancillary_variables': 'PL_WDIR_quality_control'}, 'PL_WDIR_quality_control': {'type': 'string', 'standard_name': 'wind_from_direction status_flag', 'long_name': 'quality flags for wind_from_direction', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'PL_WSPD': {'type': 'float', 'long_name': 'platform relative wind speed', 'units': 'meter second-1', 'instrument': 'Vaisala WAA151 (s/n W42236)', 'sensor_height': 21.100000381469727, 'distance_from_bow': 20.5, 'centerline_offset': -9999.0, 'observation_type': 'measured', 'standard_name': 'wind_speed', 'ancillary_variables': 'PL_WSPD_quality_control'}, 'RAD_PAR': {'type': 'float', 'long_name': 'photosynthetically active radiation', 'units': 'microeinstein meter-2', 'instrument': 'unknown', 'rad_direction': 'downwelling', 'observation_type': 'measured', 'distance_from_bow': -9999.0, 'centerline_offset': -9999.0, 'sensor_height': -9999.0, 'standard_name': 'surface_downwelling_photosynthetic_radiative_flux_in_air', 'ancillary_variables': 'RAD_PAR_quality_control'}, 'RAD_PAR_quality_control': {'type': 'string', 'standard_name': 'surface_downwelling_photosynthetic_radiative_flux_in_air status_flag', 'long_name': 'quality flags for surface_downwelling_photosynthetic_radiative_flux_in_air', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'RELH': {'type': 'float', 'long_name': 'relative humidity (starboard)', 'units': 'percent', 'instrument': 'Vaisala HMP233 (s/n X20303107)', 'observation_type': 'measured', 'distance_from_bow': -9999.0, 'centerline_offset': -9999.0, 'sensor_height': 11.600000381469727, 'standard_name': 'relative_humidity', 'ancillary_variables': 'RELH_quality_control'}, 'RELH_quality_control': {'type': 'string', 'standard_name': 'relative_humidity status_flag', 'long_name': 'quality flags for relative_humidity', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'PL_CRS': {'type': 'float32', 'long_name': 'platform course', 'units': 'degrees (clockwise towards true north)', 'instrument': 'unknown', 'observation_type': 'measured', 'standard_name': 'platform_course', 'ancillary_variables': 'PL_CRS_quality_control'}, 'ATMP': {'type': 'float', 'long_name': 'atmospheric pressure', 'units': 'millibar', 'instrument': 'Vaisala PTB220B (s/n V0430002)', 'mslp_indicator': 'adjusted to sea level', 'observation_type': 'measured', 'distance_from_bow': -9999.0, 'centerline_offset': -9999.0, 'sensor_height': 24.600000381469727, 'standard_name': 'air_pressure', 'ancillary_variables': 'ATMP_quality_control'}, 'ATMP_quality_control': {'type': 'string', 'standard_name': 'air_pressure status_flag', 'long_name': 'quality flags for air_pressure', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'AIRT': {'type': 'float', 'long_name': 'air temperature', 'units': 'celsius', 'instrument': 'Rosemount ST2401 (s/n 0512)', 'observation_type': 'measured', 'distance_from_bow': -9999.0, 'centerline_offset': -9999.0, 'sensor_height': 31.899999618530273, 'standard_name': 'air_temperature', 'ancillary_variables': 'AIRT_quality_control'}, 'PL_SPD': {'type': 'float', 'long_name': 'platform speed over ground', 'units': 'meter second-1', 'instrument': 'unknown', 'observation_type': 'measured', 'standard_name': 'platform_speed_wrt_ground', 'ancillary_variables': 'PL_SPD_quality_control'}, 'PL_SPD_quality_control': {'type': 'string', 'standard_name': 'platform_speed_wrt_ground status_flag', 'long_name': 'quality flags for platform_speed_wrt_ground', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'WETT': {'type': 'float', 'long_name': 'wet-bulb temperature', 'units': 'celsius', 'instrument': 'Rosemount ST2401 (s/n 0512) + Vaisala HMP45D (s/n C3640004)', 'observation_type': 'calculated', 'distance_from_bow': -9999.0, 'centerline_offset': -9999.0, 'sensor_height': 31.899999618530273, 'standard_name': 'wet_bulb_temperature', 'ancillary_variables': 'WETT_quality_control'}, 'WETT_quality_control': {'type': 'string', 'standard_name': 'wet_bulb_temperature status_flag', 'long_name': 'quality flags for wet_bulb_temperature', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'DEWT': {'type': 'float', 'long_name': 'dew-point temperature', 'units': 'celsius', 'instrument': 'Rosemount ST2401 (s/n 0512) + Vaisala HMP45D (s/n C3640004)', 'observation_type': 'calculated', 'distance_from_bow': -9999.0, 'centerline_offset': -9999.0, 'sensor_height': 31.899999618530273, 'standard_name': 'dew_point_temperature', 'ancillary_variables': 'DEWT_quality_control'}, 'DEWT_quality_control': {'type': 'string', 'standard_name': 'dew_point_temperature status_flag', 'long_name': 'quality flags for dew_point_temperature', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'PL_CRS_quality_control': {'type': 'string', 'standard_name': 'platform_course status_flag', 'long_name': 'quality flags for platform_course', 'quality_control_conventions': 'IMOS Reference Table F', 'quality_control_set': 3, 'quality_control_flag_values': 'B, C, D, E, F, G, H, J, K, L, M, Q, S, T, U, V, X, Z', 'quality_control_flag_meanings': 'Value_out_of_bounds Time_not_sequential Failed_T_Tw_Td_test Failed_true_wind_recomputation_test Platform_velocity_unrealistic Value_exceeds_threshold Discontinuity Erroneous_value Suspect_value_(visual) Value_located_over_land Instrument_malfunction Pre-flagged_as_suspect Spike_in_data_(visual) Time_duplicate Suspect_value_(statistical) Step_in_data_(statistical) Spike_in_data_(statistical) Value_passed_all_tests'}, 'timestamp': {'type': 'int64'}, 'polygon': {'type': 'string'}, 'platform_code': {'type': 'string'}, 'filename': {'type': 'string'}, 'dataset_metadata': {'metadata_uuid': '63db5801-cc19-40ef-83b3-85ccba884cf7', 'title': '', 'principal_investigator': '', 'principal_investigator_email': '', 'featureType': 'trajectory'}}
filter_time = create_time_filter(parquet_ds, date_start='2011-12-23 10:14:00', date_end='2012-01-01 07:50:00')
filter_geo = create_bbox_filter(parquet_ds, lat_min=-34, lat_max=-32, lon_min=150, lon_max=155)
filter = filter_geo & filter_time
%%time
# using pandas instead of pyarrow so that filters can directly be applied to the data, and not just the partition
df = pd.read_parquet(dname, engine='pyarrow',filters=filter)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 840 entries, 0 to 839 Data columns (total 41 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 TIME 840 non-null datetime64[ns] 1 LATITUDE 840 non-null float64 2 LONGITUDE 840 non-null float64 3 TEMP 660 non-null float32 4 TEMP_2 0 non-null float32 5 TEMP_2_quality_control 0 non-null object 6 PSAL 0 non-null float32 7 history 840 non-null object 8 TIME_quality_control 840 non-null object 9 LATITUDE_quality_control 840 non-null object 10 LONGITUDE_quality_control 840 non-null object 11 TEMP_quality_control 840 non-null object 12 PSAL_quality_control 0 non-null object 13 AIRT_quality_control 840 non-null object 14 WSPD 840 non-null float32 15 WSPD_quality_control 840 non-null object 16 WDIR 840 non-null float32 17 WDIR_quality_control 840 non-null object 18 PL_WSPD_quality_control 0 non-null object 19 PL_WDIR 0 non-null float32 20 PL_WDIR_quality_control 0 non-null object 21 PL_WSPD 0 non-null float32 22 RELH 0 non-null float32 23 RELH_quality_control 0 non-null object 24 PL_CRS 840 non-null float32 25 ATMP 840 non-null float32 26 ATMP_quality_control 840 non-null object 27 AIRT 840 non-null float32 28 PL_SPD 840 non-null float32 29 PL_SPD_quality_control 840 non-null object 30 PL_CRS_quality_control 840 non-null object 31 filename 840 non-null object 32 RAD_PAR 0 non-null float32 33 RAD_PAR_quality_control 0 non-null object 34 WETT 840 non-null float32 35 WETT_quality_control 840 non-null object 36 DEWT 840 non-null float32 37 DEWT_quality_control 840 non-null object 38 timestamp 840 non-null category 39 polygon 840 non-null category 40 platform_code 840 non-null category dtypes: category(3), datetime64[ns](1), float32(15), float64(2), object(20) memory usage: 233.8+ KB CPU times: user 4.01 s, sys: 271 ms, total: 4.28 s Wall time: 14.6 s
df_sorted = df.sort_values('TIME')
# Create a list of segments
points = np.array([df_sorted['LONGITUDE'], df_sorted['LATITUDE']]).T.reshape(-1, 1, 2)
segments = np.concatenate([points[:-1], points[1:]], axis=1)
# Create a LineCollection with segments colored by temperature
norm = plt.Normalize(df_sorted['TEMP'].min(), df_sorted['TEMP'].max())
lc = LineCollection(segments, cmap='RdYlBu_r', norm=norm)
lc.set_array(df_sorted['TEMP'])
lc.set_linewidth(2)
fig, ax = plt.subplots()
ax.add_collection(lc)
ax.autoscale()
ax.set_xlabel(metadata['LONGITUDE']['standard_name'])
ax.set_ylabel(metadata['LATITUDE']['standard_name'])
ax.invert_yaxis()
# Adding color bar
cbar = plt.colorbar(lc, ax=ax)
cbar.set_label('Temperature')
plt.show()
filter_time = create_time_filter(parquet_ds, date_start='2010-01-31 10:14:00', date_end='2010-02-01 07:50:00')
expr_1 = pc.field('platform_code') == pa.scalar("FHZI")
filter = expr_1 & filter_time
%%time
# using pandas instead of pyarrow so that filters can directly be applied to the data, and not just the partition
df = pd.read_parquet(dname, engine='pyarrow',filters=filter)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 26340 entries, 0 to 26339 Data columns (total 41 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 TIME 26340 non-null datetime64[ns] 1 LATITUDE 26340 non-null float64 2 LONGITUDE 26340 non-null float64 3 TEMP 26340 non-null float32 4 TEMP_2 0 non-null float32 5 TEMP_2_quality_control 0 non-null object 6 PSAL 0 non-null float32 7 history 26340 non-null object 8 TIME_quality_control 26340 non-null object 9 LATITUDE_quality_control 26340 non-null object 10 LONGITUDE_quality_control 26340 non-null object 11 TEMP_quality_control 26340 non-null object 12 PSAL_quality_control 0 non-null object 13 AIRT_quality_control 26340 non-null object 14 WSPD 26340 non-null float32 15 WSPD_quality_control 26340 non-null object 16 WDIR 26320 non-null float32 17 WDIR_quality_control 26340 non-null object 18 PL_WSPD_quality_control 25920 non-null object 19 PL_WDIR 25900 non-null float32 20 PL_WDIR_quality_control 25920 non-null object 21 PL_WSPD 25920 non-null float32 22 RELH 25920 non-null float32 23 RELH_quality_control 25920 non-null object 24 PL_CRS 26340 non-null float32 25 ATMP 26340 non-null float32 26 ATMP_quality_control 26340 non-null object 27 AIRT 26340 non-null float32 28 PL_SPD 26340 non-null float32 29 PL_SPD_quality_control 26340 non-null object 30 PL_CRS_quality_control 26340 non-null object 31 filename 26340 non-null object 32 RAD_PAR 25920 non-null float32 33 RAD_PAR_quality_control 25920 non-null object 34 WETT 420 non-null float32 35 WETT_quality_control 420 non-null object 36 DEWT 420 non-null float32 37 DEWT_quality_control 420 non-null object 38 timestamp 26340 non-null category 39 polygon 26340 non-null category 40 platform_code 26340 non-null category dtypes: category(3), datetime64[ns](1), float32(15), float64(2), object(20) memory usage: 6.3+ MB CPU times: user 4.47 s, sys: 216 ms, total: 4.68 s Wall time: 17.2 s
df_sorted = df.sort_values('TIME')
# Create a list of segments
points = np.array([df_sorted['LONGITUDE'], df_sorted['LATITUDE']]).T.reshape(-1, 1, 2)
segments = np.concatenate([points[:-1], points[1:]], axis=1)
# Create a LineCollection with segments colored by temperature
norm = plt.Normalize(df_sorted['TEMP'].min(), df_sorted['TEMP'].max())
lc = LineCollection(segments, cmap='RdYlBu_r', norm=norm)
lc.set_array(df_sorted['TEMP'])
lc.set_linewidth(2)
fig, ax = plt.subplots()
ax.add_collection(lc)
ax.autoscale()
ax.set_xlabel(metadata['LONGITUDE']['standard_name'])
ax.set_ylabel(metadata['LATITUDE']['standard_name'])
ax.invert_yaxis()
# Adding color bar
cbar = plt.colorbar(lc, ax=ax)
cbar.set_label('Temperature')
plt.show()