dataset_name = "vessel_co2_delayed_qc"
# only run once, then restart session if needed
!pip install uv
import os
def is_colab():
try:
import google.colab
return True
except ImportError:
return False
if is_colab():
os.system('uv pip install --system -r https://raw.githubusercontent.com/aodn/aodn_cloud_optimised/main/notebooks/requirements.txt')
else:
os.system('uv venv')
os.system('uv pip install -r https://raw.githubusercontent.com/aodn/aodn_cloud_optimised/main/notebooks/requirements.txt')
import requests
import os
if not os.path.exists('parquet_queries.py'):
print('Downloading parquet_queries.py')
url = 'https://raw.githubusercontent.com/aodn/aodn_cloud_optimised/main/aodn_cloud_optimised/lib/ParquetDataQuery.py'
response = requests.get(url)
with open('parquet_queries.py', 'w') as f:
f.write(response.text)
from parquet_queries import create_time_filter, create_bbox_filter, query_unique_value, plot_spatial_extent, \
get_temporal_extent, get_schema_metadata
import pyarrow.parquet as pq
import pyarrow.dataset as pds
import pyarrow as pa
import pandas as pd
import pyarrow.compute as pc
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import numpy as np
/home/lbesnard/miniforge3/envs/AodnCloudOptimised/lib/python3.12/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
BUCKET_OPTIMISED_DEFAULT="aodn-cloud-optimised"
dname = f"s3://anonymous@{BUCKET_OPTIMISED_DEFAULT}/{dataset_name}.parquet/"
parquet_ds = pq.ParquetDataset(dname,partitioning='hive')
Partitioning in Parquet involves organising data files based on the values of one or more columns, known as partition keys. When data is written to Parquet files with partitioning enabled, the files are physically stored in a directory structure that reflects the partition keys. This directory structure makes it easier to retrieve and process specific subsets of data based on the partition keys.
dataset = pds.dataset(dname, format="parquet", partitioning="hive")
partition_keys = dataset.partitioning.schema
print(partition_keys)
timestamp: int32 polygon: string platform_code: string
%%time
unique_partition_value = query_unique_value(parquet_ds, 'platform_code')
print(list(unique_partition_value)[0:2]) # showing a subset only
['ZMFR', 'VLMJ'] CPU times: user 19.8 ms, sys: 0 ns, total: 19.8 ms Wall time: 18.9 ms
In this section, we're plotting the polygons where data exists. This helps then with creating a bounding box where there is data
plot_spatial_extent(parquet_ds)
Similary to the spatial extent, we're retrieving the minimum and maximum timestamp partition values of the dataset. This is not necessarely accurately representative of the TIME values, as the timestamp partition can be yearly/monthly... but is here to give an idea
get_temporal_extent(parquet_ds)
(datetime.datetime(2008, 1, 1, 11, 0), datetime.datetime(2024, 4, 1, 11, 0))
For all parquet dataset, we create a sidecar file in the root of the dataset named _common_matadata. This contains the variable attributes.
# parquet_meta = pa.parquet.read_schema(os.path.join(dname + '_common_metadata')) # parquet metadata
metadata = get_schema_metadata(dname) # schema metadata
metadata
{'TIME': {'type': 'timestamp[ns]', 'standard_name': 'time', 'long_name': 'analysis_time', 'axis': 'T', 'valid_min': 0.0, 'valid_max': 999999.0, 'ancillary_variables': 'TIME_quality_control'}, 'TIME_quality_control': {'type': 'float', 'standard_name': 'time status_flag', 'long_name': 'Quality Control flag for time', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'LATITUDE': {'type': 'double', 'standard_name': 'latitude', 'long_name': 'latitude', 'units': 'degrees_north', 'axis': 'Y', 'valid_min': -90.0, 'valid_max': 90.0, 'reference_datum': 'geographical coordinates, WGS84 projection', 'ancillary_variables': 'LATITUDE_quality_control'}, 'LATITUDE_quality_control': {'type': 'float', 'standard_name': 'latitude status_flag', 'long_name': 'Quality Control flag for latitude', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'LONGITUDE': {'type': 'double', 'standard_name': 'longitude', 'long_name': 'longitude', 'units': 'degrees_east', 'axis': 'X', 'valid_min': -180.0, 'valid_max': 180.0, 'reference_datum': 'geographical coordinates, WGS84 projection', 'ancillary_variables': 'LONGITUDE_quality_control'}, 'LONGITUDE_quality_control': {'type': 'float', 'standard_name': 'longitude status_flag', 'long_name': 'Quality Control flag for longitude', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'TEMP': {'type': 'double', 'standard_name': 'sea_surface_temperature', 'long_name': 'sea surface temperature', 'units': 'degree_Celsius', 'valid_min': -2.0, 'valid_max': 40.0, 'ancillary_variables': 'TEMP_quality_control'}, 'TEMP_quality_control': {'type': 'float', 'standard_name': 'sea_surface_temperature status_flag', 'long_name': 'Quality Control flag for sea_surface_temperature', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'TEMP_2': {'type': 'double', 'long_name': 'equilibrator water temperature', 'units': 'degree_Celsius', 'valid_min': -2.0, 'valid_max': 40.0, 'ancillary_variables': 'TEMP_2_quality_control'}, 'TEMP_2_quality_control': {'type': 'float', 'long_name': 'Quality Control flag for sea_surface_temperature', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'PSAL': {'type': 'double', 'standard_name': 'sea_surface_salinity', 'long_name': 'sea surface salinity', 'units': '1e-3', 'valid_min': 0.0, 'valid_max': 42.0, 'ancillary_variables': 'PSAL_quality_control'}, 'PSAL_quality_control': {'type': 'float', 'standard_name': 'sea_surface_salinity status_flag', 'long_name': 'Quality Control flag for sea_surface_salinity', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'WSPD': {'type': 'double', 'standard_name': 'wind_speed', 'long_name': 'wind speed', 'units': 'm s-1', 'ancillary_variables': 'WSPD_quality_control'}, 'WSPD_quality_control': {'type': 'float', 'standard_name': 'wind_speed status_flag', 'long_name': 'Quality Control flag for wind speed', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'WDIR': {'type': 'double', 'long_name': 'wind direction', 'units': 'degree', 'ancillary_variables': 'WDIR_quality_control', 'comment': 'true wind direction where 0 is North and 90 is East'}, 'WDIR_quality_control': {'type': 'float', 'long_name': 'Quality Control flag for wind direction', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'Press_Equil': {'type': 'double', 'long_name': 'equilibrator head space pressure', 'units': 'hPa', 'ancillary_variables': 'Press_Equil_quality_control'}, 'Press_Equil_quality_control': {'type': 'float', 'long_name': 'Quality Control flag for equilibrator head space pressure', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'Press_ATM': {'type': 'double', 'long_name': 'barometric pressure', 'units': 'hPa', 'ancillary_variables': 'Press_ATM_quality_control'}, 'Press_ATM_quality_control': {'type': 'float', 'long_name': 'Quality Control flag for barometric pressure', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'xCO2EQ_PPM': {'type': 'double', 'long_name': 'mole fraction of CO2 in the equilibrator head space (dry)', 'units': '1e-6', 'ancillary_variables': 'xCO2EQ_PPM_quality_control', 'comment': 'the unit 1e-6 is also called parts per million (ppm)'}, 'xCO2EQ_PPM_quality_control': {'type': 'float', 'long_name': 'Quality Control flag for xCO2EQ_PPM', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'xCO2ATM_PPM': {'type': 'double', 'long_name': 'mole fraction of CO2 in the atmosphere (dry) measured every 4 hours after standard runs', 'units': '1e-6', 'ancillary_variables': 'xCO2ATM_PPM_quality_control', 'comment': 'the unit 1e-6 is also called parts per million (ppm)'}, 'xCO2ATM_PPM_quality_control': {'type': 'float', 'long_name': 'Quality Control flag for xCO2ATM_PPM', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'xCO2ATM_PPM_INTERPOLATED': {'type': 'double', 'long_name': 'mole fraction of CO2 in the atmosphere (dry) measured every 4 hours after standard runs and values linearly interpolated to the times shown', 'units': '1e-6', 'ancillary_variables': 'xCO2ATM_PPM_INTERPOLATED_quality_control', 'comment': 'the unit 1e-6 is also called parts per million (ppm)'}, 'xCO2ATM_PPM_INTERPOLATED_quality_control': {'type': 'float', 'long_name': 'Quality Control flag for xCO2ATM_PPM_INTERPOLATED', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'fCO2SW_UATM': {'type': 'double', 'long_name': 'fugacity of carbon dioxide at surface water salinity and temperature', 'units': 'microatmospheres', 'ancillary_variables': 'fCO2SW_UATM_quality_control'}, 'fCO2SW_UATM_quality_control': {'type': 'float', 'long_name': 'Quality Control flag for fCO2SW_UATM', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'fCO2ATM_UATM_INTERPOLATED': {'type': 'double', 'long_name': 'fugacity of CO2 in the atmosphere', 'units': 'microatmospheres', 'ancillary_variables': 'fCO2ATM_UATM_INTERPOLATED_quality_control'}, 'fCO2ATM_UATM_INTERPOLATED_quality_control': {'type': 'float', 'long_name': 'Quality Control flag for fCO2ATM_UATM_INTERPOLATED', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'DfCO2': {'type': 'double', 'long_name': 'Difference between fCO2SW and fCO2ATM', 'units': 'microatmospheres', 'ancillary_variables': 'DfCO2_quality_control'}, 'DfCO2_quality_control': {'type': 'float', 'long_name': 'Quality Control flag for DfCO2', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'LICORflow': {'type': 'double', 'long_name': 'Gas flow through infrared gas analyser', 'units': 'ml min-1', 'ancillary_variables': 'LICORflow_quality_control'}, 'LICORflow_quality_control': {'type': 'float', 'long_name': 'Quality Control flag for LICORflow', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'H2OFLOW': {'type': 'double', 'long_name': 'water flow to equilibrator', 'units': 'L min-1', 'ancillary_variables': 'H2OFLOW_quality_control'}, 'H2OFLOW_quality_control': {'type': 'float', 'long_name': 'Quality Control flag for H2OFLOW', 'quality_control_conventions': 'WOCE quality control procedure', 'valid_min': 2, 'valid_max': 4, 'flag_values': [2, 3, 4], 'flag_meanings': 'good questionable bad', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005', 'ancillary_variables': 'SUBFLAG'}, 'SUBFLAG': {'type': 'float', 'long_name': 'secondary flags, only for questionable measurements, WOCE flag 3 (Pierrot et Al 2009)', 'valid_min': 1, 'valid_max': 10, 'flag_values': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'flag_meanings': 'Outside_of_standard_range Questionable_or_interpolated_SST Questionable_EQU_temperature Anomalous_EQU_temperature-SST_+or-1degC Questionable_sea-surface_salinity Questionable_pressure Low_EQU_gas_flow Questionable_air_value Interpolated_standard Other_see_metadata', 'references': 'Pierrot,D. et al. 2009, Recommendations for Autonomous Underway pCO2 Measuring Systems and Data Reduction Routines, Deep-Sea Research II, doi:10.1016/j.dsr2.2008.12.005'}, 'TYPE': {'type': 'string', 'long_name': 'measurement type (equilibrator, standard or atmosphere)', 'units': 'categorical'}, 'timestamp': {'type': 'int64'}, 'polygon': {'type': 'string'}, 'platform_code': {'type': 'string'}, 'cruise_id': {'type': 'string'}, 'vessel_name': {'type': 'string'}, 'filename': {'type': 'string'}, 'dataset_metadata': {'metadata_uuid': '63db5801-cc19-40ef-83b3-85ccba884cf7', 'title': 'Upper Ocean Thermal Data collected using XBT (expendable bathythermographs)', 'principal_investigator': 'Cowley, Rebecca', 'principal_investigator_email': 'rebecca.cowley@csiro.au', 'featureType': 'profile'}}
filter_time = create_time_filter(parquet_ds, date_start='2020-12-23 10:14:00', date_end='2024-01-01 07:50:00')
filter_geo = create_bbox_filter(parquet_ds, lat_min=-34, lat_max=-32, lon_min=150, lon_max=155)
filter = filter_geo & filter_time
%%time
# using pandas instead of pyarrow so that filters can directly be applied to the data, and not just the partition
df = pd.read_parquet(dname, engine='pyarrow',filters=filter)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10199 entries, 0 to 10198 Data columns (total 44 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 TIME 10199 non-null datetime64[ns] 1 TIME_quality_control 10199 non-null float32 2 LATITUDE 10199 non-null float64 3 LATITUDE_quality_control 10199 non-null float32 4 LONGITUDE 10199 non-null float64 5 LONGITUDE_quality_control 10199 non-null float32 6 TEMP 10199 non-null float64 7 TEMP_quality_control 10199 non-null float32 8 TEMP_2 10196 non-null float64 9 TEMP_2_quality_control 10199 non-null float32 10 PSAL 10199 non-null float64 11 PSAL_quality_control 10199 non-null float32 12 WSPD 9592 non-null float64 13 WSPD_quality_control 10199 non-null float32 14 WDIR 10157 non-null float64 15 WDIR_quality_control 10199 non-null float32 16 Press_Equil 10199 non-null float64 17 Press_Equil_quality_control 10199 non-null float32 18 Press_ATM 10199 non-null float64 19 Press_ATM_quality_control 10199 non-null float32 20 xCO2EQ_PPM 9931 non-null float64 21 xCO2EQ_PPM_quality_control 10199 non-null float32 22 xCO2ATM_PPM 268 non-null float64 23 xCO2ATM_PPM_quality_control 10199 non-null float32 24 xCO2ATM_PPM_INTERPOLATED 10199 non-null float64 25 xCO2ATM_PPM_INTERPOLATED_quality_control 10199 non-null float32 26 fCO2SW_UATM 9931 non-null float64 27 fCO2SW_UATM_quality_control 10199 non-null float32 28 fCO2ATM_UATM_INTERPOLATED 10199 non-null float64 29 fCO2ATM_UATM_INTERPOLATED_quality_control 10199 non-null float32 30 DfCO2 9931 non-null float64 31 DfCO2_quality_control 10199 non-null float32 32 LICORflow 10199 non-null float64 33 LICORflow_quality_control 10199 non-null float32 34 H2OFLOW 10199 non-null float64 35 H2OFLOW_quality_control 10199 non-null float32 36 SUBFLAG 0 non-null float32 37 TYPE 10199 non-null object 38 cruise_id 10199 non-null object 39 vessel_name 10199 non-null object 40 filename 10199 non-null object 41 timestamp 10199 non-null category 42 polygon 10199 non-null category 43 platform_code 10199 non-null category dtypes: category(3), datetime64[ns](1), float32(19), float64(17), object(4) memory usage: 2.5+ MB CPU times: user 415 ms, sys: 54.1 ms, total: 469 ms Wall time: 3.38 s
df
TIME | TIME_quality_control | LATITUDE | LATITUDE_quality_control | LONGITUDE | LONGITUDE_quality_control | TEMP | TEMP_quality_control | TEMP_2 | TEMP_2_quality_control | ... | H2OFLOW | H2OFLOW_quality_control | SUBFLAG | TYPE | cruise_id | vessel_name | filename | timestamp | polygon | platform_code | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2021-05-10 18:05:01.000000000 | 2.0 | -33.999871 | 2.0 | 151.395629 | 2.0 | 20.435 | 2.0 | 20.66 | 2.0 | ... | 2.09 | 2.0 | NaN | EQU | IN2021_V03 | RV Investigator | IMOS_SOOP-CO2_GST_20210508T033536Z_VLMJ_FV01.nc | 1619827200 | 0103000000010000000500000000000000002062400000... | VLMJ |
1 | 2021-05-10 18:06:23.000000000 | 2.0 | -33.996769 | 2.0 | 151.399301 | 2.0 | 20.429 | 2.0 | 20.66 | 2.0 | ... | 2.09 | 2.0 | NaN | EQU | IN2021_V03 | RV Investigator | IMOS_SOOP-CO2_GST_20210508T033536Z_VLMJ_FV01.nc | 1619827200 | 0103000000010000000500000000000000002062400000... | VLMJ |
2 | 2021-05-10 18:07:41.999999744 | 2.0 | -33.993654 | 2.0 | 151.402956 | 2.0 | 20.438 | 2.0 | 20.67 | 2.0 | ... | 2.09 | 2.0 | NaN | EQU | IN2021_V03 | RV Investigator | IMOS_SOOP-CO2_GST_20210508T033536Z_VLMJ_FV01.nc | 1619827200 | 0103000000010000000500000000000000002062400000... | VLMJ |
3 | 2021-05-10 18:09:03.999999744 | 2.0 | -33.990543 | 2.0 | 151.406601 | 2.0 | 20.428 | 2.0 | 20.66 | 2.0 | ... | 2.09 | 2.0 | NaN | EQU | IN2021_V03 | RV Investigator | IMOS_SOOP-CO2_GST_20210508T033536Z_VLMJ_FV01.nc | 1619827200 | 0103000000010000000500000000000000002062400000... | VLMJ |
4 | 2021-05-10 18:10:25.000000000 | 2.0 | -33.987426 | 2.0 | 151.410262 | 2.0 | 20.432 | 2.0 | 20.66 | 2.0 | ... | 2.09 | 2.0 | NaN | EQU | IN2021_V03 | RV Investigator | IMOS_SOOP-CO2_GST_20210508T033536Z_VLMJ_FV01.nc | 1619827200 | 0103000000010000000500000000000000002062400000... | VLMJ |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10194 | 2023-11-01 16:37:56.000000000 | 2.0 | -33.852700 | 2.0 | 151.471700 | 2.0 | 20.172 | 2.0 | 20.45 | 2.0 | ... | 2.12 | 2.0 | NaN | EQU | IN2023_V06 | RV Investigator | IMOS_SOOP-CO2_GST_20231009T002809Z_VLMJ_FV01.nc | 1698796800 | 0103000000010000000500000000000000002062400000... | VLMJ |
10195 | 2023-11-01 16:39:14.000000000 | 2.0 | -33.850300 | 2.0 | 151.470400 | 2.0 | 20.164 | 2.0 | 20.45 | 2.0 | ... | 2.12 | 2.0 | NaN | EQU | IN2023_V06 | RV Investigator | IMOS_SOOP-CO2_GST_20231009T002809Z_VLMJ_FV01.nc | 1698796800 | 0103000000010000000500000000000000002062400000... | VLMJ |
10196 | 2023-11-01 16:40:34.000000000 | 2.0 | -33.847900 | 2.0 | 151.469200 | 2.0 | 20.147 | 2.0 | 20.43 | 2.0 | ... | 2.12 | 2.0 | NaN | EQU | IN2023_V06 | RV Investigator | IMOS_SOOP-CO2_GST_20231009T002809Z_VLMJ_FV01.nc | 1698796800 | 0103000000010000000500000000000000002062400000... | VLMJ |
10197 | 2023-11-01 16:41:54.000000000 | 2.0 | -33.845800 | 2.0 | 151.468100 | 2.0 | 20.129 | 2.0 | 20.42 | 2.0 | ... | 2.12 | 2.0 | NaN | EQU | IN2023_V06 | RV Investigator | IMOS_SOOP-CO2_GST_20231009T002809Z_VLMJ_FV01.nc | 1698796800 | 0103000000010000000500000000000000002062400000... | VLMJ |
10198 | 2023-11-01 16:43:14.000000000 | 2.0 | -33.843700 | 2.0 | 151.467100 | 2.0 | 20.110 | 2.0 | 20.40 | 2.0 | ... | 2.12 | 2.0 | NaN | EQU | IN2023_V06 | RV Investigator | IMOS_SOOP-CO2_GST_20231009T002809Z_VLMJ_FV01.nc | 1698796800 | 0103000000010000000500000000000000002062400000... | VLMJ |
10199 rows × 44 columns
df_sorted = df.sort_values('TIME')
# Create a list of segments
points = np.array([df_sorted['LONGITUDE'], df_sorted['LATITUDE']]).T.reshape(-1, 1, 2)
segments = np.concatenate([points[:-1], points[1:]], axis=1)
# Create a LineCollection with segments colored by temperature
norm = plt.Normalize(df_sorted['TEMP'].min(), df_sorted['TEMP'].max())
lc = LineCollection(segments, cmap='RdYlBu_r', norm=norm)
lc.set_array(df_sorted['TEMP'])
lc.set_linewidth(2)
fig, ax = plt.subplots()
ax.add_collection(lc)
ax.autoscale()
ax.set_xlabel(metadata['LONGITUDE']['standard_name'])
ax.set_ylabel(metadata['LATITUDE']['standard_name'])
ax.invert_yaxis()
# Adding color bar
cbar = plt.colorbar(lc, ax=ax)
cbar.set_label('Temperature')
plt.show()
filter_time = create_time_filter(parquet_ds, date_start='2020-01-31 10:14:00', date_end='2022-02-01 07:50:00')
expr_1 = pc.field('platform_code') == pa.scalar("VLMJ")
filter = expr_1 & filter_time
%%time
# using pandas instead of pyarrow so that filters can directly be applied to the data, and not just the partition
df = pd.read_parquet(dname, engine='pyarrow',filters=filter)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 244944 entries, 0 to 244943 Data columns (total 44 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 TIME 244944 non-null datetime64[ns] 1 TIME_quality_control 244944 non-null float32 2 LATITUDE 244944 non-null float64 3 LATITUDE_quality_control 244944 non-null float32 4 LONGITUDE 244944 non-null float64 5 LONGITUDE_quality_control 244944 non-null float32 6 TEMP 244680 non-null float64 7 TEMP_quality_control 244944 non-null float32 8 TEMP_2 244780 non-null float64 9 TEMP_2_quality_control 244944 non-null float32 10 PSAL 244944 non-null float64 11 PSAL_quality_control 244944 non-null float32 12 WSPD 222686 non-null float64 13 WSPD_quality_control 244944 non-null float32 14 WDIR 237361 non-null float64 15 WDIR_quality_control 244944 non-null float32 16 Press_Equil 244944 non-null float64 17 Press_Equil_quality_control 244944 non-null float32 18 Press_ATM 241825 non-null float64 19 Press_ATM_quality_control 244944 non-null float32 20 xCO2EQ_PPM 236906 non-null float64 21 xCO2EQ_PPM_quality_control 244944 non-null float32 22 xCO2ATM_PPM 8038 non-null float64 23 xCO2ATM_PPM_quality_control 244944 non-null float32 24 xCO2ATM_PPM_INTERPOLATED 244944 non-null float64 25 xCO2ATM_PPM_INTERPOLATED_quality_control 244944 non-null float32 26 fCO2SW_UATM 236647 non-null float64 27 fCO2SW_UATM_quality_control 244944 non-null float32 28 fCO2ATM_UATM_INTERPOLATED 241561 non-null float64 29 fCO2ATM_UATM_INTERPOLATED_quality_control 244944 non-null float32 30 DfCO2 233528 non-null float64 31 DfCO2_quality_control 244944 non-null float32 32 LICORflow 244944 non-null float64 33 LICORflow_quality_control 244944 non-null float32 34 H2OFLOW 244944 non-null float64 35 H2OFLOW_quality_control 244944 non-null float32 36 SUBFLAG 0 non-null float32 37 TYPE 244944 non-null object 38 cruise_id 244944 non-null object 39 vessel_name 244944 non-null object 40 filename 244944 non-null object 41 timestamp 244944 non-null category 42 polygon 244944 non-null category 43 platform_code 244944 non-null category dtypes: category(3), datetime64[ns](1), float32(19), float64(17), object(4) memory usage: 60.0+ MB CPU times: user 1.73 s, sys: 305 ms, total: 2.03 s Wall time: 8.04 s
df_sorted = df.sort_values('TIME')
# Create a list of segments
points = np.array([df_sorted['LONGITUDE'], df_sorted['LATITUDE']]).T.reshape(-1, 1, 2)
segments = np.concatenate([points[:-1], points[1:]], axis=1)
# Create a LineCollection with segments colored by temperature
norm = plt.Normalize(df_sorted['TEMP'].min(), df_sorted['TEMP'].max())
lc = LineCollection(segments, cmap='RdYlBu_r', norm=norm)
lc.set_array(df_sorted['TEMP'])
lc.set_linewidth(2)
fig, ax = plt.subplots()
ax.add_collection(lc)
ax.autoscale()
ax.set_xlabel(metadata['LONGITUDE']['standard_name'])
ax.set_ylabel(metadata['LATITUDE']['standard_name'])
ax.invert_yaxis()
# Adding color bar
cbar = plt.colorbar(lc, ax=ax)
cbar.set_label('Temperature')
plt.show()