import numpy as np # this module handles arrays, but here we need it for its NaN value
import pandas as pd # this module contains a lot of tools for handling tabular data
import re
# define paths to the source files and eventual output file
#pathBottle='/ocean/eolson/MEOPAR/obs/Nina/All 2018 SoG bottle.xlsx'
#pathPhyto='/ocean/eolson/MEOPAR/obs/Nina/2015-2018 Abs phyto groups.xlsx'
pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2019 SoG bottle_revWithNew044.xlsx'
pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2019 Abs phyto groups.xlsx'
pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2019.csv'
#formatting function to get year-(3digit) format
def fmtCruise(istr):
if re.fullmatch('[0-9]{4}-[0-9]{2}',istr):
sp=re.split('-',istr)
rstr=sp[0]+'-0'+sp[1]
elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):
rstr=istr
else:
raise ValueError('Input had unexpected format:',istr)
return rstr
# get names of sheets in notebook
with pd.ExcelFile(pathBottle) as xl:
sheets=xl.sheet_names
print(sheets)
['2019-001', '2019-007', '2019-016', '2019-044', '2019-005', '2019-045', '2019-050', '2019-023', '2019-062']
# load each sheet in the bottle Excel file and concatenate them together into one table
dfbotlist=list()
for sheet in sheets:
df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,
na_values=(-99,-99.9)) # read each sheet; include additional na values
df0['Cruise']=fmtCruise(sheet) # create and populate Cruise column based on sheet name
dfbotlist.append(df0) # append the sheet to a list
dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table
Reading sheet 2019-001 Reading sheet 2019-007 Reading sheet 2019-016 Reading sheet 2019-044 Reading sheet 2019-005 Reading sheet 2019-045 Reading sheet 2019-050 Reading sheet 2019-023 Reading sheet 2019-062
# Drop columns with no data in them
l1=set(dfbot.keys())
dfbot.dropna(axis=1,how='all',inplace=True)
print('removed empty columns:',l1-set(dfbot.keys()))
removed empty columns: {'Flag:Dimethylsulfoniopropionate_Total', 'Dimethyl_Sulphide [nmol/L]', 'Flag:Dimethyl_Sulphide', 'Flag:Dimethylsulfoniopropionate_Dissolve', 'Dimethylsulfoniopropionate_Total [nmol/L]', 'Ammonium [umol/L]', 'Dimethylsulfoniopropionate_Dissolved [nmol/L]', 'Flag:Ammonium'}
# list the column names in the resulting table
for el in sorted(dfbot.keys()):
if not el.startswith('ADM') and not el.startswith('Flag') and not el.startswith('LOC') \
and not el.startswith('Bottle'):
print(el)
Chlorophyll:Extracted Chlorophyll:Extracted [mg/m^3] Comments by sample_numbeR Conductivity:Primary [S/m] Conductivity:Secondary Conductivity:Secondary [S/m] Cruise Depth Depth [metres] Depth:Nominal [metres] FIL:DATA DESCRIPTION FIL:START TIME YYYY/MM/DD HH:MM:SS File Name Fluorescence:URU:Seapoint Fluorescence:URU:Seapoint [mg/m^3] Fluorescence:URU:Wetlabs [mg/m^3] Nitrate_plus_Nitrite Nitrate_plus_Nitrite [umol/L] Number_of_bin_records Oxygen:Dissolved Oxygen:Dissolved [mL/L] Oxygen:Dissolved [umol/kg] Oxygen:Dissolved.1 Oxygen:Dissolved:SBE Oxygen:Dissolved:SBE [mL/L] Oxygen:Dissolved:SBE [umol/kg] Oxygen:Dissolved:SBE.1 PAR PAR [uE/m^2/sec] PAR:Reference PAR:Reference [uE/m^2/sec] Phaeo-Pigment:Extracted Phaeo-Pigment:Extracted [mg/m^3] Phosphate Phosphate [umol/L] Pressure Pressure [decibar] Salinity:Bottle Salinity:Bottle [PSS-78] Salinity:T0:C0 [PSS-78] Salinity:T1:C1 Salinity:T1:C1 [PSS-78] Sample_Number Silicate Silicate [umol/L] Temperature:Draw Temperature:Draw [deg C (ITS90)] Temperature:Primary [deg C (ITS90)] Temperature:Secondary Temperature:Secondary [deg C (ITS90)] Transmissivity Transmissivity [*/metre] Transmissivity2 [*/metre] Transmissivity:Green [*/metre] YYYY/MM/DD HH:MM:SS Zone Zone.1 pH:SBE:Nominal
for el in dfbot.keys():
if 'Oxy' in el:
print(el,np.max(dfbot[el]))
Oxygen:Dissolved:SBE [mL/L] 275.2 Oxygen:Dissolved:SBE [umol/kg] 447.5 Oxygen:Dissolved [mL/L] 9.571 Oxygen:Dissolved [umol/kg] 419.6 Flag:Oxygen:Dissolved 46.0 Oxygen:Dissolved:SBE 9.19 Oxygen:Dissolved:SBE.1 9.19 Oxygen:Dissolved 8.924 Oxygen:Dissolved.1 8.924
# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Depth [metres]']>=0),
np.sum(dfbot['Pressure [decibar]']>=0),
np.sum(dfbot['Pressure']>=0),
np.sum(dfbot['Depth']>=0))
len(dfbot.loc[~(dfbot['Depth [metres]']>=0)&\
~(dfbot['Pressure [decibar]']>=0)&\
(dfbot['Pressure']>=0)])
1104 1104 340 340
340
# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Temperature:Primary [deg C (ITS90)]']>=0),
np.sum(dfbot['Temperature:Secondary [deg C (ITS90)]']>=0),
np.sum(dfbot['Temperature:Secondary']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Temperature:Primary [deg C (ITS90)]']>=0)])+\
np.array([int(ii) for ii in (dfbot['Temperature:Secondary [deg C (ITS90)]']>=0)])+\
np.array([int(ii) for ii in (dfbot['Temperature:Secondary']>=0)]))>1]
726 378 340
Zone | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | ADM:MISSION | LOC:STATION | ADM:PROJECT | YYYY/MM/DD HH:MM:SS | ... | Temperature:Draw | Salinity:Bottle | Chlorophyll:Extracted | Phaeo-Pigment:Extracted | Oxygen:Dissolved | Oxygen:Dissolved.1 | Nitrate_plus_Nitrite | Silicate | Phosphate | Transmissivity:Green [*/metre] |
---|
0 rows × 77 columns
# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0),
np.sum(dfbot['Fluorescence:URU:Seapoint']>=0),
np.sum(dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0)])+\
np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Seapoint']>=0)])+\
np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0)]))>1]
993 340 110
Zone | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | ADM:MISSION | LOC:STATION | ADM:PROJECT | YYYY/MM/DD HH:MM:SS | ... | Temperature:Draw | Salinity:Bottle | Chlorophyll:Extracted | Phaeo-Pigment:Extracted | Oxygen:Dissolved | Oxygen:Dissolved.1 | Nitrate_plus_Nitrite | Silicate | Phosphate | Transmissivity:Green [*/metre] |
---|
0 rows × 77 columns
# no rows returned, so there are no rows with both both salinity fields
print(np.sum(dfbot['Salinity:T0:C0 [PSS-78]']>=0),
np.sum(dfbot['Salinity:T1:C1 [PSS-78]']>=0),
np.sum(dfbot['Salinity:T1:C1']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Salinity:T0:C0 [PSS-78]']>=0)])+\
np.array([int(ii) for ii in (dfbot['Salinity:T1:C1 [PSS-78]']>=0)])+\
np.array([int(ii) for ii in (dfbot['Salinity:T1:C1']>=0)]))>1]
726 378 340
Zone | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | ADM:MISSION | LOC:STATION | ADM:PROJECT | YYYY/MM/DD HH:MM:SS | ... | Temperature:Draw | Salinity:Bottle | Chlorophyll:Extracted | Phaeo-Pigment:Extracted | Oxygen:Dissolved | Oxygen:Dissolved.1 | Nitrate_plus_Nitrite | Silicate | Phosphate | Transmissivity:Green [*/metre] |
---|
0 rows × 77 columns
# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Pressure [decibar]']>=0),
np.sum(dfbot['Pressure']>=0))
dfbot.loc[(dfbot['Pressure [decibar]']>=0)&\
(dfbot['Pressure']>=0)]
1104 340
Zone | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | ADM:MISSION | LOC:STATION | ADM:PROJECT | YYYY/MM/DD HH:MM:SS | ... | Temperature:Draw | Salinity:Bottle | Chlorophyll:Extracted | Phaeo-Pigment:Extracted | Oxygen:Dissolved | Oxygen:Dissolved.1 | Nitrate_plus_Nitrite | Silicate | Phosphate | Transmissivity:Green [*/metre] |
---|
0 rows × 77 columns
def subval(idf,colList):
# first value in colList should be the column you are going to keep
# follow with other columns that will be used to fill in when that column is NaN
# in order of precedence
if len(colList)==2:
idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
else r[colList[1]] for i,r in idf.iterrows()]
elif len(colList)==3:
idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
else r[colList[1]] if not pd.isna(r[colList[1]]) \
else r[colList[2]] for i,r in idf.iterrows()]
else:
raise NotImplementedError('Add to code to handle this case')
idf.drop(columns=list(colList[1:]),inplace=True)
return idf
# there are some duplicate columns here; handle them:
dfbot=subval(dfbot,('Depth [metres]','Depth'))
dfbot=subval(dfbot,('Pressure [decibar]','Pressure'))
dfbot=subval(dfbot,('Temperature:Primary [deg C (ITS90)]',
'Temperature:Secondary [deg C (ITS90)]',
'Temperature:Secondary'))
dfbot=subval(dfbot,('Salinity:T0:C0 [PSS-78]',
'Salinity:T1:C1 [PSS-78]',
'Salinity:T1:C1'))
dfbot=subval(dfbot,('Fluorescence:URU:Seapoint [mg/m^3]',
'Fluorescence:URU:Wetlabs [mg/m^3]',
'Fluorescence:URU:Seapoint'))
dfbot=subval(dfbot,('PAR [uE/m^2/sec]','PAR'))
dfbot=subval(dfbot,('PAR:Reference [uE/m^2/sec]','PAR:Reference'))
dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [mL/L]',
'Oxygen:Dissolved:SBE',
'Oxygen:Dissolved:SBE.1'))
dfbot=subval(dfbot,('Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved', 'Oxygen:Dissolved.1'))
dfbot=subval(dfbot,('Transmissivity [*/metre]','Transmissivity2 [*/metre]','Transmissivity'))
dfbot=subval(dfbot,('Nitrate_plus_Nitrite [umol/L]', 'Nitrate_plus_Nitrite'))
dfbot=subval(dfbot,('Silicate [umol/L]','Silicate'))
dfbot=subval(dfbot,('Phosphate [umol/L]','Phosphate'))
dfbot=subval(dfbot,('Phaeo-Pigment:Extracted [mg/m^3]','Phaeo-Pigment:Extracted'))
dfbot=subval(dfbot,('Chlorophyll:Extracted [mg/m^3]','Chlorophyll:Extracted'))
dfbot=subval(dfbot,('Conductivity:Primary [S/m]',
'Conductivity:Secondary [S/m]',
'Conductivity:Secondary'))
dfbot=subval(dfbot,('FIL:START TIME YYYY/MM/DD HH:MM:SS','YYYY/MM/DD HH:MM:SS'))
dfbot=subval(dfbot,('Zone', 'Zone.1'))
dfbot=subval(dfbot,('Temperature:Draw [deg C (ITS90)]','Temperature:Draw'))
dfbot=subval(dfbot,('Salinity:Bottle [PSS-78]','Salinity:Bottle'))
dfbot.rename(columns={'Temperature:Primary [deg C (ITS90)]':'Temperature [deg C (ITS90)]'},
inplace=True)
dfbot.rename(columns={'Salinity:T0:C0 [PSS-78]':'Salinity [PSS-78]'},
inplace=True)
dfbot.rename(columns={'Fluorescence:URU:Seapoint [mg/m^3]':'Fluorescence:URU [mg/m^3]'},
inplace=True)
dfbot.rename(columns={'Conductivity:Primary [S/m]':'Conductivity [S/m]'},
inplace=True)
# define a function that will be applied to the values in the index column;
# this makes it easier to drop non-data rows later
def convertIndex(val):
try:
x =int(val)
except ValueError:
x=np.nan
return x
# load the phytoplankton data with the following options:
# sheet_name='???? CHEMTAX abs results' -> choose the sheet
# usecols='A:I,T:AC' -> read only columns A:I and T:AC from the Excel sheet
# skiprows=2 -> start reading at the 3rd row of the sheet,
# which contains the column headings
# converters={'Index': convertIndex,} -> apply the function defined above to the Index column
# verbose = True -> print extra information/ warnings/ errors
dfPhyto=pd.read_excel(pathPhyto,sheet_name='2019 CHEMTAX abs rerun results',usecols='A:S',
skiprows=2,converters={'Index': convertIndex,},
verbose=True)
Reading sheet 2019 CHEMTAX abs rerun results
# display rows 48 to 59 of the resulting table
dfPhyto[48:60]
Bin # | Index | Subgroup | Cruise | Date | Station | Sample# | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48 | 7 | 46.0 | 2 | 2019-016 | 2019-04-10 00:00:00 | 56 | 289 | mean | 0 | 0.235929 | 0.0557088 | 0.000116464 | 0.252259 | 0.00720253 | 0.015618 | 0.00990757 | 0.00374288 | 0.00721907 | 0.587704 |
49 | 7 | 47.0 | 2 | 2019-016 | 2019-04-10 00:00:00 | 59 | 303 | mean | 0 | 1.17636 | 0.0649978 | 0.00151012 | 0.116654 | 0.0114187 | 0.0189902 | 0.00697747 | 0.000317128 | 0.00746592 | 1.40469 |
50 | 7 | 48.0 | 2 | 2019-016 | 2019-04-11 00:00:00 | 102 | 318 | mean | 0 | 1.81861 | 0.254111 | 0.0210325 | 0.106842 | 0.0526873 | 0.222644 | 0 | 0.00369154 | 0.0117118 | 2.49133 |
51 | 7 | 49.0 | 2 | 2019-016 | 2019-04-11 00:00:00 | 75 | 332 | mean | 0 | 1.66103 | 0.145549 | 0.0161815 | 0.113534 | 0.0543728 | 0.236785 | 0 | 0.00986979 | 0.00749211 | 2.24481 |
52 | 7 | 50.0 | 2 | 2019-016 | 2019-04-11 00:00:00 | 72 | 345 | mean | 0 | 1.69253 | 0.0607043 | 0.0273842 | 0.20995 | 0.107062 | 0.235914 | 0.0171222 | 0.019343 | 0.00543582 | 2.37545 |
53 | 7 | 51.0 | 2 | 2019-016 | 2019-04-11 00:00:00 | 69 | 357 | mean | 0 | 1.20815 | 0.189921 | 0.0237731 | 0.219837 | 0.0697194 | 0.275599 | 0 | 0.0128893 | 0.00430002 | 2.00419 |
54 | 7 | 52.0 | 2 | 2019-016 | 2019-04-11 00:00:00 | ADCP | 367 | mean | 0 | 0.277147 | 0.0376473 | 0.0182011 | 0.222086 | 0.0207715 | 0.0508863 | 0.0027147 | 0.00533482 | 0.00312848 | 0.637917 |
55 | 7 | 53.0 | 2 | 2019-016 | 2019-04-11 00:00:00 | 65 | 377 | mean | 0 | 0.91255 | 0.0374169 | 0.0295607 | 0.226087 | 0.046938 | 0.232094 | 0.00658676 | 0.0116996 | 0.00512161 | 1.50806 |
56 | 7 | 54.0 | 2 | 2019-016 | 2019-04-11 00:00:00 | 63 | 380 | mean | 0 | 0.704281 | 0.0266105 | 0.0317602 | 0.254517 | 0.0525936 | 0.207993 | 0.00172955 | 0.0059513 | 0.00552183 | 1.29096 |
57 | 7 | 55.0 | 2 | 2019-016 | 2019-04-11 00:00:00 | 62 | 391 | mean | 0 | 0.52177 | 0.0800526 | 0.0118601 | 0.117773 | 0.0196437 | 0.0626985 | 0.0115089 | 0 | 0.0046034 | 0.82991 |
58 | 7 | 56.0 | 2 | 2019-016 | 2019-04-12 00:00:00 | SC04 | 394 | mean | 0 | 3.5872 | 0.139061 | 0 | 0.112151 | 0.0294817 | 0.0468341 | 0 | 0 | 0.0152213 | 3.92995 |
59 | Absolute Pigment Compositions - Bin # 13 | NaN | NaN | NaN | NaN | From Sheet: OutR9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# now, drop any rows from the table that have NaN values in either of the columns
# 'Index' or 'TchlA (ug/L)'
# This is why we applied a function to the Index column to make sure all
# non-numeric Index values would have a consistent NaN entry, making them easy to identify
# and remove
dfPhyto.dropna(subset=['Index', 'TchlA'],how='any',inplace=True)
# pandas creates its own index, and after dropping rows I like to reset it -
# this is just for convenience
dfPhyto.reset_index(drop=True,inplace=True)
# apply formatting function all rows in Cruise column to get year-3digit format
dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]
# display part of the table, confirming that non-data rows have been removed
dfPhyto[48:60]
Bin # | Index | Subgroup | Cruise | Date | Station | Sample# | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48 | 7 | 49.0 | 2 | 2019-016 | 2019-04-11 00:00:00 | 75 | 332 | mean | 0 | 1.66103 | 0.145549 | 0.0161815 | 0.113534 | 0.0543728 | 0.236785 | 0 | 0.00986979 | 0.00749211 | 2.24481 |
49 | 7 | 50.0 | 2 | 2019-016 | 2019-04-11 00:00:00 | 72 | 345 | mean | 0 | 1.69253 | 0.0607043 | 0.0273842 | 0.20995 | 0.107062 | 0.235914 | 0.0171222 | 0.019343 | 0.00543582 | 2.37545 |
50 | 7 | 51.0 | 2 | 2019-016 | 2019-04-11 00:00:00 | 69 | 357 | mean | 0 | 1.20815 | 0.189921 | 0.0237731 | 0.219837 | 0.0697194 | 0.275599 | 0 | 0.0128893 | 0.00430002 | 2.00419 |
51 | 7 | 52.0 | 2 | 2019-016 | 2019-04-11 00:00:00 | ADCP | 367 | mean | 0 | 0.277147 | 0.0376473 | 0.0182011 | 0.222086 | 0.0207715 | 0.0508863 | 0.0027147 | 0.00533482 | 0.00312848 | 0.637917 |
52 | 7 | 53.0 | 2 | 2019-016 | 2019-04-11 00:00:00 | 65 | 377 | mean | 0 | 0.91255 | 0.0374169 | 0.0295607 | 0.226087 | 0.046938 | 0.232094 | 0.00658676 | 0.0116996 | 0.00512161 | 1.50806 |
53 | 7 | 54.0 | 2 | 2019-016 | 2019-04-11 00:00:00 | 63 | 380 | mean | 0 | 0.704281 | 0.0266105 | 0.0317602 | 0.254517 | 0.0525936 | 0.207993 | 0.00172955 | 0.0059513 | 0.00552183 | 1.29096 |
54 | 7 | 55.0 | 2 | 2019-016 | 2019-04-11 00:00:00 | 62 | 391 | mean | 0 | 0.52177 | 0.0800526 | 0.0118601 | 0.117773 | 0.0196437 | 0.0626985 | 0.0115089 | 0 | 0.0046034 | 0.82991 |
55 | 7 | 56.0 | 2 | 2019-016 | 2019-04-12 00:00:00 | SC04 | 394 | mean | 0 | 3.5872 | 0.139061 | 0 | 0.112151 | 0.0294817 | 0.0468341 | 0 | 0 | 0.0152213 | 3.92995 |
56 | 13 | 57.0 | 3 | 2019-044 | 2019-04-29 00:00:00 | GEO1 | 9 | mean | 0 | 5.62557 | 0.409893 | 0.3874 | 0.241505 | 0.0396982 | 0.155595 | 0.070028 | 0.117273 | 0.0390135 | 7.08597 |
57 | 13 | 58.0 | 3 | 2019-044 | 2019-04-30 00:00:00 | 39 | 10 | mean | 0 | 8.91175 | 0.803637 | 0.238999 | 0.523477 | 0.23522 | 0.153088 | 0.120047 | 0.100953 | 0 | 11.0872 |
58 | 13 | 59.0 | 3 | 2019-044 | 2019-04-30 00:00:00 | 27 | 11 | mean | 0 | 12.1251 | 1.06535 | 0.168672 | 0.28809 | 0.00246724 | 0.240602 | 0.0952687 | 0.251948 | 0.0393948 | 14.2769 |
59 | 13 | 60.0 | 3 | 2019-044 | 2019-04-30 00:00:00 | 22 | 12 | mean | 0 | 10.9518 | 0.681424 | 0.174355 | 0.573097 | 0.0855425 | 0 | 0 | 0 | 0.062748 | 12.529 |
# due to repeated column names in the original spreadsheet, '.1' was appended to the names
# of the phytoplankton columns;
# these lines correct the column names, removing the '.1':
renameDict=dict()
for colName in dfPhyto.keys():
if colName.endswith('.1'):
renameDict[colName]=colName.split('.1')[0]
dfPhyto.rename(columns=renameDict,inplace=True)
dfPhyto
Bin # | Index | Subgroup | Cruise | Date | Station | Sample# | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 16 | 1.0 | 1 | 2019-001 | 2019-02-20 00:00:00 | 14 | 729 | mean | 0 | 0.0331146 | 0.101655 | 0.123836 | 0.3117 | 0.00876211 | 0.1462 | 0.0225388 | 0.0234123 | 0.00090972 | 0.772129 |
1 | 16 | 2.0 | 1 | 2019-001 | 2019-02-20 00:00:00 | 12 | 732 | mean | 0 | 0.156416 | 0.17666 | 0.112519 | 0.237514 | 0.0127872 | 0.179982 | 0.0256941 | 0.0165342 | 0.000955474 | 0.919063 |
2 | 16 | 3.0 | 1 | 2019-001 | 2019-02-20 00:00:00 | 22 | 743 | mean | 0 | 0.356374 | 0.340304 | 0.125008 | 0.315779 | 0.0132582 | 0.07692 | 0.0203337 | 0.0362638 | 0.00278554 | 1.28703 |
3 | 16 | 4.0 | 1 | 2019-001 | 2019-02-20 00:00:00 | CPF2 | 746 | mean | 0 | 0.0606656 | 0.116013 | 0.135416 | 0.332063 | 0.007642 | 0.159874 | 0.0260862 | 0.0274673 | 0.00137643 | 0.866603 |
4 | 16 | 5.0 | 1 | 2019-001 | 2019-02-20 00:00:00 | CPF1 | 749 | mean | 0 | 0.404852 | 0.41367 | 0.131683 | 0.197787 | 0.0128683 | 0.0582133 | 0.0158865 | 0.0375323 | 0.00585506 | 1.27835 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
166 | 19 | 167.0 | 7 | 2019-062 | 2019-10-05 00:00:00 | IND0 | 349 | mean | 0 | 9.11885 | 0 | 0.289786 | 0.612077 | 0.0876324 | 0.0656088 | 0.0868017 | 0.0365193 | 0 | 10.2973 |
167 | 19 | 168.0 | 7 | 2019-062 | 2019-10-05 00:00:00 | IND2 | 361 | mean | 0 | 6.2426 | 0 | 0.312427 | 0.510974 | 0.0604039 | 0.146785 | 0.0594772 | 0.0247012 | 0.0121179 | 7.36948 |
168 | 19 | 169.0 | 7 | 2019-062 | 2019-10-05 00:00:00 | IND4 | 375 | mean | 0 | 8.54631 | 0 | 0.520754 | 0.511822 | 0.158578 | 0.177592 | 0.0323331 | 0.0887445 | 0.0333497 | 10.0695 |
169 | 19 | 170.0 | 7 | 2019-062 | 2019-10-05 00:00:00 | IND7 | 389 | mean | 0 | 4.28515 | 0 | 0.365631 | 8.53139 | 0 | 0 | 0.00631162 | 0.00703949 | 0.0631797 | 13.2587 |
170 | 19 | 171.0 | 7 | 2019-062 | 2019-10-05 00:00:00 | VAN5 | 395 | mean | 0 | 1.94256 | 0 | 0.279896 | 0.383945 | 0.106905 | 0.0399761 | 0.0289777 | 0.0341708 | 0.00224957 | 2.81868 |
171 rows × 19 columns
# This is the important step- join the two tables ('left' and 'right'),
# matching the cruise IDs and sample numbers
# how='outer' -> all rows from both the left and the right tables will be included,
# even if they cannot be matched; this makes it easy to check for
# unmatched data later
# left_on specifies the name of the column to match in the left table (dfbot)
# right_on specifies the name of the column to match in the right table (dfPhyto)
dfout = pd.merge(dfbot, dfPhyto, how='outer',
left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])
# Identify cases where phytoplankton data were matched to multiple samples in bottle data:
dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])
# show the column names in the resulting table
dfout.keys()
Index(['Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION', 'LOC:STATION', 'ADM:PROJECT', 'Bottle_Number', 'Bottle:Firing_Sequence', 'Sample_Number', 'Pressure [decibar]', 'Depth [metres]', 'Temperature [deg C (ITS90)]', 'Transmissivity [*/metre]', 'Fluorescence:URU [mg/m^3]', 'PAR [uE/m^2/sec]', 'Salinity [PSS-78]', 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]', 'Conductivity [S/m]', 'Number_of_bin_records', 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise', 'FIL:START TIME YYYY/MM/DD HH:MM:SS', 'Depth:Nominal [metres]', 'Temperature:Draw [deg C (ITS90)]', 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle', 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved', 'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal', 'File Name', 'LOC:ALTIMETER (M)', 'ADM:PLATFORM', 'LOC:GEOGRAPHIC AREA', 'FIL:DATA DESCRIPTION', 'Transmissivity:Green [*/metre]', 'Bin #', 'Index', 'Subgroup', 'Date', 'Station', 'Sample#', 'rep', 'depth', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido', 'Cyanobacteria', 'TchlA'], dtype='object')
temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})
temp.columns = ['icount']
np.unique(temp.icount)
array([1])
# check for Phyto samples matched to multiple bottle samples:
temp.loc[temp.icount>1]
icount | ||
---|---|---|
Cruise | Sample# |
# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]
icount | ||
---|---|---|
Cruise | Sample# |
temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})
temp2.columns = ['icount']
# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:
temp2.loc[temp2.icount>1]
icount | ||
---|---|---|
Cruise | Sample_Number |
# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]
icount | ||
---|---|---|
Cruise | Sample# |
# if the output table is longer than either of the input tables, some columns were not matched
len(dfout), len(dfPhyto), len(dfbot)
(1444, 171, 1444)
# Check that the number of cells with data in the 'Cyanobacteria' column is
# the same for the input and output tables to show that no rows are missing:
np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)
(171, 171)
# If there were data rows from the phytoplankton table that were not matched to
# rows from the bottle table, their indices from the phytoplankton table would be
# displayed below (the series [] would not be empty)
print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Index'])
Series([], Name: Index, dtype: float64)
dfout.loc[dfout['ADM:SCIENTIST'].isna(),['Index','Cruise','Diatoms-1','Prasinophytes']]
Index | Cruise | Diatoms-1 | Prasinophytes |
---|
# drop repetetive/unecessary columns:
dfout.drop(labels=['Bin #', 'Index', 'Subgroup', 'Station', 'Sample#', 'rep',
'depth',],axis=1,inplace=True)
# truncate phyto group values to 3 decimal places:
for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',
'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
'TchlA'):
dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column
dfout['Cyanobacteria']
0 NaN 1 NaN 2 0.001 3 NaN 4 NaN ... 1439 NaN 1440 NaN 1441 NaN 1442 NaN 1443 0.002 Name: Cyanobacteria, Length: 1444, dtype: float64
# now write the output table to a .csv file:
dfout.to_csv(pathOut, index=False)
dfout.keys()
Index(['Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION', 'LOC:STATION', 'ADM:PROJECT', 'Bottle_Number', 'Bottle:Firing_Sequence', 'Sample_Number', 'Pressure [decibar]', 'Depth [metres]', 'Temperature [deg C (ITS90)]', 'Transmissivity [*/metre]', 'Fluorescence:URU [mg/m^3]', 'PAR [uE/m^2/sec]', 'Salinity [PSS-78]', 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]', 'Conductivity [S/m]', 'Number_of_bin_records', 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise', 'FIL:START TIME YYYY/MM/DD HH:MM:SS', 'Depth:Nominal [metres]', 'Temperature:Draw [deg C (ITS90)]', 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle', 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved', 'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal', 'File Name', 'LOC:ALTIMETER (M)', 'ADM:PLATFORM', 'LOC:GEOGRAPHIC AREA', 'FIL:DATA DESCRIPTION', 'Transmissivity:Green [*/metre]', 'Date', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido', 'Cyanobacteria', 'TchlA'], dtype='object')