import numpy as np # this module handles arrays, but here we need it for its NaN value
import pandas as pd # this module contains a lot of tools for handling tabular data
import re
# define paths to the source files and eventual output file
#pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2018 SoG bottle.xlsx'
#pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2018 Abs phyto groups.xlsx'
pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2018 SoG bottleCorrected.xlsx'
pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2019 SoG abs values New ALLO.xlsx'
pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2018_NewALLO.csv'
#formatting function to get year-(3digit) format
def fmtCruise(istr):
if re.fullmatch('[0-9]{4}-[0-9]{2}',istr):
sp=re.split('-',istr)
rstr=sp[0]+'-0'+sp[1]
elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):
rstr=istr
else:
raise ValueError('Input had unexpected format:',istr)
return rstr
# get names of sheets in notebook
with pd.ExcelFile(pathBottle,engine='openpyxl') as xl:
sheets=xl.sheet_names
print(sheets)
['2018-005', '2018-01', '2018-96', '2018-29', '2018-035', '2018-39', '2018-036', '2018-030', '2018-037', '2018-34', '2018-40', '2018-31', '2018-032']
# load each sheet in the bottle Excel file and concatenate them together into one table
dfbotlist=list()
for sheet in sheets:
df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,engine='openpyxl',
na_values=(-99,-99.9)) # read each sheet; include additional na values
df0['Cruise']=fmtCruise(sheet) # create and populate Cruise column based on sheet name
dfbotlist.append(df0) # append the sheet to a list
dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table
# Drop columns with no data in them
l1=set(dfbot.keys())
dfbot.dropna(axis=1,how='all',inplace=True)
print('removed empty columns:',l1-set(dfbot.keys()))
Reading sheet 2018-005 Reading sheet 2018-01
/home/eolson/anaconda3/envs/py39/lib/python3.9/site-packages/openpyxl/worksheet/_reader.py:300: UserWarning: Unknown extension is not supported and will be removed warn(msg)
Reading sheet 2018-96 Reading sheet 2018-29 Reading sheet 2018-035 Reading sheet 2018-39 Reading sheet 2018-036 Reading sheet 2018-030 Reading sheet 2018-037 Reading sheet 2018-34 Reading sheet 2018-40 Reading sheet 2018-31 Reading sheet 2018-032 removed empty columns: {'Ammonium [umol/L]', 'Flag:Ammonium'}
# list the column names in the resulting table
print(dfbot.keys())
Index(['File Name', 'Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION', 'LOC:STATION', 'ADM:PROJECT', 'Zone.1', 'YYYY/MM/DD HH:MM:SS', 'Sample_Number', 'Pressure [decibar]', 'Depth [metres]', 'Temperature:Primary [deg C (ITS90)]', 'Transmissivity [*/metre]', 'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]', 'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal', 'Salinity:T0:C0 [PSS-78]', 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]', 'Temperature:Draw [deg C (ITS90)]', 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted [mg/m^3]', 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved', 'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise', 'FIL:START TIME YYYY/MM/DD HH:MM:SS', 'Temperature:Secondary [deg C (ITS90)]', 'Pressure:CTD [decibar]', 'Depth:CTD [metres]', 'Temperature:CTD [deg C (ITS90)]', 'Salinity:CTD [PSS-78]', 'Oxygen:Dissolved:Volume:CTD [mL/L]', 'Oxygen:Dissolved:Mass:CTD [umol/kg]', 'Fluorescence:URU:CTD [mg/m^3]', 'Depth:Nominal [metres]', 'Alkalinity:Total [umol/L]', 'Flag:Alkalinity:Total', 'Carbon:Dissolved:Inorganic [umol/kg]', 'Flag:Carbon:Dissolved:Inorganic', 'Salinity:T1:C1 [PSS-78]', 'Fluorescence:URU:Wetlabs [mg/m^3]', 'Bottle_Number', 'Bottle:Firing_Sequence'], dtype='object')
# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Depth [metres]']>=0),
np.sum(dfbot['Depth:CTD [metres]']>=0))
dfbot.loc[(dfbot['Depth [metres]']>=0)&\
(dfbot['Depth:CTD [metres]']>=0)]
2010 36
File Name | Zone | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | ADM:MISSION | LOC:STATION | ADM:PROJECT | ... | Fluorescence:URU:CTD [mg/m^3] | Depth:Nominal [metres] | Alkalinity:Total [umol/L] | Flag:Alkalinity:Total | Carbon:Dissolved:Inorganic [umol/kg] | Flag:Carbon:Dissolved:Inorganic | Salinity:T1:C1 [PSS-78] | Fluorescence:URU:Wetlabs [mg/m^3] | Bottle_Number | Bottle:Firing_Sequence |
---|
0 rows × 59 columns
# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Temperature:Primary [deg C (ITS90)]']>=0),
np.sum(dfbot['Temperature:Secondary [deg C (ITS90)]']>=0),
np.sum(dfbot['Temperature:CTD [deg C (ITS90)]']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Temperature:Primary [deg C (ITS90)]']>=0)])+\
np.array([int(ii) for ii in (dfbot['Temperature:Secondary [deg C (ITS90)]']>=0)])+\
np.array([int(ii) for ii in (dfbot['Temperature:CTD [deg C (ITS90)]']>=0)]))>1]
1014 956 36
File Name | Zone | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | ADM:MISSION | LOC:STATION | ADM:PROJECT | ... | Fluorescence:URU:CTD [mg/m^3] | Depth:Nominal [metres] | Alkalinity:Total [umol/L] | Flag:Alkalinity:Total | Carbon:Dissolved:Inorganic [umol/kg] | Flag:Carbon:Dissolved:Inorganic | Salinity:T1:C1 [PSS-78] | Fluorescence:URU:Wetlabs [mg/m^3] | Bottle_Number | Bottle:Firing_Sequence |
---|
0 rows × 59 columns
# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0),
np.sum(dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0),
np.sum(dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0)])+\
np.array([int(ii) for ii in (dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0)])+\
np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0)]))>1]
1517 36 104
File Name | Zone | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | ADM:MISSION | LOC:STATION | ADM:PROJECT | ... | Fluorescence:URU:CTD [mg/m^3] | Depth:Nominal [metres] | Alkalinity:Total [umol/L] | Flag:Alkalinity:Total | Carbon:Dissolved:Inorganic [umol/kg] | Flag:Carbon:Dissolved:Inorganic | Salinity:T1:C1 [PSS-78] | Fluorescence:URU:Wetlabs [mg/m^3] | Bottle_Number | Bottle:Firing_Sequence |
---|
0 rows × 59 columns
# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0),
np.sum(dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0))
dfbot.loc[(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0)&\
(dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0)]
1517 36
File Name | Zone | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | ADM:MISSION | LOC:STATION | ADM:PROJECT | ... | Fluorescence:URU:CTD [mg/m^3] | Depth:Nominal [metres] | Alkalinity:Total [umol/L] | Flag:Alkalinity:Total | Carbon:Dissolved:Inorganic [umol/kg] | Flag:Carbon:Dissolved:Inorganic | Salinity:T1:C1 [PSS-78] | Fluorescence:URU:Wetlabs [mg/m^3] | Bottle_Number | Bottle:Firing_Sequence |
---|
0 rows × 59 columns
# no rows returned, so there are no rows with both both salinity fields
print(np.sum(dfbot['Salinity:T0:C0 [PSS-78]']>=0),
np.sum(dfbot['Salinity:T1:C1 [PSS-78]']>=0),
np.sum(dfbot['Salinity:CTD [PSS-78]']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Salinity:T0:C0 [PSS-78]']>=0)])+\
np.array([int(ii) for ii in (dfbot['Salinity:T1:C1 [PSS-78]']>=0)])+\
np.array([int(ii) for ii in (dfbot['Salinity:CTD [PSS-78]']>=0)]))>1]
1132 836 36
File Name | Zone | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | ADM:MISSION | LOC:STATION | ADM:PROJECT | ... | Fluorescence:URU:CTD [mg/m^3] | Depth:Nominal [metres] | Alkalinity:Total [umol/L] | Flag:Alkalinity:Total | Carbon:Dissolved:Inorganic [umol/kg] | Flag:Carbon:Dissolved:Inorganic | Salinity:T1:C1 [PSS-78] | Fluorescence:URU:Wetlabs [mg/m^3] | Bottle_Number | Bottle:Firing_Sequence |
---|
0 rows × 59 columns
# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Pressure [decibar]']>=0),
np.sum(dfbot['Pressure:CTD [decibar]']>=0))
dfbot.loc[(dfbot['Pressure [decibar]']>=0)&\
(dfbot['Pressure:CTD [decibar]']>=0)]
1983 36
File Name | Zone | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | ADM:MISSION | LOC:STATION | ADM:PROJECT | ... | Fluorescence:URU:CTD [mg/m^3] | Depth:Nominal [metres] | Alkalinity:Total [umol/L] | Flag:Alkalinity:Total | Carbon:Dissolved:Inorganic [umol/kg] | Flag:Carbon:Dissolved:Inorganic | Salinity:T1:C1 [PSS-78] | Fluorescence:URU:Wetlabs [mg/m^3] | Bottle_Number | Bottle:Firing_Sequence |
---|
0 rows × 59 columns
# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Depth [metres]']>=0),
np.sum(dfbot['Depth:CTD [metres]']>=0))
dfbot.loc[(dfbot['Depth [metres]']>=0)&\
(dfbot['Depth:CTD [metres]']>=0)]
2010 36
File Name | Zone | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | ADM:MISSION | LOC:STATION | ADM:PROJECT | ... | Fluorescence:URU:CTD [mg/m^3] | Depth:Nominal [metres] | Alkalinity:Total [umol/L] | Flag:Alkalinity:Total | Carbon:Dissolved:Inorganic [umol/kg] | Flag:Carbon:Dissolved:Inorganic | Salinity:T1:C1 [PSS-78] | Fluorescence:URU:Wetlabs [mg/m^3] | Bottle_Number | Bottle:Firing_Sequence |
---|
0 rows × 59 columns
# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Oxygen:Dissolved:SBE [mL/L]']>=0),
np.sum(dfbot['Oxygen:Dissolved:Volume:CTD [mL/L]']>=0))
dfbot.loc[(dfbot['Oxygen:Dissolved:SBE [mL/L]']>=0)&\
(dfbot['Oxygen:Dissolved:Volume:CTD [mL/L]']>=0)]
1975 36
File Name | Zone | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | ADM:MISSION | LOC:STATION | ADM:PROJECT | ... | Fluorescence:URU:CTD [mg/m^3] | Depth:Nominal [metres] | Alkalinity:Total [umol/L] | Flag:Alkalinity:Total | Carbon:Dissolved:Inorganic [umol/kg] | Flag:Carbon:Dissolved:Inorganic | Salinity:T1:C1 [PSS-78] | Fluorescence:URU:Wetlabs [mg/m^3] | Bottle_Number | Bottle:Firing_Sequence |
---|
0 rows × 59 columns
# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Oxygen:Dissolved:SBE [umol/kg]']>=0),
np.sum(dfbot['Oxygen:Dissolved:Mass:CTD [umol/kg]']>=0))
dfbot.loc[(dfbot['Oxygen:Dissolved:SBE [umol/kg]']>=0)&\
(dfbot['Oxygen:Dissolved:Mass:CTD [umol/kg]']>=0)]
1952 36
File Name | Zone | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | ADM:MISSION | LOC:STATION | ADM:PROJECT | ... | Fluorescence:URU:CTD [mg/m^3] | Depth:Nominal [metres] | Alkalinity:Total [umol/L] | Flag:Alkalinity:Total | Carbon:Dissolved:Inorganic [umol/kg] | Flag:Carbon:Dissolved:Inorganic | Salinity:T1:C1 [PSS-78] | Fluorescence:URU:Wetlabs [mg/m^3] | Bottle_Number | Bottle:Firing_Sequence |
---|
0 rows × 59 columns
def subval(idf,colList):
# first value in colList should be the column you are going to keep
# follow with other columns that will be used to fill in when that column is NaN
# in order of precedence
if len(colList)==2:
idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
else r[colList[1]] for i,r in idf.iterrows()]
elif len(colList)==3:
idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
else r[colList[1]] if not pd.isna(r[colList[1]]) \
else r[colList[2]] for i,r in idf.iterrows()]
else:
raise NotImplementedError('Add to code to handle this case')
idf.drop(columns=list(colList[1:]),inplace=True)
return idf
# there are some duplicate columns here; handle them:
dfbot=subval(dfbot,('FIL:START TIME YYYY/MM/DD HH:MM:SS',
'YYYY/MM/DD HH:MM:SS'))
dfbot=subval(dfbot,('Temperature:Primary [deg C (ITS90)]',
'Temperature:Secondary [deg C (ITS90)]',
'Temperature:CTD [deg C (ITS90)]'))
dfbot=subval(dfbot,('Salinity:T0:C0 [PSS-78]',
'Salinity:T1:C1 [PSS-78]',
'Salinity:CTD [PSS-78]'))
dfbot=subval(dfbot,('Pressure [decibar]',
'Pressure:CTD [decibar]'))
dfbot=subval(dfbot,('Depth [metres]',
'Depth:CTD [metres]'))
dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [mL/L]',
'Oxygen:Dissolved:Volume:CTD [mL/L]'))
dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [umol/kg]',
'Oxygen:Dissolved:Mass:CTD [umol/kg]'))
dfbot=subval(dfbot,('Fluorescence:URU:Seapoint [mg/m^3]',
'Fluorescence:URU:CTD [mg/m^3]',
'Fluorescence:URU:Wetlabs [mg/m^3]'))
dfbot.rename(columns={'Temperature:Primary [deg C (ITS90)]':'Temperature [deg C (ITS90)]'},
inplace=True)
dfbot.rename(columns={'Salinity:T0:C0 [PSS-78]':'Salinity [PSS-78]'},
inplace=True)
dfbot.rename(columns={'Oxygen:Dissolved:SBE [mL/L]':'Oxygen:Dissolved:CTD [mL/L]'},
inplace=True)
dfbot.rename(columns={'Oxygen:Dissolved:SBE [umol/kg]':'Oxygen:Dissolved:CTD [umol/kg]'},
inplace=True)
dfbot.rename(columns={'Fluorescence:URU:Seapoint [mg/m^3]':'Fluorescence [mg/m^3]'},
inplace=True)
# define a function that will be applied to the values in the index column;
# this makes it easier to drop non-data rows later
def convertIndex(val):
try:
x =int(val)
except ValueError:
x=np.nan
return x
# load the phytoplankton data with the following options:
# sheet_name='???? CHEMTAX abs results' -> choose the sheet
# usecols='A:I,T:AC' -> read only columns A:I and T:AC from the Excel sheet
# skiprows=2 -> start reading at the 3rd row of the sheet,
# which contains the column headings
# converters={'Index': convertIndex,} -> apply the function defined above to the Index column
# verbose = True -> print extra information/ warnings/ errors
dfPhyto=pd.read_excel(pathPhyto,sheet_name='2018',engine='openpyxl',
skiprows=0,converters={'Sample#': convertIndex,},
verbose=True)
Reading sheet 2018
# display rows 48 to 59 of the resulting table
dfPhyto[48:60]
Cruise | Date | Station | Sample# | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48 | 2018-029 | NaN | SI | 9.0 | mean | 0 | 32.464 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 32.464 |
49 | 2018-029 | NaN | 59 | 24.0 | mean | 0 | 0.531625 | 0.128984 | 0 | 0.0842221 | 0.0166345 | 0 | 0 | 0 | 0.00753434 | 0.769 |
50 | 2018-029 | NaN | 69 | 36.0 | mean | 0 | 0.0367559 | 0.150469 | 0.0224271 | 0.0859842 | 0.000162244 | 0.0132018 | 0 | 0 | 0 | 0.309 |
51 | 2018-029 | NaN | 72 | 50.0 | mean | 0 | 0.0841301 | 0.132112 | 0.0228435 | 0.183018 | 0.0137391 | 0.0471567 | 0 | 0 | 0 | 0.483 |
52 | 2018-029 | NaN | ADCP | 60.0 | mean | 0 | 0.0744871 | 0.115963 | 0.0283835 | 0.192215 | 8.99369e-05 | 0 | 0.0138611 | 0 | 0 | 0.425 |
53 | 2018-029 | NaN | 65 | 70.0 | mean | 0 | 0.0761432 | 0.126493 | 0.0372582 | 0.17367 | 0.000112182 | 0.0059944 | 0 | 0 | 0 | 0.41967 |
54 | 2018-029 | NaN | 56 | 87.0 | mean | 0 | 0.803652 | 0.137031 | 0 | 0.044934 | 0.0307319 | 0 | 0.0170716 | 0 | 0.00457965 | 1.038 |
55 | 2018-029 | NaN | 46 | 99.0 | mean | 0 | 3.3556 | 0.550971 | 0 | 0.243657 | 0.0557609 | 0 | 0 | 0 | 0.0250118 | 4.231 |
56 | 2018-029 | NaN | 42 | 115.0 | mean | 0 | 11.3383 | 0.707286 | 0 | 0.0503767 | 0.0230193 | 0 | 0 | 0 | 0.0119893 | 12.131 |
57 | 2018-029 | NaN | 39 | 132.0 | mean | 0 | 8.91487 | 0.569452 | 0.0183908 | 0 | 0.0620335 | 0 | 0 | 0 | 0.0232555 | 9.588 |
58 | 2018-029 | NaN | GEO1 | 135.0 | mean | 0 | 8.34946 | 0.737569 | 0 | 0.180029 | 0.113739 | 0.0133282 | 0.0532258 | 0.0716479 | 0 | 9.519 |
59 | 2018-029 | NaN | 27 | 151.0 | mean | 0 | 6.06706 | 0.676182 | 0 | 0.290192 | 0.102723 | 0.129099 | 0.062599 | 0.0323922 | 0.0427484 | 7.403 |
# now, drop any rows from the table that have NaN values in either of the columns
# 'Index' or 'TchlA (ug/L)'
# This is why we applied a function to the Index column to make sure all
# non-numeric Index values would have a consistent NaN entry, making them easy to identify
# and remove
dfPhyto.dropna(subset=['Sample#', 'TchlA'],how='any',inplace=True)
# pandas creates its own index, and after dropping rows I like to reset it -
# this is just for convenience
dfPhyto.reset_index(drop=True,inplace=True)
# apply formatting function all rows in Cruise column to get year-3digit format
dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]
# display part of the table, confirming that non-data rows have been removed
dfPhyto[48:60]
Cruise | Date | Station | Sample# | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48 | 2018-029 | NaN | 69 | 36.0 | mean | 0 | 0.0367559 | 0.150469 | 0.0224271 | 0.0859842 | 0.000162244 | 0.0132018 | 0 | 0 | 0 | 0.309 |
49 | 2018-029 | NaN | 72 | 50.0 | mean | 0 | 0.0841301 | 0.132112 | 0.0228435 | 0.183018 | 0.0137391 | 0.0471567 | 0 | 0 | 0 | 0.483 |
50 | 2018-029 | NaN | ADCP | 60.0 | mean | 0 | 0.0744871 | 0.115963 | 0.0283835 | 0.192215 | 8.99369e-05 | 0 | 0.0138611 | 0 | 0 | 0.425 |
51 | 2018-029 | NaN | 65 | 70.0 | mean | 0 | 0.0761432 | 0.126493 | 0.0372582 | 0.17367 | 0.000112182 | 0.0059944 | 0 | 0 | 0 | 0.41967 |
52 | 2018-029 | NaN | 56 | 87.0 | mean | 0 | 0.803652 | 0.137031 | 0 | 0.044934 | 0.0307319 | 0 | 0.0170716 | 0 | 0.00457965 | 1.038 |
53 | 2018-029 | NaN | 46 | 99.0 | mean | 0 | 3.3556 | 0.550971 | 0 | 0.243657 | 0.0557609 | 0 | 0 | 0 | 0.0250118 | 4.231 |
54 | 2018-029 | NaN | 42 | 115.0 | mean | 0 | 11.3383 | 0.707286 | 0 | 0.0503767 | 0.0230193 | 0 | 0 | 0 | 0.0119893 | 12.131 |
55 | 2018-029 | NaN | 39 | 132.0 | mean | 0 | 8.91487 | 0.569452 | 0.0183908 | 0 | 0.0620335 | 0 | 0 | 0 | 0.0232555 | 9.588 |
56 | 2018-029 | NaN | GEO1 | 135.0 | mean | 0 | 8.34946 | 0.737569 | 0 | 0.180029 | 0.113739 | 0.0133282 | 0.0532258 | 0.0716479 | 0 | 9.519 |
57 | 2018-029 | NaN | 27 | 151.0 | mean | 0 | 6.06706 | 0.676182 | 0 | 0.290192 | 0.102723 | 0.129099 | 0.062599 | 0.0323922 | 0.0427484 | 7.403 |
58 | 2018-029 | NaN | CPF1 | 154.0 | mean | 0 | 2.1667 | 0.328524 | 0.000788839 | 0.00118597 | 0.0210415 | 0 | 0.000905106 | 0.000282141 | 0.00956884 | 2.529 |
59 | 2018-029 | NaN | 2 | 169.0 | mean | 0 | 1.70794 | 0.468611 | 0 | 0.246463 | 0.0261223 | 0 | 0 | 0 | 0.0228652 | 2.472 |
# due to repeated column names in the original spreadsheet, '.1' was appended to the names
# of the phytoplankton columns;
# these lines correct the column names, removing the '.1':
renameDict=dict()
for colName in dfPhyto.keys():
if colName.endswith('.1'):
renameDict[colName]=colName.split('.1')[0]
dfPhyto.rename(columns=renameDict,inplace=True)
dfPhyto
Cruise | Date | Station | Sample# | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-005 | 0 | SI | 9.0 | mean | 0 | 0 | 0.217929 | 0.12314 | 0.146544 | 0.0212777 | 0.163098 | 0.00798932 | 0.0730214 | 0 | 0.753 |
1 | 2018-005 | 0 | 59 | 23.0 | mean | 0 | 0.095598 | 0.0651501 | 0.0169969 | 0.0517847 | 0.000411987 | 0 | 5.82829e-05 | 0 | 0 | 0.23 |
2 | 2018-005 | 0 | 56 | 37.0 | mean | 0 | 0.101019 | 0.0498707 | 0.0242176 | 0.0238925 | 0 | 0 | 0 | 0 | 0 | 0.199 |
3 | 2018-005 | 0 | 46 | 49.0 | mean | 0 | 0.165891 | 0.174852 | 0.102298 | 0.139463 | 0 | 0.126996 | 0.00403917 | 0.0434603 | 0 | 0.757 |
4 | 2018-005 | 0 | 42 | 65.0 | mean | 0 | 0.127544 | 0.152836 | 0.0788997 | 0.237937 | 0.00868366 | 0.0404586 | 0.00414668 | 0.0224942 | 0 | 0.673 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
221 | 2018-032 | NaN | 9 | 263.0 | mean | 5 | 0.0084716 | 0.135204 | 0.0643302 | 0.0550352 | 7.61252e-05 | 0.000371439 | 9.52715e-06 | 0.0109626 | 0.000745806 | 0.275206 |
222 | 2018-032 | NaN | 12 | 281.0 | mean | 0 | 0.0231077 | 0.133241 | 0.111226 | 0.129287 | 0.0118486 | 0 | 0.0050674 | 0.0119497 | 0.00591481 | 0.431642 |
223 | 2018-032 | NaN | 14 | 297.0 | mean | 0 | 0.00613478 | 0.103514 | 0.100642 | 0.0847234 | 4.34756e-05 | 0.000583632 | 0 | 0 | 0.00408381 | 0.299725 |
224 | 2018-032 | NaN | 16 | 308.0 | mean | 0 | 0.0266453 | 0.0981853 | 0.0827848 | 0.119032 | 0.000139516 | 0.00146098 | 0 | 0 | 0.00457703 | 0.332825 |
225 | 2018-032 | NaN | 22 | 311.0 | mean | 0 | 0.00639727 | 0.148778 | 0.166602 | 0.201075 | 0.00016008 | 0.00618898 | 0.011432 | 0.0147686 | 0.00678334 | 0.562185 |
226 rows × 16 columns
# This is the important step- join the two tables ('left' and 'right'),
# matching the cruise IDs and sample numbers
# how='outer' -> all rows from both the left and the right tables will be included,
# even if they cannot be matched; this makes it easy to check for
# unmatched data later
# left_on specifies the name of the column to match in the left table (dfbot)
# right_on specifies the name of the column to match in the right table (dfPhyto)
dfout = pd.merge(dfbot, dfPhyto, how='outer',
left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])
# Identify cases where phytoplankton data were matched to multiple samples in bottle data:
dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])
# show the column names in the resulting table
dfout.keys()
Index(['File Name', 'Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION', 'LOC:STATION', 'ADM:PROJECT', 'Zone.1', 'Sample_Number', 'Pressure [decibar]', 'Depth [metres]', 'Temperature [deg C (ITS90)]', 'Transmissivity [*/metre]', 'Fluorescence [mg/m^3]', 'PAR [uE/m^2/sec]', 'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal', 'Salinity [PSS-78]', 'Oxygen:Dissolved:CTD [mL/L]', 'Oxygen:Dissolved:CTD [umol/kg]', 'Temperature:Draw [deg C (ITS90)]', 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted [mg/m^3]', 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved', 'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise', 'FIL:START TIME YYYY/MM/DD HH:MM:SS', 'Depth:Nominal [metres]', 'Alkalinity:Total [umol/L]', 'Flag:Alkalinity:Total', 'Carbon:Dissolved:Inorganic [umol/kg]', 'Flag:Carbon:Dissolved:Inorganic', 'Bottle_Number', 'Bottle:Firing_Sequence', 'Date', 'Station', 'Sample#', 'rep', 'depth', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido', 'Cyanobacteria', 'TchlA'], dtype='object')
temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})
temp.columns = ['icount']
np.unique(temp.icount)
array([1])
# check for Phyto samples matched to multiple bottle samples:
temp.loc[temp.icount>1]
icount | ||
---|---|---|
Cruise | Sample# |
# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]
icount | ||
---|---|---|
Cruise | Sample# |
temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})
temp2.columns = ['icount']
# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:
temp2.loc[temp2.icount>1]
icount | ||
---|---|---|
Cruise | Sample_Number | |
2018-035 | 13.0 | 2 |
# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]
icount | ||
---|---|---|
Cruise | Sample# |
# if the output table is longer than either of the input tables, some columns were not matched
len(dfout), len(dfPhyto), len(dfbot)
(2082, 226, 2080)
# Check that the number of cells with data in the 'Cyanobacteria' column is
# the same for the input and output tables to show that no rows are missing:
np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)
(226, 226)
# If there were data rows from the phytoplankton table that were not matched to
# rows from the bottle table, their indices from the phytoplankton table would be
# displayed below (the series [] would not be empty)
print(dfout.loc[dfout['ADM:SCIENTIST'].isna(),['Cruise','Sample#']])
Cruise Sample# 2080 2018-035 8.0 2081 2018-039 999.0
dfout.loc[dfout['ADM:SCIENTIST'].isna(),['Sample#','Cruise','Diatoms-1','Prasinophytes']]
Sample# | Cruise | Diatoms-1 | Prasinophytes | |
---|---|---|---|---|
2080 | 8.0 | 2018-035 | 2.47287 | 0.407371 |
2081 | 999.0 | 2018-039 | 0.461796 | 0.0354468 |
# drop repetetive/unecessary columns:
dfout.drop(labels=['Date', 'Station', 'Sample#', 'rep',
'depth',],axis=1,inplace=True)
# truncate phyto group values to 3 decimal places:
for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',
'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
'TchlA'):
dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column
# Check for unexpected depths
iii=((dfout['Pressure [decibar]']>10)|(dfout['Depth [metres]']>10))&(dfout['Diatoms-1']>=0)
dfout.loc[iii,['FIL:START TIME YYYY/MM/DD HH:MM:SS','Cruise','LOC:STATION','Sample_Number',
'Pressure [decibar]','Depth [metres]','Diatoms-1','Temperature [deg C (ITS90)]','Salinity [PSS-78]']]
FIL:START TIME YYYY/MM/DD HH:MM:SS | Cruise | LOC:STATION | Sample_Number | Pressure [decibar] | Depth [metres] | Diatoms-1 | Temperature [deg C (ITS90)] | Salinity [PSS-78] | |
---|---|---|---|---|---|---|---|---|---|
1240 | 2018-07-18 14:31:00 | 2018-037 | BS11 | 12.0 | 1.3 | 332.0 | 5.119 | 17.695 | 27.1723 |
if np.sum(iii)>0:
if (np.sum(iii)==1)&(dfout.loc[iii].index==1240)&(dfout.loc[iii,['Pressure [decibar]']].values<=10):
dfout.loc[iii,['Depth [metres]']]=np.nan
else:
print('Check row above for different case than expected')
dfout.loc[iii,['FIL:START TIME YYYY/MM/DD HH:MM:SS','Cruise','LOC:STATION','Sample_Number',
'Pressure [decibar]','Depth [metres]','Diatoms-1','Temperature [deg C (ITS90)]','Salinity [PSS-78]']]
FIL:START TIME YYYY/MM/DD HH:MM:SS | Cruise | LOC:STATION | Sample_Number | Pressure [decibar] | Depth [metres] | Diatoms-1 | Temperature [deg C (ITS90)] | Salinity [PSS-78] | |
---|---|---|---|---|---|---|---|---|---|
1240 | 2018-07-18 14:31:00 | 2018-037 | BS11 | 12.0 | 1.3 | NaN | 5.119 | 17.695 | 27.1723 |
dfout['Cyanobacteria']
0 NaN 1 NaN 2 NaN 3 NaN 4 NaN ... 2077 NaN 2078 NaN 2079 0.007 2080 0.046 2081 0.000 Name: Cyanobacteria, Length: 2082, dtype: float64
# now write the output table to a .csv file:
dfout.to_csv(pathOut, index=False)
dfout.keys()
Index(['File Name', 'Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION', 'LOC:STATION', 'ADM:PROJECT', 'Zone.1', 'Sample_Number', 'Pressure [decibar]', 'Depth [metres]', 'Temperature [deg C (ITS90)]', 'Transmissivity [*/metre]', 'Fluorescence [mg/m^3]', 'PAR [uE/m^2/sec]', 'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal', 'Salinity [PSS-78]', 'Oxygen:Dissolved:CTD [mL/L]', 'Oxygen:Dissolved:CTD [umol/kg]', 'Temperature:Draw [deg C (ITS90)]', 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted [mg/m^3]', 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved', 'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise', 'FIL:START TIME YYYY/MM/DD HH:MM:SS', 'Depth:Nominal [metres]', 'Alkalinity:Total [umol/L]', 'Flag:Alkalinity:Total', 'Carbon:Dissolved:Inorganic [umol/kg]', 'Flag:Carbon:Dissolved:Inorganic', 'Bottle_Number', 'Bottle:Firing_Sequence', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido', 'Cyanobacteria', 'TchlA'], dtype='object')