import numpy as np # this module handles arrays, but here we need it for its NaN value
import pandas as pd # this module contains a lot of tools for handling tabular data
import re
# define paths to the source files and eventual output file
pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2016 SoG bottle.xlsx'
pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2019 SoG abs values New ALLO.xlsx'
pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2016_NewALLO.csv'
#formatting function to get year-(3digit) format
def fmtCruise(istr):
if re.fullmatch('[0-9]{4}-[0-9]{2}',istr):
sp=re.split('-',istr)
rstr=sp[0]+'-0'+sp[1]
elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):
rstr=istr
else:
raise ValueError('Input had unexpected format:',istr)
return rstr
# get names of sheets in notebook
with pd.ExcelFile(pathBottle,engine='openpyxl') as xl:
sheets=xl.sheet_names
print(sheets)
['2016-05', '2016-47', '2016-07', '2016-62', '2016-010', '2016-071']
# load each sheet in the 2015 bottle Excel file and concatenate them together into one table
dfbotlist=list()
for sheet in sheets:
df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,engine='openpyxl',
na_values=(-99,-99.9)) # read each sheet; include additional na values
df0['Cruise']=fmtCruise(sheet) # create and populate Cruise column based on sheet name
dfbotlist.append(df0) # append the sheet to a list
dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table
# Drop columns with no data in them
l1=set(dfbot.keys())
dfbot.dropna(axis=1,how='all',inplace=True)
print('removed empty columns:',l1-set(dfbot.keys()))
Reading sheet 2016-05 Reading sheet 2016-47 Reading sheet 2016-07 Reading sheet 2016-62 Reading sheet 2016-010 Reading sheet 2016-071 removed empty columns: {'Flag:Ammonium', 'Oxygen:Dissolved.1', 'Ammonium'}
# list the column names in the resulting table
print(dfbot.keys())
Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'LOC:STATION', 'Sample_Number', 'Pressure', 'Temperature:Secondary', 'Transmissivity', 'Fluorescence:URU:Seapoint', 'PAR', 'PAR:Reference', 'pH:SBE:Nominal', 'Salinity:T1:C1', 'Oxygen:Dissolved:SBE', 'Temperature:Draw', 'Salinity:Bottle', 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted', 'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved', 'Nitrate_plus_Nitrite', 'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate', 'Flag:Phosphate', 'Cruise', 'PAR1', 'Sample_Number.1', 'Number_of_bin_records', 'YYYY/MM/DD HH:MM:SS', 'ADM:MISSION', 'ADM:PROJECT', 'Pressure [decibar]', 'Temperature:Secondary [deg C (ITS90)]', 'Transmissivity [*/metre]', 'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]', 'PAR:Reference [uE/m^2/sec]', 'Salinity:T1:C1 [PSS-78]', 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]', 'Salinity:Bottle [PSS-78]', 'Chlorophyll:Extracted [mg/m^3]', 'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]', 'Silicate [umol/L]', 'Phosphate [umol/L]', 'Bottle_Number', 'Bottle:Firing_Sequence', 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]'], dtype='object')
def subval(idf,colList):
# first value in colList should be the column you are going to keep
# follow with other columns that will be used to fill in when that column is NaN
# in order of precedence
if len(colList)==2:
idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
else r[colList[1]] for i,r in idf.iterrows()]
elif len(colList)==3:
idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
else r[colList[1]] if not pd.isna(r[colList[1]]) \
else r[colList[2]] for i,r in idf.iterrows()]
else:
raise NotImplementedError('Add to code to handle this case')
return idf
# there are some duplicate columns here; handle them:
print(np.max(np.abs(dfbot['Sample_Number']-dfbot['Sample_Number.1'])),
': zero means no difference between Sample_Number and Sample_number.1 columns containing data')
# check there are no rows missing a value in 'Sample_Number':
print(np.sum(pd.isna(dfbot['Sample_Number'])),
': zero means no missing values in Sample_Number')
# -> extra copy is repeat so drop it
# 'ADM:MISSION' contains cruise id so drop it
# 'YYYY/MM/DD HH:MM:SS' data should be in 'FIL:START TIME YYYY/MM/DD HH:MM:SS'; move it there:
dfbot=subval(dfbot,('FIL:START TIME YYYY/MM/DD HH:MM:SS','YYYY/MM/DD HH:MM:SS'))
dfbot=subval(dfbot,('Pressure [decibar]','Pressure'))
dfbot=subval(dfbot,('Temperature:Secondary [deg C (ITS90)]','Temperature:Secondary'))
dfbot=subval(dfbot,('Transmissivity [*/metre]','Transmissivity'))
dfbot=subval(dfbot,('Fluorescence:URU:Seapoint [mg/m^3]','Fluorescence:URU:Seapoint'))
dfbot=subval(dfbot,('PAR [uE/m^2/sec]','PAR'))
dfbot=subval(dfbot,('PAR:Reference [uE/m^2/sec]','PAR:Reference'))
dfbot=subval(dfbot,('Salinity:T1:C1 [PSS-78]','Salinity:T1:C1'))
dfbot=subval(dfbot,('Salinity:Bottle [PSS-78]','Salinity:Bottle'))
dfbot=subval(dfbot,('Chlorophyll:Extracted [mg/m^3]','Chlorophyll:Extracted'))
dfbot=subval(dfbot,('Phaeo-Pigment:Extracted [mg/m^3]','Phaeo-Pigment:Extracted'))
dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [mL/L]','Oxygen:Dissolved:SBE'))
dfbot=subval(dfbot,('Oxygen:Dissolved [mL/L]','Oxygen:Dissolved'))
dfbot=subval(dfbot,('Nitrate_plus_Nitrite [umol/L]','Nitrate_plus_Nitrite'))
dfbot=subval(dfbot,('Silicate [umol/L]','Silicate'))
dfbot=subval(dfbot,('Phosphate [umol/L]','Phosphate'))
dfbot=subval(dfbot,('PAR [uE/m^2/sec]','PAR1'))
0.0 : zero means no difference between Sample_Number and Sample_number.1 columns containing data 0 : zero means no missing values in Sample_Number
# drop repetetive/unecessary columns:
dfbot.drop(labels=['Sample_Number.1','ADM:MISSION','YYYY/MM/DD HH:MM:SS','Transmissivity',
'Pressure','Temperature:Secondary','Fluorescence:URU:Seapoint','PAR',
'PAR:Reference','Salinity:T1:C1','Chlorophyll:Extracted',
'Oxygen:Dissolved:SBE','Oxygen:Dissolved',
'Nitrate_plus_Nitrite','Silicate','PAR1'],axis=1,inplace=True)
# define a function that will be applied to the values in the index column;
# this makes it easier to drop non-data rows later
def convertIndex(val):
try:
x =int(val)
except ValueError:
x=np.nan
return x
# load the 2015 phytoplankton data with the following options:
# sheet_name='2015 CHEMTAX abs results' -> choose the 2015 sheet
# usecols='A:I,T:AC' -> read only columns A:I and T:AC from the Excel sheet
# skiprows=2 -> start reading at the 3rd row of the sheet,
# which contains the column headings
# converters={'Index': convertIndex,} -> apply the function defined above to the Index column
# verbose = True -> print extra information/ warnings/ errors
dfPhyto=pd.read_excel(pathPhyto,sheet_name='2016',engine='openpyxl',
skiprows=0,converters={'Sample#': convertIndex,},
verbose=True)
Reading sheet 2016
# display rows 48 to 59 of the resulting table
dfPhyto[48:60]
Cruise | Date | Station | Sample# | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48 | 2016-007 | June | 14 | 306.0 | mean | 0 | 0 | 0 | 0.215782 | 0.568259 | 0.155735 | 1.02837 | 0 | 0 | 0.00485905 | 1.973 |
49 | 2016-007 | June | 16 | 318.0 | mean | 0 | 0.176595 | 0 | 0.197922 | 0.556587 | 0.129862 | 0.704256 | 0.0753036 | 0.0859203 | 0.0115539 | 1.938 |
50 | 2016-007 | June | OKE | 329.0 | mean | 0 | 0.534565 | 1.30459 | 0.310359 | 1.03751 | 0.435442 | 2.29758 | 2.14958 | 0.247075 | 0.0502969 | 8.367 |
51 | 2016-007 | June | 22 | 333.0 | mean | 0 | 0.748735 | 2.2245 | 0.0510914 | 0.270685 | 0.417001 | 0.156895 | 3.49932 | 0.0870553 | 0.0197189 | 7.475 |
52 | Cruise | Date | Station | NaN | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA |
53 | 2016-047 | June | 14 | 438.0 | mean | 0 | 1.03723 | 0.402856 | 0.0826985 | 0.215331 | 0.117579 | 0.224789 | 0.037861 | 0.0581045 | 0.0265498 | 2.203 |
54 | 2016-047 | June | 11 | 449.0 | mean | 0 | 0.37708 | 0.165123 | 0.087552 | 0.113769 | 0.101932 | 0.26103 | 0.0118683 | 0.0485396 | 0.0161065 | 1.183 |
55 | 2016-047 | June | CPF1 | 452.0 | mean | 0 | 0.00981477 | 0 | 0.125556 | 0.143086 | 0.0616604 | 0.287119 | 0 | 0.0535119 | 0.00525178 | 0.686 |
56 | 2016-047 | June | CPF2 | 455.0 | mean | 0 | 0.223128 | 0 | 0.0825152 | 0.324354 | 0.067458 | 0.251326 | 0.0179209 | 0.125422 | 0.026876 | 1.119 |
57 | 2016-047 | June | 22 | 466.0 | mean | 0 | 0.612099 | 0.124002 | 0.0585194 | 0.158803 | 0.0759229 | 0.119778 | 0.0561802 | 0.0190252 | 0.00866994 | 1.233 |
58 | 2016-047 | June | 24 | 469.0 | mean | 0 | 0.696984 | 0.106455 | 0.0752098 | 0.160245 | 0.0776968 | 0.182455 | 0.0441467 | 0.0226451 | 0.00516366 | 1.371 |
59 | 2016-047 | June | 28 | 472.0 | mean | 0 | 0.818837 | 0 | 0.0915953 | 0.174323 | 0.055937 | 0.0619514 | 0.0265586 | 0.0612878 | 0.0205105 | 1.311 |
# now, drop any rows from the table that have NaN values in either of the columns
# 'Index' or 'TchlA (ug/L)'
# This is why we applied a function to the Index column to make sure all
# non-numeric Index values would have a consistent NaN entry, making them easy to identify
# and remove
dfPhyto.dropna(subset=['Sample#', 'TchlA'],how='any',inplace=True)
# pandas creates its own index, and after dropping rows I like to reset it -
# this is just for convenience
dfPhyto.reset_index(drop=True,inplace=True)
# apply formatting function all rows in Cruise column to get year-3digit format
dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]
# display part of the table, confirming that non-data rows have been removed
dfPhyto[48:60]
Cruise | Date | Station | Sample# | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48 | 2016-007 | June | 16 | 318.0 | mean | 0 | 0.176595 | 0 | 0.197922 | 0.556587 | 0.129862 | 0.704256 | 0.0753036 | 0.0859203 | 0.0115539 | 1.938 |
49 | 2016-007 | June | OKE | 329.0 | mean | 0 | 0.534565 | 1.30459 | 0.310359 | 1.03751 | 0.435442 | 2.29758 | 2.14958 | 0.247075 | 0.0502969 | 8.367 |
50 | 2016-007 | June | 22 | 333.0 | mean | 0 | 0.748735 | 2.2245 | 0.0510914 | 0.270685 | 0.417001 | 0.156895 | 3.49932 | 0.0870553 | 0.0197189 | 7.475 |
51 | 2016-047 | June | 14 | 438.0 | mean | 0 | 1.03723 | 0.402856 | 0.0826985 | 0.215331 | 0.117579 | 0.224789 | 0.037861 | 0.0581045 | 0.0265498 | 2.203 |
52 | 2016-047 | June | 11 | 449.0 | mean | 0 | 0.37708 | 0.165123 | 0.087552 | 0.113769 | 0.101932 | 0.26103 | 0.0118683 | 0.0485396 | 0.0161065 | 1.183 |
53 | 2016-047 | June | CPF1 | 452.0 | mean | 0 | 0.00981477 | 0 | 0.125556 | 0.143086 | 0.0616604 | 0.287119 | 0 | 0.0535119 | 0.00525178 | 0.686 |
54 | 2016-047 | June | CPF2 | 455.0 | mean | 0 | 0.223128 | 0 | 0.0825152 | 0.324354 | 0.067458 | 0.251326 | 0.0179209 | 0.125422 | 0.026876 | 1.119 |
55 | 2016-047 | June | 22 | 466.0 | mean | 0 | 0.612099 | 0.124002 | 0.0585194 | 0.158803 | 0.0759229 | 0.119778 | 0.0561802 | 0.0190252 | 0.00866994 | 1.233 |
56 | 2016-047 | June | 24 | 469.0 | mean | 0 | 0.696984 | 0.106455 | 0.0752098 | 0.160245 | 0.0776968 | 0.182455 | 0.0441467 | 0.0226451 | 0.00516366 | 1.371 |
57 | 2016-047 | June | 28 | 472.0 | mean | 0 | 0.818837 | 0 | 0.0915953 | 0.174323 | 0.055937 | 0.0619514 | 0.0265586 | 0.0612878 | 0.0205105 | 1.311 |
58 | 2016-047 | June | GE01 | 483.0 | mean | 0 | 1.71173 | 0 | 0.120347 | 0.210731 | 0.0864201 | 0.0260724 | 0.0207969 | 0.0199069 | 0 | 2.196 |
59 | 2016-047 | June | 38 | 486.0 | mean | 0 | 6.43463 | 0 | 0.186531 | 0.432792 | 0.137378 | 0 | 0 | 0.196669 | 0 | 7.388 |
# due to repeated column names in the original spreadsheet, '.1' was appended to the names
# of the phytoplankton columns;
# these lines correct the column names, removing the '.1':
renameDict=dict()
for colName in dfPhyto.keys():
if colName.endswith('.1'):
renameDict[colName]=colName.split('.1')[0]
dfPhyto.rename(columns=renameDict,inplace=True)
dfPhyto
Cruise | Date | Station | Sample# | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-005 | April | 59 | 23.0 | mean | 0 | 4.50144 | 0.438107 | 0.048483 | 0.338791 | 0.0478202 | 0 | 0.00362949 | 0.000129969 | 0.0086402 | 5.38704 |
1 | 2016-005 | April | 102 | 37.0 | mean | 0 | 3.55538 | 1.73326 | 0.385956 | 1.82211 | 0.139308 | 0.186364 | 0.0440193 | 0.0767997 | 0 | 7.9432 |
2 | 2016-005 | April | 75 | 51.0 | mean | 0 | 14.7795 | 0.563364 | 0.397391 | 1.18036 | 0.075357 | 0.0941965 | 0 | 0 | 0 | 17.0901 |
3 | 2016-005 | April | 72 | 64.0 | mean | 0 | 0.923699 | 0.745832 | 0.337639 | 1.20012 | 0.0704119 | 0.0893556 | 0.00326812 | 0.0523156 | 0 | 3.42264 |
4 | 2016-005 | April | 69 | 76.0 | mean | 0 | 3.88279 | 0.290172 | 0.183778 | 0.531872 | 0.0478852 | 0.0609817 | 0 | 0 | 0 | 4.99748 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
115 | 2016-071 | November | 9 | 250.0 | mean | 0 | 0.0142285 | 0.082281 | 0.0897016 | 0.0562649 | 0.000186116 | 0.00849243 | 0.017071 | 0.00322275 | 0.00355171 | 0.275 |
116 | 2016-071 | November | 6 | 264.0 | mean | 0 | 0.00399836 | 0.104654 | 0.145557 | 0.109826 | 0.00047569 | 0.040834 | 0.0139237 | 0.00948663 | 0.00424517 | 0.433 |
117 | 2016-071 | November | 3 | 281.0 | mean | 0 | 0 | 0.0634224 | 0.0775545 | 0.0395764 | 0.0314152 | 0.0276703 | 0.0142296 | 0.0202238 | 0.00390783 | 0.278 |
118 | 2016-071 | November | 2 | 297.0 | mean | 0 | 0 | 0.0636933 | 0.181767 | 0.274402 | 0.0402761 | 0.0463697 | 0.0196038 | 0.0268882 | 0 | 0.653 |
119 | 2016-071 | November | 27 | 314.0 | mean | 0 | 0.00343804 | 0.112049 | 0.343388 | 0.400357 | 0.035914 | 0.0469057 | 0.00552154 | 0.0769891 | 0.00243813 | 1.027 |
120 rows × 16 columns
# This is the important step- join the two tables ('left' and 'right'),
# matching the cruise IDs and sample numbers
# how='outer' -> all rows from both the left and the right tables will be included,
# even if they cannot be matched; this makes it easy to check for
# unmatched data later
# left_on specifies the name of the column to match in the left table (dfbot)
# right_on specifies the name of the column to match in the right table (dfPhyto)
dfout = pd.merge(dfbot, dfPhyto, how='outer',
left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])
# show the column names in the resulting table
dfout.keys()
Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'LOC:STATION', 'Sample_Number', 'pH:SBE:Nominal', 'Temperature:Draw', 'Salinity:Bottle', 'Flag:Salinity:Bottle', 'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted', 'Flag:Oxygen:Dissolved', 'Flag:Nitrate_plus_Nitrite', 'Flag:Silicate', 'Phosphate', 'Flag:Phosphate', 'Cruise', 'Number_of_bin_records', 'ADM:PROJECT', 'Pressure [decibar]', 'Temperature:Secondary [deg C (ITS90)]', 'Transmissivity [*/metre]', 'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]', 'PAR:Reference [uE/m^2/sec]', 'Salinity:T1:C1 [PSS-78]', 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]', 'Salinity:Bottle [PSS-78]', 'Chlorophyll:Extracted [mg/m^3]', 'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]', 'Silicate [umol/L]', 'Phosphate [umol/L]', 'Bottle_Number', 'Bottle:Firing_Sequence', 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]', 'Date', 'Station', 'Sample#', 'rep', 'depth', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido', 'Cyanobacteria', 'TchlA'], dtype='object')
# Identify cases where phytoplankton data were matched to multiple samples in bottle data:
dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])
temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})
temp.columns = ['icount']
np.unique(temp.icount)
array([1])
# check for Phyto samples matched to multiple bottle samples:
temp.loc[temp.icount>1]
icount | ||
---|---|---|
Cruise | Sample# |
# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]
icount | ||
---|---|---|
Cruise | Sample# |
temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})
temp2.columns = ['icount']
# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:
temp2.loc[temp2.icount>1]
icount | ||
---|---|---|
Cruise | Sample_Number |
# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]
icount | ||
---|---|---|
Cruise | Sample# |
# if the output table is longer than either of the input tables, some columns were not matched
len(dfout), len(dfPhyto), len(dfbot)
(1420, 120, 1420)
# Check that the number of cells with data in the 'Cyanobacteria' column is
# the same for the input and output tables to show that no rows are missing:
np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)
(120, 120)
# If there were data rows from the phytoplankton table that were not matched to
# rows from the bottle table, their indices from the phytoplankton table would be
# displayed below (the series [] would not be empty)
print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Sample#'])
Series([], Name: Sample#, dtype: float64)
# drop repetetive/unecessary columns:
dfout.drop(labels=['Date', 'Station', 'Sample#', 'rep',
'depth',],axis=1,inplace=True)
# truncate phyto group values to 3 decimal places:
for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',
'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
'TchlA'):
dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column
dfout['Cyanobacteria']
0 NaN 1 NaN 2 NaN 3 NaN 4 NaN ... 1415 NaN 1416 NaN 1417 NaN 1418 0.002 1419 NaN Name: Cyanobacteria, Length: 1420, dtype: float64
# Check for unexpected depths
dfout.loc[((dfout['Pressure [decibar]']>10)|(dfout['Depth [metres]']>10))&(dfout['Diatoms-1']>=0)]#,
# ['FIL:START TIME YYYY/MM/DD HH:MM:SS','Pressure [decibar]','Depth [metres]','Diatoms-1']]
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) ~/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 2897 try: -> 2898 return self._engine.get_loc(casted_key) 2899 except KeyError as err: pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'Depth [metres]' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) <ipython-input-34-98a40a30babe> in <module> 1 # Check for unexpected depths ----> 2 dfout.loc[((dfout['Pressure [decibar]']>10)|(dfout['Depth [metres]']>10))&(dfout['Diatoms-1']>=0)]#, 3 # ['FIL:START TIME YYYY/MM/DD HH:MM:SS','Pressure [decibar]','Depth [metres]','Diatoms-1']] ~/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/frame.py in __getitem__(self, key) 2904 if self.columns.nlevels > 1: 2905 return self._getitem_multilevel(key) -> 2906 indexer = self.columns.get_loc(key) 2907 if is_integer(indexer): 2908 indexer = [indexer] ~/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 2898 return self._engine.get_loc(casted_key) 2899 except KeyError as err: -> 2900 raise KeyError(key) from err 2901 2902 if tolerance is not None: KeyError: 'Depth [metres]'
# now write the output table to a .csv file:
dfout.to_csv(pathOut, index=False)
dfout.keys()