import numpy as np # this module handles arrays, but here we need it for its NaN value
import pandas as pd # this module contains a lot of tools for handling tabular data
import re
# define paths to the source files and eventual output file
pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2015 SoG_bottle.xlsx'
pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2019 SoG abs values New ALLO.xlsx'
pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2015_NewALLO.csv'
def fmtCruise(istr):
if re.fullmatch('[0-9]{4}-[0-9]{2}',istr):
sp=re.split('-',istr)
rstr=sp[0]+'-0'+sp[1]
elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):
rstr=istr
else:
raise ValueError('Input had unexpected format:',istr)
return rstr
# load each sheet in the 2015 bottle Excel file and concatenate them together into one table
dfbotlist=list()
for sheet in ('2015-17','2015-20','2015-18','2015-21','2015-19'):
df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,na_values=(-99,-99.9),engine='openpyxl',
dtype={'Cruise':str,'Sample_Number':np.int64}) # read each sheet
df0['Cruise']=fmtCruise(sheet) # create and populate Cruise column based on sheet name
dfbotlist.append(df0) # append the sheet to a list
dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table
# Drop columns with no data in them
l1=set(dfbot.keys())
dfbot.dropna(axis=1,how='all',inplace=True)
print('removed empty columns:',l1-set(dfbot.keys()))
Reading sheet 2015-17 Reading sheet 2015-20 Reading sheet 2015-18 Reading sheet 2015-21 Reading sheet 2015-19 removed empty columns: {'Ammonium', 'Flag:Ammonium'}
# list the column names in the resulting table
print(dfbot.keys())
Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS', 'LOC:EVENT_NUMBER', 'LOC:STATION', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'Sample_Number', 'Pressure', 'Temperature:Primary', 'Transmissivity', 'Fluorescence:URU:Seapoint', 'PAR', 'Salinity:T0:C0', 'Oxygen:Dissolved:SBE', 'Oxygen:Dissolved:SBE.1', 'pH:SBE:Nominal', 'Salinity:Bottle', 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted', 'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite', 'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate', 'Flag:Phosphate', 'Cruise', 'Temperature:Draw', 'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Oxygen:Dissolved.1', 'Flag:Oxygen:Dissolved', 'Temperature:Secondary', 'Salinity:T1:C1', 'Number_of_bin_records'], dtype='object')
def subval(idf,colList):
# first value in colList should be the column you are going to keep
# follow with other columns that will be used to fill in when that column is NaN
# in order of precedence
if len(colList)==2:
idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
else r[colList[1]] for i,r in idf.iterrows()]
elif len(colList)==3:
idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
else r[colList[1]] if not pd.isna(r[colList[1]]) \
else r[colList[2]] for i,r in idf.iterrows()]
else:
raise NotImplementedError('Add to code to handle this case')
idf.drop(columns=list(colList[1:]),inplace=True)
return idf
dfbot=subval(dfbot,('Oxygen:Dissolved','Oxygen:Dissolved.1'))
dfbot=subval(dfbot,('Temperature:Primary','Temperature:Secondary'))
dfbot=subval(dfbot,('Salinity:T0:C0','Salinity:T1:C1'))
dfbot=subval(dfbot,('Oxygen:Dissolved:SBE','Oxygen:Dissolved:SBE.1'))
dfbot.rename(columns={'LOC:LATITUDE':'Lat',
'LOC:LONGITUDE':'Lon',
'Temperature:Primary':'Temperature',
'Oxygen:Dissolved:SBE':'Oxygen:Dissolved:CTD',
'Salinity:T0:C0':'Salinity',},inplace=True)
# define a function that will be applied to the values in the index column;
# this makes it easier to drop non-data rows later
def convertIndex(val):
try:
x =int(val)
except ValueError:
x=-999
return x
# load the 2015 phytoplankton data with the following options:
# sheet_name='2015 CHEMTAX abs results' -> choose the 2015 sheet
# usecols='A:I,T:AC' -> read only columns A:I and T:AC from the Excel sheet
# skiprows=2 -> start reading at the 3rd row of the sheet,
# which contains the column headings
# converters={'Index': convertIndex,} -> apply the function defined above to the Index column
# verbose = True -> print extra information/ warnings/ errors
#usecols='A:I,T:AC',
dfPhyto=pd.read_excel(pathPhyto,sheet_name='2015',engine='openpyxl',
skiprows=0,converters={'Sample#': convertIndex,},
verbose=True)
Reading sheet 2015
# display rows 48 to 59 of the resulting table
dfPhyto[48:60]
Cruise | Date | Station | Sample# | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48 | Cruise | Date | Station | -999 | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA |
49 | 2015-019 | September | SI | 9 | mean | 0 | 0 | 0 | 0.262077 | 2.54479 | 0.541801 | 1.61053 | 0 | 0 | 0.00480152 | 4.964 |
50 | 2015-019 | September | 59 | 23 | mean | 0 | 0.824291 | 0.14173 | 0.0878956 | 0.241363 | 0.0219083 | 0.0624404 | 0.00493941 | 0.0391662 | 0.0122657 | 1.436 |
51 | 2015-019 | September | 102 | 38 | mean | 0 | 4.23697 | 1.2806 | 0.138505 | 0.452335 | 0.0626771 | 0.273765 | 0 | 0.0581462 | 0 | 6.503 |
52 | 2015-019 | September | 75 | 52 | mean | 0 | 1.97283 | 0.851509 | 0.415309 | 0.712103 | 0.100579 | 0.339134 | 0.0147496 | 0.159789 | 0 | 4.566 |
53 | 2015-019 | September | 72 | 65 | mean | 0 | 0.82108 | 0.189251 | 0.166689 | 0.258274 | 0.050802 | 0.134443 | 0.00717662 | 0.0565812 | 0.000703135 | 1.685 |
54 | 2015-019 | September | 69 | 77 | mean | 0 | 0.69704 | 0.191174 | 0.104355 | 0.156305 | 0.000429999 | 0.0710586 | 0.011922 | 0.0397155 | 0 | 1.272 |
55 | 2015-019 | September | ADCP | 87 | mean | 0 | 0.747473 | 0.233891 | 0.0708116 | 0.209912 | 0.0259796 | 0.0823137 | 0.00108747 | 0.0345318 | 0 | 1.406 |
56 | 2015-019 | September | 65 | 97 | mean | 0 | 0.734673 | 0.152787 | 0.0414143 | 0.169293 | 0.0156013 | 0.0699448 | 0.00710006 | 0.0217077 | 0.00447809 | 1.217 |
57 | 2015-019 | September | 63 | 108 | mean | 0 | 0.621756 | 0.132632 | 0.0337293 | 0.105831 | 0.0101963 | 0.0339016 | 0.013503 | 0.021451 | 0 | 0.973 |
58 | 2015-019 | September | 62 | 119 | mean | 0 | 0.63392 | 0.136818 | 0.0389149 | 0.146725 | 0.00871505 | 0.00812858 | 0.0221678 | 0 | 0.000610712 | 0.996 |
59 | 2015-019 | September | 56 | 133 | mean | 0 | 1.16464 | 0.0481537 | 0.193494 | 0.282189 | 0.0108959 | 0.0248796 | 0.0161239 | 0.0417164 | 0.0489061 | 1.831 |
# now, drop any rows from the table that have NaN values in either of the columns
# 'Index' or 'TchlA (ug/L)'
# This is why we applied a function to the Index column to make sure all
# non-numeric Index values would have a consistent NaN entry, making them easy to identify
# and remove
dfPhyto.dropna(subset=['TchlA'],how='any',inplace=True)
dfPhyto.drop(dfPhyto[dfPhyto['Sample#']<0].index,inplace=True)
# pandas creates its own index, and after dropping rows I like to reset it -
# this is just for convenience
dfPhyto.reset_index(drop=True,inplace=True)
# display part of the table, confirming that non-data rows have been removed
dfPhyto.loc[:10,['Cruise','Sample#']]
Cruise | Sample# | |
---|---|---|
0 | 2015-017 | 9 |
1 | 2015-017 | 23 |
2 | 2015-017 | 37 |
3 | 2015-017 | 51 |
4 | 2015-017 | 64 |
5 | 2015-017 | 76 |
6 | 2015-017 | 86 |
7 | 2015-017 | 96 |
8 | 2015-017 | 109 |
9 | 2015-017 | 120 |
10 | 2015-017 | 134 |
dfbot.loc[:20,['Cruise','Sample_Number']]
Cruise | Sample_Number | |
---|---|---|
0 | 2015-017 | 1 |
1 | 2015-017 | 2 |
2 | 2015-017 | 3 |
3 | 2015-017 | 4 |
4 | 2015-017 | 5 |
5 | 2015-017 | 6 |
6 | 2015-017 | 7 |
7 | 2015-017 | 8 |
8 | 2015-017 | 9 |
9 | 2015-017 | 10 |
10 | 2015-017 | 11 |
11 | 2015-017 | 12 |
12 | 2015-017 | 13 |
13 | 2015-017 | 14 |
14 | 2015-017 | 15 |
15 | 2015-017 | 16 |
16 | 2015-017 | 17 |
17 | 2015-017 | 18 |
18 | 2015-017 | 19 |
19 | 2015-017 | 20 |
20 | 2015-017 | 21 |
# due to repeated column names in the original spreadsheet, '.1' was appended to the names
# of the phytoplankton columns;
# these lines correct the column names, removing the '.1':
renameDict=dict()
for colName in dfPhyto.keys():
if colName.endswith('.1'):
renameDict[colName]=colName.split('.1')[0]
dfPhyto.rename(columns=renameDict,inplace=True)
dfPhyto
Cruise | Date | Station | Sample# | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-017 | April | SI | 9 | mean | 0 | 23.5805 | 2.41835 | 0 | 0 | 0 | 0 | 0 | 0.163169 | 0 | 26.162 |
1 | 2015-017 | April | 59 | 23 | mean | 0 | 0.271799 | 0.0303002 | 0 | 0.0521275 | 0.00467348 | 0.0801 | 0 | 0 | 0 | 0.439 |
2 | 2015-017 | April | 102 | 37 | mean | 0 | 2.03334 | 0.809783 | 0.0442965 | 0.18086 | 0.0245126 | 0.00410068 | 0.00483189 | 0.0233698 | 0.00390702 | 3.129 |
3 | 2015-017 | April | 75 | 51 | mean | 0 | 2.72243 | 1.09346 | 0.0387971 | 0.25975 | 0.0294629 | 0.0185552 | 0.0136585 | 0.0328907 | 0.0100036 | 4.219 |
4 | 2015-017 | April | 72 | 64 | mean | 0 | 3.34139 | 1.59186 | 0.172997 | 0.924837 | 0.329514 | 0 | 0.00213607 | 0.0222662 | 0 | 6.385 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
65 | 2015-019 | September | 9 | 248 | mean | 0 | 0.558597 | 0.207741 | 0.605681 | 0.433424 | 0.000954912 | 0.200797 | 0.031296 | 0.0739315 | 0.138577 | 2.251 |
66 | 2015-019 | September | 6 | 261 | mean | 0 | 0.256106 | 0.254845 | 0.556941 | 0.826706 | 5.79139e-05 | 0.378296 | 0.0619777 | 0.0707657 | 0.124304 | 2.53 |
67 | 2015-019 | September | BS11 | 269 | mean | 0 | 9.83601 | 0.0171224 | 0.0109873 | 0.344913 | 0.00197183 | 0.033415 | 0 | 0 | 0.025584 | 10.27 |
68 | 2015-019 | September | 3 | 285 | mean | 0 | 0.574524 | 0.140787 | 0.34785 | 0.697251 | 0.00785798 | 0.309306 | 0.0253154 | 0.0967167 | 0.171391 | 2.371 |
69 | 2015-019 | September | 2 | 300 | mean | 0 | 1.63844 | 0 | 0.71315 | 0.459814 | 0.00299931 | 0.270579 | 0.0199415 | 0.188252 | 0.169828 | 3.463 |
70 rows × 16 columns
# This is the important step- join the two tables ('left' and 'right'),
# matching the cruise IDs and sample numbers
# how='outer' -> all rows from both the left and the right tables will be included,
# even if they cannot be matched; this makes it easy to check for
# unmatched data later
# left_on specifies the name of the column to match in the left table (dfbot)
# right_on specifies the name of the column to match in the right table (dfPhyto)
dfout = pd.merge(dfbot, dfPhyto, how='outer',
left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])
# show the column names in the resulting table
dfout.keys()
Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS', 'LOC:EVENT_NUMBER', 'LOC:STATION', 'Lat', 'Lon', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'Sample_Number', 'Pressure', 'Temperature', 'Transmissivity', 'Fluorescence:URU:Seapoint', 'PAR', 'Salinity', 'Oxygen:Dissolved:CTD', 'pH:SBE:Nominal', 'Salinity:Bottle', 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted', 'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite', 'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate', 'Flag:Phosphate', 'Cruise', 'Temperature:Draw', 'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved', 'Number_of_bin_records', 'Date', 'Station', 'Sample#', 'rep', 'depth', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido', 'Cyanobacteria', 'TchlA'], dtype='object')
# Identify cases where phytoplankton data were matched to multiple samples in bottle data:
dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])
temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})
temp.columns = ['icount']
np.unique(temp.icount)
array([1])
# check for Phyto samples matched to multiple bottle samples:
temp.loc[temp.icount>1]
icount | ||
---|---|---|
Cruise | Sample# |
# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]
icount | ||
---|---|---|
Cruise | Sample# |
temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})
temp2.columns = ['icount']
# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:
temp2.loc[temp2.icount>1]
icount | ||
---|---|---|
Cruise | Sample_Number |
# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]
icount | ||
---|---|---|
Cruise | Sample# |
# if the output table is longer than either of the input tables, some columns were not matched
len(dfout), len(dfPhyto), len(dfbot)
(938, 70, 938)
dfout
File Name | Zone | FIL:START TIME YYYY/MM/DD HH:MM:SS | LOC:EVENT_NUMBER | LOC:STATION | Lat | Lon | LOC:WATER DEPTH | ADM:SCIENTIST | Sample_Number | ... | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-17-0001.che | UTC | 2015-04-01 23:42:17 | 1 | SI | 48.65233 | -123.50183 | 187 | Chandler P. | 1 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2015-17-0001.che | UTC | 2015-04-01 23:42:17 | 1 | SI | 48.65233 | -123.50183 | 187 | Chandler P. | 2 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 2015-17-0001.che | UTC | 2015-04-01 23:42:17 | 1 | SI | 48.65233 | -123.50183 | 187 | Chandler P. | 3 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 2015-17-0001.che | UTC | 2015-04-01 23:42:17 | 1 | SI | 48.65233 | -123.50183 | 187 | Chandler P. | 4 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 2015-17-0001.che | UTC | 2015-04-01 23:42:17 | 1 | SI | 48.65233 | -123.50183 | 187 | Chandler P. | 5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
933 | 2015-19-0091.che | UTC | 2015-10-04 00:30:55 | 91 | 2 | 49.40167 | -124.15600 | 279 | Chandler P. | 296 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
934 | 2015-19-0091.che | UTC | 2015-10-04 00:30:55 | 91 | 2 | 49.40167 | -124.15600 | 279 | Chandler P. | 297 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
935 | 2015-19-0091.che | UTC | 2015-10-04 00:30:55 | 91 | 2 | 49.40167 | -124.15600 | 279 | Chandler P. | 298 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
936 | 2015-19-0091.che | UTC | 2015-10-04 00:30:55 | 91 | 2 | 49.40167 | -124.15600 | 279 | Chandler P. | 299 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
937 | 2015-19-0091.che | UTC | 2015-10-04 00:30:55 | 91 | 2 | 49.40167 | -124.15600 | 279 | Chandler P. | 300 | ... | 1.63844 | 0 | 0.71315 | 0.459814 | 0.00299931 | 0.270579 | 0.0199415 | 0.188252 | 0.169828 | 3.463 |
938 rows × 49 columns
# Check that the number of cells with data in the 'Cyanobacteria' column is
# the same for the input and output tables:
np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)
(70, 70)
# If there were data rows from the phytoplankton table that were not matched to
# rows from the bottle table, their indices from the phytoplankton table would be
# displayed below (the series [] would not be empty)
print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Sample#'])
Series([], Name: Sample#, dtype: float64)
dfout.keys()
Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS', 'LOC:EVENT_NUMBER', 'LOC:STATION', 'Lat', 'Lon', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'Sample_Number', 'Pressure', 'Temperature', 'Transmissivity', 'Fluorescence:URU:Seapoint', 'PAR', 'Salinity', 'Oxygen:Dissolved:CTD', 'pH:SBE:Nominal', 'Salinity:Bottle', 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted', 'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite', 'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate', 'Flag:Phosphate', 'Cruise', 'Temperature:Draw', 'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved', 'Number_of_bin_records', 'Date', 'Station', 'Sample#', 'rep', 'depth', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido', 'Cyanobacteria', 'TchlA'], dtype='object')
# drop repetetive/unecessary columns: 'Index','Bin #', 'Subgroup', 'Month',
dfout.drop(labels=['Date', 'Station', 'Sample#', 'rep',
'depth',],axis=1,inplace=True)
# truncate phyto group values to 3 decimal places:
for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',
'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
'TchlA'):
dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column
# Check for unexpected depths
dfout.loc[((dfout['Pressure']>10))&(dfout['Diatoms-1']>=0)]#,
# ['FIL:START TIME YYYY/MM/DD HH:MM:SS','Pressure [decibar]','Depth [metres]','Diatoms-1']]
File Name | Zone | FIL:START TIME YYYY/MM/DD HH:MM:SS | LOC:EVENT_NUMBER | LOC:STATION | Lat | Lon | LOC:WATER DEPTH | ADM:SCIENTIST | Sample_Number | ... | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA |
---|
0 rows × 44 columns
dfout.keys()
Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS', 'LOC:EVENT_NUMBER', 'LOC:STATION', 'Lat', 'Lon', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'Sample_Number', 'Pressure', 'Temperature', 'Transmissivity', 'Fluorescence:URU:Seapoint', 'PAR', 'Salinity', 'Oxygen:Dissolved:CTD', 'pH:SBE:Nominal', 'Salinity:Bottle', 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted', 'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite', 'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate', 'Flag:Phosphate', 'Cruise', 'Temperature:Draw', 'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved', 'Number_of_bin_records', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido', 'Cyanobacteria', 'TchlA'], dtype='object')
# now write the output table to a .csv file:
dfout.to_csv(pathOut, index=False)