import numpy as np # this module handles arrays, but here we need it for its NaN value
import pandas as pd # this module contains a lot of tools for handling tabular data
import re
import gsw
# define paths to the source files and eventual output file
pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2017 SoG bottle.xlsx'
pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2019 SoG abs values New ALLO.xlsx'
pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2017_NewALLO.csv'
#formatting function to get year-(3digit) format
def fmtCruise(istr):
if re.fullmatch('[0-9]{4}-[0-9]{2}',istr):
sp=re.split('-',istr)
rstr=sp[0]+'-0'+sp[1]
elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):
rstr=istr
else:
raise ValueError('Input had unexpected format:',istr)
return rstr
# get names of sheets in notebook
with pd.ExcelFile(pathBottle,engine='openpyxl') as xl:
sheets=xl.sheet_names
print(sheets)
['2017-01', '2017-63', '2017-05', '2017-64', '2017-09', '2017-65']
# load each sheet in the bottle Excel file and concatenate them together into one table
dfbotlist=list()
for sheet in sheets:
df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,engine='openpyxl',
na_values=(-99,-99.9)) # read each sheet; include additional na values
df0['Cruise']=fmtCruise(sheet) # create and populate Cruise column based on sheet name
dfbotlist.append(df0) # append the sheet to a list
dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table
# Drop columns with no data in them
l1=set(dfbot.keys())
dfbot.dropna(axis=1,how='all',inplace=True)
print('removed empty columns:',l1-set(dfbot.keys()))
Reading sheet 2017-01 Reading sheet 2017-63 Reading sheet 2017-05 Reading sheet 2017-64 Reading sheet 2017-09 Reading sheet 2017-65 removed empty columns: set()
# list the column names in the resulting table
print(dfbot.keys())
Index(['FIL:START TIME YYYY/MM/DD HH:MM:SS', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'LOC:STATION', 'ADM:PROJECT', 'Bottle_Number', 'Sample_Number', 'Pressure [decibar]', 'Depth [metres]', 'Temperature:Primary [deg C (ITS90)]', 'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]', 'Salinity:T0:C0 [PSS-78]', 'Temperature:Draw [deg C (ITS90)]', 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted [mg/m^3]', 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved', 'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise', 'Zone', 'ADM:MISSION', 'Bottle:Firing_Sequence', 'Temperature*', 'Transmissivity [*/metre]', 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]', 'Temperature:Secondary [deg C (ITS90)]', 'Salinity:T1:C1 [PSS-78]', 'Number_of_bin_records'], dtype='object')
# no rows returned, so there are no rows with multiple temperature values
print(np.sum(dfbot['Temperature:Primary [deg C (ITS90)]']>=0),
np.sum(dfbot['Temperature:Secondary [deg C (ITS90)]']>=0),
np.sum(dfbot['Temperature*']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Temperature:Primary [deg C (ITS90)]']>=0)])+\
np.array([int(ii) for ii in (dfbot['Temperature:Secondary [deg C (ITS90)]']>=0)])+\
np.array([int(ii) for ii in (dfbot['Temperature*']>=0)]))>1]
204 71 1061
FIL:START TIME YYYY/MM/DD HH:MM:SS | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | LOC:STATION | ADM:PROJECT | Bottle_Number | Sample_Number | ... | Zone | ADM:MISSION | Bottle:Firing_Sequence | Temperature* | Transmissivity [*/metre] | Oxygen:Dissolved:SBE [mL/L] | Oxygen:Dissolved:SBE [umol/kg] | Temperature:Secondary [deg C (ITS90)] | Salinity:T1:C1 [PSS-78] | Number_of_bin_records |
---|
0 rows × 43 columns
# no rows returned, so there are no rows with both both salinity fields
print(np.sum(dfbot['Salinity:T0:C0 [PSS-78]']>=0),
np.sum(dfbot['Salinity:T1:C1 [PSS-78]']>=0))
dfbot.loc[(dfbot['Salinity:T0:C0 [PSS-78]']>=0)&\
(dfbot['Salinity:T1:C1 [PSS-78]']>=0)]
931 71
FIL:START TIME YYYY/MM/DD HH:MM:SS | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | LOC:STATION | ADM:PROJECT | Bottle_Number | Sample_Number | ... | Zone | ADM:MISSION | Bottle:Firing_Sequence | Temperature* | Transmissivity [*/metre] | Oxygen:Dissolved:SBE [mL/L] | Oxygen:Dissolved:SBE [umol/kg] | Temperature:Secondary [deg C (ITS90)] | Salinity:T1:C1 [PSS-78] | Number_of_bin_records |
---|
0 rows × 43 columns
def subval(idf,colList):
# first value in colList should be the column you are going to keep
# follow with other columns that will be used to fill in when that column is NaN
# in order of precedence
if len(colList)==2:
idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
else r[colList[1]] for i,r in idf.iterrows()]
elif len(colList)==3:
idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
else r[colList[1]] if not pd.isna(r[colList[1]]) \
else r[colList[2]] for i,r in idf.iterrows()]
else:
raise NotImplementedError('Add to code to handle this case')
return idf
# there are some duplicate columns here; handle them:
dfbot=subval(dfbot,('Temperature:Primary [deg C (ITS90)]',
'Temperature:Secondary [deg C (ITS90)]',
'Temperature*'))
dfbot=subval(dfbot,('Salinity:T0:C0 [PSS-78]',
'Salinity:T1:C1 [PSS-78]'))
dfbot.rename(columns={'Temperature:Primary [deg C (ITS90)]':'Temperature [deg C (ITS90)]'},
inplace=True)
dfbot.rename(columns={'Salinity:T0:C0 [PSS-78]':'Salinity [PSS-78]'},
inplace=True)
# drop repetetive/unecessary columns:
dfbot.drop(labels=['Temperature:Secondary [deg C (ITS90)]',
'Salinity:T1:C1 [PSS-78]','Temperature*'],axis=1,inplace=True)
# define a function that will be applied to the values in the index column;
# this makes it easier to drop non-data rows later
def convertIndex(val):
try:
x =int(val)
except ValueError:
x=np.nan
return x
# load the 2015 phytoplankton data with the following options:
# sheet_name='2015 CHEMTAX abs results' -> choose the 2015 sheet
# usecols='A:I,T:AC' -> read only columns A:I and T:AC from the Excel sheet
# skiprows=2 -> start reading at the 3rd row of the sheet,
# which contains the column headings
# converters={'Index': convertIndex,} -> apply the function defined above to the Index column
# verbose = True -> print extra information/ warnings/ errors
dfPhyto=pd.read_excel(pathPhyto,sheet_name='2017',engine='openpyxl',
skiprows=0,converters={'Sample#': convertIndex,},
verbose=True)
Reading sheet 2017
# display rows 48 to 59 of the resulting table
dfPhyto[48:60]
Cruise | Date | Station | Sample# | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48 | 2017-005 | May | CPF1 | 398.0 | mean | 0 | 9.72624 | 1.18163 | 0.0900009 | 0.466654 | 0.12447 | 0 | 0 | 0 | 0 | 11.589 |
49 | 2017-005 | May | 22 | 410.0 | mean | 0 | 2.29192 | 0.0859758 | 0.0356223 | 0.272079 | 0.056573 | 0.176352 | 0.00451122 | 0.0184467 | 0.00451809 | 2.946 |
50 | 2017-005 | May | 24 | 413.0 | mean | 0 | 3.28244 | 0.0612094 | 0.0354772 | 0.698303 | 0.0780227 | 0.195614 | 0.00391747 | 0.0660138 | 0 | 4.421 |
51 | 2017-005 | May | 28 | 416.0 | mean | 0 | 1.73834 | 0 | 0.0320477 | 0.427669 | 0.0395294 | 0.146105 | 0.00864831 | 0.032897 | 0.00176814 | 2.427 |
52 | 2017-005 | May | GEO1 | 425.0 | mean | 0 | 2.74901 | 0.136714 | 0.00276401 | 0.373035 | 0.0327157 | 0 | 0.00170031 | 0.0384725 | 0.0245927 | 3.359 |
53 | 2017-005 | May | 38 | 428.0 | mean | 0 | 1.42461 | 0 | 0.152071 | 2.29114 | 0.217939 | 0.181603 | 0.0160822 | 0.0325562 | 0 | 4.316 |
54 | 2017-005 | May | 41 | 441.0 | mean | 0 | 0.13052 | 0 | 0.0224933 | 0.257959 | 0.0319908 | 0 | 2.00975e-05 | 0.0263617 | 0.0226547 | 0.492 |
55 | Cruise | Date | Station | NaN | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA |
56 | 2017-064 | June | SI | 9.0 | mean | 0 | 0.186265 | 0 | 0.0965613 | 0.577516 | 0.457205 | 0.713054 | 0.0895957 | 0.0658043 | 0 | 2.186 |
57 | 2017-064 | June | 59 | 23.0 | mean | 0 | 0.0964428 | 0.0675701 | 0.0290869 | 0.179129 | 0.0353787 | 0.0676973 | 0.00145237 | 0.0142423 | 0 | 0.491 |
58 | 2017-064 | June | 102 | 39.0 | mean | 0 | 0.432713 | 0.0959644 | 0.24912 | 0.702623 | 0.265839 | 0.45136 | 0.0362003 | 0.0883046 | 0.0208753 | 2.343 |
59 | 2017-064 | June | 75 | 54.0 | mean | 0 | 0.554319 | 0.148322 | 0.150959 | 0.854182 | 0.296814 | 0.573149 | 0.0448703 | 0.07966 | 0.0287253 | 2.731 |
# now, drop any rows from the table that have NaN values in either of the columns
# 'Index' or 'TchlA (ug/L)'
# This is why we applied a function to the Index column to make sure all
# non-numeric Index values would have a consistent NaN entry, making them easy to identify
# and remove
dfPhyto.dropna(subset=['Sample#', 'TchlA'],how='any',inplace=True)
# pandas creates its own index, and after dropping rows I like to reset it -
# this is just for convenience
dfPhyto.reset_index(drop=True,inplace=True)
# apply formatting function all rows in Cruise column to get year-3digit format
dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]
# display part of the table, confirming that non-data rows have been removed
dfPhyto[48:60]
Cruise | Date | Station | Sample# | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48 | 2017-005 | May | 24 | 413.0 | mean | 0 | 3.28244 | 0.0612094 | 0.0354772 | 0.698303 | 0.0780227 | 0.195614 | 0.00391747 | 0.0660138 | 0 | 4.421 |
49 | 2017-005 | May | 28 | 416.0 | mean | 0 | 1.73834 | 0 | 0.0320477 | 0.427669 | 0.0395294 | 0.146105 | 0.00864831 | 0.032897 | 0.00176814 | 2.427 |
50 | 2017-005 | May | GEO1 | 425.0 | mean | 0 | 2.74901 | 0.136714 | 0.00276401 | 0.373035 | 0.0327157 | 0 | 0.00170031 | 0.0384725 | 0.0245927 | 3.359 |
51 | 2017-005 | May | 38 | 428.0 | mean | 0 | 1.42461 | 0 | 0.152071 | 2.29114 | 0.217939 | 0.181603 | 0.0160822 | 0.0325562 | 0 | 4.316 |
52 | 2017-005 | May | 41 | 441.0 | mean | 0 | 0.13052 | 0 | 0.0224933 | 0.257959 | 0.0319908 | 0 | 2.00975e-05 | 0.0263617 | 0.0226547 | 0.492 |
53 | 2017-064 | June | SI | 9.0 | mean | 0 | 0.186265 | 0 | 0.0965613 | 0.577516 | 0.457205 | 0.713054 | 0.0895957 | 0.0658043 | 0 | 2.186 |
54 | 2017-064 | June | 59 | 23.0 | mean | 0 | 0.0964428 | 0.0675701 | 0.0290869 | 0.179129 | 0.0353787 | 0.0676973 | 0.00145237 | 0.0142423 | 0 | 0.491 |
55 | 2017-064 | June | 102 | 39.0 | mean | 0 | 0.432713 | 0.0959644 | 0.24912 | 0.702623 | 0.265839 | 0.45136 | 0.0362003 | 0.0883046 | 0.0208753 | 2.343 |
56 | 2017-064 | June | 75 | 54.0 | mean | 0 | 0.554319 | 0.148322 | 0.150959 | 0.854182 | 0.296814 | 0.573149 | 0.0448703 | 0.07966 | 0.0287253 | 2.731 |
57 | 2017-064 | June | 72 | 68.0 | mean | 0 | 0 | 0 | 0.159731 | 0.756824 | 0.0962737 | 0.128563 | 5.65439e-05 | 0.104552 | 0 | 1.246 |
58 | 2017-064 | June | 69 | 81.0 | mean | 0 | 0 | 0.0696512 | 0.146823 | 0.51675 | 0.0494223 | 0.114177 | 0.0103499 | 0.0511945 | 0 | 0.958368 |
59 | 2017-064 | June | ADCP | 92.0 | mean | 0 | 0 | 0.0229016 | 0.0760985 | 0.405645 | 0.0180952 | 0.0551009 | 0 | 0.0331584 | 0 | 0.611 |
# due to repeated column names in the original spreadsheet, '.1' was appended to the names
# of the phytoplankton columns;
# these lines correct the column names, removing the '.1':
renameDict=dict()
for colName in dfPhyto.keys():
if colName.endswith('.1'):
renameDict[colName]=colName.split('.1')[0]
dfPhyto.rename(columns=renameDict,inplace=True)
dfPhyto
Cruise | Date | Station | Sample# | rep | depth | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017-001 | Feb | 102 | 615.0 | mean | 1 | 0.153031 | 0.0070906 | 0.0951162 | 0.285236 | 0.057834 | 0.158406 | 0 | 0.0170267 | 0 | 1.77374 |
1 | 2017-001 | Feb | JF2 | 627.0 | mean | 1 | 0.0466544 | 0.0369462 | 0.133572 | 0.36522 | 0.0794122 | 0.179547 | 0.00376303 | 0.0606366 | 0 | 1.90575 |
2 | 2017-001 | Feb | 59 | 641.0 | mean | 1 | 0.119854 | 0.179411 | 0.0629763 | 0.176599 | 0.0232654 | 0 | 0.00529869 | 0.0125153 | 0 | 1.57992 |
3 | 2017-001 | Feb | 56 | 655.0 | mean | 1 | 0.346102 | 0.246237 | 0.137605 | 0.395798 | 0.059873 | 0.0607372 | 0.0109077 | 0.0422404 | 0.00985842 | 2.30936 |
4 | 2017-001 | Feb | 46 | 667.0 | mean | 2 | 0.336588 | 0.262766 | 0.117003 | 0.33433 | 0.0532367 | 0.047056 | 0.0141236 | 0.0194676 | 0.0122297 | 3.1968 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
114 | 2017-065 | Oct | 9 | 260.0 | mean | 0 | 0 | 0 | 0.0632929 | 0.109627 | 0 | 0.406064 | 0 | 0.0207089 | 0.0383072 | 0.638 |
115 | 2017-065 | Oct | 12 | 276.0 | mean | 0 | 1.29519 | 0.29206 | 0 | 0.198517 | 0.0181637 | 0.102396 | 0.0107763 | 0.00904861 | 0.0108495 | 1.937 |
116 | 2017-065 | Oct | 14 | 291.0 | mean | 0 | 0.100137 | 0 | 0.114614 | 0.169733 | 0.000143127 | 0.289479 | 0 | 0.0510349 | 0.0258585 | 0.751 |
117 | 2017-065 | Oct | 16 | 304.0 | mean | 0 | 0.344155 | 0.053402 | 0.0148634 | 0.150205 | 0 | 0.069992 | 0.00990213 | 0.0152399 | 0.00724086 | 0.665 |
118 | 2017-065 | Oct | 22 | 307.0 | mean | 0 | 1.52571 | 0.346386 | 0.0609604 | 0.261251 | 0.0640507 | 0.180817 | 0.0377362 | 0.0397075 | 0.0473817 | 2.564 |
119 rows × 16 columns
# sample number corrections
iii=(dfPhyto['Cruise']=='2017-005')&(dfPhyto['Station']==28)&(dfPhyto['Sample#']==416)
if len(dfPhyto.loc[iii])>0:
dfPhyto.loc[iii,['Sample#']]=9416
iii=(dfPhyto['Cruise']=='2017-005')&(dfPhyto['Station']==28)&(dfPhyto['Sample#']==416)
print('9416 This number should be zero:',len(dfPhyto.loc[iii]))
iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']==2)&(dfPhyto['Sample#']==213)
if len(dfPhyto.loc[iii])>0:
dfPhyto.loc[iii,['Sample#']]=9213
iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']==2)&(dfPhyto['Sample#']==213)
print('9213 This number should be zero:',len(dfPhyto.loc[iii]))
#this ones was correct but test it just in case:
iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']=='CPF1')&(dfPhyto['Sample#']==217)
if len(dfPhyto.loc[iii])>0:
dfPhyto.loc[iii,['Sample#']]=9217
iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']=='CPF1')&(dfPhyto['Sample#']==217)
print('9217 This number should be zero:',len(dfPhyto.loc[iii]))
9416 This number should be zero: 0 9213 This number should be zero: 0 9217 This number should be zero: 0
# This is the important step- join the two tables ('left' and 'right'),
# matching the cruise IDs and sample numbers
# how='outer' -> all rows from both the left and the right tables will be included,
# even if they cannot be matched; this makes it easy to check for
# unmatched data later
# left_on specifies the name of the column to match in the left table (dfbot)
# right_on specifies the name of the column to match in the right table (dfPhyto)
dfout = pd.merge(dfbot, dfPhyto, how='outer',
left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])
# show the column names in the resulting table
dfout.keys()
Index(['FIL:START TIME YYYY/MM/DD HH:MM:SS', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'LOC:STATION', 'ADM:PROJECT', 'Bottle_Number', 'Sample_Number', 'Pressure [decibar]', 'Depth [metres]', 'Temperature [deg C (ITS90)]', 'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]', 'Salinity [PSS-78]', 'Temperature:Draw [deg C (ITS90)]', 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted [mg/m^3]', 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved', 'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise', 'Zone', 'ADM:MISSION', 'Bottle:Firing_Sequence', 'Transmissivity [*/metre]', 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]', 'Number_of_bin_records', 'Date', 'Station', 'Sample#', 'rep', 'depth', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido', 'Cyanobacteria', 'TchlA'], dtype='object')
# Identify cases where phytoplankton data were matched to multiple samples in bottle data:
dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])
temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})
temp.columns = ['icount']
np.unique(temp.icount)
array([1])
# check for Phyto samples matched to multiple bottle samples:
temp.loc[temp.icount>1]
icount | ||
---|---|---|
Cruise | Sample# |
# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]
icount | ||
---|---|---|
Cruise | Sample# |
temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})
temp2.columns = ['icount']
# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:
temp2.loc[temp2.icount>1]
icount | ||
---|---|---|
Cruise | Sample_Number | |
2017-064 | -99 | 7 |
dfbot.loc[(dfbot.Cruise=='2017-064')&(dfbot.Sample_Number<0)]
FIL:START TIME YYYY/MM/DD HH:MM:SS | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | LOC:STATION | ADM:PROJECT | Bottle_Number | Sample_Number | ... | Flag:Phosphate | Comments by sample_numbeR | Cruise | Zone | ADM:MISSION | Bottle:Firing_Sequence | Transmissivity [*/metre] | Oxygen:Dissolved:SBE [mL/L] | Oxygen:Dissolved:SBE [umol/kg] | Number_of_bin_records | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
647 | 2017-06-22 17:07:45 | 7 | 48.50000 | -124.73450 | 252 | Chandler P. | 102 | NaN | NaN | -99 | ... | NaN | NaN | 2017-064 | UTC | 2017-64 | NaN | 35.5 | 6.13 | 267.4 | 241.0 |
662 | 2017-06-22 20:56:06 | 10 | 48.46950 | -124.54600 | 220 | Chandler P. | 75 | NaN | NaN | -99 | ... | NaN | NaN | 2017-064 | UTC | 2017-64 | NaN | 34.8 | 6.34 | 276.6 | 83.0 |
714 | 2017-06-23 12:35:06 | 27 | 48.24317 | -122.97500 | 153 | Chandler P. | 63 | NaN | NaN | -99 | ... | NaN | NaN | 2017-064 | UTC | 2017-64 | NaN | 35.7 | 2.75 | 119.7 | 241.0 |
744 | 2017-06-23 22:43:15 | 34 | 48.77350 | -123.02900 | 205 | Chandler P. | 56 | NaN | NaN | -99 | ... | NaN | NaN | 2017-064 | UTC | 2017-64 | NaN | 36.9 | 5.59 | 244.8 | 241.0 |
819 | 2017-06-24 19:30:30 | 61 | 49.31816 | -123.79933 | 342 | Chandler P. | 27 | NaN | NaN | -99 | ... | NaN | NaN | 2017-064 | UTC | 2017-64 | NaN | 12.4 | 8.18 | 359.1 | 241.0 |
835 | 2017-06-25 00:47:05 | 68 | 49.40200 | -124.15434 | 277 | Chandler P. | 2 | NaN | NaN | -99 | ... | NaN | NaN | 2017-064 | UTC | 2017-64 | NaN | 23.9 | 7.26 | 318.2 | 241.0 |
936 | 2017-06-26 01:21:43 | 95 | 49.88317 | -124.99350 | 316 | Chandler P. | 14 | NaN | NaN | -99 | ... | NaN | NaN | 2017-064 | UTC | 2017-64 | NaN | 31.7 | 6.96 | 304.5 | 241.0 |
7 rows × 40 columns
dfout.loc[(dfout.Cruise=='2017-064')&(dfout.Sample_Number<0)]
FIL:START TIME YYYY/MM/DD HH:MM:SS | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | LOC:STATION | ADM:PROJECT | Bottle_Number | Sample_Number | ... | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
647 | 2017-06-22 17:07:45 | 7 | 48.50000 | -124.73450 | 252 | Chandler P. | 102 | NaN | NaN | -99 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
648 | 2017-06-22 20:56:06 | 10 | 48.46950 | -124.54600 | 220 | Chandler P. | 75 | NaN | NaN | -99 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
649 | 2017-06-23 12:35:06 | 27 | 48.24317 | -122.97500 | 153 | Chandler P. | 63 | NaN | NaN | -99 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
650 | 2017-06-23 22:43:15 | 34 | 48.77350 | -123.02900 | 205 | Chandler P. | 56 | NaN | NaN | -99 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
651 | 2017-06-24 19:30:30 | 61 | 49.31816 | -123.79933 | 342 | Chandler P. | 27 | NaN | NaN | -99 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
652 | 2017-06-25 00:47:05 | 68 | 49.40200 | -124.15434 | 277 | Chandler P. | 2 | NaN | NaN | -99 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
653 | 2017-06-26 01:21:43 | 95 | 49.88317 | -124.99350 | 316 | Chandler P. | 14 | NaN | NaN | -99 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
7 rows × 55 columns
# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]
icount | ||
---|---|---|
Cruise | Sample# |
# if the output table is longer than either of the input tables, some columns were not matched
len(dfout), len(dfPhyto), len(dfbot)
(1336, 119, 1336)
# Check that the number of cells with data in the 'Cyanobacteria' column is
# the same for the input and output tables to show that no rows are missing:
np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)
(119, 119)
# If there were data rows from the phytoplankton table that were not matched to
# rows from the bottle table, their indices from the phytoplankton table would be
# displayed below (the series [] would not be empty)
print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Sample#'])
Series([], Name: Sample#, dtype: float64)
# drop repetetive/unecessary columns:
dfout.drop(labels=['Date', 'Station', 'Sample#', 'rep',
'depth',],axis=1,inplace=True)
# truncate phyto group values to 3 decimal places:
for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',
'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
'TchlA'):
dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column
# Check for unexpected depths
dfout.loc[((dfout['Pressure [decibar]']>10)|(dfout['Depth [metres]']>10))&(dfout['Diatoms-1']>=0)]#,
# ['FIL:START TIME YYYY/MM/DD HH:MM:SS','Pressure [decibar]','Depth [metres]','Diatoms-1']]
FIL:START TIME YYYY/MM/DD HH:MM:SS | LOC:EVENT_NUMBER | LOC:LATITUDE | LOC:LONGITUDE | LOC:WATER DEPTH | ADM:SCIENTIST | LOC:STATION | ADM:PROJECT | Bottle_Number | Sample_Number | ... | Diatoms-1 | Diatoms-2 | Prasinophytes | Cryptophytes | Dinoflagellates-1 | Haptophytes | Dictyo | Raphido | Cyanobacteria | TchlA |
---|
0 rows × 50 columns
dfout['Cyanobacteria']
0 NaN 1 NaN 2 NaN 3 NaN 4 NaN ... 1331 NaN 1332 0.007 1333 NaN 1334 NaN 1335 0.047 Name: Cyanobacteria, Length: 1336, dtype: float64
# now write the output table to a .csv file:
dfout.to_csv(pathOut, index=False)
dfout.keys()
Index(['FIL:START TIME YYYY/MM/DD HH:MM:SS', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'LOC:STATION', 'ADM:PROJECT', 'Bottle_Number', 'Sample_Number', 'Pressure [decibar]', 'Depth [metres]', 'Temperature [deg C (ITS90)]', 'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]', 'Salinity [PSS-78]', 'Temperature:Draw [deg C (ITS90)]', 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted [mg/m^3]', 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved', 'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise', 'Zone', 'ADM:MISSION', 'Bottle:Firing_Sequence', 'Transmissivity [*/metre]', 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]', 'Number_of_bin_records', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido', 'Cyanobacteria', 'TchlA'], dtype='object')