In [1]:

import numpy as np  # this module handles arrays, but here we need it for its NaN value
import pandas as pd # this module contains a lot of tools for handling tabular data
import re
import gsw

In [2]:

# define paths to the source files and eventual output file
pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2017 SoG bottle.xlsx'
pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2019 SoG abs values New ALLO.xlsx'
pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2017_NewALLO.csv'

In [3]:

#formatting function to get year-(3digit) format
def fmtCruise(istr):
    if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): 
        sp=re.split('-',istr)
        rstr=sp[0]+'-0'+sp[1]
    elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):
        rstr=istr
    else:
        raise ValueError('Input had unexpected format:',istr)
    return rstr

In [4]:

# get names of sheets in notebook
with pd.ExcelFile(pathBottle,engine='openpyxl') as xl:
    sheets=xl.sheet_names
print(sheets)

['2017-01', '2017-63', '2017-05', '2017-64', '2017-09', '2017-65']

In [5]:

# load each sheet in the bottle Excel file and concatenate them together into one table
dfbotlist=list()
for sheet in sheets:
    df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,engine='openpyxl',
                      na_values=(-99,-99.9)) # read each sheet; include additional na values
    df0['Cruise']=fmtCruise(sheet)  # create and populate Cruise column based on sheet name
    dfbotlist.append(df0) # append the sheet to a list
dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table
# Drop columns with no data in them
l1=set(dfbot.keys())
dfbot.dropna(axis=1,how='all',inplace=True)
print('removed empty columns:',l1-set(dfbot.keys()))

Reading sheet 2017-01
Reading sheet 2017-63
Reading sheet 2017-05
Reading sheet 2017-64
Reading sheet 2017-09
Reading sheet 2017-65
removed empty columns: set()

In [6]:

# list the column names in the resulting table
print(dfbot.keys())

Index(['FIL:START TIME YYYY/MM/DD HH:MM:SS', 'LOC:EVENT_NUMBER',
       'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST',
       'LOC:STATION', 'ADM:PROJECT', 'Bottle_Number', 'Sample_Number',
       'Pressure [decibar]', 'Depth [metres]',
       'Temperature:Primary [deg C (ITS90)]',
       'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]',
       'Salinity:T0:C0 [PSS-78]', 'Temperature:Draw [deg C (ITS90)]',
       'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle',
       'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted',
       'Phaeo-Pigment:Extracted [mg/m^3]', 'Oxygen:Dissolved [mL/L]',
       'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved',
       'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite',
       'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]',
       'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise', 'Zone',
       'ADM:MISSION', 'Bottle:Firing_Sequence', 'Temperature*',
       'Transmissivity [*/metre]', 'Oxygen:Dissolved:SBE [mL/L]',
       'Oxygen:Dissolved:SBE [umol/kg]',
       'Temperature:Secondary [deg C (ITS90)]', 'Salinity:T1:C1 [PSS-78]',
       'Number_of_bin_records'],
      dtype='object')

In [7]:

# no rows returned, so there are no rows with multiple temperature values
print(np.sum(dfbot['Temperature:Primary [deg C (ITS90)]']>=0),
      np.sum(dfbot['Temperature:Secondary [deg C (ITS90)]']>=0),
      np.sum(dfbot['Temperature*']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Temperature:Primary [deg C (ITS90)]']>=0)])+\
          np.array([int(ii) for ii in (dfbot['Temperature:Secondary [deg C (ITS90)]']>=0)])+\
           np.array([int(ii) for ii in (dfbot['Temperature*']>=0)]))>1]

204 71 1061

Out[7]:

	FIL:START TIME YYYY/MM/DD HH:MM:SS	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	LOC:STATION	ADM:PROJECT	Bottle_Number	Sample_Number	...	Zone	ADM:MISSION	Bottle:Firing_Sequence	Temperature*	Transmissivity [*/metre]	Oxygen:Dissolved:SBE [mL/L]	Oxygen:Dissolved:SBE [umol/kg]	Temperature:Secondary [deg C (ITS90)]	Salinity:T1:C1 [PSS-78]	Number_of_bin_records

0 rows × 43 columns

In [8]:

# no rows returned, so there are no rows with both both salinity fields
print(np.sum(dfbot['Salinity:T0:C0 [PSS-78]']>=0),
      np.sum(dfbot['Salinity:T1:C1 [PSS-78]']>=0))
dfbot.loc[(dfbot['Salinity:T0:C0 [PSS-78]']>=0)&\
          (dfbot['Salinity:T1:C1 [PSS-78]']>=0)]

931 71

Out[8]:

	FIL:START TIME YYYY/MM/DD HH:MM:SS	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	LOC:STATION	ADM:PROJECT	Bottle_Number	Sample_Number	...	Zone	ADM:MISSION	Bottle:Firing_Sequence	Temperature*	Transmissivity [*/metre]	Oxygen:Dissolved:SBE [mL/L]	Oxygen:Dissolved:SBE [umol/kg]	Temperature:Secondary [deg C (ITS90)]	Salinity:T1:C1 [PSS-78]	Number_of_bin_records

0 rows × 43 columns

In [9]:

def subval(idf,colList):
    # first value in colList should be the column you are going to keep
    # follow with other columns that will be used to fill in when that column is NaN
    # in order of precedence
    if len(colList)==2:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] for i,r in idf.iterrows()]
    elif len(colList)==3:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] if not pd.isna(r[colList[1]]) \
                         else r[colList[2]] for i,r in idf.iterrows()]
    else:
        raise NotImplementedError('Add to code to handle this case')
    return idf

In [10]:

# there are some duplicate columns here; handle them:
dfbot=subval(dfbot,('Temperature:Primary [deg C (ITS90)]',
                    'Temperature:Secondary [deg C (ITS90)]',
                    'Temperature*'))
dfbot=subval(dfbot,('Salinity:T0:C0 [PSS-78]',
                    'Salinity:T1:C1 [PSS-78]'))
dfbot.rename(columns={'Temperature:Primary [deg C (ITS90)]':'Temperature [deg C (ITS90)]'},
             inplace=True)
dfbot.rename(columns={'Salinity:T0:C0 [PSS-78]':'Salinity [PSS-78]'},
             inplace=True)

In [11]:

# drop repetetive/unecessary columns:
dfbot.drop(labels=['Temperature:Secondary [deg C (ITS90)]',
                   'Salinity:T1:C1 [PSS-78]','Temperature*'],axis=1,inplace=True)

In [12]:

# define a function that will be applied to the values in the index column;
# this makes it easier to drop non-data rows later
def convertIndex(val):
    try:
        x =int(val)
    except ValueError:
        x=np.nan
    return x

In [13]:

# load the 2015 phytoplankton data with the following options:
#   sheet_name='2015 CHEMTAX abs results'  -> choose the 2015 sheet
#   usecols='A:I,T:AC'   -> read only columns A:I and T:AC from the Excel sheet
#   skiprows=2     -> start reading at the 3rd row of the sheet, 
#                     which contains the column headings
#   converters={'Index': convertIndex,}   -> apply the function defined above to the Index column
#   verbose = True   -> print extra information/ warnings/ errors
dfPhyto=pd.read_excel(pathPhyto,sheet_name='2017',engine='openpyxl',
                      skiprows=0,converters={'Sample#': convertIndex,},
                      verbose=True)

Reading sheet 2017

In [14]:

# display rows 48 to 59 of the resulting table
dfPhyto[48:60]

Out[14]:

	Cruise	Date	Station	Sample#	rep	depth	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	Cyanobacteria	TchlA
48	2017-005	May	CPF1	398.0	mean	0	9.72624	1.18163	0.0900009	0.466654	0.12447	0	0	0	0	11.589
49	2017-005	May	22	410.0	mean	0	2.29192	0.0859758	0.0356223	0.272079	0.056573	0.176352	0.00451122	0.0184467	0.00451809	2.946
50	2017-005	May	24	413.0	mean	0	3.28244	0.0612094	0.0354772	0.698303	0.0780227	0.195614	0.00391747	0.0660138	0	4.421
51	2017-005	May	28	416.0	mean	0	1.73834	0	0.0320477	0.427669	0.0395294	0.146105	0.00864831	0.032897	0.00176814	2.427
52	2017-005	May	GEO1	425.0	mean	0	2.74901	0.136714	0.00276401	0.373035	0.0327157	0	0.00170031	0.0384725	0.0245927	3.359
53	2017-005	May	38	428.0	mean	0	1.42461	0	0.152071	2.29114	0.217939	0.181603	0.0160822	0.0325562	0	4.316
54	2017-005	May	41	441.0	mean	0	0.13052	0	0.0224933	0.257959	0.0319908	0	2.00975e-05	0.0263617	0.0226547	0.492
55	Cruise	Date	Station	NaN	rep	depth	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	Cyanobacteria	TchlA
56	2017-064	June	SI	9.0	mean	0	0.186265	0	0.0965613	0.577516	0.457205	0.713054	0.0895957	0.0658043	0	2.186
57	2017-064	June	59	23.0	mean	0	0.0964428	0.0675701	0.0290869	0.179129	0.0353787	0.0676973	0.00145237	0.0142423	0	0.491
58	2017-064	June	102	39.0	mean	0	0.432713	0.0959644	0.24912	0.702623	0.265839	0.45136	0.0362003	0.0883046	0.0208753	2.343
59	2017-064	June	75	54.0	mean	0	0.554319	0.148322	0.150959	0.854182	0.296814	0.573149	0.0448703	0.07966	0.0287253	2.731

In [15]:

# now, drop any rows from the table that have NaN values in either of the columns
#  'Index' or 'TchlA (ug/L)'
# This is why we applied a function to the Index column to make sure all 
#  non-numeric Index values would have a consistent NaN entry, making them easy to identify
#  and remove
dfPhyto.dropna(subset=['Sample#', 'TchlA'],how='any',inplace=True)

In [16]:

# pandas creates its own index, and after dropping rows I like to reset it -
# this is just for convenience
dfPhyto.reset_index(drop=True,inplace=True)

In [17]:

# apply formatting function all rows in Cruise column to get year-3digit format
dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]

In [18]:

# display part of the table, confirming that non-data rows have been removed
dfPhyto[48:60]

Out[18]:

	Cruise	Date	Station	Sample#	rep	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	Cyanobacteria	TchlA
48	2017-005	May	24	413.0	mean	3.28244	0.0612094	0.0354772	0.698303	0.0780227	0.195614	0.00391747	0.0660138	0	4.421
49	2017-005	May	28	416.0	mean	1.73834	0	0.0320477	0.427669	0.0395294	0.146105	0.00864831	0.032897	0.00176814	2.427
50	2017-005	May	GEO1	425.0	mean	2.74901	0.136714	0.00276401	0.373035	0.0327157	0	0.00170031	0.0384725	0.0245927	3.359
51	2017-005	May	38	428.0	mean	1.42461	0	0.152071	2.29114	0.217939	0.181603	0.0160822	0.0325562	0	4.316
52	2017-005	May	41	441.0	mean	0.13052	0	0.0224933	0.257959	0.0319908	0	2.00975e-05	0.0263617	0.0226547	0.492
53	2017-064	June	SI	9.0	mean	0.186265	0	0.0965613	0.577516	0.457205	0.713054	0.0895957	0.0658043	0	2.186
54	2017-064	June	59	23.0	mean	0.0964428	0.0675701	0.0290869	0.179129	0.0353787	0.0676973	0.00145237	0.0142423	0	0.491
55	2017-064	June	102	39.0	mean	0.432713	0.0959644	0.24912	0.702623	0.265839	0.45136	0.0362003	0.0883046	0.0208753	2.343
56	2017-064	June	75	54.0	mean	0.554319	0.148322	0.150959	0.854182	0.296814	0.573149	0.0448703	0.07966	0.0287253	2.731
57	2017-064	June	72	68.0	mean	0	0	0.159731	0.756824	0.0962737	0.128563	5.65439e-05	0.104552	0	1.246
58	2017-064	June	69	81.0	mean	0	0.0696512	0.146823	0.51675	0.0494223	0.114177	0.0103499	0.0511945	0	0.958368
59	2017-064	June	ADCP	92.0	mean	0	0.0229016	0.0760985	0.405645	0.0180952	0.0551009	0	0.0331584	0	0.611

In [19]:

# due to repeated column names in the original spreadsheet, '.1' was appended to the names
# of the phytoplankton columns; 
# these lines correct the column names, removing the '.1':
renameDict=dict()
for colName in dfPhyto.keys():
    if colName.endswith('.1'):
        renameDict[colName]=colName.split('.1')[0]
dfPhyto.rename(columns=renameDict,inplace=True)

In [20]:

dfPhyto

Out[20]:

	Cruise	Date	Station	Sample#	rep	depth	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	Cyanobacteria	TchlA
0	2017-001	Feb	102	615.0	mean	1	0.153031	0.0070906	0.0951162	0.285236	0.057834	0.158406	0	0.0170267	0	1.77374
1	2017-001	Feb	JF2	627.0	mean	1	0.0466544	0.0369462	0.133572	0.36522	0.0794122	0.179547	0.00376303	0.0606366	0	1.90575
2	2017-001	Feb	59	641.0	mean	1	0.119854	0.179411	0.0629763	0.176599	0.0232654	0	0.00529869	0.0125153	0	1.57992
3	2017-001	Feb	56	655.0	mean	1	0.346102	0.246237	0.137605	0.395798	0.059873	0.0607372	0.0109077	0.0422404	0.00985842	2.30936
4	2017-001	Feb	46	667.0	mean	2	0.336588	0.262766	0.117003	0.33433	0.0532367	0.047056	0.0141236	0.0194676	0.0122297	3.1968
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
114	2017-065	Oct	9	260.0	mean	0	0	0	0.0632929	0.109627	0	0.406064	0	0.0207089	0.0383072	0.638
115	2017-065	Oct	12	276.0	mean	0	1.29519	0.29206	0	0.198517	0.0181637	0.102396	0.0107763	0.00904861	0.0108495	1.937
116	2017-065	Oct	14	291.0	mean	0	0.100137	0	0.114614	0.169733	0.000143127	0.289479	0	0.0510349	0.0258585	0.751
117	2017-065	Oct	16	304.0	mean	0	0.344155	0.053402	0.0148634	0.150205	0	0.069992	0.00990213	0.0152399	0.00724086	0.665
118	2017-065	Oct	22	307.0	mean	0	1.52571	0.346386	0.0609604	0.261251	0.0640507	0.180817	0.0377362	0.0397075	0.0473817	2.564

119 rows × 16 columns

In [21]:

# sample number corrections
iii=(dfPhyto['Cruise']=='2017-005')&(dfPhyto['Station']==28)&(dfPhyto['Sample#']==416)
if len(dfPhyto.loc[iii])>0:
    dfPhyto.loc[iii,['Sample#']]=9416
iii=(dfPhyto['Cruise']=='2017-005')&(dfPhyto['Station']==28)&(dfPhyto['Sample#']==416)
print('9416 This number should be zero:',len(dfPhyto.loc[iii]))
iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']==2)&(dfPhyto['Sample#']==213)
if len(dfPhyto.loc[iii])>0:
    dfPhyto.loc[iii,['Sample#']]=9213
iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']==2)&(dfPhyto['Sample#']==213)
print('9213 This number should be zero:',len(dfPhyto.loc[iii]))
#this ones was correct but test it just in case:
iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']=='CPF1')&(dfPhyto['Sample#']==217)
if len(dfPhyto.loc[iii])>0:
    dfPhyto.loc[iii,['Sample#']]=9217
iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']=='CPF1')&(dfPhyto['Sample#']==217)
print('9217 This number should be zero:',len(dfPhyto.loc[iii]))

9416 This number should be zero: 0
9213 This number should be zero: 0
9217 This number should be zero: 0

In [22]:

# This is the important step- join the two tables ('left' and 'right'), 
#  matching the cruise IDs and sample numbers
#   how='outer'  -> all rows from both the left and the right tables will be included, 
#                   even if they cannot be matched; this makes it easy to check for 
#                   unmatched data later
#   left_on  specifies the name of the column to match in the left table (dfbot) 
#   right_on specifies the name of the column to match in the right table (dfPhyto)
dfout = pd.merge(dfbot, dfPhyto,  how='outer', 
                 left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])

In [23]:

# show the column names in the resulting table
dfout.keys()

Out[23]:

Index(['FIL:START TIME YYYY/MM/DD HH:MM:SS', 'LOC:EVENT_NUMBER',
       'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST',
       'LOC:STATION', 'ADM:PROJECT', 'Bottle_Number', 'Sample_Number',
       'Pressure [decibar]', 'Depth [metres]', 'Temperature [deg C (ITS90)]',
       'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]',
       'Salinity [PSS-78]', 'Temperature:Draw [deg C (ITS90)]',
       'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle',
       'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted',
       'Phaeo-Pigment:Extracted [mg/m^3]', 'Oxygen:Dissolved [mL/L]',
       'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved',
       'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite',
       'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]',
       'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise', 'Zone',
       'ADM:MISSION', 'Bottle:Firing_Sequence', 'Transmissivity [*/metre]',
       'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',
       'Number_of_bin_records', 'Date', 'Station', 'Sample#', 'rep', 'depth',
       'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes',
       'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
       'Cyanobacteria', 'TchlA'],
      dtype='object')

Checks¶

In [24]:

# Identify cases where phytoplankton data were matched to multiple samples in bottle data:
dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])

In [25]:

temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})
temp.columns = ['icount']

In [26]:

np.unique(temp.icount)

Out[26]:

array([1])

In [27]:

# check for Phyto samples matched to multiple bottle samples:
temp.loc[temp.icount>1]

Out[27]:

		icount
Cruise	Sample#

In [28]:

# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]

Out[28]:

		icount
Cruise	Sample#

In [29]:

temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})
temp2.columns = ['icount']
# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:
temp2.loc[temp2.icount>1]

Out[29]:

		icount
Cruise	Sample_Number
2017-064	-99	7

In [30]:

dfbot.loc[(dfbot.Cruise=='2017-064')&(dfbot.Sample_Number<0)]

Out[30]:

	FIL:START TIME YYYY/MM/DD HH:MM:SS	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	LOC:STATION	ADM:PROJECT	Bottle_Number	Sample_Number	...	Flag:Phosphate	Comments by sample_numbeR	Cruise	Zone	ADM:MISSION	Bottle:Firing_Sequence	Transmissivity [*/metre]	Oxygen:Dissolved:SBE [mL/L]	Oxygen:Dissolved:SBE [umol/kg]	Number_of_bin_records
647	2017-06-22 17:07:45	7	48.50000	-124.73450	252	Chandler P.	102	NaN	NaN	-99	...	NaN	NaN	2017-064	UTC	2017-64	NaN	35.5	6.13	267.4	241.0
662	2017-06-22 20:56:06	10	48.46950	-124.54600	220	Chandler P.	75	NaN	NaN	-99	...	NaN	NaN	2017-064	UTC	2017-64	NaN	34.8	6.34	276.6	83.0
714	2017-06-23 12:35:06	27	48.24317	-122.97500	153	Chandler P.	63	NaN	NaN	-99	...	NaN	NaN	2017-064	UTC	2017-64	NaN	35.7	2.75	119.7	241.0
744	2017-06-23 22:43:15	34	48.77350	-123.02900	205	Chandler P.	56	NaN	NaN	-99	...	NaN	NaN	2017-064	UTC	2017-64	NaN	36.9	5.59	244.8	241.0
819	2017-06-24 19:30:30	61	49.31816	-123.79933	342	Chandler P.	27	NaN	NaN	-99	...	NaN	NaN	2017-064	UTC	2017-64	NaN	12.4	8.18	359.1	241.0
835	2017-06-25 00:47:05	68	49.40200	-124.15434	277	Chandler P.	2	NaN	NaN	-99	...	NaN	NaN	2017-064	UTC	2017-64	NaN	23.9	7.26	318.2	241.0
936	2017-06-26 01:21:43	95	49.88317	-124.99350	316	Chandler P.	14	NaN	NaN	-99	...	NaN	NaN	2017-064	UTC	2017-64	NaN	31.7	6.96	304.5	241.0

7 rows × 40 columns

In [31]:

dfout.loc[(dfout.Cruise=='2017-064')&(dfout.Sample_Number<0)]

Out[31]:

	FIL:START TIME YYYY/MM/DD HH:MM:SS	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	LOC:STATION	ADM:PROJECT	Bottle_Number	Sample_Number	...	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	Cyanobacteria	TchlA
647	2017-06-22 17:07:45	7	48.50000	-124.73450	252	Chandler P.	102	NaN	NaN	-99	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
648	2017-06-22 20:56:06	10	48.46950	-124.54600	220	Chandler P.	75	NaN	NaN	-99	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
649	2017-06-23 12:35:06	27	48.24317	-122.97500	153	Chandler P.	63	NaN	NaN	-99	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
650	2017-06-23 22:43:15	34	48.77350	-123.02900	205	Chandler P.	56	NaN	NaN	-99	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
651	2017-06-24 19:30:30	61	49.31816	-123.79933	342	Chandler P.	27	NaN	NaN	-99	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
652	2017-06-25 00:47:05	68	49.40200	-124.15434	277	Chandler P.	2	NaN	NaN	-99	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
653	2017-06-26 01:21:43	95	49.88317	-124.99350	316	Chandler P.	14	NaN	NaN	-99	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

7 rows × 55 columns

In [32]:

# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]

Out[32]:

		icount
Cruise	Sample#

In [33]:

# if the output table is longer than either of the input tables, some columns were not matched
len(dfout), len(dfPhyto), len(dfbot)

Out[33]:

(1336, 119, 1336)

In [34]:

# Check that the number of cells with data in the 'Cyanobacteria' column is 
#  the same for the input and output tables to show that no rows are missing:
np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)

Out[34]:

(119, 119)

In [35]:

# If there were data rows from the phytoplankton table that were not matched to 
#  rows from the bottle table, their indices from the phytoplankton table would be 
#  displayed below (the series [] would not be empty)
print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Sample#'])

Series([], Name: Sample#, dtype: float64)

In [36]:

# drop repetetive/unecessary columns:
dfout.drop(labels=['Date', 'Station', 'Sample#', 'rep',
                   'depth',],axis=1,inplace=True)

In [37]:

# truncate phyto group values to 3 decimal places:
for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',
       'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
       'TchlA'):
    dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column

In [44]:

# Check for unexpected depths
dfout.loc[((dfout['Pressure [decibar]']>10)|(dfout['Depth [metres]']>10))&(dfout['Diatoms-1']>=0)]#,
#         ['FIL:START TIME YYYY/MM/DD HH:MM:SS','Pressure [decibar]','Depth [metres]','Diatoms-1']]

Out[44]:

	FIL:START TIME YYYY/MM/DD HH:MM:SS	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	LOC:STATION	ADM:PROJECT	Bottle_Number	Sample_Number	...	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	Cyanobacteria	TchlA

0 rows × 50 columns

In [38]:

dfout['Cyanobacteria']

Out[38]:

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
        ...  
1331      NaN
1332    0.007
1333      NaN
1334      NaN
1335    0.047
Name: Cyanobacteria, Length: 1336, dtype: float64

In [39]:

# now write the output table to a .csv file:
dfout.to_csv(pathOut, index=False)  

In [40]:

dfout.keys()

Out[40]:

Index(['FIL:START TIME YYYY/MM/DD HH:MM:SS', 'LOC:EVENT_NUMBER',
       'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST',
       'LOC:STATION', 'ADM:PROJECT', 'Bottle_Number', 'Sample_Number',
       'Pressure [decibar]', 'Depth [metres]', 'Temperature [deg C (ITS90)]',
       'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]',
       'Salinity [PSS-78]', 'Temperature:Draw [deg C (ITS90)]',
       'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle',
       'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted',
       'Phaeo-Pigment:Extracted [mg/m^3]', 'Oxygen:Dissolved [mL/L]',
       'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved',
       'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite',
       'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]',
       'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise', 'Zone',
       'ADM:MISSION', 'Bottle:Firing_Sequence', 'Transmissivity [*/metre]',
       'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',
       'Number_of_bin_records', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes',
       'Cryptophytes', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
       'Cyanobacteria', 'TchlA'],
      dtype='object')

In [ ]: