In [1]:

import numpy as np  # this module handles arrays, but here we need it for its NaN value
import pandas as pd # this module contains a lot of tools for handling tabular data
import re

In [2]:

# define paths to the source files and eventual output file
#pathBottle='/ocean/eolson/MEOPAR/obs/Nina/All 2018 SoG bottle.xlsx'
#pathPhyto='/ocean/eolson/MEOPAR/obs/Nina/2015-2018 Abs phyto groups.xlsx'
pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2019 SoG bottle_revWithNew044.xlsx'
pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2019 Abs phyto groups.xlsx'

pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2019.csv'

In [3]:

#formatting function to get year-(3digit) format
def fmtCruise(istr):
    if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): 
        sp=re.split('-',istr)
        rstr=sp[0]+'-0'+sp[1]
    elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):
        rstr=istr
    else:
        raise ValueError('Input had unexpected format:',istr)
    return rstr

In [4]:

# get names of sheets in notebook
with pd.ExcelFile(pathBottle) as xl:
    sheets=xl.sheet_names
print(sheets)

['2019-001', '2019-007', '2019-016', '2019-044', '2019-005', '2019-045', '2019-050', '2019-023', '2019-062']

In [5]:

# load each sheet in the bottle Excel file and concatenate them together into one table
dfbotlist=list()
for sheet in sheets:
    df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,
                      na_values=(-99,-99.9)) # read each sheet; include additional na values
    df0['Cruise']=fmtCruise(sheet)  # create and populate Cruise column based on sheet name
    dfbotlist.append(df0) # append the sheet to a list
dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table

Reading sheet 2019-001
Reading sheet 2019-007
Reading sheet 2019-016
Reading sheet 2019-044
Reading sheet 2019-005
Reading sheet 2019-045
Reading sheet 2019-050
Reading sheet 2019-023
Reading sheet 2019-062

In [6]:

# Drop columns with no data in them
l1=set(dfbot.keys())
dfbot.dropna(axis=1,how='all',inplace=True)
print('removed empty columns:',l1-set(dfbot.keys()))

removed empty columns: {'Flag:Dimethylsulfoniopropionate_Total', 'Dimethyl_Sulphide [nmol/L]', 'Flag:Dimethyl_Sulphide', 'Flag:Dimethylsulfoniopropionate_Dissolve', 'Dimethylsulfoniopropionate_Total [nmol/L]', 'Ammonium [umol/L]', 'Dimethylsulfoniopropionate_Dissolved [nmol/L]', 'Flag:Ammonium'}

In [7]:

# list the column names in the resulting table
for el in sorted(dfbot.keys()):
    if not el.startswith('ADM') and not el.startswith('Flag') and not el.startswith('LOC') \
     and not el.startswith('Bottle'):
        print(el)

Chlorophyll:Extracted
Chlorophyll:Extracted [mg/m^3]
Comments by sample_numbeR
Conductivity:Primary [S/m]
Conductivity:Secondary
Conductivity:Secondary [S/m]
Cruise
Depth
Depth [metres]
Depth:Nominal [metres]
FIL:DATA DESCRIPTION
FIL:START TIME YYYY/MM/DD HH:MM:SS
File Name
Fluorescence:URU:Seapoint
Fluorescence:URU:Seapoint [mg/m^3]
Fluorescence:URU:Wetlabs [mg/m^3]
Nitrate_plus_Nitrite
Nitrate_plus_Nitrite [umol/L]
Number_of_bin_records
Oxygen:Dissolved
Oxygen:Dissolved [mL/L]
Oxygen:Dissolved [umol/kg]
Oxygen:Dissolved.1
Oxygen:Dissolved:SBE
Oxygen:Dissolved:SBE [mL/L]
Oxygen:Dissolved:SBE [umol/kg]
Oxygen:Dissolved:SBE.1
PAR
PAR [uE/m^2/sec]
PAR:Reference
PAR:Reference [uE/m^2/sec]
Phaeo-Pigment:Extracted
Phaeo-Pigment:Extracted [mg/m^3]
Phosphate
Phosphate [umol/L]
Pressure
Pressure [decibar]
Salinity:Bottle
Salinity:Bottle [PSS-78]
Salinity:T0:C0 [PSS-78]
Salinity:T1:C1
Salinity:T1:C1 [PSS-78]
Sample_Number
Silicate
Silicate [umol/L]
Temperature:Draw
Temperature:Draw [deg C (ITS90)]
Temperature:Primary [deg C (ITS90)]
Temperature:Secondary
Temperature:Secondary [deg C (ITS90)]
Transmissivity
Transmissivity [*/metre]
Transmissivity2 [*/metre]
Transmissivity:Green [*/metre]
YYYY/MM/DD HH:MM:SS
Zone
Zone.1
pH:SBE:Nominal

In [8]:

for el in dfbot.keys():
    if 'Oxy' in el:
        print(el,np.max(dfbot[el]))

Oxygen:Dissolved:SBE [mL/L] 275.2
Oxygen:Dissolved:SBE [umol/kg] 447.5
Oxygen:Dissolved [mL/L] 9.571
Oxygen:Dissolved [umol/kg] 419.6
Flag:Oxygen:Dissolved 46.0
Oxygen:Dissolved:SBE 9.19
Oxygen:Dissolved:SBE.1 9.19
Oxygen:Dissolved 8.924
Oxygen:Dissolved.1 8.924

In [9]:

# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Depth [metres]']>=0),
      np.sum(dfbot['Pressure [decibar]']>=0),
      np.sum(dfbot['Pressure']>=0),
     np.sum(dfbot['Depth']>=0))
len(dfbot.loc[~(dfbot['Depth [metres]']>=0)&\
          ~(dfbot['Pressure [decibar]']>=0)&\
          (dfbot['Pressure']>=0)])

1104 1104 340 340

Out[9]:

In [10]:

# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Temperature:Primary [deg C (ITS90)]']>=0),
      np.sum(dfbot['Temperature:Secondary [deg C (ITS90)]']>=0),
      np.sum(dfbot['Temperature:Secondary']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Temperature:Primary [deg C (ITS90)]']>=0)])+\
          np.array([int(ii) for ii in (dfbot['Temperature:Secondary [deg C (ITS90)]']>=0)])+\
           np.array([int(ii) for ii in (dfbot['Temperature:Secondary']>=0)]))>1]

726 378 340

Out[10]:

	Zone	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	ADM:MISSION	LOC:STATION	ADM:PROJECT	YYYY/MM/DD HH:MM:SS	...	Temperature:Draw	Salinity:Bottle	Chlorophyll:Extracted	Phaeo-Pigment:Extracted	Oxygen:Dissolved	Oxygen:Dissolved.1	Nitrate_plus_Nitrite	Silicate	Phosphate	Transmissivity:Green [*/metre]

0 rows × 77 columns

In [11]:

# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0),
      np.sum(dfbot['Fluorescence:URU:Seapoint']>=0),
      np.sum(dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0)])+\
          np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Seapoint']>=0)])+\
           np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0)]))>1]

993 340 110

Out[11]:

	Zone	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	ADM:MISSION	LOC:STATION	ADM:PROJECT	YYYY/MM/DD HH:MM:SS	...	Temperature:Draw	Salinity:Bottle	Chlorophyll:Extracted	Phaeo-Pigment:Extracted	Oxygen:Dissolved	Oxygen:Dissolved.1	Nitrate_plus_Nitrite	Silicate	Phosphate	Transmissivity:Green [*/metre]

0 rows × 77 columns

In [12]:

# no rows returned, so there are no rows with both both salinity fields
print(np.sum(dfbot['Salinity:T0:C0 [PSS-78]']>=0),
      np.sum(dfbot['Salinity:T1:C1 [PSS-78]']>=0),
      np.sum(dfbot['Salinity:T1:C1']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Salinity:T0:C0 [PSS-78]']>=0)])+\
           np.array([int(ii) for ii in (dfbot['Salinity:T1:C1 [PSS-78]']>=0)])+\
           np.array([int(ii) for ii in (dfbot['Salinity:T1:C1']>=0)]))>1]

726 378 340

Out[12]:

	Zone	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	ADM:MISSION	LOC:STATION	ADM:PROJECT	YYYY/MM/DD HH:MM:SS	...	Temperature:Draw	Salinity:Bottle	Chlorophyll:Extracted	Phaeo-Pigment:Extracted	Oxygen:Dissolved	Oxygen:Dissolved.1	Nitrate_plus_Nitrite	Silicate	Phosphate	Transmissivity:Green [*/metre]

0 rows × 77 columns

In [13]:

# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Pressure [decibar]']>=0),
      np.sum(dfbot['Pressure']>=0))
dfbot.loc[(dfbot['Pressure [decibar]']>=0)&\
          (dfbot['Pressure']>=0)]

1104 340

Out[13]:

	Zone	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	ADM:MISSION	LOC:STATION	ADM:PROJECT	YYYY/MM/DD HH:MM:SS	...	Temperature:Draw	Salinity:Bottle	Chlorophyll:Extracted	Phaeo-Pigment:Extracted	Oxygen:Dissolved	Oxygen:Dissolved.1	Nitrate_plus_Nitrite	Silicate	Phosphate	Transmissivity:Green [*/metre]

0 rows × 77 columns

In [14]:

def subval(idf,colList):
    # first value in colList should be the column you are going to keep
    # follow with other columns that will be used to fill in when that column is NaN
    # in order of precedence
    if len(colList)==2:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] for i,r in idf.iterrows()]
    elif len(colList)==3:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] if not pd.isna(r[colList[1]]) \
                         else r[colList[2]] for i,r in idf.iterrows()]
    else:
        raise NotImplementedError('Add to code to handle this case')
    idf.drop(columns=list(colList[1:]),inplace=True)
    return idf

In [15]:

# there are some duplicate columns here; handle them:
dfbot=subval(dfbot,('Depth [metres]','Depth'))
dfbot=subval(dfbot,('Pressure [decibar]','Pressure'))
dfbot=subval(dfbot,('Temperature:Primary [deg C (ITS90)]',
                    'Temperature:Secondary [deg C (ITS90)]',
                    'Temperature:Secondary'))
dfbot=subval(dfbot,('Salinity:T0:C0 [PSS-78]',
                    'Salinity:T1:C1 [PSS-78]',
                    'Salinity:T1:C1'))
dfbot=subval(dfbot,('Fluorescence:URU:Seapoint [mg/m^3]',
                    'Fluorescence:URU:Wetlabs [mg/m^3]',
                    'Fluorescence:URU:Seapoint'))
dfbot=subval(dfbot,('PAR [uE/m^2/sec]','PAR'))
dfbot=subval(dfbot,('PAR:Reference [uE/m^2/sec]','PAR:Reference'))
dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [mL/L]',
                    'Oxygen:Dissolved:SBE',
                    'Oxygen:Dissolved:SBE.1'))
dfbot=subval(dfbot,('Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved', 'Oxygen:Dissolved.1'))
dfbot=subval(dfbot,('Transmissivity [*/metre]','Transmissivity2 [*/metre]','Transmissivity'))
dfbot=subval(dfbot,('Nitrate_plus_Nitrite [umol/L]', 'Nitrate_plus_Nitrite'))
dfbot=subval(dfbot,('Silicate [umol/L]','Silicate'))
dfbot=subval(dfbot,('Phosphate [umol/L]','Phosphate'))
dfbot=subval(dfbot,('Phaeo-Pigment:Extracted [mg/m^3]','Phaeo-Pigment:Extracted'))
dfbot=subval(dfbot,('Chlorophyll:Extracted [mg/m^3]','Chlorophyll:Extracted'))
dfbot=subval(dfbot,('Conductivity:Primary [S/m]',
                    'Conductivity:Secondary [S/m]',
                    'Conductivity:Secondary'))
dfbot=subval(dfbot,('FIL:START TIME YYYY/MM/DD HH:MM:SS','YYYY/MM/DD HH:MM:SS'))
dfbot=subval(dfbot,('Zone', 'Zone.1'))
dfbot=subval(dfbot,('Temperature:Draw [deg C (ITS90)]','Temperature:Draw'))
dfbot=subval(dfbot,('Salinity:Bottle [PSS-78]','Salinity:Bottle'))

dfbot.rename(columns={'Temperature:Primary [deg C (ITS90)]':'Temperature [deg C (ITS90)]'},
             inplace=True)
dfbot.rename(columns={'Salinity:T0:C0 [PSS-78]':'Salinity [PSS-78]'},
             inplace=True)
dfbot.rename(columns={'Fluorescence:URU:Seapoint [mg/m^3]':'Fluorescence:URU [mg/m^3]'},
             inplace=True)
dfbot.rename(columns={'Conductivity:Primary [S/m]':'Conductivity [S/m]'},
             inplace=True)

In [16]:

# define a function that will be applied to the values in the index column;
# this makes it easier to drop non-data rows later
def convertIndex(val):
    try:
        x =int(val)
    except ValueError:
        x=np.nan
    return x

In [17]:

# load the  phytoplankton data with the following options:
#   sheet_name='???? CHEMTAX abs results'  -> choose the  sheet
#   usecols='A:I,T:AC'   -> read only columns A:I and T:AC from the Excel sheet
#   skiprows=2     -> start reading at the 3rd row of the sheet, 
#                     which contains the column headings
#   converters={'Index': convertIndex,}   -> apply the function defined above to the Index column
#   verbose = True   -> print extra information/ warnings/ errors
dfPhyto=pd.read_excel(pathPhyto,sheet_name='2019 CHEMTAX abs rerun results',usecols='A:S',
                      skiprows=2,converters={'Index': convertIndex,},
                      verbose=True)

Reading sheet 2019 CHEMTAX abs rerun results

In [18]:

# display rows 48 to 59 of the resulting table
dfPhyto[48:60]

Out[18]:

	Bin #	Index	Subgroup	Cruise	Date	Station	Sample#	rep	depth	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	Cyanobacteria	TchlA
48	7	46.0	2	2019-016	2019-04-10 00:00:00	56	289	mean	0	0.235929	0.0557088	0.000116464	0.252259	0.00720253	0.015618	0.00990757	0.00374288	0.00721907	0.587704
49	7	47.0	2	2019-016	2019-04-10 00:00:00	59	303	mean	0	1.17636	0.0649978	0.00151012	0.116654	0.0114187	0.0189902	0.00697747	0.000317128	0.00746592	1.40469
50	7	48.0	2	2019-016	2019-04-11 00:00:00	102	318	mean	0	1.81861	0.254111	0.0210325	0.106842	0.0526873	0.222644	0	0.00369154	0.0117118	2.49133
51	7	49.0	2	2019-016	2019-04-11 00:00:00	75	332	mean	0	1.66103	0.145549	0.0161815	0.113534	0.0543728	0.236785	0	0.00986979	0.00749211	2.24481
52	7	50.0	2	2019-016	2019-04-11 00:00:00	72	345	mean	0	1.69253	0.0607043	0.0273842	0.20995	0.107062	0.235914	0.0171222	0.019343	0.00543582	2.37545
53	7	51.0	2	2019-016	2019-04-11 00:00:00	69	357	mean	0	1.20815	0.189921	0.0237731	0.219837	0.0697194	0.275599	0	0.0128893	0.00430002	2.00419
54	7	52.0	2	2019-016	2019-04-11 00:00:00	ADCP	367	mean	0	0.277147	0.0376473	0.0182011	0.222086	0.0207715	0.0508863	0.0027147	0.00533482	0.00312848	0.637917
55	7	53.0	2	2019-016	2019-04-11 00:00:00	65	377	mean	0	0.91255	0.0374169	0.0295607	0.226087	0.046938	0.232094	0.00658676	0.0116996	0.00512161	1.50806
56	7	54.0	2	2019-016	2019-04-11 00:00:00	63	380	mean	0	0.704281	0.0266105	0.0317602	0.254517	0.0525936	0.207993	0.00172955	0.0059513	0.00552183	1.29096
57	7	55.0	2	2019-016	2019-04-11 00:00:00	62	391	mean	0	0.52177	0.0800526	0.0118601	0.117773	0.0196437	0.0626985	0.0115089	0	0.0046034	0.82991
58	7	56.0	2	2019-016	2019-04-12 00:00:00	SC04	394	mean	0	3.5872	0.139061	0	0.112151	0.0294817	0.0468341	0	0	0.0152213	3.92995
59	Absolute Pigment Compositions - Bin # 13	NaN	NaN	NaN	NaN	From Sheet: OutR9	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

In [19]:

# now, drop any rows from the table that have NaN values in either of the columns
#  'Index' or 'TchlA (ug/L)'
# This is why we applied a function to the Index column to make sure all 
#  non-numeric Index values would have a consistent NaN entry, making them easy to identify
#  and remove
dfPhyto.dropna(subset=['Index', 'TchlA'],how='any',inplace=True)

In [20]:

# pandas creates its own index, and after dropping rows I like to reset it -
# this is just for convenience
dfPhyto.reset_index(drop=True,inplace=True)

In [21]:

# apply formatting function all rows in Cruise column to get year-3digit format
dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]

In [22]:

# display part of the table, confirming that non-data rows have been removed
dfPhyto[48:60]

Out[22]:

	Bin #	Index	Subgroup	Cruise	Date	Station	Sample#	rep	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	Cyanobacteria	TchlA
48	7	49.0	2	2019-016	2019-04-11 00:00:00	75	332	mean	1.66103	0.145549	0.0161815	0.113534	0.0543728	0.236785	0	0.00986979	0.00749211	2.24481
49	7	50.0	2	2019-016	2019-04-11 00:00:00	72	345	mean	1.69253	0.0607043	0.0273842	0.20995	0.107062	0.235914	0.0171222	0.019343	0.00543582	2.37545
50	7	51.0	2	2019-016	2019-04-11 00:00:00	69	357	mean	1.20815	0.189921	0.0237731	0.219837	0.0697194	0.275599	0	0.0128893	0.00430002	2.00419
51	7	52.0	2	2019-016	2019-04-11 00:00:00	ADCP	367	mean	0.277147	0.0376473	0.0182011	0.222086	0.0207715	0.0508863	0.0027147	0.00533482	0.00312848	0.637917
52	7	53.0	2	2019-016	2019-04-11 00:00:00	65	377	mean	0.91255	0.0374169	0.0295607	0.226087	0.046938	0.232094	0.00658676	0.0116996	0.00512161	1.50806
53	7	54.0	2	2019-016	2019-04-11 00:00:00	63	380	mean	0.704281	0.0266105	0.0317602	0.254517	0.0525936	0.207993	0.00172955	0.0059513	0.00552183	1.29096
54	7	55.0	2	2019-016	2019-04-11 00:00:00	62	391	mean	0.52177	0.0800526	0.0118601	0.117773	0.0196437	0.0626985	0.0115089	0	0.0046034	0.82991
55	7	56.0	2	2019-016	2019-04-12 00:00:00	SC04	394	mean	3.5872	0.139061	0	0.112151	0.0294817	0.0468341	0	0	0.0152213	3.92995
56	13	57.0	3	2019-044	2019-04-29 00:00:00	GEO1	9	mean	5.62557	0.409893	0.3874	0.241505	0.0396982	0.155595	0.070028	0.117273	0.0390135	7.08597
57	13	58.0	3	2019-044	2019-04-30 00:00:00	39	10	mean	8.91175	0.803637	0.238999	0.523477	0.23522	0.153088	0.120047	0.100953	0	11.0872
58	13	59.0	3	2019-044	2019-04-30 00:00:00	27	11	mean	12.1251	1.06535	0.168672	0.28809	0.00246724	0.240602	0.0952687	0.251948	0.0393948	14.2769
59	13	60.0	3	2019-044	2019-04-30 00:00:00	22	12	mean	10.9518	0.681424	0.174355	0.573097	0.0855425	0	0	0	0.062748	12.529

In [23]:

# due to repeated column names in the original spreadsheet, '.1' was appended to the names
# of the phytoplankton columns; 
# these lines correct the column names, removing the '.1':
renameDict=dict()
for colName in dfPhyto.keys():
    if colName.endswith('.1'):
        renameDict[colName]=colName.split('.1')[0]
dfPhyto.rename(columns=renameDict,inplace=True)

In [24]:

dfPhyto

Out[24]:

	Bin #	Index	Subgroup	Cruise	Date	Station	Sample#	rep	depth	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	Cyanobacteria	TchlA
0	16	1.0	1	2019-001	2019-02-20 00:00:00	14	729	mean	0	0.0331146	0.101655	0.123836	0.3117	0.00876211	0.1462	0.0225388	0.0234123	0.00090972	0.772129
1	16	2.0	1	2019-001	2019-02-20 00:00:00	12	732	mean	0	0.156416	0.17666	0.112519	0.237514	0.0127872	0.179982	0.0256941	0.0165342	0.000955474	0.919063
2	16	3.0	1	2019-001	2019-02-20 00:00:00	22	743	mean	0	0.356374	0.340304	0.125008	0.315779	0.0132582	0.07692	0.0203337	0.0362638	0.00278554	1.28703
3	16	4.0	1	2019-001	2019-02-20 00:00:00	CPF2	746	mean	0	0.0606656	0.116013	0.135416	0.332063	0.007642	0.159874	0.0260862	0.0274673	0.00137643	0.866603
4	16	5.0	1	2019-001	2019-02-20 00:00:00	CPF1	749	mean	0	0.404852	0.41367	0.131683	0.197787	0.0128683	0.0582133	0.0158865	0.0375323	0.00585506	1.27835
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
166	19	167.0	7	2019-062	2019-10-05 00:00:00	IND0	349	mean	0	9.11885	0	0.289786	0.612077	0.0876324	0.0656088	0.0868017	0.0365193	0	10.2973
167	19	168.0	7	2019-062	2019-10-05 00:00:00	IND2	361	mean	0	6.2426	0	0.312427	0.510974	0.0604039	0.146785	0.0594772	0.0247012	0.0121179	7.36948
168	19	169.0	7	2019-062	2019-10-05 00:00:00	IND4	375	mean	0	8.54631	0	0.520754	0.511822	0.158578	0.177592	0.0323331	0.0887445	0.0333497	10.0695
169	19	170.0	7	2019-062	2019-10-05 00:00:00	IND7	389	mean	0	4.28515	0	0.365631	8.53139	0	0	0.00631162	0.00703949	0.0631797	13.2587
170	19	171.0	7	2019-062	2019-10-05 00:00:00	VAN5	395	mean	0	1.94256	0	0.279896	0.383945	0.106905	0.0399761	0.0289777	0.0341708	0.00224957	2.81868

171 rows × 19 columns

In [25]:

# This is the important step- join the two tables ('left' and 'right'), 
#  matching the cruise IDs and sample numbers
#   how='outer'  -> all rows from both the left and the right tables will be included, 
#                   even if they cannot be matched; this makes it easy to check for 
#                   unmatched data later
#   left_on  specifies the name of the column to match in the left table (dfbot) 
#   right_on specifies the name of the column to match in the right table (dfPhyto)
dfout = pd.merge(dfbot, dfPhyto,  how='outer', 
                 left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])

In [26]:

# Identify cases where phytoplankton data were matched to multiple samples in bottle data:
dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])

In [27]:

# show the column names in the resulting table
dfout.keys()

Out[27]:

Index(['Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE',
       'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION', 'LOC:STATION',
       'ADM:PROJECT', 'Bottle_Number', 'Bottle:Firing_Sequence',
       'Sample_Number', 'Pressure [decibar]', 'Depth [metres]',
       'Temperature [deg C (ITS90)]', 'Transmissivity [*/metre]',
       'Fluorescence:URU [mg/m^3]', 'PAR [uE/m^2/sec]', 'Salinity [PSS-78]',
       'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',
       'Conductivity [S/m]', 'Number_of_bin_records',
       'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted',
       'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]',
       'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate',
       'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR',
       'Cruise', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',
       'Depth:Nominal [metres]', 'Temperature:Draw [deg C (ITS90)]',
       'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle',
       'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]',
       'Flag:Oxygen:Dissolved', 'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal',
       'File Name', 'LOC:ALTIMETER (M)', 'ADM:PLATFORM', 'LOC:GEOGRAPHIC AREA',
       'FIL:DATA DESCRIPTION', 'Transmissivity:Green [*/metre]', 'Bin #',
       'Index', 'Subgroup', 'Date', 'Station', 'Sample#', 'rep', 'depth',
       'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes',
       'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
       'Cyanobacteria', 'TchlA'],
      dtype='object')

Checks¶

In [28]:

temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})
temp.columns = ['icount']

In [29]:

np.unique(temp.icount)

Out[29]:

array([1])

In [30]:

# check for Phyto samples matched to multiple bottle samples:
temp.loc[temp.icount>1]

Out[30]:

		icount
Cruise	Sample#

In [31]:

# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]

Out[31]:

		icount
Cruise	Sample#

In [32]:

temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})
temp2.columns = ['icount']
# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:
temp2.loc[temp2.icount>1]

Out[32]:

		icount
Cruise	Sample_Number

In [33]:

# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]

Out[33]:

		icount
Cruise	Sample#

In [34]:

# if the output table is longer than either of the input tables, some columns were not matched
len(dfout), len(dfPhyto), len(dfbot)

Out[34]:

(1444, 171, 1444)

In [35]:

# Check that the number of cells with data in the 'Cyanobacteria' column is 
#  the same for the input and output tables to show that no rows are missing:
np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)

Out[35]:

(171, 171)

In [36]:

# If there were data rows from the phytoplankton table that were not matched to 
#  rows from the bottle table, their indices from the phytoplankton table would be 
#  displayed below (the series [] would not be empty)
print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Index'])

Series([], Name: Index, dtype: float64)

In [37]:

dfout.loc[dfout['ADM:SCIENTIST'].isna(),['Index','Cruise','Diatoms-1','Prasinophytes']]

Out[37]:

	Index	Cruise	Diatoms-1	Prasinophytes

In [38]:

# drop repetetive/unecessary columns:
dfout.drop(labels=['Bin #', 'Index', 'Subgroup', 'Station', 'Sample#', 'rep',
                   'depth',],axis=1,inplace=True)

In [39]:

# truncate phyto group values to 3 decimal places:
for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',
       'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
       'TchlA'):
    dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column

In [40]:

dfout['Cyanobacteria']

Out[40]:

0         NaN
1         NaN
2       0.001
3         NaN
4         NaN
        ...  
1439      NaN
1440      NaN
1441      NaN
1442      NaN
1443    0.002
Name: Cyanobacteria, Length: 1444, dtype: float64

In [41]:

# now write the output table to a .csv file:
dfout.to_csv(pathOut, index=False)  

In [42]:

dfout.keys()

Out[42]:

Index(['Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE',
       'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION', 'LOC:STATION',
       'ADM:PROJECT', 'Bottle_Number', 'Bottle:Firing_Sequence',
       'Sample_Number', 'Pressure [decibar]', 'Depth [metres]',
       'Temperature [deg C (ITS90)]', 'Transmissivity [*/metre]',
       'Fluorescence:URU [mg/m^3]', 'PAR [uE/m^2/sec]', 'Salinity [PSS-78]',
       'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',
       'Conductivity [S/m]', 'Number_of_bin_records',
       'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted',
       'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]',
       'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate',
       'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR',
       'Cruise', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',
       'Depth:Nominal [metres]', 'Temperature:Draw [deg C (ITS90)]',
       'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle',
       'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]',
       'Flag:Oxygen:Dissolved', 'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal',
       'File Name', 'LOC:ALTIMETER (M)', 'ADM:PLATFORM', 'LOC:GEOGRAPHIC AREA',
       'FIL:DATA DESCRIPTION', 'Transmissivity:Green [*/metre]', 'Date',
       'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes',
       'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
       'Cyanobacteria', 'TchlA'],
      dtype='object')

In [ ]: