In [1]:

import numpy as np  # this module handles arrays, but here we need it for its NaN value
import pandas as pd # this module contains a lot of tools for handling tabular data
import re

In [2]:

# define paths to the source files and eventual output file
#pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2018 SoG bottle.xlsx'
#pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2018 Abs phyto groups.xlsx'
pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2018 SoG bottleCorrected.xlsx'
pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2018 Abs phyto groupsCorrected.xlsx'

pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2018.csv'

In [3]:

#formatting function to get year-(3digit) format
def fmtCruise(istr):
    if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): 
        sp=re.split('-',istr)
        rstr=sp[0]+'-0'+sp[1]
    elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):
        rstr=istr
    else:
        raise ValueError('Input had unexpected format:',istr)
    return rstr

In [4]:

# get names of sheets in notebook
with pd.ExcelFile(pathBottle) as xl:
    sheets=xl.sheet_names
print(sheets)

['2018-005', '2018-01', '2018-96', '2018-29', '2018-035', '2018-39', '2018-036', '2018-030', '2018-037', '2018-34', '2018-40', '2018-31', '2018-032']

In [5]:

# load each sheet in the bottle Excel file and concatenate them together into one table
dfbotlist=list()
for sheet in sheets:
    df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,
                      na_values=(-99,-99.9)) # read each sheet; include additional na values
    df0['Cruise']=fmtCruise(sheet)  # create and populate Cruise column based on sheet name
    dfbotlist.append(df0) # append the sheet to a list
dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table
# Drop columns with no data in them
l1=set(dfbot.keys())
dfbot.dropna(axis=1,how='all',inplace=True)
print('removed empty columns:',l1-set(dfbot.keys()))

Reading sheet 2018-005
Reading sheet 2018-01
Reading sheet 2018-96
Reading sheet 2018-29
Reading sheet 2018-035
Reading sheet 2018-39
Reading sheet 2018-036
Reading sheet 2018-030
Reading sheet 2018-037
Reading sheet 2018-34
Reading sheet 2018-40
Reading sheet 2018-31
Reading sheet 2018-032
removed empty columns: {'Flag:Ammonium', 'Ammonium [umol/L]'}

In [6]:

# list the column names in the resulting table
print(dfbot.keys())

Index(['File Name', 'Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE',
       'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION',
       'LOC:STATION', 'ADM:PROJECT', 'Zone.1', 'YYYY/MM/DD HH:MM:SS',
       'Sample_Number', 'Pressure [decibar]', 'Depth [metres]',
       'Temperature:Primary [deg C (ITS90)]', 'Transmissivity [*/metre]',
       'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]',
       'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal',
       'Salinity:T0:C0 [PSS-78]', 'Oxygen:Dissolved:SBE [mL/L]',
       'Oxygen:Dissolved:SBE [umol/kg]', 'Temperature:Draw [deg C (ITS90)]',
       'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle',
       'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted',
       'Phaeo-Pigment:Extracted [mg/m^3]', 'Oxygen:Dissolved [mL/L]',
       'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved',
       'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite',
       'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]',
       'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise',
       'FIL:START TIME YYYY/MM/DD HH:MM:SS',
       'Temperature:Secondary [deg C (ITS90)]', 'Pressure:CTD [decibar]',
       'Depth:CTD [metres]', 'Temperature:CTD [deg C (ITS90)]',
       'Salinity:CTD [PSS-78]', 'Oxygen:Dissolved:Volume:CTD [mL/L]',
       'Oxygen:Dissolved:Mass:CTD [umol/kg]', 'Fluorescence:URU:CTD [mg/m^3]',
       'Depth:Nominal [metres]', 'Alkalinity:Total [umol/L]',
       'Flag:Alkalinity:Total', 'Carbon:Dissolved:Inorganic [umol/kg]',
       'Flag:Carbon:Dissolved:Inorganic', 'Salinity:T1:C1 [PSS-78]',
       'Fluorescence:URU:Wetlabs [mg/m^3]', 'Bottle_Number',
       'Bottle:Firing_Sequence'],
      dtype='object')

In [7]:

# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Depth [metres]']>=0),
      np.sum(dfbot['Depth:CTD [metres]']>=0))
dfbot.loc[(dfbot['Depth [metres]']>=0)&\
          (dfbot['Depth:CTD [metres]']>=0)]

2010 36

Out[7]:

	File Name	Zone	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	ADM:MISSION	LOC:STATION	ADM:PROJECT	...	Fluorescence:URU:CTD [mg/m^3]	Depth:Nominal [metres]	Alkalinity:Total [umol/L]	Flag:Alkalinity:Total	Carbon:Dissolved:Inorganic [umol/kg]	Flag:Carbon:Dissolved:Inorganic	Salinity:T1:C1 [PSS-78]	Fluorescence:URU:Wetlabs [mg/m^3]	Bottle_Number	Bottle:Firing_Sequence

0 rows × 59 columns

In [8]:

# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Temperature:Primary [deg C (ITS90)]']>=0),
      np.sum(dfbot['Temperature:Secondary [deg C (ITS90)]']>=0),
      np.sum(dfbot['Temperature:CTD [deg C (ITS90)]']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Temperature:Primary [deg C (ITS90)]']>=0)])+\
          np.array([int(ii) for ii in (dfbot['Temperature:Secondary [deg C (ITS90)]']>=0)])+\
           np.array([int(ii) for ii in (dfbot['Temperature:CTD [deg C (ITS90)]']>=0)]))>1]

1014 956 36

Out[8]:

	File Name	Zone	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	ADM:MISSION	LOC:STATION	ADM:PROJECT	...	Fluorescence:URU:CTD [mg/m^3]	Depth:Nominal [metres]	Alkalinity:Total [umol/L]	Flag:Alkalinity:Total	Carbon:Dissolved:Inorganic [umol/kg]	Flag:Carbon:Dissolved:Inorganic	Salinity:T1:C1 [PSS-78]	Fluorescence:URU:Wetlabs [mg/m^3]	Bottle_Number	Bottle:Firing_Sequence

0 rows × 59 columns

In [9]:

# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0),
      np.sum(dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0),
      np.sum(dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0)])+\
          np.array([int(ii) for ii in (dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0)])+\
           np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0)]))>1]

1517 36 104

Out[9]:

	File Name	Zone	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	ADM:MISSION	LOC:STATION	ADM:PROJECT	...	Fluorescence:URU:CTD [mg/m^3]	Depth:Nominal [metres]	Alkalinity:Total [umol/L]	Flag:Alkalinity:Total	Carbon:Dissolved:Inorganic [umol/kg]	Flag:Carbon:Dissolved:Inorganic	Salinity:T1:C1 [PSS-78]	Fluorescence:URU:Wetlabs [mg/m^3]	Bottle_Number	Bottle:Firing_Sequence

0 rows × 59 columns

In [10]:

# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0),
      np.sum(dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0))
dfbot.loc[(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0)&\
          (dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0)]

1517 36

Out[10]:

	File Name	Zone	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	ADM:MISSION	LOC:STATION	ADM:PROJECT	...	Fluorescence:URU:CTD [mg/m^3]	Depth:Nominal [metres]	Alkalinity:Total [umol/L]	Flag:Alkalinity:Total	Carbon:Dissolved:Inorganic [umol/kg]	Flag:Carbon:Dissolved:Inorganic	Salinity:T1:C1 [PSS-78]	Fluorescence:URU:Wetlabs [mg/m^3]	Bottle_Number	Bottle:Firing_Sequence

0 rows × 59 columns

In [11]:

# no rows returned, so there are no rows with both both salinity fields
print(np.sum(dfbot['Salinity:T0:C0 [PSS-78]']>=0),
      np.sum(dfbot['Salinity:T1:C1 [PSS-78]']>=0),
      np.sum(dfbot['Salinity:CTD [PSS-78]']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Salinity:T0:C0 [PSS-78]']>=0)])+\
           np.array([int(ii) for ii in (dfbot['Salinity:T1:C1 [PSS-78]']>=0)])+\
           np.array([int(ii) for ii in (dfbot['Salinity:CTD [PSS-78]']>=0)]))>1]

1132 836 36

Out[11]:

	File Name	Zone	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	ADM:MISSION	LOC:STATION	ADM:PROJECT	...	Fluorescence:URU:CTD [mg/m^3]	Depth:Nominal [metres]	Alkalinity:Total [umol/L]	Flag:Alkalinity:Total	Carbon:Dissolved:Inorganic [umol/kg]	Flag:Carbon:Dissolved:Inorganic	Salinity:T1:C1 [PSS-78]	Fluorescence:URU:Wetlabs [mg/m^3]	Bottle_Number	Bottle:Firing_Sequence

0 rows × 59 columns

In [12]:

# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Pressure [decibar]']>=0),
      np.sum(dfbot['Pressure:CTD [decibar]']>=0))
dfbot.loc[(dfbot['Pressure [decibar]']>=0)&\
          (dfbot['Pressure:CTD [decibar]']>=0)]

1983 36

Out[12]:

	File Name	Zone	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	ADM:MISSION	LOC:STATION	ADM:PROJECT	...	Fluorescence:URU:CTD [mg/m^3]	Depth:Nominal [metres]	Alkalinity:Total [umol/L]	Flag:Alkalinity:Total	Carbon:Dissolved:Inorganic [umol/kg]	Flag:Carbon:Dissolved:Inorganic	Salinity:T1:C1 [PSS-78]	Fluorescence:URU:Wetlabs [mg/m^3]	Bottle_Number	Bottle:Firing_Sequence

0 rows × 59 columns

In [13]:

# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Depth [metres]']>=0),
      np.sum(dfbot['Depth:CTD [metres]']>=0))
dfbot.loc[(dfbot['Depth [metres]']>=0)&\
          (dfbot['Depth:CTD [metres]']>=0)]

2010 36

Out[13]:

	File Name	Zone	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	ADM:MISSION	LOC:STATION	ADM:PROJECT	...	Fluorescence:URU:CTD [mg/m^3]	Depth:Nominal [metres]	Alkalinity:Total [umol/L]	Flag:Alkalinity:Total	Carbon:Dissolved:Inorganic [umol/kg]	Flag:Carbon:Dissolved:Inorganic	Salinity:T1:C1 [PSS-78]	Fluorescence:URU:Wetlabs [mg/m^3]	Bottle_Number	Bottle:Firing_Sequence

0 rows × 59 columns

In [14]:

# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Oxygen:Dissolved:SBE [mL/L]']>=0),
      np.sum(dfbot['Oxygen:Dissolved:Volume:CTD [mL/L]']>=0))
dfbot.loc[(dfbot['Oxygen:Dissolved:SBE [mL/L]']>=0)&\
          (dfbot['Oxygen:Dissolved:Volume:CTD [mL/L]']>=0)]

1975 36

Out[14]:

	File Name	Zone	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	ADM:MISSION	LOC:STATION	ADM:PROJECT	...	Fluorescence:URU:CTD [mg/m^3]	Depth:Nominal [metres]	Alkalinity:Total [umol/L]	Flag:Alkalinity:Total	Carbon:Dissolved:Inorganic [umol/kg]	Flag:Carbon:Dissolved:Inorganic	Salinity:T1:C1 [PSS-78]	Fluorescence:URU:Wetlabs [mg/m^3]	Bottle_Number	Bottle:Firing_Sequence

0 rows × 59 columns

In [15]:

# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Oxygen:Dissolved:SBE [umol/kg]']>=0),
      np.sum(dfbot['Oxygen:Dissolved:Mass:CTD [umol/kg]']>=0))
dfbot.loc[(dfbot['Oxygen:Dissolved:SBE [umol/kg]']>=0)&\
          (dfbot['Oxygen:Dissolved:Mass:CTD [umol/kg]']>=0)]

1952 36

Out[15]:

	File Name	Zone	LOC:EVENT_NUMBER	LOC:LATITUDE	LOC:LONGITUDE	LOC:WATER DEPTH	ADM:SCIENTIST	ADM:MISSION	LOC:STATION	ADM:PROJECT	...	Fluorescence:URU:CTD [mg/m^3]	Depth:Nominal [metres]	Alkalinity:Total [umol/L]	Flag:Alkalinity:Total	Carbon:Dissolved:Inorganic [umol/kg]	Flag:Carbon:Dissolved:Inorganic	Salinity:T1:C1 [PSS-78]	Fluorescence:URU:Wetlabs [mg/m^3]	Bottle_Number	Bottle:Firing_Sequence

0 rows × 59 columns

In [16]:

def subval(idf,colList):
    # first value in colList should be the column you are going to keep
    # follow with other columns that will be used to fill in when that column is NaN
    # in order of precedence
    if len(colList)==2:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] for i,r in idf.iterrows()]
    elif len(colList)==3:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] if not pd.isna(r[colList[1]]) \
                         else r[colList[2]] for i,r in idf.iterrows()]
    else:
        raise NotImplementedError('Add to code to handle this case')
    idf.drop(columns=list(colList[1:]),inplace=True)
    return idf

In [17]:

# there are some duplicate columns here; handle them:
dfbot=subval(dfbot,('FIL:START TIME YYYY/MM/DD HH:MM:SS',
                    'YYYY/MM/DD HH:MM:SS'))
dfbot=subval(dfbot,('Temperature:Primary [deg C (ITS90)]',
                    'Temperature:Secondary [deg C (ITS90)]',
                    'Temperature:CTD [deg C (ITS90)]'))
dfbot=subval(dfbot,('Salinity:T0:C0 [PSS-78]',
                    'Salinity:T1:C1 [PSS-78]',
                    'Salinity:CTD [PSS-78]'))
dfbot=subval(dfbot,('Pressure [decibar]',
                    'Pressure:CTD [decibar]'))
dfbot=subval(dfbot,('Depth [metres]',
                    'Depth:CTD [metres]'))
dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [mL/L]',
                    'Oxygen:Dissolved:Volume:CTD [mL/L]'))
dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [umol/kg]',
                    'Oxygen:Dissolved:Mass:CTD [umol/kg]'))
dfbot=subval(dfbot,('Fluorescence:URU:Seapoint [mg/m^3]',
                    'Fluorescence:URU:CTD [mg/m^3]',
                    'Fluorescence:URU:Wetlabs [mg/m^3]'))
dfbot.rename(columns={'Temperature:Primary [deg C (ITS90)]':'Temperature [deg C (ITS90)]'},
             inplace=True)
dfbot.rename(columns={'Salinity:T0:C0 [PSS-78]':'Salinity [PSS-78]'},
             inplace=True)
dfbot.rename(columns={'Oxygen:Dissolved:SBE [mL/L]':'Oxygen:Dissolved:CTD [mL/L]'},
             inplace=True)
dfbot.rename(columns={'Oxygen:Dissolved:SBE [umol/kg]':'Oxygen:Dissolved:CTD [umol/kg]'},
             inplace=True)
dfbot.rename(columns={'Fluorescence:URU:Seapoint [mg/m^3]':'Fluorescence [mg/m^3]'},
             inplace=True)

In [18]:

# define a function that will be applied to the values in the index column;
# this makes it easier to drop non-data rows later
def convertIndex(val):
    try:
        x =int(val)
    except ValueError:
        x=np.nan
    return x

In [19]:

# load the  phytoplankton data with the following options:
#   sheet_name='???? CHEMTAX abs results'  -> choose the  sheet
#   usecols='A:I,T:AC'   -> read only columns A:I and T:AC from the Excel sheet
#   skiprows=2     -> start reading at the 3rd row of the sheet, 
#                     which contains the column headings
#   converters={'Index': convertIndex,}   -> apply the function defined above to the Index column
#   verbose = True   -> print extra information/ warnings/ errors
dfPhyto=pd.read_excel(pathPhyto,sheet_name='2018 CHEMTAX abs results ',usecols='A:I,T:AC',
                      skiprows=2,converters={'Index': convertIndex,},
                      verbose=True)

Reading sheet 2018 CHEMTAX abs results

In [20]:

# display rows 48 to 59 of the resulting table
dfPhyto[48:60]

Out[20]:

	Bin #	Index	Subgroup	Cruise	Month	Station	Sample#	rep	depth	Diatoms-1.1	Diatoms-2.1	Prasinophytes.1	Cryptophytes.1	Dinoflagellates	Haptophytes.1	Dictyochophytes	Raphidophytes	Cyanobacteria.1	TchlA.1
48	20	44.0	1	2018-05	2018-02-06 00:00:00	IND4	283	A	0	0.0649569	0	0.00302939	0.00476828	0.00010182	0	0	0	0.000196532	0.0730529
49	20	45.0	1	2018-05	2018-02-06 00:00:00	IND4	283	B	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
50	20	42.0	1	2018-05	2018-02-06 00:00:00	IND7	258	A	0	0.108673	0	0.0150159	0.0356732	0.000435557	0.000280088	0	0	0.000201243	0.160279
51	20	43.0	1	2018-05	2018-02-06 00:00:00	IND7	258	B	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
52	Absolute Pigment Compositions - Bin # 6	NaN	NaN	NaN	NaN	From Sheet: OutR6	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
53	Tchl_a	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
54	Bin #	NaN	Subgroup	Cruise	Date	Station	Sample#	rep	depth	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	Cyanobacteria	TchlA
55	6	52.0	2	2018-01	2018-03-05 00:00:00	JF2	515	A	2.311	1.11284	0.191776	0.129091	0.120473	0.00356186	0	0.00256907	0.0637685	0	1.62408
56	6	53.0	2	2018-01	2018-03-05 00:00:00	JF2	515	B	2.311	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
57	6	54.0	2	2018-01	2018-03-06 00:00:00	56	556	A	2.218	0.474503	0.066797	0.0240524	0.0413634	0	0.101717	0	0	0	0.708433
58	6	55.0	2	2018-01	2018-03-06 00:00:00	56	556	B	2.218	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
59	6	56.0	2	2018-01	2018-03-06 00:00:00	46	559	A	2.596	1.13658	0.252949	0.0463031	0.0438257	0	0.230516	0.0018858	0	0	1.71206

In [21]:

# now, drop any rows from the table that have NaN values in either of the columns
#  'Index' or 'TchlA (ug/L)'
# This is why we applied a function to the Index column to make sure all 
#  non-numeric Index values would have a consistent NaN entry, making them easy to identify
#  and remove
dfPhyto.dropna(subset=['Index', 'TchlA.1'],how='any',inplace=True)

In [22]:

# pandas creates its own index, and after dropping rows I like to reset it -
# this is just for convenience
dfPhyto.reset_index(drop=True,inplace=True)

In [23]:

# apply formatting function all rows in Cruise column to get year-3digit format
dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]

In [24]:

# display part of the table, confirming that non-data rows have been removed
dfPhyto[48:60]

Out[24]:

	Bin #	Index	Subgroup	Cruise	Month	Station	Sample#	rep	Diatoms-1.1	Diatoms-2.1	Prasinophytes.1	Cryptophytes.1	Dinoflagellates	Haptophytes.1	Dictyochophytes	Raphidophytes	Cyanobacteria.1	TchlA.1
48	7	95.0	3	2018-029	2018-04-07 00:00:00	69	36	A	0.187208	0.053063	0.000209863	0.064098	0.000847701	0	0	0.00279431	0.000899571	0.30912
49	7	99.0	3	2018-029	2018-04-07 00:00:00	ADCP	60	A	0.19605	0.0455146	0.022445	0.150537	0.000617533	0	0.0101852	0	0	0.42535
50	7	101.0	3	2018-029	2018-04-07 00:00:00	65	70	A	0.190369	0.0611141	0.0298991	0.13724	0.00104771	0	0	0	0	0.41967
51	7	93.0	3	2018-029	2018-04-07 00:00:00	59	24	A	0.624601	0.0413709	0	0.0655129	0.000867299	0	0	0.00178241	0	0.734135
52	7	103.0	3	2018-029	2018-04-08 00:00:00	56	87	A	0.92955	0.00740467	0	0.0206606	0.0600011	0	0.0201656	0	0	1.03778
53	7	105.0	3	2018-029	2018-04-08 00:00:00	46	99	A	3.42264	0.392457	0	0.197309	0.0680703	0.150816	0	0	0	4.2313
54	7	107.0	3	2018-029	2018-04-08 00:00:00	42	115	A	11.6617	0	0	0	0.00175876	0.467956	0	0	0	12.1314
55	7	109.0	3	2018-029	2018-04-08 00:00:00	39	132	A	9.28337	0	0	0.00637019	0.090733	0.205921	0	0.00158903	0	9.58799
56	7	111.0	3	2018-029	2018-04-08 00:00:00	GEO1	135	A	8.67672	0.0262928	0	0.146423	0.221538	0.415636	0.0321994	0	0	9.51881
57	7	113.0	3	2018-029	2018-04-09 00:00:00	27	151	A	6.3555	0.134811	0	0.234917	0.19905	0.418803	0.0603483	0	0	7.40343
58	7	115.0	3	2018-029	2018-04-09 00:00:00	CPF1	154	A	2.43778	0.078365	7.92647e-05	0.00468596	0.00262092	0	0	0.00473855	0.000425651	2.52869
59	7	117.0	3	2018-029	2018-04-09 00:00:00	2	169	A	1.73363	0.417418	0.000101997	0.198702	0	0.100577	0	0	0.000486359	2.45091

In [25]:

# due to repeated column names in the original spreadsheet, '.1' was appended to the names
# of the phytoplankton columns; 
# these lines correct the column names, removing the '.1':
renameDict=dict()
for colName in dfPhyto.keys():
    if colName.endswith('.1'):
        renameDict[colName]=colName.split('.1')[0]
dfPhyto.rename(columns=renameDict,inplace=True)

In [26]:

dfPhyto

Out[26]:

	Bin #	Index	Subgroup	Cruise	Month	Station	Sample#	rep	depth	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates	Haptophytes	Dictyochophytes	Raphidophytes	Cyanobacteria	TchlA
0	20	1.0	1	2018-005	2018-02-02 00:00:00	SI	9	A	0	0.153863	0.0714609	0.166427	0.0874306	0	0.185031	0.0149052	0.0734495	0	0.752567
1	20	3.0	1	2018-005	2018-02-02 00:00:00	59	23	A	0	0.205173	0.0166544	0.00668857	0.00172965	0.000179561	0	0	0	3.44658e-06	0.230429
2	20	5.0	1	2018-005	2018-02-02 00:00:00	56	37	A	0	0.175883	0.0100662	0.013261	0	0	0	1.1709e-05	0	0	0.199222
3	20	7.0	1	2018-005	2018-02-03 00:00:00	46	49	A	0	0.326609	0.0461415	0.131239	0.0709554	0	0.142245	0.00767466	0.0317834	0	0.756648
4	20	9.0	1	2018-005	2018-02-03 00:00:00	42	65	A	0	0.404061	0.0270814	0.095717	0.135733	0	0	0.0106509	0	0	0.673244
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
221	13	424.0	11	2018-032	2018-11-25 00:00:00	9	263	A	5	0.186443	0.000274801	0.0594893	0.0232346	0.000438217	0.00532654	0	0	0	0.275206
222	13	426.0	11	2018-032	2018-11-25 00:00:00	12	281	A	0	0.218516	0	0.10337	0.104374	0.000538635	0.00475885	8.41674e-05	0	0	0.431642
223	13	428.0	11	2018-032	2018-11-25 00:00:00	14	297	A	0	0.137821	0.000257954	0.091518	0.0677467	0.000241816	0.00206663	7.29151e-05	0	0	0.299725
224	13	430.0	11	2018-032	2018-11-25 00:00:00	16	308	A	0	0.159867	0	0.0727466	0.0962076	0.000445752	0.00355829	0	0	0	0.332825
225	13	432.0	11	2018-032	2018-11-25 00:00:00	22	311	A	0	0.228341	0	0.154744	0.162821	0.00065139	0.00554693	0.0100801	0	0	0.562185

226 rows × 19 columns

In [27]:

# This is the important step- join the two tables ('left' and 'right'), 
#  matching the cruise IDs and sample numbers
#   how='outer'  -> all rows from both the left and the right tables will be included, 
#                   even if they cannot be matched; this makes it easy to check for 
#                   unmatched data later
#   left_on  specifies the name of the column to match in the left table (dfbot) 
#   right_on specifies the name of the column to match in the right table (dfPhyto)
dfout = pd.merge(dfbot, dfPhyto,  how='outer', 
                 left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])

In [28]:

# Identify cases where phytoplankton data were matched to multiple samples in bottle data:
dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])

In [29]:

# show the column names in the resulting table
dfout.keys()

Out[29]:

Index(['File Name', 'Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE',
       'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION',
       'LOC:STATION', 'ADM:PROJECT', 'Zone.1', 'Sample_Number',
       'Pressure [decibar]', 'Depth [metres]', 'Temperature [deg C (ITS90)]',
       'Transmissivity [*/metre]', 'Fluorescence [mg/m^3]', 'PAR [uE/m^2/sec]',
       'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal', 'Salinity [PSS-78]',
       'Oxygen:Dissolved:CTD [mL/L]', 'Oxygen:Dissolved:CTD [umol/kg]',
       'Temperature:Draw [deg C (ITS90)]', 'Salinity:Bottle [PSS-78]',
       'Flag:Salinity:Bottle', 'Chlorophyll:Extracted [mg/m^3]',
       'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted [mg/m^3]',
       'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]',
       'Flag:Oxygen:Dissolved', 'Nitrate_plus_Nitrite [umol/L]',
       'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate',
       'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR',
       'Cruise', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',
       'Depth:Nominal [metres]', 'Alkalinity:Total [umol/L]',
       'Flag:Alkalinity:Total', 'Carbon:Dissolved:Inorganic [umol/kg]',
       'Flag:Carbon:Dissolved:Inorganic', 'Bottle_Number',
       'Bottle:Firing_Sequence', 'Bin #', 'Index', 'Subgroup', 'Month',
       'Station', 'Sample#', 'rep', 'depth', 'Diatoms-1', 'Diatoms-2',
       'Prasinophytes', 'Cryptophytes', 'Dinoflagellates', 'Haptophytes',
       'Dictyochophytes', 'Raphidophytes', 'Cyanobacteria', 'TchlA'],
      dtype='object')

Checks¶

In [30]:

temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})
temp.columns = ['icount']

In [31]:

np.unique(temp.icount)

Out[31]:

array([1])

In [32]:

# check for Phyto samples matched to multiple bottle samples:
temp.loc[temp.icount>1]

Out[32]:

		icount
Cruise	Sample#

In [33]:

# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]

Out[33]:

		icount
Cruise	Sample#

In [34]:

temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})
temp2.columns = ['icount']
# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:
temp2.loc[temp2.icount>1]

Out[34]:

		icount
Cruise	Sample_Number
2018-035	13	2

In [35]:

# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]

Out[35]:

		icount
Cruise	Sample#

In [36]:

# if the output table is longer than either of the input tables, some columns were not matched
len(dfout), len(dfPhyto), len(dfbot)

Out[36]:

(2082, 226, 2080)

In [37]:

# Check that the number of cells with data in the 'Cyanobacteria' column is 
#  the same for the input and output tables to show that no rows are missing:
np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)

Out[37]:

(226, 226)

In [38]:

# If there were data rows from the phytoplankton table that were not matched to 
#  rows from the bottle table, their indices from the phytoplankton table would be 
#  displayed below (the series [] would not be empty)
print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Index'])

2080    141.0
2081    192.0
Name: Index, dtype: float64

In [39]:

dfout.loc[dfout['ADM:SCIENTIST'].isna(),['Index','Cruise','Diatoms-1','Prasinophytes']]

Out[39]:

	Index	Cruise	Diatoms-1	Prasinophytes
2080	141.0	2018-035	3.75868	0.36334
2081	192.0	2018-039	0.546726	0.00136988

In [40]:

# drop repetetive/unecessary columns:
dfout.drop(labels=['Bin #', 'Index', 'Subgroup', 'Month', 'Station', 'Sample#', 'rep',
                   'depth',],axis=1,inplace=True)

In [41]:

# truncate phyto group values to 3 decimal places:
for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',
       'Diatoms-2', 'Dinoflagellates', 'Haptophytes', 'Dictyochophytes', 'Raphidophytes',
       'TchlA'):
    dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column

In [42]:

dfout['Cyanobacteria']

Out[42]:

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
        ...  
2077      NaN
2078      NaN
2079    0.000
2080    0.047
2081    0.001
Name: Cyanobacteria, Length: 2082, dtype: float64

In [43]:

# now write the output table to a .csv file:
dfout.to_csv(pathOut, index=False)  

In [44]:

dfout.keys()

Out[44]:

Index(['File Name', 'Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE',
       'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION',
       'LOC:STATION', 'ADM:PROJECT', 'Zone.1', 'Sample_Number',
       'Pressure [decibar]', 'Depth [metres]', 'Temperature [deg C (ITS90)]',
       'Transmissivity [*/metre]', 'Fluorescence [mg/m^3]', 'PAR [uE/m^2/sec]',
       'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal', 'Salinity [PSS-78]',
       'Oxygen:Dissolved:CTD [mL/L]', 'Oxygen:Dissolved:CTD [umol/kg]',
       'Temperature:Draw [deg C (ITS90)]', 'Salinity:Bottle [PSS-78]',
       'Flag:Salinity:Bottle', 'Chlorophyll:Extracted [mg/m^3]',
       'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted [mg/m^3]',
       'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]',
       'Flag:Oxygen:Dissolved', 'Nitrate_plus_Nitrite [umol/L]',
       'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate',
       'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR',
       'Cruise', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',
       'Depth:Nominal [metres]', 'Alkalinity:Total [umol/L]',
       'Flag:Alkalinity:Total', 'Carbon:Dissolved:Inorganic [umol/kg]',
       'Flag:Carbon:Dissolved:Inorganic', 'Bottle_Number',
       'Bottle:Firing_Sequence', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes',
       'Cryptophytes', 'Dinoflagellates', 'Haptophytes', 'Dictyochophytes',
       'Raphidophytes', 'Cyanobacteria', 'TchlA'],
      dtype='object')

In [ ]: