In [1]:

import numpy as np  # this module handles arrays, but here we need it for its NaN value
import pandas as pd # this module contains a lot of tools for handling tabular data
from matplotlib import pyplot as plt

%matplotlib inline

In [2]:

# define paths to the source files and eventual output file
pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC//All 2015 SoG_bottle.xlsx'
pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2018 Abs phyto groupsCorrected.xlsx'
pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2015.csv'

In [3]:

# load each sheet in the 2015 bottle Excel file and concatenate them together into one table
dfbotlist=list()
for sheet in ('2015-17','2015-20','2015-18','2015-21','2015-19'):
    df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,na_values=(-99,-99.9)) # read each sheet
    df0['Cruise']=sheet  # create and populate Cruise column based on sheet name
    dfbotlist.append(df0) # append the sheet to a list
dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table
# Drop columns with no data in them
l1=set(dfbot.keys())
dfbot.dropna(axis=1,how='all',inplace=True)
print('removed empty columns:',l1-set(dfbot.keys()))

Reading sheet 2015-17
Reading sheet 2015-20
Reading sheet 2015-18
Reading sheet 2015-21
Reading sheet 2015-19
removed empty columns: {'Ammonium', 'Flag:Ammonium'}

In [4]:

# list the column names in the resulting table
print(dfbot.keys())

Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',
       'LOC:EVENT_NUMBER', 'LOC:STATION', 'LOC:LATITUDE', 'LOC:LONGITUDE',
       'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'Sample_Number', 'Pressure',
       'Temperature:Primary', 'Transmissivity', 'Fluorescence:URU:Seapoint',
       'PAR', 'Salinity:T0:C0', 'Oxygen:Dissolved:SBE',
       'Oxygen:Dissolved:SBE.1', 'pH:SBE:Nominal', 'Salinity:Bottle',
       'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',
       'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite',
       'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',
       'Flag:Phosphate', 'Cruise', 'Temperature:Draw',
       'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Oxygen:Dissolved.1',
       'Flag:Oxygen:Dissolved', 'Temperature:Secondary', 'Salinity:T1:C1',
       'Number_of_bin_records'],
      dtype='object')

In [5]:

def subval(idf,colList):
    # first value in colList should be the column you are going to keep
    # follow with other columns that will be used to fill in when that column is NaN
    # in order of precedence
    if len(colList)==2:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] for i,r in idf.iterrows()]
    elif len(colList)==3:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] if not pd.isna(r[colList[1]]) \
                         else r[colList[2]] for i,r in idf.iterrows()]
    else:
        raise NotImplementedError('Add to code to handle this case')
    idf.drop(columns=list(colList[1:]),inplace=True)
    return idf

In [6]:

dfbot=subval(dfbot,('Oxygen:Dissolved','Oxygen:Dissolved.1'))
dfbot=subval(dfbot,('Temperature:Primary','Temperature:Secondary'))
dfbot=subval(dfbot,('Salinity:T0:C0','Salinity:T1:C1'))
dfbot=subval(dfbot,('Oxygen:Dissolved:SBE','Oxygen:Dissolved:SBE.1'))
dfbot.rename(columns={'LOC:LATITUDE':'Lat',
                      'LOC:LONGITUDE':'Lon',
                      'Temperature:Primary':'Temperature',
                      'Oxygen:Dissolved:SBE':'Oxygen:Dissolved:CTD',
                      'Salinity:T0:C0':'Salinity',},inplace=True)

In [ ]:

In [7]:

# define a function that will be applied to the values in the index column;
# this makes it easier to drop non-data rows later
def convertIndex(val):
    try:
        x =int(val)
    except ValueError:
        x=np.nan
    return x

In [8]:

# load the 2015 phytoplankton data with the following options:
#   sheet_name='2015 CHEMTAX abs results'  -> choose the 2015 sheet
#   usecols='A:I,T:AC'   -> read only columns A:I and T:AC from the Excel sheet
#   skiprows=2     -> start reading at the 3rd row of the sheet, 
#                     which contains the column headings
#   converters={'Index': convertIndex,}   -> apply the function defined above to the Index column
#   verbose = True   -> print extra information/ warnings/ errors
dfPhyto=pd.read_excel(pathPhyto,sheet_name='2015 CHEMTAX abs results',usecols='A:I,T:AC',
                      skiprows=2,converters={'Index': convertIndex,},
                      verbose=True)

Reading sheet 2015 CHEMTAX abs results

In [9]:

# display rows 48 to 59 of the resulting table
dfPhyto[48:60]

Out[9]:

	Bin #	Index	Subgroup	Cruise	Month	Station	Sample#	rep	depth	Cyanobacteria.1	Prasinophytes.1	Cryptophytes.1	Diatoms-1.1	Diatoms-2.1	Dinoflagellates-1.1	Haptophytes.1	Dictyo.1	Raphido.1	TchlA (ug/L)
48	12	49.0	1	2015-17	April	22	303	A	0	0	0.0915533	0.0933184	9.81634	1.17253	0.00630329	0	0	0	11.1801
49	12	50.0	1	2015-17	April	22	303	B	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
50	Absolute Pigment Compositions - Bin # 16	NaN	NaN	NaN	NaN	From Sheet: OutR4	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
51	Tchl_a	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
52	Bin #	NaN	Subgroup	Cruise	Month	Station	Sample#	rep	depth	Cyanobacteria	Prasinophytes	Cryptophytes	Diatoms-1	Diatoms-2	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	TchlA
53	16	51.0	2	2015-20	June	CPF2	396	A	0	0.0672112	0.10427	0.0489969	0	0.0163572	0	0.184355	0	0.0422233	0.463414
54	16	52.0	2	2015-20	June	CPF2	396	B	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
55	16	53.0	2	2015-20	June	CPF1	397	A	0	0.0043579	0.495659	0.462661	0.692599	0	0.0954998	0.246477	0.0565426	0.0569377	2.11073
56	16	54.0	2	2015-20	June	CPF1	397	B	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
57	16	55.0	2	2015-20	June	22	398	A	0	0.0140808	0.169415	0.087069	0.392301	0.0331626	3.15016e-06	0.0847585	0	0.00837901	0.789169
58	16	56.0	2	2015-20	June	22	398	B	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
59	16	57.0	2	2015-20	June	24	399	A	0	0.0200784	0.101276	0.0692901	0.406107	0.00241722	0.000152693	0.00213347	6.5546e-05	0	0.60152

In [10]:

# now, drop any rows from the table that have NaN values in either of the columns
#  'Index' or 'TchlA (ug/L)'
# This is why we applied a function to the Index column to make sure all 
#  non-numeric Index values would have a consistent NaN entry, making them easy to identify
#  and remove
dfPhyto.dropna(subset=['Index', 'TchlA (ug/L)'],how='any',inplace=True)

In [11]:

# pandas creates its own index, and after dropping rows I like to reset it -
# this is just for convenience
dfPhyto.reset_index(drop=True,inplace=True)

In [12]:

# display part of the table, confirming that non-data rows have been removed
dfPhyto[48:60]

Out[12]:

	Bin #	Index	Subgroup	Cruise	Month	Station	Sample#	rep	Cyanobacteria.1	Prasinophytes.1	Cryptophytes.1	Diatoms-1.1	Diatoms-2.1	Dinoflagellates-1.1	Haptophytes.1	Dictyo.1	Raphido.1	TchlA (ug/L)
48	7	95.0	3	2015-18	June	2	202	A	0.1201	0.0876395	0.151612	0	0	0	0.360452	0	0.0861721	0.805976
49	7	97.0	3	2015-18	June	3	218	A	0.108716	0.100695	0.113358	0	0	0	0.437597	0	0.00902126	0.769387
50	7	99.0	3	2015-18	June	6	249	A	0.0899446	0.0257313	0.0671602	0	0	0	0.347255	0	0	0.530091
51	7	101.0	3	2015-18	June	9	261	A	0.0929619	0.0873561	0.102004	0.00641763	0	0	0.279578	0	0	0.568318
52	7	103.0	3	2015-18	June	12	277	A	0.0786977	0.1356	0.168394	0	0	0	0.444634	0	0.0214413	0.848767
53	7	105.0	3	2015-18	June	14	292	A	0.0520265	0.135979	0.161225	0	0	0	0.933363	0	0	1.28259
54	7	107.0	3	2015-18	June	16	303	A	0.0162557	0.0659037	0.131307	0	0	0	0.79041	0	0	1.00388
55	1	109.0	4	2015-21	Sept	14	327	A	0.103725	0.411755	0.541998	0.644463	0.161644	0.0743051	0.794115	0.0455306	0.0427693	2.82031
56	1	111.0	4	2015-21	Sept	11	328	A	0.473666	0.863616	0.922717	0.198315	0.0325998	0.186888	1.37823	0.0334639	0.106627	4.19612
57	1	113.0	4	2015-21	Sept	CPF2	329	A	0.270905	1.43341	0.910451	0.195409	0.61877	0	0.4964	0.127837	0.387924	4.4411
58	1	115.0	4	2015-21	Sept	CPF1	330	A	0.369211	1.56642	0.752309	0.207375	0.392154	0.000738665	0.142002	0.130842	0.4017	3.96275
59	1	117.0	4	2015-21	Sept	22	331	A	0.21602	1.21956	0.471939	0.421989	0.256469	0.00168525	0.402357	0.0810156	0.180523	3.25155

In [13]:

# due to repeated column names in the original spreadsheet, '.1' was appended to the names
# of the phytoplankton columns; 
# these lines correct the column names, removing the '.1':
renameDict=dict()
for colName in dfPhyto.keys():
    if colName.endswith('.1'):
        renameDict[colName]=colName.split('.1')[0]
dfPhyto.rename(columns=renameDict,inplace=True)

In [14]:

dfPhyto

Out[14]:

	Bin #	Index	Subgroup	Cruise	Month	Station	Sample#	rep	depth	Cyanobacteria	Prasinophytes	Cryptophytes	Diatoms-1	Diatoms-2	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	TchlA (ug/L)
0	12	1.0	1	2015-17	April	SI	9	A	0	0	0	0	24.4709	1.60687	0	0.0143717	0	0.0695617	26.1618
1	12	3.0	1	2015-17	April	59	23	A	0	3.57707e-05	4.01063e-05	0	0.337957	0.039984	0	0.0447033	2.42373e-05	0.000804232	0.423549
2	12	5.0	1	2015-17	April	102	37	A	0	0.00264467	0.021158	0.0450918	2.40675	0.610975	0.00463661	0.0242601	0.00118509	0.0118163	3.12852
3	12	7.0	1	2015-17	April	75	51	A	0	0.00488975	0.00514994	0.0671581	3.17296	0.881844	0.00563953	0.0438563	0.0151278	0.0224263	4.21906
4	12	9.0	1	2015-17	April	72	64	A	0	0	0.130189	0.326276	4.42956	0.991716	0.50695	0	0	0	6.38469
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
83	16	165.0	5	2015-19	Sept	9	248	A	0	0.0903742	0.844792	0.15371	0.552866	0.20032	0.000378653	0.314222	0.0342132	0.0465154	2.23739
84	16	167.0	5	2015-19	Sept	6	261	A	0	0.0838546	0.81372	0.308694	0.291579	0.32757	8.07944e-05	0.561729	0.0746783	0.0588162	2.52072
85	16	169.0	5	2015-19	Sept	BS11	269	A	0	0	0	0.125485	10.1442	0	0	0	0	0	10.2697
86	16	171.0	5	2015-19	Sept	3	285	A	0	0.12782	0.515151	0.262525	0.634216	0.170571	0.000441982	0.48741	0.033452	0.1269	2.35849
87	16	173.0	5	2015-19	Sept	2	300	A	0	0.111882	1.00504	0.165712	1.52858	0.0134512	0	0.396609	0.0252189	0.216654	3.46315

88 rows × 19 columns

In [15]:

# This is the important step- join the two tables ('left' and 'right'), 
#  matching the cruise IDs and sample numbers
#   how='outer'  -> all rows from both the left and the right tables will be included, 
#                   even if they cannot be matched; this makes it easy to check for 
#                   unmatched data later
#   left_on  specifies the name of the column to match in the left table (dfbot) 
#   right_on specifies the name of the column to match in the right table (dfPhyto)
dfout = pd.merge(dfbot, dfPhyto,  how='outer', 
                 left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])

In [16]:

# show the column names in the resulting table
dfout.keys()

Out[16]:

Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',
       'LOC:EVENT_NUMBER', 'LOC:STATION', 'Lat', 'Lon', 'LOC:WATER DEPTH',
       'ADM:SCIENTIST', 'Sample_Number', 'Pressure', 'Temperature',
       'Transmissivity', 'Fluorescence:URU:Seapoint', 'PAR', 'Salinity',
       'Oxygen:Dissolved:CTD', 'pH:SBE:Nominal', 'Salinity:Bottle',
       'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',
       'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite',
       'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',
       'Flag:Phosphate', 'Cruise', 'Temperature:Draw',
       'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved',
       'Number_of_bin_records', 'Bin #', 'Index', 'Subgroup', 'Month',
       'Station', 'Sample#', 'rep', 'depth', 'Cyanobacteria', 'Prasinophytes',
       'Cryptophytes', 'Diatoms-1', 'Diatoms-2', 'Dinoflagellates-1',
       'Haptophytes', 'Dictyo', 'Raphido', 'TchlA (ug/L)'],
      dtype='object')

Checks¶

In [17]:

# Identify cases where phytoplankton data were matched to multiple samples in bottle data:
dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])

In [18]:

temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})
temp.columns = ['icount']

In [19]:

np.unique(temp.icount)

Out[19]:

array([1])

In [20]:

# check for Phyto samples matched to multiple bottle samples:
temp.loc[temp.icount>1]

Out[20]:

		icount
Cruise	Sample#

In [21]:

# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]

Out[21]:

		icount
Cruise	Sample#

In [22]:

temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})
temp2.columns = ['icount']
# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:
temp2.loc[temp2.icount>1]

Out[22]:

		icount
Cruise	Sample_Number

In [23]:

# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]

Out[23]:

		icount
Cruise	Sample#

In [24]:

# if the output table is longer than either of the input tables, some columns were not matched
len(dfout), len(dfPhyto), len(dfbot)

Out[24]:

(938, 88, 938)

In [25]:

# Check that the number of cells with data in the 'Cyanobacteria' column is 
#  the same for the input and output tables:
np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)

Out[25]:

(88, 88)

In [26]:

# If there were data rows from the phytoplankton table that were not matched to 
#  rows from the bottle table, their indices from the phytoplankton table would be 
#  displayed below (the series [] would not be empty)
print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Index'])

Series([], Name: Index, dtype: float64)

In [27]:

# drop repetetive/unecessary columns:
dfout.drop(labels=['Bin #', 'Index', 'Subgroup', 'Month', 'Station', 'Sample#', 'rep',
                   'depth',],axis=1,inplace=True)

In [28]:

# truncate phyto group values to 3 decimal places:
for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',
       'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
       'TchlA (ug/L)'):
    dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column

In [29]:

dfout.keys()

Out[29]:

Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',
       'LOC:EVENT_NUMBER', 'LOC:STATION', 'Lat', 'Lon', 'LOC:WATER DEPTH',
       'ADM:SCIENTIST', 'Sample_Number', 'Pressure', 'Temperature',
       'Transmissivity', 'Fluorescence:URU:Seapoint', 'PAR', 'Salinity',
       'Oxygen:Dissolved:CTD', 'pH:SBE:Nominal', 'Salinity:Bottle',
       'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',
       'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite',
       'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',
       'Flag:Phosphate', 'Cruise', 'Temperature:Draw',
       'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved',
       'Number_of_bin_records', 'Cyanobacteria', 'Prasinophytes',
       'Cryptophytes', 'Diatoms-1', 'Diatoms-2', 'Dinoflagellates-1',
       'Haptophytes', 'Dictyo', 'Raphido', 'TchlA (ug/L)'],
      dtype='object')

In [30]:

# now write the output table to a .csv file:
dfout.to_csv(pathOut, index=False)  

In [ ]: