In [1]:

import numpy as np  # this module handles arrays, but here we need it for its NaN value
import pandas as pd # this module contains a lot of tools for handling tabular data
import re

In [2]:

# define paths to the source files and eventual output file
pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2015 SoG_bottle.xlsx'
pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2019 SoG abs values New ALLO.xlsx'
pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2015_NewALLO.csv'

In [3]:

def fmtCruise(istr):
    if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): 
        sp=re.split('-',istr)
        rstr=sp[0]+'-0'+sp[1]
    elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):
        rstr=istr
    else:
        raise ValueError('Input had unexpected format:',istr)
    return rstr

In [4]:

# load each sheet in the 2015 bottle Excel file and concatenate them together into one table
dfbotlist=list()
for sheet in ('2015-17','2015-20','2015-18','2015-21','2015-19'):
    df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,na_values=(-99,-99.9),engine='openpyxl',
                      dtype={'Cruise':str,'Sample_Number':np.int64}) # read each sheet
    df0['Cruise']=fmtCruise(sheet)  # create and populate Cruise column based on sheet name
    dfbotlist.append(df0) # append the sheet to a list
dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table
# Drop columns with no data in them
l1=set(dfbot.keys())
dfbot.dropna(axis=1,how='all',inplace=True)
print('removed empty columns:',l1-set(dfbot.keys()))

Reading sheet 2015-17
Reading sheet 2015-20
Reading sheet 2015-18
Reading sheet 2015-21
Reading sheet 2015-19
removed empty columns: {'Ammonium', 'Flag:Ammonium'}

In [5]:

# list the column names in the resulting table
print(dfbot.keys())

Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',
       'LOC:EVENT_NUMBER', 'LOC:STATION', 'LOC:LATITUDE', 'LOC:LONGITUDE',
       'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'Sample_Number', 'Pressure',
       'Temperature:Primary', 'Transmissivity', 'Fluorescence:URU:Seapoint',
       'PAR', 'Salinity:T0:C0', 'Oxygen:Dissolved:SBE',
       'Oxygen:Dissolved:SBE.1', 'pH:SBE:Nominal', 'Salinity:Bottle',
       'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',
       'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite',
       'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',
       'Flag:Phosphate', 'Cruise', 'Temperature:Draw',
       'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Oxygen:Dissolved.1',
       'Flag:Oxygen:Dissolved', 'Temperature:Secondary', 'Salinity:T1:C1',
       'Number_of_bin_records'],
      dtype='object')

In [6]:

def subval(idf,colList):
    # first value in colList should be the column you are going to keep
    # follow with other columns that will be used to fill in when that column is NaN
    # in order of precedence
    if len(colList)==2:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] for i,r in idf.iterrows()]
    elif len(colList)==3:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] if not pd.isna(r[colList[1]]) \
                         else r[colList[2]] for i,r in idf.iterrows()]
    else:
        raise NotImplementedError('Add to code to handle this case')
    idf.drop(columns=list(colList[1:]),inplace=True)
    return idf

In [7]:

dfbot=subval(dfbot,('Oxygen:Dissolved','Oxygen:Dissolved.1'))
dfbot=subval(dfbot,('Temperature:Primary','Temperature:Secondary'))
dfbot=subval(dfbot,('Salinity:T0:C0','Salinity:T1:C1'))
dfbot=subval(dfbot,('Oxygen:Dissolved:SBE','Oxygen:Dissolved:SBE.1'))
dfbot.rename(columns={'LOC:LATITUDE':'Lat',
                      'LOC:LONGITUDE':'Lon',
                      'Temperature:Primary':'Temperature',
                      'Oxygen:Dissolved:SBE':'Oxygen:Dissolved:CTD',
                      'Salinity:T0:C0':'Salinity',},inplace=True)

In [8]:

# define a function that will be applied to the values in the index column;
# this makes it easier to drop non-data rows later
def convertIndex(val):
    try:
        x =int(val)
    except ValueError:
        x=-999
    return x

In [9]:

# load the 2015 phytoplankton data with the following options:
#   sheet_name='2015 CHEMTAX abs results'  -> choose the 2015 sheet
#   usecols='A:I,T:AC'   -> read only columns A:I and T:AC from the Excel sheet
#   skiprows=2     -> start reading at the 3rd row of the sheet, 
#                     which contains the column headings
#   converters={'Index': convertIndex,}   -> apply the function defined above to the Index column
#   verbose = True   -> print extra information/ warnings/ errors 
#usecols='A:I,T:AC',
dfPhyto=pd.read_excel(pathPhyto,sheet_name='2015',engine='openpyxl',
                      skiprows=0,converters={'Sample#': convertIndex,},
                      verbose=True)

Reading sheet 2015

In [10]:

# display rows 48 to 59 of the resulting table
dfPhyto[48:60]

Out[10]:

	Cruise	Date	Station	Sample#	rep	depth	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	Cyanobacteria	TchlA
48	Cruise	Date	Station	-999	rep	depth	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	Cyanobacteria	TchlA
49	2015-019	September	SI	9	mean	0	0	0	0.262077	2.54479	0.541801	1.61053	0	0	0.00480152	4.964
50	2015-019	September	59	23	mean	0	0.824291	0.14173	0.0878956	0.241363	0.0219083	0.0624404	0.00493941	0.0391662	0.0122657	1.436
51	2015-019	September	102	38	mean	0	4.23697	1.2806	0.138505	0.452335	0.0626771	0.273765	0	0.0581462	0	6.503
52	2015-019	September	75	52	mean	0	1.97283	0.851509	0.415309	0.712103	0.100579	0.339134	0.0147496	0.159789	0	4.566
53	2015-019	September	72	65	mean	0	0.82108	0.189251	0.166689	0.258274	0.050802	0.134443	0.00717662	0.0565812	0.000703135	1.685
54	2015-019	September	69	77	mean	0	0.69704	0.191174	0.104355	0.156305	0.000429999	0.0710586	0.011922	0.0397155	0	1.272
55	2015-019	September	ADCP	87	mean	0	0.747473	0.233891	0.0708116	0.209912	0.0259796	0.0823137	0.00108747	0.0345318	0	1.406
56	2015-019	September	65	97	mean	0	0.734673	0.152787	0.0414143	0.169293	0.0156013	0.0699448	0.00710006	0.0217077	0.00447809	1.217
57	2015-019	September	63	108	mean	0	0.621756	0.132632	0.0337293	0.105831	0.0101963	0.0339016	0.013503	0.021451	0	0.973
58	2015-019	September	62	119	mean	0	0.63392	0.136818	0.0389149	0.146725	0.00871505	0.00812858	0.0221678	0	0.000610712	0.996
59	2015-019	September	56	133	mean	0	1.16464	0.0481537	0.193494	0.282189	0.0108959	0.0248796	0.0161239	0.0417164	0.0489061	1.831

In [11]:

# now, drop any rows from the table that have NaN values in either of the columns
#  'Index' or 'TchlA (ug/L)'
# This is why we applied a function to the Index column to make sure all 
#  non-numeric Index values would have a consistent NaN entry, making them easy to identify
#  and remove
dfPhyto.dropna(subset=['TchlA'],how='any',inplace=True)

In [12]:

dfPhyto.drop(dfPhyto[dfPhyto['Sample#']<0].index,inplace=True)

In [13]:

# pandas creates its own index, and after dropping rows I like to reset it -
# this is just for convenience
dfPhyto.reset_index(drop=True,inplace=True)

In [14]:

# display part of the table, confirming that non-data rows have been removed
dfPhyto.loc[:10,['Cruise','Sample#']]

Out[14]:

	Cruise	Sample#
0	2015-017	9
1	2015-017	23
2	2015-017	37
3	2015-017	51
4	2015-017	64
5	2015-017	76
6	2015-017	86
7	2015-017	96
8	2015-017	109
9	2015-017	120
10	2015-017	134

In [15]:

dfbot.loc[:20,['Cruise','Sample_Number']]

Out[15]:

	Cruise	Sample_Number
0	2015-017	1
1	2015-017	2
2	2015-017	3
3	2015-017	4
4	2015-017	5
5	2015-017	6
6	2015-017	7
7	2015-017	8
8	2015-017	9
9	2015-017	10
10	2015-017	11
11	2015-017	12
12	2015-017	13
13	2015-017	14
14	2015-017	15
15	2015-017	16
16	2015-017	17
17	2015-017	18
18	2015-017	19
19	2015-017	20
20	2015-017	21

In [16]:

# due to repeated column names in the original spreadsheet, '.1' was appended to the names
# of the phytoplankton columns; 
# these lines correct the column names, removing the '.1':
renameDict=dict()
for colName in dfPhyto.keys():
    if colName.endswith('.1'):
        renameDict[colName]=colName.split('.1')[0]
dfPhyto.rename(columns=renameDict,inplace=True)

In [17]:

dfPhyto

Out[17]:

	Cruise	Date	Station	Sample#	rep	depth	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	Cyanobacteria	TchlA
0	2015-017	April	SI	9	mean	0	23.5805	2.41835	0	0	0	0	0	0.163169	0	26.162
1	2015-017	April	59	23	mean	0	0.271799	0.0303002	0	0.0521275	0.00467348	0.0801	0	0	0	0.439
2	2015-017	April	102	37	mean	0	2.03334	0.809783	0.0442965	0.18086	0.0245126	0.00410068	0.00483189	0.0233698	0.00390702	3.129
3	2015-017	April	75	51	mean	0	2.72243	1.09346	0.0387971	0.25975	0.0294629	0.0185552	0.0136585	0.0328907	0.0100036	4.219
4	2015-017	April	72	64	mean	0	3.34139	1.59186	0.172997	0.924837	0.329514	0	0.00213607	0.0222662	0	6.385
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
65	2015-019	September	9	248	mean	0	0.558597	0.207741	0.605681	0.433424	0.000954912	0.200797	0.031296	0.0739315	0.138577	2.251
66	2015-019	September	6	261	mean	0	0.256106	0.254845	0.556941	0.826706	5.79139e-05	0.378296	0.0619777	0.0707657	0.124304	2.53
67	2015-019	September	BS11	269	mean	0	9.83601	0.0171224	0.0109873	0.344913	0.00197183	0.033415	0	0	0.025584	10.27
68	2015-019	September	3	285	mean	0	0.574524	0.140787	0.34785	0.697251	0.00785798	0.309306	0.0253154	0.0967167	0.171391	2.371
69	2015-019	September	2	300	mean	0	1.63844	0	0.71315	0.459814	0.00299931	0.270579	0.0199415	0.188252	0.169828	3.463

70 rows × 16 columns

In [18]:

# This is the important step- join the two tables ('left' and 'right'), 
#  matching the cruise IDs and sample numbers
#   how='outer'  -> all rows from both the left and the right tables will be included, 
#                   even if they cannot be matched; this makes it easy to check for 
#                   unmatched data later
#   left_on  specifies the name of the column to match in the left table (dfbot) 
#   right_on specifies the name of the column to match in the right table (dfPhyto)
dfout = pd.merge(dfbot, dfPhyto,  how='outer', 
                 left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])

In [19]:

# show the column names in the resulting table
dfout.keys()

Out[19]:

Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',
       'LOC:EVENT_NUMBER', 'LOC:STATION', 'Lat', 'Lon', 'LOC:WATER DEPTH',
       'ADM:SCIENTIST', 'Sample_Number', 'Pressure', 'Temperature',
       'Transmissivity', 'Fluorescence:URU:Seapoint', 'PAR', 'Salinity',
       'Oxygen:Dissolved:CTD', 'pH:SBE:Nominal', 'Salinity:Bottle',
       'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',
       'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite',
       'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',
       'Flag:Phosphate', 'Cruise', 'Temperature:Draw',
       'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved',
       'Number_of_bin_records', 'Date', 'Station', 'Sample#', 'rep', 'depth',
       'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes',
       'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
       'Cyanobacteria', 'TchlA'],
      dtype='object')

Checks¶

In [20]:

# Identify cases where phytoplankton data were matched to multiple samples in bottle data:
dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])

In [21]:

temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})
temp.columns = ['icount']

In [22]:

np.unique(temp.icount)

Out[22]:

array([1])

In [23]:

# check for Phyto samples matched to multiple bottle samples:
temp.loc[temp.icount>1]

Out[23]:

		icount
Cruise	Sample#

In [24]:

# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]

Out[24]:

		icount
Cruise	Sample#

In [25]:

temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})
temp2.columns = ['icount']
# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:
temp2.loc[temp2.icount>1]

Out[25]:

		icount
Cruise	Sample_Number

In [26]:

# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]

Out[26]:

		icount
Cruise	Sample#

In [27]:

# if the output table is longer than either of the input tables, some columns were not matched
len(dfout), len(dfPhyto), len(dfbot)

Out[27]:

(938, 70, 938)

In [28]:

dfout

Out[28]:

	File Name	Zone	FIL:START TIME YYYY/MM/DD HH:MM:SS	LOC:EVENT_NUMBER	LOC:STATION	Lat	Lon	LOC:WATER DEPTH	ADM:SCIENTIST	Sample_Number	...	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	Cyanobacteria	TchlA
0	2015-17-0001.che	UTC	2015-04-01 23:42:17	1	SI	48.65233	-123.50183	187	Chandler P.	1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	2015-17-0001.che	UTC	2015-04-01 23:42:17	1	SI	48.65233	-123.50183	187	Chandler P.	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	2015-17-0001.che	UTC	2015-04-01 23:42:17	1	SI	48.65233	-123.50183	187	Chandler P.	3	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	2015-17-0001.che	UTC	2015-04-01 23:42:17	1	SI	48.65233	-123.50183	187	Chandler P.	4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	2015-17-0001.che	UTC	2015-04-01 23:42:17	1	SI	48.65233	-123.50183	187	Chandler P.	5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
933	2015-19-0091.che	UTC	2015-10-04 00:30:55	91	2	49.40167	-124.15600	279	Chandler P.	296	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
934	2015-19-0091.che	UTC	2015-10-04 00:30:55	91	2	49.40167	-124.15600	279	Chandler P.	297	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
935	2015-19-0091.che	UTC	2015-10-04 00:30:55	91	2	49.40167	-124.15600	279	Chandler P.	298	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
936	2015-19-0091.che	UTC	2015-10-04 00:30:55	91	2	49.40167	-124.15600	279	Chandler P.	299	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
937	2015-19-0091.che	UTC	2015-10-04 00:30:55	91	2	49.40167	-124.15600	279	Chandler P.	300	...	1.63844	0	0.71315	0.459814	0.00299931	0.270579	0.0199415	0.188252	0.169828	3.463

938 rows × 49 columns

In [29]:

# Check that the number of cells with data in the 'Cyanobacteria' column is 
#  the same for the input and output tables:
np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)

Out[29]:

(70, 70)

In [30]:

# If there were data rows from the phytoplankton table that were not matched to 
#  rows from the bottle table, their indices from the phytoplankton table would be 
#  displayed below (the series [] would not be empty)
print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Sample#'])

Series([], Name: Sample#, dtype: float64)

In [31]:

dfout.keys()

Out[31]:

Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',
       'LOC:EVENT_NUMBER', 'LOC:STATION', 'Lat', 'Lon', 'LOC:WATER DEPTH',
       'ADM:SCIENTIST', 'Sample_Number', 'Pressure', 'Temperature',
       'Transmissivity', 'Fluorescence:URU:Seapoint', 'PAR', 'Salinity',
       'Oxygen:Dissolved:CTD', 'pH:SBE:Nominal', 'Salinity:Bottle',
       'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',
       'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite',
       'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',
       'Flag:Phosphate', 'Cruise', 'Temperature:Draw',
       'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved',
       'Number_of_bin_records', 'Date', 'Station', 'Sample#', 'rep', 'depth',
       'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes',
       'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
       'Cyanobacteria', 'TchlA'],
      dtype='object')

In [32]:

# drop repetetive/unecessary columns: 'Index','Bin #', 'Subgroup', 'Month',
dfout.drop(labels=['Date', 'Station', 'Sample#', 'rep',
                   'depth',],axis=1,inplace=True)

In [33]:

# truncate phyto group values to 3 decimal places:
for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',
       'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
       'TchlA'):
    dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column

In [36]:

# Check for unexpected depths
dfout.loc[((dfout['Pressure']>10))&(dfout['Diatoms-1']>=0)]#,
#         ['FIL:START TIME YYYY/MM/DD HH:MM:SS','Pressure [decibar]','Depth [metres]','Diatoms-1']]

Out[36]:

	File Name	Zone	FIL:START TIME YYYY/MM/DD HH:MM:SS	LOC:EVENT_NUMBER	LOC:STATION	Lat	Lon	LOC:WATER DEPTH	ADM:SCIENTIST	Sample_Number	...	Diatoms-1	Diatoms-2	Prasinophytes	Cryptophytes	Dinoflagellates-1	Haptophytes	Dictyo	Raphido	Cyanobacteria	TchlA

0 rows × 44 columns

In [37]:

dfout.keys()

Out[37]:

Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',
       'LOC:EVENT_NUMBER', 'LOC:STATION', 'Lat', 'Lon', 'LOC:WATER DEPTH',
       'ADM:SCIENTIST', 'Sample_Number', 'Pressure', 'Temperature',
       'Transmissivity', 'Fluorescence:URU:Seapoint', 'PAR', 'Salinity',
       'Oxygen:Dissolved:CTD', 'pH:SBE:Nominal', 'Salinity:Bottle',
       'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',
       'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite',
       'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',
       'Flag:Phosphate', 'Cruise', 'Temperature:Draw',
       'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved',
       'Number_of_bin_records', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes',
       'Cryptophytes', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
       'Cyanobacteria', 'TchlA'],
      dtype='object')

In [38]:

# now write the output table to a .csv file:
dfout.to_csv(pathOut, index=False)  

In [ ]: