#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np # this module handles arrays, but here we need it for its NaN value import pandas as pd # this module contains a lot of tools for handling tabular data import re # In[2]: # define paths to the source files and eventual output file #pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2018 SoG bottle.xlsx' #pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2018 Abs phyto groups.xlsx' pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2018 SoG bottleCorrected.xlsx' pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2018 Abs phyto groupsCorrected.xlsx' pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2018.csv' # In[3]: #formatting function to get year-(3digit) format def fmtCruise(istr): if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): sp=re.split('-',istr) rstr=sp[0]+'-0'+sp[1] elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr): rstr=istr else: raise ValueError('Input had unexpected format:',istr) return rstr # In[4]: # get names of sheets in notebook with pd.ExcelFile(pathBottle) as xl: sheets=xl.sheet_names print(sheets) # In[5]: # load each sheet in the bottle Excel file and concatenate them together into one table dfbotlist=list() for sheet in sheets: df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True, na_values=(-99,-99.9)) # read each sheet; include additional na values df0['Cruise']=fmtCruise(sheet) # create and populate Cruise column based on sheet name dfbotlist.append(df0) # append the sheet to a list dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table # Drop columns with no data in them l1=set(dfbot.keys()) dfbot.dropna(axis=1,how='all',inplace=True) print('removed empty columns:',l1-set(dfbot.keys())) # In[6]: # list the column names in the resulting table print(dfbot.keys()) # In[7]: # no rows returned, so there are no rows with both primary and secondary temperature values print(np.sum(dfbot['Depth [metres]']>=0), np.sum(dfbot['Depth:CTD [metres]']>=0)) dfbot.loc[(dfbot['Depth [metres]']>=0)&\ (dfbot['Depth:CTD [metres]']>=0)] # In[8]: # no rows returned, so there are no rows with both primary and secondary temperature values print(np.sum(dfbot['Temperature:Primary [deg C (ITS90)]']>=0), np.sum(dfbot['Temperature:Secondary [deg C (ITS90)]']>=0), np.sum(dfbot['Temperature:CTD [deg C (ITS90)]']>=0)) dfbot.loc[(np.array([int(ii) for ii in (dfbot['Temperature:Primary [deg C (ITS90)]']>=0)])+\ np.array([int(ii) for ii in (dfbot['Temperature:Secondary [deg C (ITS90)]']>=0)])+\ np.array([int(ii) for ii in (dfbot['Temperature:CTD [deg C (ITS90)]']>=0)]))>1] # In[9]: # no rows returned, so there are no rows with both primary and secondary temperature values print(np.sum(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0), np.sum(dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0), np.sum(dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0)) dfbot.loc[(np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0)])+\ np.array([int(ii) for ii in (dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0)])+\ np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0)]))>1] # In[10]: # no rows returned, so there are no rows with both primary and secondary temperature values print(np.sum(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0), np.sum(dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0)) dfbot.loc[(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0)&\ (dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0)] # In[11]: # no rows returned, so there are no rows with both both salinity fields print(np.sum(dfbot['Salinity:T0:C0 [PSS-78]']>=0), np.sum(dfbot['Salinity:T1:C1 [PSS-78]']>=0), np.sum(dfbot['Salinity:CTD [PSS-78]']>=0)) dfbot.loc[(np.array([int(ii) for ii in (dfbot['Salinity:T0:C0 [PSS-78]']>=0)])+\ np.array([int(ii) for ii in (dfbot['Salinity:T1:C1 [PSS-78]']>=0)])+\ np.array([int(ii) for ii in (dfbot['Salinity:CTD [PSS-78]']>=0)]))>1] # In[12]: # no rows returned, so there are no rows with both primary and secondary temperature values print(np.sum(dfbot['Pressure [decibar]']>=0), np.sum(dfbot['Pressure:CTD [decibar]']>=0)) dfbot.loc[(dfbot['Pressure [decibar]']>=0)&\ (dfbot['Pressure:CTD [decibar]']>=0)] # In[13]: # no rows returned, so there are no rows with both primary and secondary temperature values print(np.sum(dfbot['Depth [metres]']>=0), np.sum(dfbot['Depth:CTD [metres]']>=0)) dfbot.loc[(dfbot['Depth [metres]']>=0)&\ (dfbot['Depth:CTD [metres]']>=0)] # In[14]: # no rows returned, so there are no rows with both primary and secondary temperature values print(np.sum(dfbot['Oxygen:Dissolved:SBE [mL/L]']>=0), np.sum(dfbot['Oxygen:Dissolved:Volume:CTD [mL/L]']>=0)) dfbot.loc[(dfbot['Oxygen:Dissolved:SBE [mL/L]']>=0)&\ (dfbot['Oxygen:Dissolved:Volume:CTD [mL/L]']>=0)] # In[15]: # no rows returned, so there are no rows with both primary and secondary temperature values print(np.sum(dfbot['Oxygen:Dissolved:SBE [umol/kg]']>=0), np.sum(dfbot['Oxygen:Dissolved:Mass:CTD [umol/kg]']>=0)) dfbot.loc[(dfbot['Oxygen:Dissolved:SBE [umol/kg]']>=0)&\ (dfbot['Oxygen:Dissolved:Mass:CTD [umol/kg]']>=0)] # In[16]: def subval(idf,colList): # first value in colList should be the column you are going to keep # follow with other columns that will be used to fill in when that column is NaN # in order of precedence if len(colList)==2: idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \ else r[colList[1]] for i,r in idf.iterrows()] elif len(colList)==3: idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \ else r[colList[1]] if not pd.isna(r[colList[1]]) \ else r[colList[2]] for i,r in idf.iterrows()] else: raise NotImplementedError('Add to code to handle this case') idf.drop(columns=list(colList[1:]),inplace=True) return idf # In[17]: # there are some duplicate columns here; handle them: dfbot=subval(dfbot,('FIL:START TIME YYYY/MM/DD HH:MM:SS', 'YYYY/MM/DD HH:MM:SS')) dfbot=subval(dfbot,('Temperature:Primary [deg C (ITS90)]', 'Temperature:Secondary [deg C (ITS90)]', 'Temperature:CTD [deg C (ITS90)]')) dfbot=subval(dfbot,('Salinity:T0:C0 [PSS-78]', 'Salinity:T1:C1 [PSS-78]', 'Salinity:CTD [PSS-78]')) dfbot=subval(dfbot,('Pressure [decibar]', 'Pressure:CTD [decibar]')) dfbot=subval(dfbot,('Depth [metres]', 'Depth:CTD [metres]')) dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:Volume:CTD [mL/L]')) dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [umol/kg]', 'Oxygen:Dissolved:Mass:CTD [umol/kg]')) dfbot=subval(dfbot,('Fluorescence:URU:Seapoint [mg/m^3]', 'Fluorescence:URU:CTD [mg/m^3]', 'Fluorescence:URU:Wetlabs [mg/m^3]')) dfbot.rename(columns={'Temperature:Primary [deg C (ITS90)]':'Temperature [deg C (ITS90)]'}, inplace=True) dfbot.rename(columns={'Salinity:T0:C0 [PSS-78]':'Salinity [PSS-78]'}, inplace=True) dfbot.rename(columns={'Oxygen:Dissolved:SBE [mL/L]':'Oxygen:Dissolved:CTD [mL/L]'}, inplace=True) dfbot.rename(columns={'Oxygen:Dissolved:SBE [umol/kg]':'Oxygen:Dissolved:CTD [umol/kg]'}, inplace=True) dfbot.rename(columns={'Fluorescence:URU:Seapoint [mg/m^3]':'Fluorescence [mg/m^3]'}, inplace=True) # In[18]: # define a function that will be applied to the values in the index column; # this makes it easier to drop non-data rows later def convertIndex(val): try: x =int(val) except ValueError: x=np.nan return x # In[19]: # load the phytoplankton data with the following options: # sheet_name='???? CHEMTAX abs results' -> choose the sheet # usecols='A:I,T:AC' -> read only columns A:I and T:AC from the Excel sheet # skiprows=2 -> start reading at the 3rd row of the sheet, # which contains the column headings # converters={'Index': convertIndex,} -> apply the function defined above to the Index column # verbose = True -> print extra information/ warnings/ errors dfPhyto=pd.read_excel(pathPhyto,sheet_name='2018 CHEMTAX abs results ',usecols='A:I,T:AC', skiprows=2,converters={'Index': convertIndex,}, verbose=True) # In[20]: # display rows 48 to 59 of the resulting table dfPhyto[48:60] # In[21]: # now, drop any rows from the table that have NaN values in either of the columns # 'Index' or 'TchlA (ug/L)' # This is why we applied a function to the Index column to make sure all # non-numeric Index values would have a consistent NaN entry, making them easy to identify # and remove dfPhyto.dropna(subset=['Index', 'TchlA.1'],how='any',inplace=True) # In[22]: # pandas creates its own index, and after dropping rows I like to reset it - # this is just for convenience dfPhyto.reset_index(drop=True,inplace=True) # In[23]: # apply formatting function all rows in Cruise column to get year-3digit format dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']] # In[24]: # display part of the table, confirming that non-data rows have been removed dfPhyto[48:60] # In[25]: # due to repeated column names in the original spreadsheet, '.1' was appended to the names # of the phytoplankton columns; # these lines correct the column names, removing the '.1': renameDict=dict() for colName in dfPhyto.keys(): if colName.endswith('.1'): renameDict[colName]=colName.split('.1')[0] dfPhyto.rename(columns=renameDict,inplace=True) # In[26]: dfPhyto # In[27]: # This is the important step- join the two tables ('left' and 'right'), # matching the cruise IDs and sample numbers # how='outer' -> all rows from both the left and the right tables will be included, # even if they cannot be matched; this makes it easy to check for # unmatched data later # left_on specifies the name of the column to match in the left table (dfbot) # right_on specifies the name of the column to match in the right table (dfPhyto) dfout = pd.merge(dfbot, dfPhyto, how='outer', left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#']) # In[28]: # Identify cases where phytoplankton data were matched to multiple samples in bottle data: dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#']) # In[29]: # show the column names in the resulting table dfout.keys() # ### Checks # In[30]: temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']}) temp.columns = ['icount'] # In[31]: np.unique(temp.icount) # In[32]: # check for Phyto samples matched to multiple bottle samples: temp.loc[temp.icount>1] # In[33]: # check for phyto samples not matched to bottle samples: temp.loc[temp.icount==0] # In[34]: temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']}) temp2.columns = ['icount'] # this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise: temp2.loc[temp2.icount>1] # In[35]: # check for phyto samples not matched to bottle samples: temp.loc[temp.icount==0] # In[36]: # if the output table is longer than either of the input tables, some columns were not matched len(dfout), len(dfPhyto), len(dfbot) # In[37]: # Check that the number of cells with data in the 'Cyanobacteria' column is # the same for the input and output tables to show that no rows are missing: np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0) # In[38]: # If there were data rows from the phytoplankton table that were not matched to # rows from the bottle table, their indices from the phytoplankton table would be # displayed below (the series [] would not be empty) print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Index']) # In[39]: dfout.loc[dfout['ADM:SCIENTIST'].isna(),['Index','Cruise','Diatoms-1','Prasinophytes']] # In[40]: # drop repetetive/unecessary columns: dfout.drop(labels=['Bin #', 'Index', 'Subgroup', 'Month', 'Station', 'Sample#', 'rep', 'depth',],axis=1,inplace=True) # In[41]: # truncate phyto group values to 3 decimal places: for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1', 'Diatoms-2', 'Dinoflagellates', 'Haptophytes', 'Dictyochophytes', 'Raphidophytes', 'TchlA'): dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column # In[42]: dfout['Cyanobacteria'] # In[43]: # now write the output table to a .csv file: dfout.to_csv(pathOut, index=False) # In[44]: dfout.keys() # In[ ]: