#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np # this module handles arrays, but here we need it for its NaN value import pandas as pd # this module contains a lot of tools for handling tabular data import re # In[2]: # define paths to the source files and eventual output file pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2016 SoG bottle.xlsx' pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2019 SoG abs values New ALLO.xlsx' pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2016_NewALLO.csv' # In[3]: #formatting function to get year-(3digit) format def fmtCruise(istr): if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): sp=re.split('-',istr) rstr=sp[0]+'-0'+sp[1] elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr): rstr=istr else: raise ValueError('Input had unexpected format:',istr) return rstr # In[4]: # get names of sheets in notebook with pd.ExcelFile(pathBottle,engine='openpyxl') as xl: sheets=xl.sheet_names print(sheets) # In[5]: # load each sheet in the 2015 bottle Excel file and concatenate them together into one table dfbotlist=list() for sheet in sheets: df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,engine='openpyxl', na_values=(-99,-99.9)) # read each sheet; include additional na values df0['Cruise']=fmtCruise(sheet) # create and populate Cruise column based on sheet name dfbotlist.append(df0) # append the sheet to a list dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table # Drop columns with no data in them l1=set(dfbot.keys()) dfbot.dropna(axis=1,how='all',inplace=True) print('removed empty columns:',l1-set(dfbot.keys())) # In[6]: # list the column names in the resulting table print(dfbot.keys()) # In[7]: def subval(idf,colList): # first value in colList should be the column you are going to keep # follow with other columns that will be used to fill in when that column is NaN # in order of precedence if len(colList)==2: idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \ else r[colList[1]] for i,r in idf.iterrows()] elif len(colList)==3: idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \ else r[colList[1]] if not pd.isna(r[colList[1]]) \ else r[colList[2]] for i,r in idf.iterrows()] else: raise NotImplementedError('Add to code to handle this case') return idf # In[8]: # there are some duplicate columns here; handle them: print(np.max(np.abs(dfbot['Sample_Number']-dfbot['Sample_Number.1'])), ': zero means no difference between Sample_Number and Sample_number.1 columns containing data') # check there are no rows missing a value in 'Sample_Number': print(np.sum(pd.isna(dfbot['Sample_Number'])), ': zero means no missing values in Sample_Number') # -> extra copy is repeat so drop it # 'ADM:MISSION' contains cruise id so drop it # 'YYYY/MM/DD HH:MM:SS' data should be in 'FIL:START TIME YYYY/MM/DD HH:MM:SS'; move it there: dfbot=subval(dfbot,('FIL:START TIME YYYY/MM/DD HH:MM:SS','YYYY/MM/DD HH:MM:SS')) dfbot=subval(dfbot,('Pressure [decibar]','Pressure')) dfbot=subval(dfbot,('Temperature:Secondary [deg C (ITS90)]','Temperature:Secondary')) dfbot=subval(dfbot,('Transmissivity [*/metre]','Transmissivity')) dfbot=subval(dfbot,('Fluorescence:URU:Seapoint [mg/m^3]','Fluorescence:URU:Seapoint')) dfbot=subval(dfbot,('PAR [uE/m^2/sec]','PAR')) dfbot=subval(dfbot,('PAR:Reference [uE/m^2/sec]','PAR:Reference')) dfbot=subval(dfbot,('Salinity:T1:C1 [PSS-78]','Salinity:T1:C1')) dfbot=subval(dfbot,('Salinity:Bottle [PSS-78]','Salinity:Bottle')) dfbot=subval(dfbot,('Chlorophyll:Extracted [mg/m^3]','Chlorophyll:Extracted')) dfbot=subval(dfbot,('Phaeo-Pigment:Extracted [mg/m^3]','Phaeo-Pigment:Extracted')) dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [mL/L]','Oxygen:Dissolved:SBE')) dfbot=subval(dfbot,('Oxygen:Dissolved [mL/L]','Oxygen:Dissolved')) dfbot=subval(dfbot,('Nitrate_plus_Nitrite [umol/L]','Nitrate_plus_Nitrite')) dfbot=subval(dfbot,('Silicate [umol/L]','Silicate')) dfbot=subval(dfbot,('Phosphate [umol/L]','Phosphate')) dfbot=subval(dfbot,('PAR [uE/m^2/sec]','PAR1')) # In[9]: # drop repetetive/unecessary columns: dfbot.drop(labels=['Sample_Number.1','ADM:MISSION','YYYY/MM/DD HH:MM:SS','Transmissivity', 'Pressure','Temperature:Secondary','Fluorescence:URU:Seapoint','PAR', 'PAR:Reference','Salinity:T1:C1','Chlorophyll:Extracted', 'Oxygen:Dissolved:SBE','Oxygen:Dissolved', 'Nitrate_plus_Nitrite','Silicate','PAR1'],axis=1,inplace=True) # In[10]: # define a function that will be applied to the values in the index column; # this makes it easier to drop non-data rows later def convertIndex(val): try: x =int(val) except ValueError: x=np.nan return x # In[11]: # load the 2015 phytoplankton data with the following options: # sheet_name='2015 CHEMTAX abs results' -> choose the 2015 sheet # usecols='A:I,T:AC' -> read only columns A:I and T:AC from the Excel sheet # skiprows=2 -> start reading at the 3rd row of the sheet, # which contains the column headings # converters={'Index': convertIndex,} -> apply the function defined above to the Index column # verbose = True -> print extra information/ warnings/ errors dfPhyto=pd.read_excel(pathPhyto,sheet_name='2016',engine='openpyxl', skiprows=0,converters={'Sample#': convertIndex,}, verbose=True) # In[12]: # display rows 48 to 59 of the resulting table dfPhyto[48:60] # In[13]: # now, drop any rows from the table that have NaN values in either of the columns # 'Index' or 'TchlA (ug/L)' # This is why we applied a function to the Index column to make sure all # non-numeric Index values would have a consistent NaN entry, making them easy to identify # and remove dfPhyto.dropna(subset=['Sample#', 'TchlA'],how='any',inplace=True) # In[14]: # pandas creates its own index, and after dropping rows I like to reset it - # this is just for convenience dfPhyto.reset_index(drop=True,inplace=True) # In[15]: # apply formatting function all rows in Cruise column to get year-3digit format dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']] # In[16]: # display part of the table, confirming that non-data rows have been removed dfPhyto[48:60] # In[17]: # due to repeated column names in the original spreadsheet, '.1' was appended to the names # of the phytoplankton columns; # these lines correct the column names, removing the '.1': renameDict=dict() for colName in dfPhyto.keys(): if colName.endswith('.1'): renameDict[colName]=colName.split('.1')[0] dfPhyto.rename(columns=renameDict,inplace=True) # In[18]: dfPhyto # In[19]: # This is the important step- join the two tables ('left' and 'right'), # matching the cruise IDs and sample numbers # how='outer' -> all rows from both the left and the right tables will be included, # even if they cannot be matched; this makes it easy to check for # unmatched data later # left_on specifies the name of the column to match in the left table (dfbot) # right_on specifies the name of the column to match in the right table (dfPhyto) dfout = pd.merge(dfbot, dfPhyto, how='outer', left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#']) # In[20]: # show the column names in the resulting table dfout.keys() # ### Checks # In[21]: # Identify cases where phytoplankton data were matched to multiple samples in bottle data: dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#']) # In[22]: temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']}) temp.columns = ['icount'] # In[23]: np.unique(temp.icount) # In[24]: # check for Phyto samples matched to multiple bottle samples: temp.loc[temp.icount>1] # In[25]: # check for phyto samples not matched to bottle samples: temp.loc[temp.icount==0] # In[26]: temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']}) temp2.columns = ['icount'] # this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise: temp2.loc[temp2.icount>1] # In[27]: # check for phyto samples not matched to bottle samples: temp.loc[temp.icount==0] # In[28]: # if the output table is longer than either of the input tables, some columns were not matched len(dfout), len(dfPhyto), len(dfbot) # In[29]: # Check that the number of cells with data in the 'Cyanobacteria' column is # the same for the input and output tables to show that no rows are missing: np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0) # In[30]: # If there were data rows from the phytoplankton table that were not matched to # rows from the bottle table, their indices from the phytoplankton table would be # displayed below (the series [] would not be empty) print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Sample#']) # In[31]: # drop repetetive/unecessary columns: dfout.drop(labels=['Date', 'Station', 'Sample#', 'rep', 'depth',],axis=1,inplace=True) # In[32]: # truncate phyto group values to 3 decimal places: for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1', 'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido', 'TchlA'): dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column # In[33]: dfout['Cyanobacteria'] # In[34]: # Check for unexpected depths dfout.loc[((dfout['Pressure [decibar]']>10)|(dfout['Depth [metres]']>10))&(dfout['Diatoms-1']>=0)]#, # ['FIL:START TIME YYYY/MM/DD HH:MM:SS','Pressure [decibar]','Depth [metres]','Diatoms-1']] # In[ ]: # now write the output table to a .csv file: dfout.to_csv(pathOut, index=False) # In[ ]: dfout.keys() # In[ ]: