#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np # this module handles arrays, but here we need it for its NaN value import pandas as pd # this module contains a lot of tools for handling tabular data import re # In[2]: # define paths to the source files and eventual output file pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2015 SoG_bottle.xlsx' pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2019 SoG abs values New ALLO.xlsx' pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2015_NewALLO.csv' # In[3]: def fmtCruise(istr): if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): sp=re.split('-',istr) rstr=sp[0]+'-0'+sp[1] elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr): rstr=istr else: raise ValueError('Input had unexpected format:',istr) return rstr # In[4]: # load each sheet in the 2015 bottle Excel file and concatenate them together into one table dfbotlist=list() for sheet in ('2015-17','2015-20','2015-18','2015-21','2015-19'): df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,na_values=(-99,-99.9),engine='openpyxl', dtype={'Cruise':str,'Sample_Number':np.int64}) # read each sheet df0['Cruise']=fmtCruise(sheet) # create and populate Cruise column based on sheet name dfbotlist.append(df0) # append the sheet to a list dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table # Drop columns with no data in them l1=set(dfbot.keys()) dfbot.dropna(axis=1,how='all',inplace=True) print('removed empty columns:',l1-set(dfbot.keys())) # In[5]: # list the column names in the resulting table print(dfbot.keys()) # In[6]: def subval(idf,colList): # first value in colList should be the column you are going to keep # follow with other columns that will be used to fill in when that column is NaN # in order of precedence if len(colList)==2: idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \ else r[colList[1]] for i,r in idf.iterrows()] elif len(colList)==3: idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \ else r[colList[1]] if not pd.isna(r[colList[1]]) \ else r[colList[2]] for i,r in idf.iterrows()] else: raise NotImplementedError('Add to code to handle this case') idf.drop(columns=list(colList[1:]),inplace=True) return idf # In[7]: dfbot=subval(dfbot,('Oxygen:Dissolved','Oxygen:Dissolved.1')) dfbot=subval(dfbot,('Temperature:Primary','Temperature:Secondary')) dfbot=subval(dfbot,('Salinity:T0:C0','Salinity:T1:C1')) dfbot=subval(dfbot,('Oxygen:Dissolved:SBE','Oxygen:Dissolved:SBE.1')) dfbot.rename(columns={'LOC:LATITUDE':'Lat', 'LOC:LONGITUDE':'Lon', 'Temperature:Primary':'Temperature', 'Oxygen:Dissolved:SBE':'Oxygen:Dissolved:CTD', 'Salinity:T0:C0':'Salinity',},inplace=True) # In[8]: # define a function that will be applied to the values in the index column; # this makes it easier to drop non-data rows later def convertIndex(val): try: x =int(val) except ValueError: x=-999 return x # In[9]: # load the 2015 phytoplankton data with the following options: # sheet_name='2015 CHEMTAX abs results' -> choose the 2015 sheet # usecols='A:I,T:AC' -> read only columns A:I and T:AC from the Excel sheet # skiprows=2 -> start reading at the 3rd row of the sheet, # which contains the column headings # converters={'Index': convertIndex,} -> apply the function defined above to the Index column # verbose = True -> print extra information/ warnings/ errors #usecols='A:I,T:AC', dfPhyto=pd.read_excel(pathPhyto,sheet_name='2015',engine='openpyxl', skiprows=0,converters={'Sample#': convertIndex,}, verbose=True) # In[10]: # display rows 48 to 59 of the resulting table dfPhyto[48:60] # In[11]: # now, drop any rows from the table that have NaN values in either of the columns # 'Index' or 'TchlA (ug/L)' # This is why we applied a function to the Index column to make sure all # non-numeric Index values would have a consistent NaN entry, making them easy to identify # and remove dfPhyto.dropna(subset=['TchlA'],how='any',inplace=True) # In[12]: dfPhyto.drop(dfPhyto[dfPhyto['Sample#']<0].index,inplace=True) # In[13]: # pandas creates its own index, and after dropping rows I like to reset it - # this is just for convenience dfPhyto.reset_index(drop=True,inplace=True) # In[14]: # display part of the table, confirming that non-data rows have been removed dfPhyto.loc[:10,['Cruise','Sample#']] # In[15]: dfbot.loc[:20,['Cruise','Sample_Number']] # In[16]: # due to repeated column names in the original spreadsheet, '.1' was appended to the names # of the phytoplankton columns; # these lines correct the column names, removing the '.1': renameDict=dict() for colName in dfPhyto.keys(): if colName.endswith('.1'): renameDict[colName]=colName.split('.1')[0] dfPhyto.rename(columns=renameDict,inplace=True) # In[17]: dfPhyto # In[18]: # This is the important step- join the two tables ('left' and 'right'), # matching the cruise IDs and sample numbers # how='outer' -> all rows from both the left and the right tables will be included, # even if they cannot be matched; this makes it easy to check for # unmatched data later # left_on specifies the name of the column to match in the left table (dfbot) # right_on specifies the name of the column to match in the right table (dfPhyto) dfout = pd.merge(dfbot, dfPhyto, how='outer', left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#']) # In[19]: # show the column names in the resulting table dfout.keys() # ### Checks # In[20]: # Identify cases where phytoplankton data were matched to multiple samples in bottle data: dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#']) # In[21]: temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']}) temp.columns = ['icount'] # In[22]: np.unique(temp.icount) # In[23]: # check for Phyto samples matched to multiple bottle samples: temp.loc[temp.icount>1] # In[24]: # check for phyto samples not matched to bottle samples: temp.loc[temp.icount==0] # In[25]: temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']}) temp2.columns = ['icount'] # this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise: temp2.loc[temp2.icount>1] # In[26]: # check for phyto samples not matched to bottle samples: temp.loc[temp.icount==0] # In[27]: # if the output table is longer than either of the input tables, some columns were not matched len(dfout), len(dfPhyto), len(dfbot) # In[28]: dfout # In[29]: # Check that the number of cells with data in the 'Cyanobacteria' column is # the same for the input and output tables: np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0) # In[30]: # If there were data rows from the phytoplankton table that were not matched to # rows from the bottle table, their indices from the phytoplankton table would be # displayed below (the series [] would not be empty) print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Sample#']) # In[31]: dfout.keys() # In[32]: # drop repetetive/unecessary columns: 'Index','Bin #', 'Subgroup', 'Month', dfout.drop(labels=['Date', 'Station', 'Sample#', 'rep', 'depth',],axis=1,inplace=True) # In[33]: # truncate phyto group values to 3 decimal places: for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1', 'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido', 'TchlA'): dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column # In[36]: # Check for unexpected depths dfout.loc[((dfout['Pressure']>10))&(dfout['Diatoms-1']>=0)]#, # ['FIL:START TIME YYYY/MM/DD HH:MM:SS','Pressure [decibar]','Depth [metres]','Diatoms-1']] # In[37]: dfout.keys() # In[38]: # now write the output table to a .csv file: dfout.to_csv(pathOut, index=False) # In[ ]: