#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np # this module handles arrays, but here we need it for its NaN value import pandas as pd # this module contains a lot of tools for handling tabular data from matplotlib import pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: # define paths to the source files and eventual output file pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC//All 2015 SoG_bottle.xlsx' pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2018 Abs phyto groupsCorrected.xlsx' pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2015.csv' # In[3]: # load each sheet in the 2015 bottle Excel file and concatenate them together into one table dfbotlist=list() for sheet in ('2015-17','2015-20','2015-18','2015-21','2015-19'): df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,na_values=(-99,-99.9)) # read each sheet df0['Cruise']=sheet # create and populate Cruise column based on sheet name dfbotlist.append(df0) # append the sheet to a list dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table # Drop columns with no data in them l1=set(dfbot.keys()) dfbot.dropna(axis=1,how='all',inplace=True) print('removed empty columns:',l1-set(dfbot.keys())) # In[4]: # list the column names in the resulting table print(dfbot.keys()) # In[5]: def subval(idf,colList): # first value in colList should be the column you are going to keep # follow with other columns that will be used to fill in when that column is NaN # in order of precedence if len(colList)==2: idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \ else r[colList[1]] for i,r in idf.iterrows()] elif len(colList)==3: idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \ else r[colList[1]] if not pd.isna(r[colList[1]]) \ else r[colList[2]] for i,r in idf.iterrows()] else: raise NotImplementedError('Add to code to handle this case') idf.drop(columns=list(colList[1:]),inplace=True) return idf # In[6]: dfbot=subval(dfbot,('Oxygen:Dissolved','Oxygen:Dissolved.1')) dfbot=subval(dfbot,('Temperature:Primary','Temperature:Secondary')) dfbot=subval(dfbot,('Salinity:T0:C0','Salinity:T1:C1')) dfbot=subval(dfbot,('Oxygen:Dissolved:SBE','Oxygen:Dissolved:SBE.1')) dfbot.rename(columns={'LOC:LATITUDE':'Lat', 'LOC:LONGITUDE':'Lon', 'Temperature:Primary':'Temperature', 'Oxygen:Dissolved:SBE':'Oxygen:Dissolved:CTD', 'Salinity:T0:C0':'Salinity',},inplace=True) # In[ ]: # In[7]: # define a function that will be applied to the values in the index column; # this makes it easier to drop non-data rows later def convertIndex(val): try: x =int(val) except ValueError: x=np.nan return x # In[8]: # load the 2015 phytoplankton data with the following options: # sheet_name='2015 CHEMTAX abs results' -> choose the 2015 sheet # usecols='A:I,T:AC' -> read only columns A:I and T:AC from the Excel sheet # skiprows=2 -> start reading at the 3rd row of the sheet, # which contains the column headings # converters={'Index': convertIndex,} -> apply the function defined above to the Index column # verbose = True -> print extra information/ warnings/ errors dfPhyto=pd.read_excel(pathPhyto,sheet_name='2015 CHEMTAX abs results',usecols='A:I,T:AC', skiprows=2,converters={'Index': convertIndex,}, verbose=True) # In[9]: # display rows 48 to 59 of the resulting table dfPhyto[48:60] # In[10]: # now, drop any rows from the table that have NaN values in either of the columns # 'Index' or 'TchlA (ug/L)' # This is why we applied a function to the Index column to make sure all # non-numeric Index values would have a consistent NaN entry, making them easy to identify # and remove dfPhyto.dropna(subset=['Index', 'TchlA (ug/L)'],how='any',inplace=True) # In[11]: # pandas creates its own index, and after dropping rows I like to reset it - # this is just for convenience dfPhyto.reset_index(drop=True,inplace=True) # In[12]: # display part of the table, confirming that non-data rows have been removed dfPhyto[48:60] # In[13]: # due to repeated column names in the original spreadsheet, '.1' was appended to the names # of the phytoplankton columns; # these lines correct the column names, removing the '.1': renameDict=dict() for colName in dfPhyto.keys(): if colName.endswith('.1'): renameDict[colName]=colName.split('.1')[0] dfPhyto.rename(columns=renameDict,inplace=True) # In[14]: dfPhyto # In[15]: # This is the important step- join the two tables ('left' and 'right'), # matching the cruise IDs and sample numbers # how='outer' -> all rows from both the left and the right tables will be included, # even if they cannot be matched; this makes it easy to check for # unmatched data later # left_on specifies the name of the column to match in the left table (dfbot) # right_on specifies the name of the column to match in the right table (dfPhyto) dfout = pd.merge(dfbot, dfPhyto, how='outer', left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#']) # In[16]: # show the column names in the resulting table dfout.keys() # ### Checks # In[17]: # Identify cases where phytoplankton data were matched to multiple samples in bottle data: dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#']) # In[18]: temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']}) temp.columns = ['icount'] # In[19]: np.unique(temp.icount) # In[20]: # check for Phyto samples matched to multiple bottle samples: temp.loc[temp.icount>1] # In[21]: # check for phyto samples not matched to bottle samples: temp.loc[temp.icount==0] # In[22]: temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']}) temp2.columns = ['icount'] # this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise: temp2.loc[temp2.icount>1] # In[23]: # check for phyto samples not matched to bottle samples: temp.loc[temp.icount==0] # In[24]: # if the output table is longer than either of the input tables, some columns were not matched len(dfout), len(dfPhyto), len(dfbot) # In[25]: # Check that the number of cells with data in the 'Cyanobacteria' column is # the same for the input and output tables: np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0) # In[26]: # If there were data rows from the phytoplankton table that were not matched to # rows from the bottle table, their indices from the phytoplankton table would be # displayed below (the series [] would not be empty) print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Index']) # In[27]: # drop repetetive/unecessary columns: dfout.drop(labels=['Bin #', 'Index', 'Subgroup', 'Month', 'Station', 'Sample#', 'rep', 'depth',],axis=1,inplace=True) # In[28]: # truncate phyto group values to 3 decimal places: for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1', 'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido', 'TchlA (ug/L)'): dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column # In[29]: dfout.keys() # In[30]: # now write the output table to a .csv file: dfout.to_csv(pathOut, index=False) # In[ ]: