#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np  # this module handles arrays, but here we need it for its NaN value
import pandas as pd # this module contains a lot of tools for handling tabular data
import re


# In[2]:


# define paths to the source files and eventual output file
#pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2018 SoG bottle.xlsx'
#pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2018 Abs phyto groups.xlsx'
pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2018 SoG bottleCorrected.xlsx'
pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2018 Abs phyto groupsCorrected.xlsx'

pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2018.csv'


# In[3]:


#formatting function to get year-(3digit) format
def fmtCruise(istr):
    if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): 
        sp=re.split('-',istr)
        rstr=sp[0]+'-0'+sp[1]
    elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):
        rstr=istr
    else:
        raise ValueError('Input had unexpected format:',istr)
    return rstr


# In[4]:


# get names of sheets in notebook
with pd.ExcelFile(pathBottle) as xl:
    sheets=xl.sheet_names
print(sheets)


# In[5]:


# load each sheet in the bottle Excel file and concatenate them together into one table
dfbotlist=list()
for sheet in sheets:
    df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,
                      na_values=(-99,-99.9)) # read each sheet; include additional na values
    df0['Cruise']=fmtCruise(sheet)  # create and populate Cruise column based on sheet name
    dfbotlist.append(df0) # append the sheet to a list
dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table
# Drop columns with no data in them
l1=set(dfbot.keys())
dfbot.dropna(axis=1,how='all',inplace=True)
print('removed empty columns:',l1-set(dfbot.keys()))


# In[6]:


# list the column names in the resulting table
print(dfbot.keys())


# In[7]:


# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Depth [metres]']>=0),
      np.sum(dfbot['Depth:CTD [metres]']>=0))
dfbot.loc[(dfbot['Depth [metres]']>=0)&\
          (dfbot['Depth:CTD [metres]']>=0)]


# In[8]:


# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Temperature:Primary [deg C (ITS90)]']>=0),
      np.sum(dfbot['Temperature:Secondary [deg C (ITS90)]']>=0),
      np.sum(dfbot['Temperature:CTD [deg C (ITS90)]']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Temperature:Primary [deg C (ITS90)]']>=0)])+\
          np.array([int(ii) for ii in (dfbot['Temperature:Secondary [deg C (ITS90)]']>=0)])+\
           np.array([int(ii) for ii in (dfbot['Temperature:CTD [deg C (ITS90)]']>=0)]))>1]


# In[9]:


# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0),
      np.sum(dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0),
      np.sum(dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0)])+\
          np.array([int(ii) for ii in (dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0)])+\
           np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0)]))>1]


# In[10]:


# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0),
      np.sum(dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0))
dfbot.loc[(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0)&\
          (dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0)]


# In[11]:


# no rows returned, so there are no rows with both both salinity fields
print(np.sum(dfbot['Salinity:T0:C0 [PSS-78]']>=0),
      np.sum(dfbot['Salinity:T1:C1 [PSS-78]']>=0),
      np.sum(dfbot['Salinity:CTD [PSS-78]']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Salinity:T0:C0 [PSS-78]']>=0)])+\
           np.array([int(ii) for ii in (dfbot['Salinity:T1:C1 [PSS-78]']>=0)])+\
           np.array([int(ii) for ii in (dfbot['Salinity:CTD [PSS-78]']>=0)]))>1]


# In[12]:


# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Pressure [decibar]']>=0),
      np.sum(dfbot['Pressure:CTD [decibar]']>=0))
dfbot.loc[(dfbot['Pressure [decibar]']>=0)&\
          (dfbot['Pressure:CTD [decibar]']>=0)]


# In[13]:


# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Depth [metres]']>=0),
      np.sum(dfbot['Depth:CTD [metres]']>=0))
dfbot.loc[(dfbot['Depth [metres]']>=0)&\
          (dfbot['Depth:CTD [metres]']>=0)]


# In[14]:


# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Oxygen:Dissolved:SBE [mL/L]']>=0),
      np.sum(dfbot['Oxygen:Dissolved:Volume:CTD [mL/L]']>=0))
dfbot.loc[(dfbot['Oxygen:Dissolved:SBE [mL/L]']>=0)&\
          (dfbot['Oxygen:Dissolved:Volume:CTD [mL/L]']>=0)]


# In[15]:


# no rows returned, so there are no rows with both primary and secondary temperature values
print(np.sum(dfbot['Oxygen:Dissolved:SBE [umol/kg]']>=0),
      np.sum(dfbot['Oxygen:Dissolved:Mass:CTD [umol/kg]']>=0))
dfbot.loc[(dfbot['Oxygen:Dissolved:SBE [umol/kg]']>=0)&\
          (dfbot['Oxygen:Dissolved:Mass:CTD [umol/kg]']>=0)]


# In[16]:


def subval(idf,colList):
    # first value in colList should be the column you are going to keep
    # follow with other columns that will be used to fill in when that column is NaN
    # in order of precedence
    if len(colList)==2:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] for i,r in idf.iterrows()]
    elif len(colList)==3:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] if not pd.isna(r[colList[1]]) \
                         else r[colList[2]] for i,r in idf.iterrows()]
    else:
        raise NotImplementedError('Add to code to handle this case')
    idf.drop(columns=list(colList[1:]),inplace=True)
    return idf


# In[17]:


# there are some duplicate columns here; handle them:
dfbot=subval(dfbot,('FIL:START TIME YYYY/MM/DD HH:MM:SS',
                    'YYYY/MM/DD HH:MM:SS'))
dfbot=subval(dfbot,('Temperature:Primary [deg C (ITS90)]',
                    'Temperature:Secondary [deg C (ITS90)]',
                    'Temperature:CTD [deg C (ITS90)]'))
dfbot=subval(dfbot,('Salinity:T0:C0 [PSS-78]',
                    'Salinity:T1:C1 [PSS-78]',
                    'Salinity:CTD [PSS-78]'))
dfbot=subval(dfbot,('Pressure [decibar]',
                    'Pressure:CTD [decibar]'))
dfbot=subval(dfbot,('Depth [metres]',
                    'Depth:CTD [metres]'))
dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [mL/L]',
                    'Oxygen:Dissolved:Volume:CTD [mL/L]'))
dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [umol/kg]',
                    'Oxygen:Dissolved:Mass:CTD [umol/kg]'))
dfbot=subval(dfbot,('Fluorescence:URU:Seapoint [mg/m^3]',
                    'Fluorescence:URU:CTD [mg/m^3]',
                    'Fluorescence:URU:Wetlabs [mg/m^3]'))
dfbot.rename(columns={'Temperature:Primary [deg C (ITS90)]':'Temperature [deg C (ITS90)]'},
             inplace=True)
dfbot.rename(columns={'Salinity:T0:C0 [PSS-78]':'Salinity [PSS-78]'},
             inplace=True)
dfbot.rename(columns={'Oxygen:Dissolved:SBE [mL/L]':'Oxygen:Dissolved:CTD [mL/L]'},
             inplace=True)
dfbot.rename(columns={'Oxygen:Dissolved:SBE [umol/kg]':'Oxygen:Dissolved:CTD [umol/kg]'},
             inplace=True)
dfbot.rename(columns={'Fluorescence:URU:Seapoint [mg/m^3]':'Fluorescence [mg/m^3]'},
             inplace=True)


# In[18]:


# define a function that will be applied to the values in the index column;
# this makes it easier to drop non-data rows later
def convertIndex(val):
    try:
        x =int(val)
    except ValueError:
        x=np.nan
    return x


# In[19]:


# load the  phytoplankton data with the following options:
#   sheet_name='???? CHEMTAX abs results'  -> choose the  sheet
#   usecols='A:I,T:AC'   -> read only columns A:I and T:AC from the Excel sheet
#   skiprows=2     -> start reading at the 3rd row of the sheet, 
#                     which contains the column headings
#   converters={'Index': convertIndex,}   -> apply the function defined above to the Index column
#   verbose = True   -> print extra information/ warnings/ errors
dfPhyto=pd.read_excel(pathPhyto,sheet_name='2018 CHEMTAX abs results ',usecols='A:I,T:AC',
                      skiprows=2,converters={'Index': convertIndex,},
                      verbose=True)


# In[20]:


# display rows 48 to 59 of the resulting table
dfPhyto[48:60]


# In[21]:


# now, drop any rows from the table that have NaN values in either of the columns
#  'Index' or 'TchlA (ug/L)'
# This is why we applied a function to the Index column to make sure all 
#  non-numeric Index values would have a consistent NaN entry, making them easy to identify
#  and remove
dfPhyto.dropna(subset=['Index', 'TchlA.1'],how='any',inplace=True)


# In[22]:


# pandas creates its own index, and after dropping rows I like to reset it -
# this is just for convenience
dfPhyto.reset_index(drop=True,inplace=True)


# In[23]:


# apply formatting function all rows in Cruise column to get year-3digit format
dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]


# In[24]:


# display part of the table, confirming that non-data rows have been removed
dfPhyto[48:60]


# In[25]:


# due to repeated column names in the original spreadsheet, '.1' was appended to the names
# of the phytoplankton columns; 
# these lines correct the column names, removing the '.1':
renameDict=dict()
for colName in dfPhyto.keys():
    if colName.endswith('.1'):
        renameDict[colName]=colName.split('.1')[0]
dfPhyto.rename(columns=renameDict,inplace=True)


# In[26]:


dfPhyto


# In[27]:


# This is the important step- join the two tables ('left' and 'right'), 
#  matching the cruise IDs and sample numbers
#   how='outer'  -> all rows from both the left and the right tables will be included, 
#                   even if they cannot be matched; this makes it easy to check for 
#                   unmatched data later
#   left_on  specifies the name of the column to match in the left table (dfbot) 
#   right_on specifies the name of the column to match in the right table (dfPhyto)
dfout = pd.merge(dfbot, dfPhyto,  how='outer', 
                 left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])


# In[28]:


# Identify cases where phytoplankton data were matched to multiple samples in bottle data:
dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])


# In[29]:


# show the column names in the resulting table
dfout.keys()


# ### Checks

# In[30]:


temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})
temp.columns = ['icount']


# In[31]:


np.unique(temp.icount)


# In[32]:


# check for Phyto samples matched to multiple bottle samples:
temp.loc[temp.icount>1]


# In[33]:


# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]


# In[34]:


temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})
temp2.columns = ['icount']
# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:
temp2.loc[temp2.icount>1]


# In[35]:


# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]


# In[36]:


# if the output table is longer than either of the input tables, some columns were not matched
len(dfout), len(dfPhyto), len(dfbot)


# In[37]:


# Check that the number of cells with data in the 'Cyanobacteria' column is 
#  the same for the input and output tables to show that no rows are missing:
np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)


# In[38]:


# If there were data rows from the phytoplankton table that were not matched to 
#  rows from the bottle table, their indices from the phytoplankton table would be 
#  displayed below (the series [] would not be empty)
print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Index'])


# In[39]:


dfout.loc[dfout['ADM:SCIENTIST'].isna(),['Index','Cruise','Diatoms-1','Prasinophytes']]


# In[40]:


# drop repetetive/unecessary columns:
dfout.drop(labels=['Bin #', 'Index', 'Subgroup', 'Month', 'Station', 'Sample#', 'rep',
                   'depth',],axis=1,inplace=True)


# In[41]:


# truncate phyto group values to 3 decimal places:
for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',
       'Diatoms-2', 'Dinoflagellates', 'Haptophytes', 'Dictyochophytes', 'Raphidophytes',
       'TchlA'):
    dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column


# In[42]:


dfout['Cyanobacteria']


# In[43]:


# now write the output table to a .csv file:
dfout.to_csv(pathOut, index=False)  


# In[44]:


dfout.keys()


# In[ ]: