#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np  # this module handles arrays, but here we need it for its NaN value
import pandas as pd # this module contains a lot of tools for handling tabular data
import re
import gsw

## note: 2015-2019 SoG abs values New ALLO.xlsx has an incorrect sample number
    -- 2017-05 Station 28 sample 416 should be sample 9416 (9 is treatment of duplicate sample numbers)
It is corrected in the code below rather than in the file itself so that the can be reproduced in future versions
# In[2]:


# define paths to the source files and eventual output file
pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2017 SoG bottle.xlsx'
pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2019 SoG abs values New ALLO.xlsx'
pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2017_NewALLO.csv'


# In[3]:


#formatting function to get year-(3digit) format
def fmtCruise(istr):
    if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): 
        sp=re.split('-',istr)
        rstr=sp[0]+'-0'+sp[1]
    elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):
        rstr=istr
    else:
        raise ValueError('Input had unexpected format:',istr)
    return rstr


# In[4]:


# get names of sheets in notebook
with pd.ExcelFile(pathBottle,engine='openpyxl') as xl:
    sheets=xl.sheet_names
print(sheets)


# In[5]:


# load each sheet in the bottle Excel file and concatenate them together into one table
dfbotlist=list()
for sheet in sheets:
    df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,engine='openpyxl',
                      na_values=(-99,-99.9)) # read each sheet; include additional na values
    df0['Cruise']=fmtCruise(sheet)  # create and populate Cruise column based on sheet name
    dfbotlist.append(df0) # append the sheet to a list
dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table
# Drop columns with no data in them
l1=set(dfbot.keys())
dfbot.dropna(axis=1,how='all',inplace=True)
print('removed empty columns:',l1-set(dfbot.keys()))


# In[6]:


# list the column names in the resulting table
print(dfbot.keys())


# In[7]:


# no rows returned, so there are no rows with multiple temperature values
print(np.sum(dfbot['Temperature:Primary [deg C (ITS90)]']>=0),
      np.sum(dfbot['Temperature:Secondary [deg C (ITS90)]']>=0),
      np.sum(dfbot['Temperature*']>=0))
dfbot.loc[(np.array([int(ii) for ii in (dfbot['Temperature:Primary [deg C (ITS90)]']>=0)])+\
          np.array([int(ii) for ii in (dfbot['Temperature:Secondary [deg C (ITS90)]']>=0)])+\
           np.array([int(ii) for ii in (dfbot['Temperature*']>=0)]))>1]


# In[8]:


# no rows returned, so there are no rows with both both salinity fields
print(np.sum(dfbot['Salinity:T0:C0 [PSS-78]']>=0),
      np.sum(dfbot['Salinity:T1:C1 [PSS-78]']>=0))
dfbot.loc[(dfbot['Salinity:T0:C0 [PSS-78]']>=0)&\
          (dfbot['Salinity:T1:C1 [PSS-78]']>=0)]


# In[9]:


def subval(idf,colList):
    # first value in colList should be the column you are going to keep
    # follow with other columns that will be used to fill in when that column is NaN
    # in order of precedence
    if len(colList)==2:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] for i,r in idf.iterrows()]
    elif len(colList)==3:
        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \
                         else  r[colList[1]] if not pd.isna(r[colList[1]]) \
                         else r[colList[2]] for i,r in idf.iterrows()]
    else:
        raise NotImplementedError('Add to code to handle this case')
    return idf


# In[10]:


# there are some duplicate columns here; handle them:
dfbot=subval(dfbot,('Temperature:Primary [deg C (ITS90)]',
                    'Temperature:Secondary [deg C (ITS90)]',
                    'Temperature*'))
dfbot=subval(dfbot,('Salinity:T0:C0 [PSS-78]',
                    'Salinity:T1:C1 [PSS-78]'))
dfbot.rename(columns={'Temperature:Primary [deg C (ITS90)]':'Temperature [deg C (ITS90)]'},
             inplace=True)
dfbot.rename(columns={'Salinity:T0:C0 [PSS-78]':'Salinity [PSS-78]'},
             inplace=True)


# In[11]:


# drop repetetive/unecessary columns:
dfbot.drop(labels=['Temperature:Secondary [deg C (ITS90)]',
                   'Salinity:T1:C1 [PSS-78]','Temperature*'],axis=1,inplace=True)


# In[12]:


# define a function that will be applied to the values in the index column;
# this makes it easier to drop non-data rows later
def convertIndex(val):
    try:
        x =int(val)
    except ValueError:
        x=np.nan
    return x


# In[13]:


# load the 2015 phytoplankton data with the following options:
#   sheet_name='2015 CHEMTAX abs results'  -> choose the 2015 sheet
#   usecols='A:I,T:AC'   -> read only columns A:I and T:AC from the Excel sheet
#   skiprows=2     -> start reading at the 3rd row of the sheet, 
#                     which contains the column headings
#   converters={'Index': convertIndex,}   -> apply the function defined above to the Index column
#   verbose = True   -> print extra information/ warnings/ errors
dfPhyto=pd.read_excel(pathPhyto,sheet_name='2017',engine='openpyxl',
                      skiprows=0,converters={'Sample#': convertIndex,},
                      verbose=True)


# In[14]:


# display rows 48 to 59 of the resulting table
dfPhyto[48:60]


# In[15]:


# now, drop any rows from the table that have NaN values in either of the columns
#  'Index' or 'TchlA (ug/L)'
# This is why we applied a function to the Index column to make sure all 
#  non-numeric Index values would have a consistent NaN entry, making them easy to identify
#  and remove
dfPhyto.dropna(subset=['Sample#', 'TchlA'],how='any',inplace=True)


# In[16]:


# pandas creates its own index, and after dropping rows I like to reset it -
# this is just for convenience
dfPhyto.reset_index(drop=True,inplace=True)


# In[17]:


# apply formatting function all rows in Cruise column to get year-3digit format
dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]


# In[18]:


# display part of the table, confirming that non-data rows have been removed
dfPhyto[48:60]


# In[19]:


# due to repeated column names in the original spreadsheet, '.1' was appended to the names
# of the phytoplankton columns; 
# these lines correct the column names, removing the '.1':
renameDict=dict()
for colName in dfPhyto.keys():
    if colName.endswith('.1'):
        renameDict[colName]=colName.split('.1')[0]
dfPhyto.rename(columns=renameDict,inplace=True)


# In[20]:


dfPhyto


# In[21]:


# sample number corrections
iii=(dfPhyto['Cruise']=='2017-005')&(dfPhyto['Station']==28)&(dfPhyto['Sample#']==416)
if len(dfPhyto.loc[iii])>0:
    dfPhyto.loc[iii,['Sample#']]=9416
iii=(dfPhyto['Cruise']=='2017-005')&(dfPhyto['Station']==28)&(dfPhyto['Sample#']==416)
print('9416 This number should be zero:',len(dfPhyto.loc[iii]))
iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']==2)&(dfPhyto['Sample#']==213)
if len(dfPhyto.loc[iii])>0:
    dfPhyto.loc[iii,['Sample#']]=9213
iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']==2)&(dfPhyto['Sample#']==213)
print('9213 This number should be zero:',len(dfPhyto.loc[iii]))
#this ones was correct but test it just in case:
iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']=='CPF1')&(dfPhyto['Sample#']==217)
if len(dfPhyto.loc[iii])>0:
    dfPhyto.loc[iii,['Sample#']]=9217
iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']=='CPF1')&(dfPhyto['Sample#']==217)
print('9217 This number should be zero:',len(dfPhyto.loc[iii]))


# In[22]:


# This is the important step- join the two tables ('left' and 'right'), 
#  matching the cruise IDs and sample numbers
#   how='outer'  -> all rows from both the left and the right tables will be included, 
#                   even if they cannot be matched; this makes it easy to check for 
#                   unmatched data later
#   left_on  specifies the name of the column to match in the left table (dfbot) 
#   right_on specifies the name of the column to match in the right table (dfPhyto)
dfout = pd.merge(dfbot, dfPhyto,  how='outer', 
                 left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])


# In[23]:


# show the column names in the resulting table
dfout.keys()


# ### Checks

# In[24]:


# Identify cases where phytoplankton data were matched to multiple samples in bottle data:
dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])


# In[25]:


temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})
temp.columns = ['icount']


# In[26]:


np.unique(temp.icount)


# In[27]:


# check for Phyto samples matched to multiple bottle samples:
temp.loc[temp.icount>1]


# In[28]:


# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]


# In[29]:


temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})
temp2.columns = ['icount']
# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:
temp2.loc[temp2.icount>1]


# In[30]:


dfbot.loc[(dfbot.Cruise=='2017-064')&(dfbot.Sample_Number<0)]


# In[31]:


dfout.loc[(dfout.Cruise=='2017-064')&(dfout.Sample_Number<0)]


# In[32]:


# check for phyto samples not matched to bottle samples:
temp.loc[temp.icount==0]


# In[33]:


# if the output table is longer than either of the input tables, some columns were not matched
len(dfout), len(dfPhyto), len(dfbot)


# In[34]:


# Check that the number of cells with data in the 'Cyanobacteria' column is 
#  the same for the input and output tables to show that no rows are missing:
np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)


# In[35]:


# If there were data rows from the phytoplankton table that were not matched to 
#  rows from the bottle table, their indices from the phytoplankton table would be 
#  displayed below (the series [] would not be empty)
print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Sample#'])


# In[36]:


# drop repetetive/unecessary columns:
dfout.drop(labels=['Date', 'Station', 'Sample#', 'rep',
                   'depth',],axis=1,inplace=True)


# In[37]:


# truncate phyto group values to 3 decimal places:
for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',
       'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',
       'TchlA'):
    dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column


# In[44]:


# Check for unexpected depths
dfout.loc[((dfout['Pressure [decibar]']>10)|(dfout['Depth [metres]']>10))&(dfout['Diatoms-1']>=0)]#,
#         ['FIL:START TIME YYYY/MM/DD HH:MM:SS','Pressure [decibar]','Depth [metres]','Diatoms-1']]


# In[38]:


dfout['Cyanobacteria']


# In[39]:


# now write the output table to a .csv file:
dfout.to_csv(pathOut, index=False)  


# In[40]:


dfout.keys()


# In[ ]: