Notebook

In [1]:

import re
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.rcParams['image.cmap'] = 'jet'

In [2]:

def extract_data(file_path):
    file_description = []
    default_cols = ["STATION", "DATE", "TIME", "LAT", "LON"]
    time_loc_data = 5*["NaN"]
    data_list = []
    with open(file_path) as f:
            indata = False
            for line in f:
                if not indata:
                    if line.startswith("%"):
                        file_description.append(line)
                    elif line.startswith("*"):
                        split_line = line.split()
                        columns = split_line[1:len(split_line)]
                        num_cols = len(columns)
                        indata = True
                else:
                    split_line = line.split()
                    if not line.startswith("S") and not line.startswith("T") and not line.startswith("F") and not line.startswith("P") and len(split_line) == num_cols:
                        data_list.append(time_loc_data + split_line)
                    elif len(split_line) > 0:
                        try:
                            station = split_line[0]
                        except:
                            station = "NaN"
                        try:
                            date = split_line[2] + " " + split_line[3] + " " + split_line[4]
                            date = pd.to_datetime(date, infer_datetime_format=True)
                        except:
                            date = pd.to_datetime("NaN", infer_datetime_format=True, errors = "coerce")
                        try:
                            time = split_line[5]
                        except:
                            time = "NaN"
                        try:
                            lat = split_line[6] + " " + split_line[7]
                            lon = split_line[9] + " " + split_line[10]
                        except:
                            lat, lon = "NaN", "NaN"
                        time_loc_data = [station,date,time,lat,lon]
            
    return(pd.DataFrame(data_list, columns = default_cols + columns))

In [3]:

basedir = "/ocean/shared/SoG/btl/"
chl_dict = dict()

for subdir in os.listdir(basedir):
    if os.path.isdir(basedir + '/' + subdir):
        for file in os.listdir(basedir + '/' + subdir):
            if file.startswith("bottle") and file.endswith(".txt"):
                chl_dict[subdir] = extract_data(basedir + "/" + subdir + "/" + file)
                break
all_chl = pd.concat(chl_dict, join = "inner")

In [4]:

all_chl["DEPTH"] = pd.to_numeric(all_chl["depth"], errors='coerce')
all_chl["chl002"] = pd.to_numeric(all_chl["chl002"], errors='coerce')
all_chl["chl020"] = pd.to_numeric(all_chl["chl020"], errors='coerce')
all_chl["chl200"] = pd.to_numeric(all_chl["chl200"], errors='coerce')

all_chl = all_chl[["STATION", "DATE", "DEPTH","chl002", "chl020", "chl200"]]

In [5]:

df_list = []
for i in range(1,4):
    df = pd.read_excel('/ocean/shared/SoG/PHYTO/Profiles_by_station.xls',i)
    top_index = pd.Index(df["Vol"]).get_loc("Species")
    bottom_index = pd.Index(df["Vol"]).get_loc("Carbon (ng/l)")
    df = pd.concat([df.iloc[:top_index], df.iloc[(bottom_index + 1):]])
    df = df.transpose()
    df.columns = df.iloc[0]
    df.drop(df.index[0], inplace=True)
    df.reset_index(inplace = True)
    df_list.append(df)

In [6]:

all_plank = pd.concat(df_list,  axis=0, ignore_index=True)
all_plank["STATION"] = "S" + all_plank["Site"].astype(str).str.strip()
all_plank["DATE"] = pd.to_datetime(all_plank["Date"], format='%Y-%m-%d', errors = "coerce")
all_plank["DEPTH"] = pd.to_numeric(all_plank["Depth (m)"], errors = "coerce")

all_plank = all_plank[["STATION", "DATE", "DEPTH", "Flagellates", "Mesodinium rubrum", "Total diatoms", "Total flagellates", "Total Ps" ]]

In [7]:

merged = pd.merge(all_plank, all_chl, on = ["DATE", "STATION", "DEPTH"])

merged["chl020 - chl200"] = merged["chl020"] - merged["chl200"]
merged["chl002 - chl020"] = merged["chl002"] - merged["chl020"]

sns.set(font_scale = 2)
plot_size = 8
sns.lmplot(x ="chl002", y ="Total Ps", hue="DEPTH", data=merged,  fit_reg=False, size = plot_size)
sns.lmplot("chl200" , "Total diatoms", hue="DEPTH", data=merged,  fit_reg=False, size = plot_size )
sns.lmplot("chl020 - chl200" , "Mesodinium rubrum", hue="DEPTH", data=merged,  fit_reg=False, size = plot_size )
sns.lmplot("chl002 - chl020" , "Flagellates", hue="DEPTH", data=merged,  fit_reg=False, size = plot_size )

Out[7]:

<seaborn.axisgrid.FacetGrid at 0x7f7330b3ac88>

In [8]:

x = pd.melt(merged, id_vars = ["STATION", "DATE", "DEPTH","Flagellates", "Mesodinium rubrum", "Total diatoms", "Total flagellates", "Total Ps" ], var_name = "CHLOROPHYLL_TYPE", value_name = "CHLOROPHYLL_VALUE")
long_format = pd.melt(x, id_vars=["STATION", "DATE", "DEPTH", "CHLOROPHYLL_TYPE", "CHLOROPHYLL_VALUE"], var_name = "PLANKTON_TYPE", value_name = "PLANKTON_VALUE")
long_format = long_format.dropna(axis = 0, subset = ["PLANKTON_VALUE", "CHLOROPHYLL_VALUE"])

In [9]:

sns.set(font_scale = 2)

def R2(y, pred):
    return(1 - np.mean(np.square(y-pred))/np.var(y))

chlorophyll_types = np.unique(long_format["CHLOROPHYLL_TYPE"])
plankton_types = np.unique(long_format["PLANKTON_TYPE"])

for plk in plankton_types:
    fig, ax = plt.subplots(len(chlorophyll_types)//2 + 1, 2, figsize=(15, 15))
    
    for idx, chl in enumerate(chlorophyll_types):
        subset = (merged.dropna(axis = 0, subset = [plk, chl])).sort_values(chl)
        x = subset[chl].as_matrix()
        y = subset[plk].as_matrix()
        fit = np.polyfit(x, y, 1)
        fit_fn = np.poly1d(fit) 
        pred = fit_fn(x)
        R2_val = round(R2(y, pred),3)
        ax[idx//2, idx%2].plot(x,pred,c='r')
        ax[idx//2, idx%2].scatter(x, y, s=40)
        ax[idx//2, idx%2].set_title(plk + " vs. " + chl)
        ax[idx//2, idx%2].set_ylabel(plk + " (Carbon ng/l)")
        ax[idx//2, idx%2].set_xlabel(chl)
        
        
        plt.text(0.82, 0.92,'R2 = ' + str(R2_val)  ,
            horizontalalignment='center',
            verticalalignment='center',
            transform = ax[idx//2, idx%2].transAxes)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    fig.suptitle(plk)

In [14]:

subset = (merged.dropna(axis = 0, subset = ["Total diatoms", "chl200"])).sort_values(chl)
x = subset["chl200"].as_matrix()
y = subset["Total diatoms"].as_matrix()
fit = np.polyfit(x, y, 1)
fit_fn = np.poly1d(fit) 
fit_fn

Out[14]:

poly1d([ 28070.38412148,  64926.67728433])

In [17]:

fit_fn = np.poly1d([ 28070.38412148,  64926.67728433])

In [18]:

fit_fn

Out[18]:

poly1d([ 28070.38412148,  64926.67728433])