#!/usr/bin/env python # coding: utf-8 # In[1]: import re import os import glob import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') plt.rcParams['image.cmap'] = 'jet' # In[2]: def extract_data(file_path): file_description = [] default_cols = ["STATION", "DATE", "TIME", "LAT", "LON"] time_loc_data = 5*["NaN"] data_list = [] with open(file_path) as f: indata = False for line in f: if not indata: if line.startswith("%"): file_description.append(line) elif line.startswith("*"): split_line = line.split() columns = split_line[1:len(split_line)] num_cols = len(columns) indata = True else: split_line = line.split() if not line.startswith("S") and not line.startswith("T") and not line.startswith("F") and not line.startswith("P") and len(split_line) == num_cols: data_list.append(time_loc_data + split_line) elif len(split_line) > 0: try: station = split_line[0] except: station = "NaN" try: date = split_line[2] + " " + split_line[3] + " " + split_line[4] date = pd.to_datetime(date, infer_datetime_format=True) except: date = pd.to_datetime("NaN", infer_datetime_format=True, errors = "coerce") try: time = split_line[5] except: time = "NaN" try: lat = split_line[6] + " " + split_line[7] lon = split_line[9] + " " + split_line[10] except: lat, lon = "NaN", "NaN" time_loc_data = [station,date,time,lat,lon] return(pd.DataFrame(data_list, columns = default_cols + columns)) # In[3]: basedir = "/ocean/shared/SoG/btl/" chl_dict = dict() for subdir in os.listdir(basedir): if os.path.isdir(basedir + '/' + subdir): for file in os.listdir(basedir + '/' + subdir): if file.startswith("bottle") and file.endswith(".txt"): chl_dict[subdir] = extract_data(basedir + "/" + subdir + "/" + file) break all_chl = pd.concat(chl_dict, join = "inner") # In[4]: all_chl["DEPTH"] = pd.to_numeric(all_chl["depth"], errors='coerce') all_chl["chl002"] = pd.to_numeric(all_chl["chl002"], errors='coerce') all_chl["chl020"] = pd.to_numeric(all_chl["chl020"], errors='coerce') all_chl["chl200"] = pd.to_numeric(all_chl["chl200"], errors='coerce') all_chl = all_chl[["STATION", "DATE", "DEPTH","chl002", "chl020", "chl200"]] # In[5]: df_list = [] for i in range(1,4): df = pd.read_excel('/ocean/shared/SoG/PHYTO/Profiles_by_station.xls',i) top_index = pd.Index(df["Vol"]).get_loc("Species") bottom_index = pd.Index(df["Vol"]).get_loc("Carbon (ng/l)") df = pd.concat([df.iloc[:top_index], df.iloc[(bottom_index + 1):]]) df = df.transpose() df.columns = df.iloc[0] df.drop(df.index[0], inplace=True) df.reset_index(inplace = True) df_list.append(df) # In[6]: all_plank = pd.concat(df_list, axis=0, ignore_index=True) all_plank["STATION"] = "S" + all_plank["Site"].astype(str).str.strip() all_plank["DATE"] = pd.to_datetime(all_plank["Date"], format='%Y-%m-%d', errors = "coerce") all_plank["DEPTH"] = pd.to_numeric(all_plank["Depth (m)"], errors = "coerce") all_plank = all_plank[["STATION", "DATE", "DEPTH", "Flagellates", "Mesodinium rubrum", "Total diatoms", "Total flagellates", "Total Ps" ]] # In[7]: merged = pd.merge(all_plank, all_chl, on = ["DATE", "STATION", "DEPTH"]) merged["chl020 - chl200"] = merged["chl020"] - merged["chl200"] merged["chl002 - chl020"] = merged["chl002"] - merged["chl020"] sns.set(font_scale = 2) plot_size = 8 sns.lmplot(x ="chl002", y ="Total Ps", hue="DEPTH", data=merged, fit_reg=False, size = plot_size) sns.lmplot("chl200" , "Total diatoms", hue="DEPTH", data=merged, fit_reg=False, size = plot_size ) sns.lmplot("chl020 - chl200" , "Mesodinium rubrum", hue="DEPTH", data=merged, fit_reg=False, size = plot_size ) sns.lmplot("chl002 - chl020" , "Flagellates", hue="DEPTH", data=merged, fit_reg=False, size = plot_size ) # In[8]: x = pd.melt(merged, id_vars = ["STATION", "DATE", "DEPTH","Flagellates", "Mesodinium rubrum", "Total diatoms", "Total flagellates", "Total Ps" ], var_name = "CHLOROPHYLL_TYPE", value_name = "CHLOROPHYLL_VALUE") long_format = pd.melt(x, id_vars=["STATION", "DATE", "DEPTH", "CHLOROPHYLL_TYPE", "CHLOROPHYLL_VALUE"], var_name = "PLANKTON_TYPE", value_name = "PLANKTON_VALUE") long_format = long_format.dropna(axis = 0, subset = ["PLANKTON_VALUE", "CHLOROPHYLL_VALUE"]) # In[9]: sns.set(font_scale = 2) def R2(y, pred): return(1 - np.mean(np.square(y-pred))/np.var(y)) chlorophyll_types = np.unique(long_format["CHLOROPHYLL_TYPE"]) plankton_types = np.unique(long_format["PLANKTON_TYPE"]) for plk in plankton_types: fig, ax = plt.subplots(len(chlorophyll_types)//2 + 1, 2, figsize=(15, 15)) for idx, chl in enumerate(chlorophyll_types): subset = (merged.dropna(axis = 0, subset = [plk, chl])).sort_values(chl) x = subset[chl].as_matrix() y = subset[plk].as_matrix() fit = np.polyfit(x, y, 1) fit_fn = np.poly1d(fit) pred = fit_fn(x) R2_val = round(R2(y, pred),3) ax[idx//2, idx%2].plot(x,pred,c='r') ax[idx//2, idx%2].scatter(x, y, s=40) ax[idx//2, idx%2].set_title(plk + " vs. " + chl) ax[idx//2, idx%2].set_ylabel(plk + " (Carbon ng/l)") ax[idx//2, idx%2].set_xlabel(chl) plt.text(0.82, 0.92,'R2 = ' + str(R2_val) , horizontalalignment='center', verticalalignment='center', transform = ax[idx//2, idx%2].transAxes) plt.tight_layout() plt.subplots_adjust(top=0.9) fig.suptitle(plk) # In[14]: subset = (merged.dropna(axis = 0, subset = ["Total diatoms", "chl200"])).sort_values(chl) x = subset["chl200"].as_matrix() y = subset["Total diatoms"].as_matrix() fit = np.polyfit(x, y, 1) fit_fn = np.poly1d(fit) fit_fn # In[17]: fit_fn = np.poly1d([ 28070.38412148, 64926.67728433]) # In[18]: fit_fn