import re
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['image.cmap'] = 'jet'
def extract_data(file_path):
file_description = []
default_cols = ["STATION", "DATE", "TIME", "LAT", "LON"]
time_loc_data = 5*["NaN"]
data_list = []
with open(file_path) as f:
indata = False
for line in f:
if not indata:
if line.startswith("%"):
file_description.append(line)
elif line.startswith("*"):
split_line = line.split()
columns = split_line[1:len(split_line)]
num_cols = len(columns)
indata = True
else:
split_line = line.split()
if not line.startswith("S") and not line.startswith("T") and not line.startswith("F") and not line.startswith("P") and len(split_line) == num_cols:
data_list.append(time_loc_data + split_line)
elif len(split_line) > 0:
try:
station = split_line[0]
except:
station = "NaN"
try:
date = split_line[2] + " " + split_line[3] + " " + split_line[4]
date = pd.to_datetime(date, infer_datetime_format=True)
except:
date = pd.to_datetime("NaN", infer_datetime_format=True, errors = "coerce")
try:
time = split_line[5]
except:
time = "NaN"
try:
lat = split_line[6] + " " + split_line[7]
lon = split_line[9] + " " + split_line[10]
except:
lat, lon = "NaN", "NaN"
time_loc_data = [station,date,time,lat,lon]
return(pd.DataFrame(data_list, columns = default_cols + columns))
basedir = "/ocean/shared/SoG/btl/"
chl_dict = dict()
for subdir in os.listdir(basedir):
if os.path.isdir(basedir + '/' + subdir):
for file in os.listdir(basedir + '/' + subdir):
if file.startswith("bottle") and file.endswith(".txt"):
chl_dict[subdir] = extract_data(basedir + "/" + subdir + "/" + file)
break
all_chl = pd.concat(chl_dict, join = "inner")
all_chl["DEPTH"] = pd.to_numeric(all_chl["depth"], errors='coerce')
all_chl["chl002"] = pd.to_numeric(all_chl["chl002"], errors='coerce')
all_chl["chl020"] = pd.to_numeric(all_chl["chl020"], errors='coerce')
all_chl["chl200"] = pd.to_numeric(all_chl["chl200"], errors='coerce')
all_chl = all_chl[["STATION", "DATE", "DEPTH","chl002", "chl020", "chl200"]]
df_list = []
for i in range(1,4):
df = pd.read_excel('/ocean/shared/SoG/PHYTO/Profiles_by_station.xls',i)
top_index = pd.Index(df["Vol"]).get_loc("Species")
bottom_index = pd.Index(df["Vol"]).get_loc("Carbon (ng/l)")
df = pd.concat([df.iloc[:top_index], df.iloc[(bottom_index + 1):]])
df = df.transpose()
df.columns = df.iloc[0]
df.drop(df.index[0], inplace=True)
df.reset_index(inplace = True)
df_list.append(df)
all_plank = pd.concat(df_list, axis=0, ignore_index=True)
all_plank["STATION"] = "S" + all_plank["Site"].astype(str).str.strip()
all_plank["DATE"] = pd.to_datetime(all_plank["Date"], format='%Y-%m-%d', errors = "coerce")
all_plank["DEPTH"] = pd.to_numeric(all_plank["Depth (m)"], errors = "coerce")
all_plank = all_plank[["STATION", "DATE", "DEPTH", "Flagellates", "Mesodinium rubrum", "Total diatoms", "Total flagellates", "Total Ps" ]]
merged = pd.merge(all_plank, all_chl, on = ["DATE", "STATION", "DEPTH"])
merged["chl020 - chl200"] = merged["chl020"] - merged["chl200"]
merged["chl002 - chl020"] = merged["chl002"] - merged["chl020"]
sns.set(font_scale = 2)
plot_size = 8
sns.lmplot(x ="chl002", y ="Total Ps", hue="DEPTH", data=merged, fit_reg=False, size = plot_size)
sns.lmplot("chl200" , "Total diatoms", hue="DEPTH", data=merged, fit_reg=False, size = plot_size )
sns.lmplot("chl020 - chl200" , "Mesodinium rubrum", hue="DEPTH", data=merged, fit_reg=False, size = plot_size )
sns.lmplot("chl002 - chl020" , "Flagellates", hue="DEPTH", data=merged, fit_reg=False, size = plot_size )
<seaborn.axisgrid.FacetGrid at 0x7f7330b3ac88>
x = pd.melt(merged, id_vars = ["STATION", "DATE", "DEPTH","Flagellates", "Mesodinium rubrum", "Total diatoms", "Total flagellates", "Total Ps" ], var_name = "CHLOROPHYLL_TYPE", value_name = "CHLOROPHYLL_VALUE")
long_format = pd.melt(x, id_vars=["STATION", "DATE", "DEPTH", "CHLOROPHYLL_TYPE", "CHLOROPHYLL_VALUE"], var_name = "PLANKTON_TYPE", value_name = "PLANKTON_VALUE")
long_format = long_format.dropna(axis = 0, subset = ["PLANKTON_VALUE", "CHLOROPHYLL_VALUE"])
sns.set(font_scale = 2)
def R2(y, pred):
return(1 - np.mean(np.square(y-pred))/np.var(y))
chlorophyll_types = np.unique(long_format["CHLOROPHYLL_TYPE"])
plankton_types = np.unique(long_format["PLANKTON_TYPE"])
for plk in plankton_types:
fig, ax = plt.subplots(len(chlorophyll_types)//2 + 1, 2, figsize=(15, 15))
for idx, chl in enumerate(chlorophyll_types):
subset = (merged.dropna(axis = 0, subset = [plk, chl])).sort_values(chl)
x = subset[chl].as_matrix()
y = subset[plk].as_matrix()
fit = np.polyfit(x, y, 1)
fit_fn = np.poly1d(fit)
pred = fit_fn(x)
R2_val = round(R2(y, pred),3)
ax[idx//2, idx%2].plot(x,pred,c='r')
ax[idx//2, idx%2].scatter(x, y, s=40)
ax[idx//2, idx%2].set_title(plk + " vs. " + chl)
ax[idx//2, idx%2].set_ylabel(plk + " (Carbon ng/l)")
ax[idx//2, idx%2].set_xlabel(chl)
plt.text(0.82, 0.92,'R2 = ' + str(R2_val) ,
horizontalalignment='center',
verticalalignment='center',
transform = ax[idx//2, idx%2].transAxes)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
fig.suptitle(plk)
subset = (merged.dropna(axis = 0, subset = ["Total diatoms", "chl200"])).sort_values(chl)
x = subset["chl200"].as_matrix()
y = subset["Total diatoms"].as_matrix()
fit = np.polyfit(x, y, 1)
fit_fn = np.poly1d(fit)
fit_fn
poly1d([ 28070.38412148, 64926.67728433])
fit_fn = np.poly1d([ 28070.38412148, 64926.67728433])
fit_fn
poly1d([ 28070.38412148, 64926.67728433])