#!/usr/bin/env python
# coding: utf-8
# # Regression Confidence Experiments
#
# This notebook uses explores the computation and usage of Regression confidence metrics. Unlink many classification algorithms that provide a `predict_proba()` method that will give you probabilities, which can then be turned into a confidence, there's no direct equivalent for regression models
#
#
# ## Data
# AqSolDB: A curated reference set of aqueous solubility, created by the Autonomous Energy Materials Discovery [AMD] research group, consists of aqueous solubility values of 9,982 unique compounds curated from 9 different publicly available aqueous solubility datasets. AqSolDB also contains some relevant topological and physico-chemical 2D descriptors. Additionally, AqSolDB contains validated molecular representations of each of the compounds. This openly accessible dataset, which is the largest of its kind, and will not only serve as a useful reference source of measured and calculated solubility data, but also as a much improved and generalizable training data source for building data-driven models. (2019-04-10)
#
# Main Reference:
# https://www.nature.com/articles/s41597-019-0151-1
#
# Data Dowloaded from the Harvard DataVerse:
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/OVHAW8
#
# ® Amazon Web Services, AWS, the Powered by AWS logo, are trademarks of Amazon.com, Inc. or its affiliates.
# In[ ]:
import sageworks
import logging
logging.getLogger("sageworks").setLevel(logging.WARNING)
# In[ ]:
# We've already created a FeatureSet so just grab it
from sageworks.api.feature_set import FeatureSet
# In[ ]:
fs = FeatureSet("aqsol_mol_descriptors")
# In[ ]:
full_df = fs.pull_dataframe()
full_df.head()
# In[ ]:
# Sanity check our solubility and solubility_class
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 12.0
plt.rcParams['figure.figsize'] = 14.0, 7.0
sns.set_theme(style='darkgrid')
# Create a box plot
sns.boxplot(x='solubility_class', y='solubility', data=full_df, order = ['high', 'medium', 'low'])
plt.title('Solubility by Solubility Class')
plt.xlabel('Solubility Class')
plt.ylabel('Solubility')
plt.show()
# In[ ]:
# Grab the Training View
table = fs.view("training").table
train_df = fs.query(f"SELECT * FROM {table} where training = TRUE")
hold_out_df = fs.query(f"SELECT * FROM {table} where training = FALSE")
# In[ ]:
train_df
# In[ ]:
# Here we're just grabbing the model to get the target and features
from sageworks.api.model import Model
model = Model("aqsol-mol-regression")
target = model.target()
features = model.features()
# In[ ]:
len(full_df)
# In[ ]:
import xgboost as xgb
X = train_df[features]
y = train_df[target]
# Train the main XGBRegressor model
model = xgb.XGBRegressor()
model.fit(X, y)
# Run Predictions on the the hold out
hold_out_df["predictions"] = model.predict(hold_out_df[features])
hold_out_df["confidence"] = 0
# In[ ]:
# Plot Stuff
plot_predictions(hold_out_df)
# In[ ]:
from sageworks.algorithms.dataframe.feature_spider import FeatureSpider
# Create the FeatureSpider class and run the various methods
f_spider = FeatureSpider(train_df, features, id_column="id", target_column=target, neighbors=5)
hold_out_df["predictions"] = f_spider.predict(hold_out_df)
plot_predictions(hold_out_df)
# In[ ]:
# Include Confidence metric
hold_out_df["predictions"] = model.predict(hold_out_df[features])
hold_out_df["confidence"] = f_spider.confidence_scores(hold_out_df, hold_out_df["predictions"])
# In[ ]:
plot_predictions(hold_out_df, color="confidence")
# In[ ]:
high_df = hold_out_df[hold_out_df["confidence"] >= 0.5]
plot_predictions(high_df)
# In[ ]:
# Classification Model
class_model = xgb.XGBClassifier(enable_categorical=True)
X = train_df[features]
y = train_df["solubility_class"]
y = y.astype("category")
# Train the main XGBClassifier model
class_model.fit(X, y)
# Run Predictions on the the hold out
# hold_out_df["predictions"] = class_model.predict(hold_out_df[features])
# In[ ]:
hold_out_df.head()
# In[ ]:
high_df = df[df["confidence"] >= 0.0]
# In[ ]:
plot_predictions(high_df, color="confidence")
# In[ ]:
df["predictions"] = f_spider.predict(df)
plot_predictions(df)
# In[ ]:
# Use XGBRegressor and Quartiles
import xgboost as xgb
import numpy as np
from sklearn.linear_model import QuantileRegressor
X = df[features]
y = df[target]
# Train the main XGBRegressor model
model = xgb.XGBRegressor()
model.fit(X, y)
# Train QuantileRegressor models for lower and upper bounds
# Specify a different solver, such as 'highs'
lower_model = QuantileRegressor(quantile=0.05, solver='highs')
upper_model = QuantileRegressor(quantile=0.95, solver='highs')
lower_model.fit(X, y)
upper_model.fit(X, y)
# Make predictions
predictions = model.predict(X)
lower_bounds = lower_model.predict(X)
upper_bounds = upper_model.predict(X)
# Calculate confidence metric
confidence = 1 - (upper_bounds - lower_bounds) / (np.max(y) - np.min(y))
# Combine predictions and confidence into the existing dataframe
df["predictions"] = predictions
df["confidence"] = confidence
df["lower_bound"] = lower_bounds
df["upper_bound"] = upper_bounds
print(df.head())
# In[ ]:
df["confidence"].hist()
# # Helper Methods
# In[ ]:
# Helper to look at predictions vs target
from math import sqrt
import pandas as pd
def plot_predictions(df, color="error", line=True):
# Dataframe of the targets and predictions
target = 'Actual Solubility'
pred = 'Predicted Solubility'
df_plot = pd.DataFrame({target: df['solubility'], pred: df['predictions'], 'confidence': df['confidence']})
# Compute Error per prediction
# df_plot['RMSError'] = df_plot.apply(lambda x: sqrt((x[pred] - x[target])**2), axis=1)
df_plot['PredError'] = df_plot.apply(lambda x: abs(x[pred] - x[target]), axis=1)
if color == "error":
ax = df_plot.plot.scatter(x=target, y=pred, c='PredError', cmap='coolwarm', sharex=False)
else:
ax = df_plot.plot.scatter(x=target, y=pred, c='confidence', cmap='coolwarm', sharex=False)
# Just a diagonal line
if line:
ax.axline((1, 1), slope=1, linewidth=2, c='black')
x_pad = (df_plot[target].max() - df_plot[target].min())/10.0
y_pad = (df_plot[pred].max() - df_plot[pred].min())/10.0
plt.xlim(df_plot[target].min()-x_pad, df_plot[target].max()+x_pad)
plt.ylim(df_plot[pred].min()-y_pad, df_plot[pred].max()+y_pad)
# In[ ]: