#!/usr/bin/env python # coding: utf-8 # # Regression Confidence Experiments # # This notebook uses explores the computation and usage of Regression confidence metrics. Unlink many classification algorithms that provide a `predict_proba()` method that will give you probabilities, which can then be turned into a confidence, there's no direct equivalent for regression models #

# # ## Data # AqSolDB: A curated reference set of aqueous solubility, created by the Autonomous Energy Materials Discovery [AMD] research group, consists of aqueous solubility values of 9,982 unique compounds curated from 9 different publicly available aqueous solubility datasets. AqSolDB also contains some relevant topological and physico-chemical 2D descriptors. Additionally, AqSolDB contains validated molecular representations of each of the compounds. This openly accessible dataset, which is the largest of its kind, and will not only serve as a useful reference source of measured and calculated solubility data, but also as a much improved and generalizable training data source for building data-driven models. (2019-04-10) # # Main Reference: # https://www.nature.com/articles/s41597-019-0151-1 # # Data Dowloaded from the Harvard DataVerse: # https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/OVHAW8 # # ® Amazon Web Services, AWS, the Powered by AWS logo, are trademarks of Amazon.com, Inc. or its affiliates. # In[ ]: import sageworks import logging logging.getLogger("sageworks").setLevel(logging.WARNING) # In[ ]: # We've already created a FeatureSet so just grab it from sageworks.api.feature_set import FeatureSet # In[ ]: fs = FeatureSet("aqsol_mol_descriptors") # In[ ]: full_df = fs.pull_dataframe() full_df.head() # In[ ]: # Sanity check our solubility and solubility_class import pandas as pd import seaborn as sns import matplotlib.pyplot as plt plt.rcParams['font.size'] = 12.0 plt.rcParams['figure.figsize'] = 14.0, 7.0 sns.set_theme(style='darkgrid') # Create a box plot sns.boxplot(x='solubility_class', y='solubility', data=full_df, order = ['high', 'medium', 'low']) plt.title('Solubility by Solubility Class') plt.xlabel('Solubility Class') plt.ylabel('Solubility') plt.show() # In[ ]: # Grab the Training View table = fs.view("training").table train_df = fs.query(f"SELECT * FROM {table} where training = TRUE") hold_out_df = fs.query(f"SELECT * FROM {table} where training = FALSE") # In[ ]: train_df # In[ ]: # Here we're just grabbing the model to get the target and features from sageworks.api.model import Model model = Model("aqsol-mol-regression") target = model.target() features = model.features() # In[ ]: len(full_df) # In[ ]: import xgboost as xgb X = train_df[features] y = train_df[target] # Train the main XGBRegressor model model = xgb.XGBRegressor() model.fit(X, y) # Run Predictions on the the hold out hold_out_df["predictions"] = model.predict(hold_out_df[features]) hold_out_df["confidence"] = 0 # In[ ]: # Plot Stuff plot_predictions(hold_out_df) # In[ ]: from sageworks.algorithms.dataframe.feature_spider import FeatureSpider # Create the FeatureSpider class and run the various methods f_spider = FeatureSpider(train_df, features, id_column="id", target_column=target, neighbors=5) hold_out_df["predictions"] = f_spider.predict(hold_out_df) plot_predictions(hold_out_df) # In[ ]: # Include Confidence metric hold_out_df["predictions"] = model.predict(hold_out_df[features]) hold_out_df["confidence"] = f_spider.confidence_scores(hold_out_df, hold_out_df["predictions"]) # In[ ]: plot_predictions(hold_out_df, color="confidence") # In[ ]: high_df = hold_out_df[hold_out_df["confidence"] >= 0.5] plot_predictions(high_df) # In[ ]: # Classification Model class_model = xgb.XGBClassifier(enable_categorical=True) X = train_df[features] y = train_df["solubility_class"] y = y.astype("category") # Train the main XGBClassifier model class_model.fit(X, y) # Run Predictions on the the hold out # hold_out_df["predictions"] = class_model.predict(hold_out_df[features]) # In[ ]: hold_out_df.head() # In[ ]: high_df = df[df["confidence"] >= 0.0] # In[ ]: plot_predictions(high_df, color="confidence") # In[ ]: df["predictions"] = f_spider.predict(df) plot_predictions(df) # In[ ]: # Use XGBRegressor and Quartiles import xgboost as xgb import numpy as np from sklearn.linear_model import QuantileRegressor X = df[features] y = df[target] # Train the main XGBRegressor model model = xgb.XGBRegressor() model.fit(X, y) # Train QuantileRegressor models for lower and upper bounds # Specify a different solver, such as 'highs' lower_model = QuantileRegressor(quantile=0.05, solver='highs') upper_model = QuantileRegressor(quantile=0.95, solver='highs') lower_model.fit(X, y) upper_model.fit(X, y) # Make predictions predictions = model.predict(X) lower_bounds = lower_model.predict(X) upper_bounds = upper_model.predict(X) # Calculate confidence metric confidence = 1 - (upper_bounds - lower_bounds) / (np.max(y) - np.min(y)) # Combine predictions and confidence into the existing dataframe df["predictions"] = predictions df["confidence"] = confidence df["lower_bound"] = lower_bounds df["upper_bound"] = upper_bounds print(df.head()) # In[ ]: df["confidence"].hist() # # Helper Methods # In[ ]: # Helper to look at predictions vs target from math import sqrt import pandas as pd def plot_predictions(df, color="error", line=True): # Dataframe of the targets and predictions target = 'Actual Solubility' pred = 'Predicted Solubility' df_plot = pd.DataFrame({target: df['solubility'], pred: df['predictions'], 'confidence': df['confidence']}) # Compute Error per prediction # df_plot['RMSError'] = df_plot.apply(lambda x: sqrt((x[pred] - x[target])**2), axis=1) df_plot['PredError'] = df_plot.apply(lambda x: abs(x[pred] - x[target]), axis=1) if color == "error": ax = df_plot.plot.scatter(x=target, y=pred, c='PredError', cmap='coolwarm', sharex=False) else: ax = df_plot.plot.scatter(x=target, y=pred, c='confidence', cmap='coolwarm', sharex=False) # Just a diagonal line if line: ax.axline((1, 1), slope=1, linewidth=2, c='black') x_pad = (df_plot[target].max() - df_plot[target].min())/10.0 y_pad = (df_plot[pred].max() - df_plot[pred].min())/10.0 plt.xlim(df_plot[target].min()-x_pad, df_plot[target].max()+x_pad) plt.ylim(df_plot[pred].min()-y_pad, df_plot[pred].max()+y_pad) # In[ ]: