In [ ]:

import operator
import numpy as np
import sklearn.preprocessing
import sklearn.utils
from sklearn.decomposition import PCA 
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_absolute_error, accuracy_score
import sklearn.metrics as sklm
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from functools import partial
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import pickle
import os
from matplotlib import rc
import matplotlib
import pandas as pd

In [ ]:

fs = 10 # font size
fs_label = 10 # tick label size
fs_lgd = 10 # legend font size
ss = 20 # symbol size
ts = 3 # tick size
slw = 1 # symbol line width
framelw = 1 # line width of frame
lw = 2 # line width of the bar box
rc('axes', linewidth=framelw)
plt.rcParams.update({
    "text.usetex": False,
    "font.weight":"bold",
    "axes.labelweight":"bold",
    "font.size":fs,
    'pdf.fonttype':'truetype'
})
plt.rcParams['mathtext.fontset']='stix'

Download Data and load data into Pandas¶

We will use the Supporting Information Data of Download the data from the following reference,

Fang Liu, Chenru Duan, and Heather J. Kulik

The Journal of Physical Chemistry Letters 2020 11 (19), 8067-8076 DOI: 10.1021/acs.jpclett.0c02288

The Supporting Information data is free of charge and available for research use. Execute the next cell to download and unzip the dataset. Alternatively you can manually download and extract the dataset from following URL https://pubs.acs.org/doi/suppl/10.1021/acs.jpclett.0c02288/suppl_file/jz0c02288_si_002.zip and then upload to mybinder notebook by following these steps:

click on Jupyter logo in the top left corner
navigate to JupyterNotebook/models/
upload the locally downloaded and extracted dataset by click on the up arrow below the Run menu

In [ ]:

%%capture
!rm -rf models
!mkdir models
!wget --keep-session-cookies --save-cookies=cookie.txt "https://pubs.acs.org/doi/10.1021/acs.jpclett.0c02288" 
!wget --referer="https://pubs.acs.org/doi/10.1021/acs.jpclett.0c02288" --cookies=on --keep-session-cookies --load-cookies=cookie.txt https://pubs.acs.org/doi/suppl/10.1021/acs.jpclett.0c02288/suppl_file/jz0c02288_si_002.zip -P models
!unzip models/jz0c02288_si_002.zip -d models

In [ ]:

datapath = '' # path to your data folder
modelpath = os.path.join(datapath,'models')
filein=modelpath+'/Data/refined_datasets/refined_features.csv' # read in the CSV file containing the features. This file is just for example
df_features = pd.read_csv(filein)

filein2=modelpath+'/Data/refined_datasets/refined_properties.csv' # read in the CSV file containing the properties. This file is just for example
df_props = pd.read_csv(filein2)

Take a look at the features¶

In [ ]:

df_features.shape

In [ ]:

df_features.head()

Take a look at the properties¶

In [ ]:

df_props.shape

In [ ]:

df_props.head()

Take a look at all the $r_{ND}$ diagnostic values¶

In [ ]:

df_props['rND'].values

In [ ]:

len(df_props['rND'].values)

Take a look at the HOMO-LUMO gap values¶

In [ ]:

df_props['energeticGap (eV)'].values

Plot $r_{ND}$ vs HOMO-LUMO gap. Is there any correlation between them?¶

In [ ]:

fig = plt.figure(figsize=(3.3,3.3))
gap = df_props['energeticGap (eV)'].values
rnd = df_props['rND'].values
plt.scatter(gap,rnd,edgecolors=None,c='b',alpha=0.2)
plt.ylabel('$r_{ND}$ (unitless)')
plt.xlabel('gap (eV)')
plt.show()

Next we are going to try different ways to predict rND¶

Predict $r_{ND}$ from HOMO-LUMO gap with linear regression¶

extract $r_{ND}$ and gap (eV) from database

In [ ]:

X = df_props['energeticGap (eV)'].values.reshape(-1, 1)
y = df_props['rND'].values

split into train and test dataset

In [ ]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

train linear regression model

In [ ]:

reg=linear_model.LinearRegression()
reg=reg.fit(X_train, y_train)

use linear regression to predict test data points

In [ ]:

y_predict = reg.predict(X_test)

Mean square error for linear regression with one feature:

In [ ]:

np.mean(np.square(y_test-y_predict))

In [ ]:

fig = plt.figure(figsize=(3.3,3.3))
plt.scatter(y_test,y_predict,edgecolors=None,c='b',alpha=0.2)
xmin=0.1; xmax=0.6
diag=np.linspace(xmin, xmax,100)
plt.plot(diag,diag, color='gray')
plt.xlim((xmin,xmax))
plt.ylim((xmin,xmax))
plt.ylabel('$r_{ND}$ (predicted)')
plt.xlabel('$r_{ND}$ (real)')
plt.show()

Predict $r_{ND}$ from all molecular features (157) from the data set with linear regression¶

We expect more features will improve the result over just one feature above.

Figure out which features are used in original database

In [ ]:

def return_train_columns(df, cols_selected=False):
    keys = ['RACs']
    removed_columns = []
    if not cols_selected:
        return_columns = ['ligcharge', 'ox', 'spin']
        for col in df.columns:
            for key in keys:
                if key in col and 'init' not in col and 'misc' not in col:
                    if 'Zeff' not in col and '-O-' not in col:
                        return_columns.append(col)
    else:
        print("Using input columns.")
        return_columns = cols_selected
    print("inital: ", len(return_columns))
    df = df.dropna(subset=return_columns)
    thre = 1e-4
    final_cols = []
    for col in return_columns:
        std = np.std(df[col].values)
        if std < thre:
            removed_columns.append(col)
        else:
            final_cols.append(col)
    print("removed: ", removed_columns, len(removed_columns))
    print("feature_used:", final_cols, len(final_cols))
    return final_cols, df

extract all features from database

In [ ]:

cols_use, df = return_train_columns(df_features, cols_selected=False)
X = np.array(df_features[cols_use].values)
y= df_props['rND'].values

train-test split and linear regression

In [ ]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
reg=linear_model.LinearRegression()
reg.fit(X_train, y_train)
y_predict = reg.predict(X_test)

Using all features improves the result compared to using only one feature. Mean square error for all features:

In [ ]:

np.mean(np.square(y_test-y_predict))

The number of features with non-zero contribution is high

In [ ]:

np.where(reg.coef_!=0)[0].shape

In [ ]:

fig = plt.figure(figsize=(3.3,3.3))
plt.scatter(y_test,y_predict,edgecolors=None,c='b',alpha=0.2)
xmin=0.1; xmax=0.6
diag=np.linspace(xmin, xmax,100)
plt.plot(diag,diag, color='gray')
plt.xlim((xmin,xmax))
plt.ylim((xmin,xmax))
plt.ylabel('$r_{ND}$ (predicted)')
plt.xlabel('$r_{ND}$ (real)')
plt.show()

Predict $r_{ND}$ with Elastic Net Regularization¶

In [ ]:

from sklearn.linear_model import ElasticNet
alpha = 0.001
reg = ElasticNet(alpha=alpha, l1_ratio=1)

In [ ]:

reg.fit(X_train, y_train)
y_predict = reg.predict(X_test)

After adding Elastic Net regularization the error in this case study increases, but the number of features with non-zero contribution is significantly lower.

In [ ]:

np.mean(np.square(y_test-y_predict))

In [ ]:

np.where(reg.coef_!=0)[0].shape

In [ ]:

fig = plt.figure(figsize=(3.3,3.3))
plt.scatter(y_test,y_predict,edgecolors=None,c='b',alpha=0.2)
xmin=0.1; xmax=0.6
diag=np.linspace(xmin, xmax,100)
plt.plot(diag,diag, color='gray')
plt.xlim((xmin,xmax))
plt.ylim((xmin,xmax))
plt.ylabel('$r_{ND}$ (predicted)')
plt.xlabel('$r_{ND}$ (real)')
plt.show()

Predict $r_{ND}$ from selected molecule features (65 optimal features selected based on performance) with linear regression¶

In [ ]:

rfa_features = ["ligcharge","ox","spin","RACs.mc-Z-0-all","RACs.mc-S-0-all","RACs.f-chi-3-all","RACs.D_mc-Z-1-all","RACs.D_mc-S-1-all","RACs.f-Z-1-all","RACs.f-Z-0-all","RACs.f-chi-1-all","RACs.f-chi-0-all","RACs.f-chi-2-all","RACs.f-Z-2-all","RACs.D_mc-chi-2-all","RACs.mc-chi-2-all","RACs.mc-chi-1-all","RACs.f-S-0-all","RACs.f-S-2-all","RACs.D_mc-chi-1-all","RACs.mc-Z-1-all","RACs.f-S-1-all","RACs.f-S-3-all","RACs.f-Z-3-all","RACs.D_mc-S-2-all","RACs.D_mc-Z-2-all","RACs.f-I-2-all","RACs.f-I-0-all","RACs.mc-Z-2-all","RACs.f-chi-0-eq","RACs.f-I-3-all","RACs.f-T-3-all","RACs.lc-S-3-eq","RACs.mc-chi-3-all","RACs.f-T-0-all","RACs.D_mc-chi-3-all","RACs.D_mc-S-3-all","RACs.D_mc-Z-3-all","RACs.f-I-1-all","RACs.lc-chi-2-eq","RACs.D_lc-S-2-ax","RACs.mc-T-2-all","RACs.D_lc-S-2-eq","RACs.D_lc-chi-2-ax","RACs.D_lc-Z-2-ax","RACs.D_lc-Z-2-eq","RACs.D_lc-chi-2-eq","RACs.f-Z-3-eq","RACs.lc-T-3-eq","RACs.mc-I-3-all","RACs.f-Z-3-ax","RACs.D_lc-T-3-ax","RACs.mc-T-3-all","RACs.mc-chi-0-all","RACs.mc-S-1-all","RACs.f-T-2-all","RACs.f-T-1-all","RACs.mc-Z-3-all","RACs.f-T-3-ax","RACs.mc-S-2-all","RACs.f-Z-1-ax","RACs.mc-S-3-all","RACs.D_mc-T-3-all","RACs.f-Z-2-ax","RACs.f-T-3-eq"]

cols_use, df = return_train_columns(df_features, cols_selected=rfa_features)
X = np.array(df_features[cols_use].values)
y= df_props['rND'].values

In [ ]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
reg=linear_model.LinearRegression()
reg.fit(X_train, y_train)
y_predict = reg.predict(X_test)

Using optimal features improves the result compared to using all features. Mean square error for optimal features:

In [ ]:

np.mean(np.square(y_test-y_predict))

In [ ]:

fig = plt.figure(figsize=(3.3,3.3))
plt.scatter(y_test,y_predict,edgecolors=None,c='b',alpha=0.2)
xmin=0.1; xmax=0.6
diag=np.linspace(xmin, xmax,100)
plt.plot(diag,diag, color='gray')
plt.xlim((xmin,xmax))
plt.ylim((xmin,xmax))
plt.ylabel('$r_{ND}$ (predicted)')
plt.xlabel('$r_{ND}$ (real)')
plt.show()

Comparison with Kernel ridge regression (KRR) model to predict $r_{ND}$ from the RACS features¶

In [ ]:

def prepare_data(df, x_scaler, cols_selected=False):
    np.random.seed(0)
    cols_use, df = return_train_columns(df, cols_selected)
    X = np.array(df[cols_use].values)
    X_scaled = x_scaler.transform(X)
    return X_scaled, cols_use

def predict(model_filename, df, x_scaler_filename, y_scaler_filename,y , cols_selected=False):
    x_scaler = pickle.load(open(x_scaler_filename,'rb'))
    y_scaler = pickle.load(open(y_scaler_filename,'rb'))
    krr = pickle.load(open(model_filename,'rb'))
    X_scaled, cols_use =  prepare_data(df, x_scaler, cols_selected=cols_selected)
    X_scaled_train, X_scaled_test, y_train, y_test = train_test_split(X_scaled,y, test_size=0.2, random_state=1)
    y_predict_scaled = krr.predict(X_scaled_test)
    y_predict = y_scaler.inverse_transform(y_predict_scaled)[:,0]
    return y_predict, y_test

In [ ]:

modelfile = os.path.join(modelpath,'Data/models/KRR/MD2_rND_krr_model/MD2_rND_krr.pkl')
x_scaler_filename =  os.path.join(modelpath,'Data/models/KRR/MD2_rND_krr_model/x_scaler.pkl')
y_scaler_filename =  os.path.join(modelpath,'Data/models/KRR/MD2_rND_krr_model/y_scaler.pkl')
# Note we are not using all features. We are using the selected features verified to be optimal
rfa_features = ["ligcharge","ox","spin","RACs.mc-Z-0-all","RACs.mc-S-0-all","RACs.f-chi-3-all","RACs.D_mc-Z-1-all","RACs.D_mc-S-1-all","RACs.f-Z-1-all","RACs.f-Z-0-all","RACs.f-chi-1-all","RACs.f-chi-0-all","RACs.f-chi-2-all","RACs.f-Z-2-all","RACs.D_mc-chi-2-all","RACs.mc-chi-2-all","RACs.mc-chi-1-all","RACs.f-S-0-all","RACs.f-S-2-all","RACs.D_mc-chi-1-all","RACs.mc-Z-1-all","RACs.f-S-1-all","RACs.f-S-3-all","RACs.f-Z-3-all","RACs.D_mc-S-2-all","RACs.D_mc-Z-2-all","RACs.f-I-2-all","RACs.f-I-0-all","RACs.mc-Z-2-all","RACs.f-chi-0-eq","RACs.f-I-3-all","RACs.f-T-3-all","RACs.lc-S-3-eq","RACs.mc-chi-3-all","RACs.f-T-0-all","RACs.D_mc-chi-3-all","RACs.D_mc-S-3-all","RACs.D_mc-Z-3-all","RACs.f-I-1-all","RACs.lc-chi-2-eq","RACs.D_lc-S-2-ax","RACs.mc-T-2-all","RACs.D_lc-S-2-eq","RACs.D_lc-chi-2-ax","RACs.D_lc-Z-2-ax","RACs.D_lc-Z-2-eq","RACs.D_lc-chi-2-eq","RACs.f-Z-3-eq","RACs.lc-T-3-eq","RACs.mc-I-3-all","RACs.f-Z-3-ax","RACs.D_lc-T-3-ax","RACs.mc-T-3-all","RACs.mc-chi-0-all","RACs.mc-S-1-all","RACs.f-T-2-all","RACs.f-T-1-all","RACs.mc-Z-3-all","RACs.f-T-3-ax","RACs.mc-S-2-all","RACs.f-Z-1-ax","RACs.mc-S-3-all","RACs.D_mc-T-3-all","RACs.f-Z-2-ax","RACs.f-T-3-eq"]
y_predict, y_test = predict(modelfile, df_features, x_scaler_filename, y_scaler_filename,y, cols_selected=rfa_features)

Using Kernel ridge regression (KRR) improves the result compared to linear regression. Mean square error for KRR model

In [ ]:

np.mean(np.square(y_test-y_predict))

Improved result is visible in the plot.

In [ ]:

plt.scatter(y_test,y_predict,edgecolors=None,c='b',alpha=0.2)
xmin=0.1; xmax=0.6
diag=np.linspace(xmin, xmax,100)
plt.plot(diag,diag, color='gray')
plt.xlim((xmin,xmax))
plt.ylim((xmin,xmax))
plt.ylabel('$r_{ND}$ (predicted)')
plt.xlabel('$r_{ND}$ (real)')
plt.show()

In [ ]: