import operator
import numpy as np
import sklearn.preprocessing
import sklearn.utils
from sklearn.decomposition import PCA
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_absolute_error, accuracy_score
import sklearn.metrics as sklm
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from functools import partial
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import pickle
import os
from matplotlib import rc
import matplotlib
import pandas as pd
fs = 10 # font size
fs_label = 10 # tick label size
fs_lgd = 10 # legend font size
ss = 20 # symbol size
ts = 3 # tick size
slw = 1 # symbol line width
framelw = 1 # line width of frame
lw = 2 # line width of the bar box
rc('axes', linewidth=framelw)
plt.rcParams.update({
"text.usetex": False,
"font.weight":"bold",
"axes.labelweight":"bold",
"font.size":fs,
'pdf.fonttype':'truetype'
})
plt.rcParams['mathtext.fontset']='stix'
We will use the Supporting Information Data of Download the data from the following reference,
The Journal of Physical Chemistry Letters 2020 11 (19), 8067-8076 DOI: 10.1021/acs.jpclett.0c02288
The Supporting Information data is free of charge and available for research use. Execute the next cell to download and unzip the dataset. Alternatively you can manually download and extract the dataset from following URL https://pubs.acs.org/doi/suppl/10.1021/acs.jpclett.0c02288/suppl_file/jz0c02288_si_002.zip and then upload to mybinder notebook by following these steps:
%%capture
!rm -rf models
!mkdir models
!wget --keep-session-cookies --save-cookies=cookie.txt "https://pubs.acs.org/doi/10.1021/acs.jpclett.0c02288"
!wget --referer="https://pubs.acs.org/doi/10.1021/acs.jpclett.0c02288" --cookies=on --keep-session-cookies --load-cookies=cookie.txt https://pubs.acs.org/doi/suppl/10.1021/acs.jpclett.0c02288/suppl_file/jz0c02288_si_002.zip -P models
!unzip models/jz0c02288_si_002.zip -d models
datapath = '' # path to your data folder
modelpath = os.path.join(datapath,'models')
filein=modelpath+'/Data/refined_datasets/refined_features.csv' # read in the CSV file containing the features. This file is just for example
df_features = pd.read_csv(filein)
filein2=modelpath+'/Data/refined_datasets/refined_properties.csv' # read in the CSV file containing the properties. This file is just for example
df_props = pd.read_csv(filein2)
df_features.shape
df_features.head()
df_props.shape
df_props.head()
df_props['rND'].values
len(df_props['rND'].values)
df_props['energeticGap (eV)'].values
fig = plt.figure(figsize=(3.3,3.3))
gap = df_props['energeticGap (eV)'].values
rnd = df_props['rND'].values
plt.scatter(gap,rnd,edgecolors=None,c='b',alpha=0.2)
plt.ylabel('$r_{ND}$ (unitless)')
plt.xlabel('gap (eV)')
plt.show()
extract $r_{ND}$ and gap (eV) from database
X = df_props['energeticGap (eV)'].values.reshape(-1, 1)
y = df_props['rND'].values
split into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
train linear regression model
reg=linear_model.LinearRegression()
reg=reg.fit(X_train, y_train)
use linear regression to predict test data points
y_predict = reg.predict(X_test)
Mean square error for linear regression with one feature:
np.mean(np.square(y_test-y_predict))
fig = plt.figure(figsize=(3.3,3.3))
plt.scatter(y_test,y_predict,edgecolors=None,c='b',alpha=0.2)
xmin=0.1; xmax=0.6
diag=np.linspace(xmin, xmax,100)
plt.plot(diag,diag, color='gray')
plt.xlim((xmin,xmax))
plt.ylim((xmin,xmax))
plt.ylabel('$r_{ND}$ (predicted)')
plt.xlabel('$r_{ND}$ (real)')
plt.show()
We expect more features will improve the result over just one feature above.
Figure out which features are used in original database
def return_train_columns(df, cols_selected=False):
keys = ['RACs']
removed_columns = []
if not cols_selected:
return_columns = ['ligcharge', 'ox', 'spin']
for col in df.columns:
for key in keys:
if key in col and 'init' not in col and 'misc' not in col:
if 'Zeff' not in col and '-O-' not in col:
return_columns.append(col)
else:
print("Using input columns.")
return_columns = cols_selected
print("inital: ", len(return_columns))
df = df.dropna(subset=return_columns)
thre = 1e-4
final_cols = []
for col in return_columns:
std = np.std(df[col].values)
if std < thre:
removed_columns.append(col)
else:
final_cols.append(col)
print("removed: ", removed_columns, len(removed_columns))
print("feature_used:", final_cols, len(final_cols))
return final_cols, df
extract all features from database
cols_use, df = return_train_columns(df_features, cols_selected=False)
X = np.array(df_features[cols_use].values)
y= df_props['rND'].values
train-test split and linear regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
reg=linear_model.LinearRegression()
reg.fit(X_train, y_train)
y_predict = reg.predict(X_test)
Using all features improves the result compared to using only one feature. Mean square error for all features:
np.mean(np.square(y_test-y_predict))
The number of features with non-zero contribution is high
np.where(reg.coef_!=0)[0].shape
fig = plt.figure(figsize=(3.3,3.3))
plt.scatter(y_test,y_predict,edgecolors=None,c='b',alpha=0.2)
xmin=0.1; xmax=0.6
diag=np.linspace(xmin, xmax,100)
plt.plot(diag,diag, color='gray')
plt.xlim((xmin,xmax))
plt.ylim((xmin,xmax))
plt.ylabel('$r_{ND}$ (predicted)')
plt.xlabel('$r_{ND}$ (real)')
plt.show()
from sklearn.linear_model import ElasticNet
alpha = 0.001
reg = ElasticNet(alpha=alpha, l1_ratio=1)
reg.fit(X_train, y_train)
y_predict = reg.predict(X_test)
After adding Elastic Net regularization the error in this case study increases, but the number of features with non-zero contribution is significantly lower.
np.mean(np.square(y_test-y_predict))
np.where(reg.coef_!=0)[0].shape
fig = plt.figure(figsize=(3.3,3.3))
plt.scatter(y_test,y_predict,edgecolors=None,c='b',alpha=0.2)
xmin=0.1; xmax=0.6
diag=np.linspace(xmin, xmax,100)
plt.plot(diag,diag, color='gray')
plt.xlim((xmin,xmax))
plt.ylim((xmin,xmax))
plt.ylabel('$r_{ND}$ (predicted)')
plt.xlabel('$r_{ND}$ (real)')
plt.show()
rfa_features = ["ligcharge","ox","spin","RACs.mc-Z-0-all","RACs.mc-S-0-all","RACs.f-chi-3-all","RACs.D_mc-Z-1-all","RACs.D_mc-S-1-all","RACs.f-Z-1-all","RACs.f-Z-0-all","RACs.f-chi-1-all","RACs.f-chi-0-all","RACs.f-chi-2-all","RACs.f-Z-2-all","RACs.D_mc-chi-2-all","RACs.mc-chi-2-all","RACs.mc-chi-1-all","RACs.f-S-0-all","RACs.f-S-2-all","RACs.D_mc-chi-1-all","RACs.mc-Z-1-all","RACs.f-S-1-all","RACs.f-S-3-all","RACs.f-Z-3-all","RACs.D_mc-S-2-all","RACs.D_mc-Z-2-all","RACs.f-I-2-all","RACs.f-I-0-all","RACs.mc-Z-2-all","RACs.f-chi-0-eq","RACs.f-I-3-all","RACs.f-T-3-all","RACs.lc-S-3-eq","RACs.mc-chi-3-all","RACs.f-T-0-all","RACs.D_mc-chi-3-all","RACs.D_mc-S-3-all","RACs.D_mc-Z-3-all","RACs.f-I-1-all","RACs.lc-chi-2-eq","RACs.D_lc-S-2-ax","RACs.mc-T-2-all","RACs.D_lc-S-2-eq","RACs.D_lc-chi-2-ax","RACs.D_lc-Z-2-ax","RACs.D_lc-Z-2-eq","RACs.D_lc-chi-2-eq","RACs.f-Z-3-eq","RACs.lc-T-3-eq","RACs.mc-I-3-all","RACs.f-Z-3-ax","RACs.D_lc-T-3-ax","RACs.mc-T-3-all","RACs.mc-chi-0-all","RACs.mc-S-1-all","RACs.f-T-2-all","RACs.f-T-1-all","RACs.mc-Z-3-all","RACs.f-T-3-ax","RACs.mc-S-2-all","RACs.f-Z-1-ax","RACs.mc-S-3-all","RACs.D_mc-T-3-all","RACs.f-Z-2-ax","RACs.f-T-3-eq"]
cols_use, df = return_train_columns(df_features, cols_selected=rfa_features)
X = np.array(df_features[cols_use].values)
y= df_props['rND'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
reg=linear_model.LinearRegression()
reg.fit(X_train, y_train)
y_predict = reg.predict(X_test)
Using optimal features improves the result compared to using all features. Mean square error for optimal features:
np.mean(np.square(y_test-y_predict))
fig = plt.figure(figsize=(3.3,3.3))
plt.scatter(y_test,y_predict,edgecolors=None,c='b',alpha=0.2)
xmin=0.1; xmax=0.6
diag=np.linspace(xmin, xmax,100)
plt.plot(diag,diag, color='gray')
plt.xlim((xmin,xmax))
plt.ylim((xmin,xmax))
plt.ylabel('$r_{ND}$ (predicted)')
plt.xlabel('$r_{ND}$ (real)')
plt.show()
def prepare_data(df, x_scaler, cols_selected=False):
np.random.seed(0)
cols_use, df = return_train_columns(df, cols_selected)
X = np.array(df[cols_use].values)
X_scaled = x_scaler.transform(X)
return X_scaled, cols_use
def predict(model_filename, df, x_scaler_filename, y_scaler_filename,y , cols_selected=False):
x_scaler = pickle.load(open(x_scaler_filename,'rb'))
y_scaler = pickle.load(open(y_scaler_filename,'rb'))
krr = pickle.load(open(model_filename,'rb'))
X_scaled, cols_use = prepare_data(df, x_scaler, cols_selected=cols_selected)
X_scaled_train, X_scaled_test, y_train, y_test = train_test_split(X_scaled,y, test_size=0.2, random_state=1)
y_predict_scaled = krr.predict(X_scaled_test)
y_predict = y_scaler.inverse_transform(y_predict_scaled)[:,0]
return y_predict, y_test
modelfile = os.path.join(modelpath,'Data/models/KRR/MD2_rND_krr_model/MD2_rND_krr.pkl')
x_scaler_filename = os.path.join(modelpath,'Data/models/KRR/MD2_rND_krr_model/x_scaler.pkl')
y_scaler_filename = os.path.join(modelpath,'Data/models/KRR/MD2_rND_krr_model/y_scaler.pkl')
# Note we are not using all features. We are using the selected features verified to be optimal
rfa_features = ["ligcharge","ox","spin","RACs.mc-Z-0-all","RACs.mc-S-0-all","RACs.f-chi-3-all","RACs.D_mc-Z-1-all","RACs.D_mc-S-1-all","RACs.f-Z-1-all","RACs.f-Z-0-all","RACs.f-chi-1-all","RACs.f-chi-0-all","RACs.f-chi-2-all","RACs.f-Z-2-all","RACs.D_mc-chi-2-all","RACs.mc-chi-2-all","RACs.mc-chi-1-all","RACs.f-S-0-all","RACs.f-S-2-all","RACs.D_mc-chi-1-all","RACs.mc-Z-1-all","RACs.f-S-1-all","RACs.f-S-3-all","RACs.f-Z-3-all","RACs.D_mc-S-2-all","RACs.D_mc-Z-2-all","RACs.f-I-2-all","RACs.f-I-0-all","RACs.mc-Z-2-all","RACs.f-chi-0-eq","RACs.f-I-3-all","RACs.f-T-3-all","RACs.lc-S-3-eq","RACs.mc-chi-3-all","RACs.f-T-0-all","RACs.D_mc-chi-3-all","RACs.D_mc-S-3-all","RACs.D_mc-Z-3-all","RACs.f-I-1-all","RACs.lc-chi-2-eq","RACs.D_lc-S-2-ax","RACs.mc-T-2-all","RACs.D_lc-S-2-eq","RACs.D_lc-chi-2-ax","RACs.D_lc-Z-2-ax","RACs.D_lc-Z-2-eq","RACs.D_lc-chi-2-eq","RACs.f-Z-3-eq","RACs.lc-T-3-eq","RACs.mc-I-3-all","RACs.f-Z-3-ax","RACs.D_lc-T-3-ax","RACs.mc-T-3-all","RACs.mc-chi-0-all","RACs.mc-S-1-all","RACs.f-T-2-all","RACs.f-T-1-all","RACs.mc-Z-3-all","RACs.f-T-3-ax","RACs.mc-S-2-all","RACs.f-Z-1-ax","RACs.mc-S-3-all","RACs.D_mc-T-3-all","RACs.f-Z-2-ax","RACs.f-T-3-eq"]
y_predict, y_test = predict(modelfile, df_features, x_scaler_filename, y_scaler_filename,y, cols_selected=rfa_features)
Using Kernel ridge regression (KRR) improves the result compared to linear regression. Mean square error for KRR model
np.mean(np.square(y_test-y_predict))
Improved result is visible in the plot.
plt.scatter(y_test,y_predict,edgecolors=None,c='b',alpha=0.2)
xmin=0.1; xmax=0.6
diag=np.linspace(xmin, xmax,100)
plt.plot(diag,diag, color='gray')
plt.xlim((xmin,xmax))
plt.ylim((xmin,xmax))
plt.ylabel('$r_{ND}$ (predicted)')
plt.xlabel('$r_{ND}$ (real)')
plt.show()