import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from interpret.glassbox import LogisticRegression
from sklearn.preprocessing import StandardScaler
from interpret import perf
from interpret.blackbox import ShapKernel
from interpret.blackbox import LimeTabular
from interpret import show
from interpret.provider import InlineProvider
from interpret import set_visualize_provider
set_visualize_provider(InlineProvider())
from interpret.data import ClassHistogram
When vizualization doesn't work (https://github.com/interpretml/interpret/issues/259)
# from interpret.data import Marginal
# from interpret import preserve
data = pd.read_csv("heart_failure_clinical_records_dataset.csv")
data.head()
age | anaemia | creatinine_phosphokinase | diabetes | ejection_fraction | high_blood_pressure | platelets | serum_creatinine | serum_sodium | sex | smoking | time | DEATH_EVENT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 75.0 | 0 | 582 | 0 | 20 | 1 | 265000.00 | 1.9 | 130 | 1 | 0 | 4 | 1 |
1 | 55.0 | 0 | 7861 | 0 | 38 | 0 | 263358.03 | 1.1 | 136 | 1 | 0 | 6 | 1 |
2 | 65.0 | 0 | 146 | 0 | 20 | 0 | 162000.00 | 1.3 | 129 | 1 | 1 | 7 | 1 |
3 | 50.0 | 1 | 111 | 0 | 20 | 0 | 210000.00 | 1.9 | 137 | 1 | 0 | 7 | 1 |
4 | 65.0 | 1 | 160 | 1 | 20 | 0 | 327000.00 | 2.7 | 116 | 0 | 0 | 8 | 1 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 299 entries, 0 to 298 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 299 non-null float64 1 anaemia 299 non-null int64 2 creatinine_phosphokinase 299 non-null int64 3 diabetes 299 non-null int64 4 ejection_fraction 299 non-null int64 5 high_blood_pressure 299 non-null int64 6 platelets 299 non-null float64 7 serum_creatinine 299 non-null float64 8 serum_sodium 299 non-null int64 9 sex 299 non-null int64 10 smoking 299 non-null int64 11 time 299 non-null int64 12 DEATH_EVENT 299 non-null int64 dtypes: float64(3), int64(10) memory usage: 30.5 KB
X = data.drop(["time", "DEATH_EVENT"], axis=1)
y = data["DEATH_EVENT"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42, stratify=y
)
InterpretML also support data exploration
hist = ClassHistogram().explain_data(X_train, y_train, name="Train Data")
show(hist)
scaler = StandardScaler()
features_to_scale = [
"age",
"creatinine_phosphokinase",
"ejection_fraction",
"platelets",
"serum_creatinine",
"serum_sodium",
]
X_train_scaled, X_test_scaled = X_train.copy(), X_test.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(
X_train_scaled[features_to_scale]
)
X_test_scaled[features_to_scale] = scaler.transform(X_test_scaled[features_to_scale])
Note: InterpretML's Logistic Regression currently cannot run with RandomizedSearchCV since it does not implement a 'get_params' method.
LR_clf = LogisticRegression(solver="liblinear", class_weight="balanced", penalty="l1")
LR_clf.fit(X_train_scaled, y_train)
<interpret.glassbox.linear.LogisticRegression at 0x7efe9b091970>
lr_global = LR_clf.explain_global()
show(lr_global)
lr_local = LR_clf.explain_local(X_test_scaled[10:15], y_test[10:15])
show(lr_local)
param_test = {
"learning_rate": [0.001, 0.005, 0.01, 0.03],
"interactions": [5, 10, 15],
"max_interaction_bins": [10, 15, 20],
"max_rounds": [5000, 10000, 15000, 20000],
"min_samples_leaf": [2, 3, 5],
"max_leaves": [3, 5, 10],
}
n_HP_points_to_test = 10
EBM_clf = ExplainableBoostingClassifier(feature_names=X_train.columns)
EBM_gs = RandomizedSearchCV(
estimator=EBM_clf,
param_distributions=param_test,
n_iter=n_HP_points_to_test,
scoring="roc_auc",
cv=3,
refit=True,
random_state=314,
verbose=False,
)
EBM_gs.fit(X_train, y_train)
RandomizedSearchCV(cv=3, estimator=ExplainableBoostingClassifier(feature_names=Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'serum_creatinine', 'serum_sodium', 'sex', 'smoking'], dtype='object')), param_distributions={'interactions': [5, 10, 15], 'learning_rate': [0.001, 0.005, 0.01, 0.03], 'max_interaction_bins': [10, 15, 20], 'max_leaves': [3, 5, 10], 'max_rounds': [5000, 10000, 15000, 20000], 'min_samples_leaf': [2, 3, 5]}, random_state=314, scoring='roc_auc', verbose=False)
roc = perf.ROC(EBM_gs.best_estimator_.predict_proba, feature_names=X_train.columns)
roc_explanation = roc.explain_perf(X_test, y_test)
show(roc_explanation)
ebm_global = EBM_gs.best_estimator_.explain_global()
show(ebm_global)
ebm_local = EBM_gs.best_estimator_.explain_local(X_test[10:15], y_test[10:15])
show(ebm_local)
param_test = {
"num_leaves": [20, 30, 40, 50, 60],
"max_depth": [-1, 5, 10, 15],
"learning_rate": [0.01, 0.03, 0.05, 0.07],
"n_estimators": [200, 500, 700, 1000],
"is_unbalance": [True, False],
}
n_HP_points_to_test = 10
LGBM_clf = LGBMClassifier(random_state=314, n_jobs=-1)
LGBM_gs = RandomizedSearchCV(
estimator=LGBM_clf,
param_distributions=param_test,
n_iter=n_HP_points_to_test,
scoring="roc_auc",
cv=3,
refit=True,
random_state=314,
verbose=False,
)
LGBM_gs.fit(X_train, y_train)
RandomizedSearchCV(cv=3, estimator=LGBMClassifier(random_state=314), param_distributions={'is_unbalance': [True, False], 'learning_rate': [0.01, 0.03, 0.05, 0.07], 'max_depth': [-1, 5, 10, 15], 'n_estimators': [200, 500, 700, 1000], 'num_leaves': [20, 30, 40, 50, 60]}, random_state=314, scoring='roc_auc', verbose=False)
roc = perf.ROC(LGBM_gs.best_estimator_.predict_proba, feature_names=X_train.columns)
roc_explanation = roc.explain_perf(X_test, y_test)
show(roc_explanation)
shap = ShapKernel(predict_fn=LGBM_gs.predict_proba, data=X_train)
shap_local = shap.explain_local(X_test[10:15], y_test[10:15])
show(shap_local)
Using 200 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
0%| | 0/5 [00:00<?, ?it/s]
lime = LimeTabular(predict_fn=LGBM_gs.predict_proba, data=X_train)
lime_local = lime.explain_local(X_test[10:15], y_test[10:15])
show(lime_local)