In [2]:

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from interpret.glassbox import LogisticRegression
from sklearn.preprocessing import StandardScaler
from interpret import perf
from interpret.blackbox import ShapKernel
from interpret.blackbox import LimeTabular
from interpret import show
from interpret.provider import InlineProvider
from interpret import set_visualize_provider

set_visualize_provider(InlineProvider())
from interpret.data import ClassHistogram

When vizualization doesn't work (https://github.com/interpretml/interpret/issues/259)

In [3]:

# from interpret.data import Marginal
# from interpret import preserve

Import Data¶

In [4]:

data = pd.read_csv("heart_failure_clinical_records_dataset.csv")

In [5]:

data.head()

Out[5]:

	age	anaemia	creatinine_phosphokinase	diabetes	ejection_fraction	high_blood_pressure	platelets	serum_creatinine	serum_sodium	sex	smoking	time	DEATH_EVENT
0	75.0	0	582	0	20	1	265000.00	1.9	130	1	0	4	1
1	55.0	0	7861	0	38	0	263358.03	1.1	136	1	0	6	1
2	65.0	0	146	0	20	0	162000.00	1.3	129	1	1	7	1
3	50.0	1	111	0	20	0	210000.00	1.9	137	1	0	7	1
4	65.0	1	160	1	20	0	327000.00	2.7	116	0	0	8	1

In [6]:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB

Train Test Split¶

In [7]:

X = data.drop(["time", "DEATH_EVENT"], axis=1)
y = data["DEATH_EVENT"]

In [8]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

Explore the data¶

InterpretML also support data exploration

In [9]:

hist = ClassHistogram().explain_data(X_train, y_train, name="Train Data")
show(hist)

Modeling & Hyperparameter Tuning¶

Logistic Regression¶

In [10]:

scaler = StandardScaler()
features_to_scale = [
    "age",
    "creatinine_phosphokinase",
    "ejection_fraction",
    "platelets",
    "serum_creatinine",
    "serum_sodium",
]

X_train_scaled, X_test_scaled = X_train.copy(), X_test.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(
    X_train_scaled[features_to_scale]
)
X_test_scaled[features_to_scale] = scaler.transform(X_test_scaled[features_to_scale])

Note: InterpretML's Logistic Regression currently cannot run with RandomizedSearchCV since it does not implement a 'get_params' method.

In [11]:

LR_clf = LogisticRegression(solver="liblinear", class_weight="balanced", penalty="l1")

In [12]:

LR_clf.fit(X_train_scaled, y_train)

Out[12]:

<interpret.glassbox.linear.LogisticRegression at 0x7efe9b091970>

In [13]:

lr_global = LR_clf.explain_global()
show(lr_global)

In [14]:

lr_local = LR_clf.explain_local(X_test_scaled[10:15], y_test[10:15])
show(lr_local)

EBM¶

Hyperparameter Optimization¶

In [15]:

param_test = {
    "learning_rate": [0.001, 0.005, 0.01, 0.03],
    "interactions": [5, 10, 15],
    "max_interaction_bins": [10, 15, 20],
    "max_rounds": [5000, 10000, 15000, 20000],
    "min_samples_leaf": [2, 3, 5],
    "max_leaves": [3, 5, 10],
}

n_HP_points_to_test = 10
EBM_clf = ExplainableBoostingClassifier(feature_names=X_train.columns)
EBM_gs = RandomizedSearchCV(
    estimator=EBM_clf,
    param_distributions=param_test,
    n_iter=n_HP_points_to_test,
    scoring="roc_auc",
    cv=3,
    refit=True,
    random_state=314,
    verbose=False,
)

EBM_gs.fit(X_train, y_train)

Out[15]:

RandomizedSearchCV(cv=3,
                   estimator=ExplainableBoostingClassifier(feature_names=Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking'],
      dtype='object')),
                   param_distributions={'interactions': [5, 10, 15],
                                        'learning_rate': [0.001, 0.005, 0.01,
                                                          0.03],
                                        'max_interaction_bins': [10, 15, 20],
                                        'max_leaves': [3, 5, 10],
                                        'max_rounds': [5000, 10000, 15000,
                                                       20000],
                                        'min_samples_leaf': [2, 3, 5]},
                   random_state=314, scoring='roc_auc', verbose=False)

Model Performance¶

In [16]:

roc = perf.ROC(EBM_gs.best_estimator_.predict_proba, feature_names=X_train.columns)
roc_explanation = roc.explain_perf(X_test, y_test)

In [17]:

show(roc_explanation)

Global Explanation¶

In [18]:

ebm_global = EBM_gs.best_estimator_.explain_global()
show(ebm_global)

Local Explanation¶

In [19]:

ebm_local = EBM_gs.best_estimator_.explain_local(X_test[10:15], y_test[10:15])
show(ebm_local)

LightGBM¶

Hyperparameter Optimization¶

In [20]:

param_test = {
    "num_leaves": [20, 30, 40, 50, 60],
    "max_depth": [-1, 5, 10, 15],
    "learning_rate": [0.01, 0.03, 0.05, 0.07],
    "n_estimators": [200, 500, 700, 1000],
    "is_unbalance": [True, False],
}

n_HP_points_to_test = 10
LGBM_clf = LGBMClassifier(random_state=314, n_jobs=-1)
LGBM_gs = RandomizedSearchCV(
    estimator=LGBM_clf,
    param_distributions=param_test,
    n_iter=n_HP_points_to_test,
    scoring="roc_auc",
    cv=3,
    refit=True,
    random_state=314,
    verbose=False,
)

LGBM_gs.fit(X_train, y_train)

Out[20]:

RandomizedSearchCV(cv=3, estimator=LGBMClassifier(random_state=314),
                   param_distributions={'is_unbalance': [True, False],
                                        'learning_rate': [0.01, 0.03, 0.05,
                                                          0.07],
                                        'max_depth': [-1, 5, 10, 15],
                                        'n_estimators': [200, 500, 700, 1000],
                                        'num_leaves': [20, 30, 40, 50, 60]},
                   random_state=314, scoring='roc_auc', verbose=False)

Model Performance¶

In [21]:

roc = perf.ROC(LGBM_gs.best_estimator_.predict_proba, feature_names=X_train.columns)
roc_explanation = roc.explain_perf(X_test, y_test)

In [22]:

show(roc_explanation)

Local Explanations¶

In [23]:

shap = ShapKernel(predict_fn=LGBM_gs.predict_proba, data=X_train)
shap_local = shap.explain_local(X_test[10:15], y_test[10:15])

show(shap_local)

Using 200 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.

  0%|          | 0/5 [00:00<?, ?it/s]

In [24]:

lime = LimeTabular(predict_fn=LGBM_gs.predict_proba, data=X_train)
lime_local = lime.explain_local(X_test[10:15], y_test[10:15])

show(lime_local)