#!/usr/bin/env python # coding: utf-8 # In[2]: import pandas as pd import numpy as np from sklearn.metrics import roc_auc_score from lightgbm import LGBMClassifier from interpret.glassbox import ExplainableBoostingClassifier from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import train_test_split from interpret.glassbox import LogisticRegression from sklearn.preprocessing import StandardScaler from interpret import perf from interpret.blackbox import ShapKernel from interpret.blackbox import LimeTabular from interpret import show from interpret.provider import InlineProvider from interpret import set_visualize_provider set_visualize_provider(InlineProvider()) from interpret.data import ClassHistogram # When vizualization doesn't work (https://github.com/interpretml/interpret/issues/259) # In[3]: # from interpret.data import Marginal # from interpret import preserve # # Import Data # In[4]: data = pd.read_csv("heart_failure_clinical_records_dataset.csv") # In[5]: data.head() # In[6]: data.info() # # Train Test Split # In[7]: X = data.drop(["time", "DEATH_EVENT"], axis=1) y = data["DEATH_EVENT"] # In[8]: X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42, stratify=y ) # # Explore the data # InterpretML also support data exploration # In[9]: hist = ClassHistogram().explain_data(X_train, y_train, name="Train Data") show(hist) # # Modeling & Hyperparameter Tuning # ## Logistic Regression # In[10]: scaler = StandardScaler() features_to_scale = [ "age", "creatinine_phosphokinase", "ejection_fraction", "platelets", "serum_creatinine", "serum_sodium", ] X_train_scaled, X_test_scaled = X_train.copy(), X_test.copy() X_train_scaled[features_to_scale] = scaler.fit_transform( X_train_scaled[features_to_scale] ) X_test_scaled[features_to_scale] = scaler.transform(X_test_scaled[features_to_scale]) # Note: InterpretML's Logistic Regression currently cannot run with RandomizedSearchCV since it does not implement a 'get_params' method. # In[11]: LR_clf = LogisticRegression(solver="liblinear", class_weight="balanced", penalty="l1") # In[12]: LR_clf.fit(X_train_scaled, y_train) # In[13]: lr_global = LR_clf.explain_global() show(lr_global) # In[14]: lr_local = LR_clf.explain_local(X_test_scaled[10:15], y_test[10:15]) show(lr_local) # ## EBM # ### Hyperparameter Optimization # In[15]: param_test = { "learning_rate": [0.001, 0.005, 0.01, 0.03], "interactions": [5, 10, 15], "max_interaction_bins": [10, 15, 20], "max_rounds": [5000, 10000, 15000, 20000], "min_samples_leaf": [2, 3, 5], "max_leaves": [3, 5, 10], } n_HP_points_to_test = 10 EBM_clf = ExplainableBoostingClassifier(feature_names=X_train.columns) EBM_gs = RandomizedSearchCV( estimator=EBM_clf, param_distributions=param_test, n_iter=n_HP_points_to_test, scoring="roc_auc", cv=3, refit=True, random_state=314, verbose=False, ) EBM_gs.fit(X_train, y_train) # ### Model Performance # In[16]: roc = perf.ROC(EBM_gs.best_estimator_.predict_proba, feature_names=X_train.columns) roc_explanation = roc.explain_perf(X_test, y_test) # In[17]: show(roc_explanation) # ### Global Explanation # In[18]: ebm_global = EBM_gs.best_estimator_.explain_global() show(ebm_global) # ### Local Explanation # In[19]: ebm_local = EBM_gs.best_estimator_.explain_local(X_test[10:15], y_test[10:15]) show(ebm_local) # ## LightGBM # ### Hyperparameter Optimization # In[20]: param_test = { "num_leaves": [20, 30, 40, 50, 60], "max_depth": [-1, 5, 10, 15], "learning_rate": [0.01, 0.03, 0.05, 0.07], "n_estimators": [200, 500, 700, 1000], "is_unbalance": [True, False], } n_HP_points_to_test = 10 LGBM_clf = LGBMClassifier(random_state=314, n_jobs=-1) LGBM_gs = RandomizedSearchCV( estimator=LGBM_clf, param_distributions=param_test, n_iter=n_HP_points_to_test, scoring="roc_auc", cv=3, refit=True, random_state=314, verbose=False, ) LGBM_gs.fit(X_train, y_train) # ### Model Performance # In[21]: roc = perf.ROC(LGBM_gs.best_estimator_.predict_proba, feature_names=X_train.columns) roc_explanation = roc.explain_perf(X_test, y_test) # In[22]: show(roc_explanation) # ### Local Explanations # In[23]: shap = ShapKernel(predict_fn=LGBM_gs.predict_proba, data=X_train) shap_local = shap.explain_local(X_test[10:15], y_test[10:15]) show(shap_local) # In[24]: lime = LimeTabular(predict_fn=LGBM_gs.predict_proba, data=X_train) lime_local = lime.explain_local(X_test[10:15], y_test[10:15]) show(lime_local)