The notebook contains the code for the accompanying blogpost titled Interpretable or Accurate? Why not both?
Interpret is supported across Windows, Mac and Linux on Python 3.5+. Please refer the documentation for more details.
pip install interpret
conda install -c interpretml interpret
git clone https://github.com/interpretml/interpret.git && cd interpret/scripts && make install
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from interpret import show
from interpret import set_visualize_provider
from interpret.provider import InlineProvider
from interpret.data import ClassHistogram
set_visualize_provider(InlineProvider())
from interpret.glassbox import (
LogisticRegression,
ClassificationTree,
ExplainableBoostingClassifier,
)
seed = 42
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()
Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
#Encoding the target variable i.e Attrition
target_map = {'Yes': 1, 'No': 0}
target = df["Attrition"].apply(lambda x: target_map[x])
print(target[:10])
0 1 1 0 2 1 3 0 4 0 5 0 6 0 7 0 8 0 9 0 Name: Attrition, dtype: int64
# Deleting columns that are not useful for the predicitons
df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours','Attrition'], axis="columns", inplace=True)
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(df,
target,
test_size=0.2,
random_state=seed,
stratify=target)
hist = ClassHistogram().explain_data(X_train, y_train, name = 'Train Data')
show(hist)
ebm = ExplainableBoostingClassifier(random_state=seed, n_jobs=-1,inner_bags=100,outer_bags=100)
ebm.fit(X_train, y_train)
ExplainableBoostingClassifier(feature_names=['Age', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike', 'Perfor... 'categorical', 'continuous', 'categorical', 'continuous', 'continuous', 'continuous', 'categorical', 'continuous', 'categorical', 'continuous', 'continuous', 'continuous', 'categorical', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', ...], inner_bags=100, n_jobs=-1, outer_bags=100)
Global Explanations help to gain a better understanding of the model's overall behavior and what the model learnt overall.
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)
Local Explanations helps us understand the reasons behind individual predictionsHow an why individual prediction was made
ebm_local = ebm.explain_local(X_test[:5], y_test[:5], name='EBM')
show(ebm_local)
from interpret.perf import ROC
ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name='EBM')
show(ebm_perf)
# We have to transform categorical variables to use Logistic Regression and Decision Tree
X_enc = pd.get_dummies(df, prefix_sep='.')
feature_names = list(X_enc.columns)
X_train_enc, X_test_enc, y_train, y_test = train_test_split(X_enc, target, test_size=0.20, random_state=seed)
lr = LogisticRegression(random_state=seed, feature_names=feature_names, penalty='l1', solver='liblinear')
lr.fit(X_train_enc, y_train)
tree = ClassificationTree()
tree.fit(X_train_enc, y_train)
<interpret.glassbox.decisiontree.ClassificationTree at 0x7fc4adbc37b8>
lr_perf = ROC(lr.predict_proba).explain_perf(X_test_enc, y_test, name='Logistic Regression')
tree_perf = ROC(tree.predict_proba).explain_perf(X_test_enc, y_test, name='Classification Tree')
show(lr_perf)
show(tree_perf)
show(ebm_perf)
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
#Blackbox system can include preprocessing, not just a classifier!
pca = PCA()
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
X_enc = pd.get_dummies(df, prefix_sep='.')
feature_names = list(X_enc.columns)
X_train_enc, X_test_enc, y_train, y_test = train_test_split(X_enc, target, test_size=0.20, random_state=seed)
blackbox_model = Pipeline([('pca', pca), ('rf', rf)])
blackbox_model.fit(X_train_enc, y_train)
Pipeline(steps=[('pca', PCA()), ('rf', RandomForestClassifier(n_jobs=-1))])
from interpret import show
from interpret.perf import ROC
blackbox_perf = ROC(blackbox_model.predict_proba).explain_perf(X_test_enc, y_test, name='Blackbox')
show(blackbox_perf)
from interpret.blackbox import LimeTabular
from interpret import show
#Blackbox explainers need a predict function, and optionally a dataset
lime = LimeTabular(predict_fn=blackbox_model.predict_proba, data=X_train_enc, random_state=1)
#Pick the instances to explain, optionally pass in labels if you have them
lime_local = lime.explain_local(X_test_enc[:5], y_test[:5], name='LIME')
show(lime_local)
from interpret.blackbox import PartialDependence
pdp = PartialDependence(predict_fn=blackbox_model.predict_proba, data=X_train_enc)
pdp_global = pdp.explain_global(name='Partial Dependence')
show(pdp_global)