TODO:
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, plot_tree
import test_tree_shap
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
get_ipython().magic("load_ext autoreload")
get_ipython().magic("autoreload 2")
data, tree = test_tree_shap.toy_tree()
import tree_shap
assert tree_shap.naive_tree_shap(tree, current_node=0, features={0: 150}) == 20
assert tree_shap.naive_tree_shap(tree, current_node=0, features={1: 75}) == 27
# given feature 0, having feature 1 doesn't make a difference
assert (
tree_shap.naive_tree_shap(tree, current_node=0, features={0: 150, 1: 75})
== 20
)
import itertools
features_tuple = ((0, 150), (1, 75), (2, 200))
basis = data['y'].mean()
basis
23.0
Need to rename the variables here to make it more readable
all_phis = []
for permutation in itertools.permutations(features_tuple):
phis = {'basis': basis}
for i in range(len(permutation)):
phi_raw = tree_shap.naive_tree_shap(tree, current_node=0, features=dict(permutation[:i + 1]))
phi = phi_raw - sum(phis.values())
phis[permutation[i][0]] = phi
all_phis.append(phis)
all_phis
pd.DataFrame(all_phis).mean(axis=0)
basis 23.0 0 -5.0 1 2.0 2 0.0 dtype: float64
from collections import OrderedDict
OrderedDict(features_tuple)[:2]
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-43-dafde3bde0d2> in <module> ----> 1 OrderedDict(features_tuple)[:2] TypeError: unhashable type: 'slice'
list(itertools.permutations(OrderedDict(features_tuple)))
permutation
((2, 200), (1, 75), (0, 150))
phis
permutation
((2, 200), (1, 75), (0, 150))
np.array(all_phis)[:, 1:]
array([[-3., 0., 0.], [-3., 0., 0.], [ 4., -7., 0.], [ 4., 0., -7.], [ 0., -3., 0.], [ 0., 4., -7.]])
np.array(all_phis)[:, 1:].mean(axis=0)
array([ 0.33333333, -1. , -2.33333333])
itertools.subpermutation
(0, 1, 2)
tree.feature
array([ 0, 1, -2, -2, 0, -2, -2])
tree.tree_.value
array([[[15.]], [[40.]], [[50.]], [[30.]], [[10.]], [[20.]], [[10.]]])
tree.decision_path([[150, 75, 200]]).todense()
matrix([[1, 0, 0, 0, 1, 1, 0]])
tree.tree_.children_left
array([ 1, 2, -1, -1, 5, -1, -1])
tree.tree_.children_right
array([ 4, 3, -1, -1, 6, -1, -1])
tree.tree_.
4
tree.tree_.threshold
array([100., 300., -2., -2., 200., -2., -2.])
_ = plot_tree(tree, filled=True, proportion=False)
Explain for the example [x=150, y=75, z=200]
import sklearn
tree.tree_.threshold
array([100., 300., -2., -2., 200., -2., -2.])
tree.tree_.feature
array([ 0, 1, -2, -2, 0, -2, -2])
tree.tree_.children_left
array([ 1, 2, -1, -1, 5, -1, -1])
tree.tree_.children_right
array([ 4, 3, -1, -1, 6, -1, -1])
tree.tree_.children_left
array([ 1, 2, -1, -1, 5, -1, -1])
from collections import OrderedDict
features = [[1, 75], [2, 200], [0, 150]]
features[:2]
tree.tree_.n_node_samples
array([10, 4, 2, 2, 6, 1, 5])
def decision_path(i):
"""
i: index of features
"""
if i == tree.tree_.feature[0]
x > y > z
phi_x = 20 - phi_null
phi_y = 20 - phi_x - phi_null
phi_z = 20 - phi_y - phi_x - phi_null
phi_z
0.0
y > z > x
phi_y = (4 / 10) * 50 + (6 / 10) * (1 / 6 * 20 + 5 / 6 * 10) - phi_null
phi_y
4.0
phi_z = 0
phi_x = 20 - phi_y - phi_null
phi_x
-7.0
tree.predict([[150, 75, 200]])
array([20.])
X_test = pd.DataFrame({'x': [150], 'y': [75], 'z': [200]})
explainer = shap.TreeExplainer(tree)
shap_values = explainer.shap_values(X_test)
shap_values
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
array([[-5., 2., 0.]])