TODO:
import itertools
import pickle
import math
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, plot_tree
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
get_ipython().magic("load_ext autoreload")
get_ipython().magic("autoreload 2")
with open('./tree_model.pkl', 'rb') as opened:
tree_model = pickle.load(opened)
Trying to unpickle estimator DecisionTreeRegressor from version 1.0.1 when using version 0.23.2. This might lead to breaking code or invalid results. Use at your own risk.
tree_model
From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.
DecisionTreeRegressor(criterion='mae', max_depth=2, presort=None, random_state=100)
tree = tree_model.tree_
import tree_shap
tree.predict(np.array([[150, 75, 200]], dtype=np.float32))
array([[20.]])
x = [150, 75, 200]
assert tree_shap.exp_value(x, S={}, tree=tree) == 23
assert tree_shap.exp_value(x, S={0}, tree=tree) == 20
assert tree_shap.exp_value(x, S={1}, tree=tree) == 27
assert tree_shap.exp_value(x, S={0, 1}, tree=tree) == 20
x2 = np.array([100, 25, 400])
shap.TreeExplainer?
explainer = shap.TreeExplainer(tree_model)
explainer.shap_values(x2)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-11-7ef351b65faf> in <module> 1 explainer = shap.TreeExplainer(tree_model) ----> 2 explainer.shap_values(x2) NameError: name 'x2' is not defined
x = x2
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-12-cb64638f1b76> in <module> ----> 1 x = x2 NameError: name 'x2' is not defined
# ϕ0 = tree_shap.exp_value(x, {}, tree)
# ϕs = []
preds = {}
for card in range(0, len(x) + 1):
for perm in itertools.permutations(range(len(x)), r=card):
preds[tuple(perm)] = tree_shap.exp_value(x, S=set(perm), tree=tree)
preds
{(): 23.0, (0,): 40.0, (1,): 27.0, (2,): 23.0, (0, 1): 50.0, (0, 2): 40.0, (1, 0): 50.0, (1, 2): 27.0, (2, 0): 40.0, (2, 1): 27.0, (0, 1, 2): 50.0, (0, 2, 1): 50.0, (1, 0, 2): 50.0, (1, 2, 0): 50.0, (2, 0, 1): 50.0, (2, 1, 0): 50.0}
deltas = [0] * len(x)
for perm in itertools.permutations(range(len(x)), r=len(x)):
for index, i in enumerate(perm):
deltas[i] += preds[perm[:index+1]] - preds[perm[:index]]
np.array(deltas) / math.factorial(len(x))
array([20., 7., 0.])
ϕ0 = preds[()]
ϕs = []
for i in range(len(x)):
delta = 0
for key in preds:
if i in key:
delta += preds[key] - preds[tuple(_ for _ in key if _ != i)]
# ϕs.append(delta / math.factorial(len(x)))
ϕs.append(delta)
ϕ0
23.0
ϕs
[-65.0, 12.0, 0.0]
features_tuple = ((0, 150), (1, 75), (2, 200))
# follows the calculation at https://medium.com/analytics-vidhya/shap-part-3-tree-shap-3af9bcd7cd9b closely
basis = tree_shap.exp_value(x, {}, tree)
all_phis = []
for perm in itertools.permutations(features_tuple):
phis = {"basis": basis}
for i in range(len(perm)):
phi_raw = tree_shap.exp_value(x, S={_[0] for _ in perm[: i + 1]}, tree=tree)
phi = phi_raw - sum(phis.values())
phis[perm[i][0]] = phi
all_phis.append(phis)
perm
((2, 200), (1, 75), (0, 150))
{_[0] for _ in perm}
{0, 1, 2}
all_phis
[{'basis': 23.0, 0: -3.0, 1: 0.0, 2: 0.0}, {'basis': 23.0, 0: -3.0, 2: 0.0, 1: 0.0}, {'basis': 23.0, 1: 4.0, 0: -7.0, 2: 0.0}, {'basis': 23.0, 1: 4.0, 2: 0.0, 0: -7.0}, {'basis': 23.0, 2: 0.0, 0: -3.0, 1: 0.0}, {'basis': 23.0, 2: 0.0, 1: 4.0, 0: -7.0}]
pd.DataFrame(all_phis)
basis | 0 | 1 | 2 | |
---|---|---|---|---|
0 | 23.0 | -3.0 | 0.0 | 0.0 |
1 | 23.0 | -3.0 | 0.0 | 0.0 |
2 | 23.0 | -7.0 | 4.0 | 0.0 |
3 | 23.0 | -7.0 | 4.0 | 0.0 |
4 | 23.0 | -3.0 | 0.0 | 0.0 |
5 | 23.0 | -7.0 | 4.0 | 0.0 |
pd.DataFrame(all_phis).mean()
basis 23.0 0 -5.0 1 2.0 2 0.0 dtype: float64
math.factorial(3)
6
preds = {}
for card in range(len(x) + 1): # cardinality of S
for perm in itertools.permutations(range(len(x)), r=card):
S = set(perm)
preds[S] = tree_shap.exp_value(x, S, tree)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-41-181e585d6128> in <module> 4 for perm in itertools.permutations(range(len(x)), r=card): 5 S = set(perm) ----> 6 preds[S] = tree_shap.exp_value(x, S, tree) TypeError: unhashable type: 'set'
pd.DataFrame(all_phis).mean(axis=0)
basis 23.0 0 -1.0 1 -1.0 2 -1.0 dtype: float64
all_phis = []
for permutation in itertools.permutations(features_tuple):
phis = {'basis': basis}
for i in range(len(permutation)):
phi_raw = tree_shap.naive_tree_shap(tree, current_node=0, features=dict(permutation[:i + 1]))
phi = phi_raw - sum(phis.values())
phis[permutation[i][0]] = phi
all_phis.append(phis)
pd.DataFrame(all_phis).mean(axis=0)
basis 23.0 0 -5.0 1 2.0 2 0.0 dtype: float64
tree_shap.exp_value(x, S={1, 2}, tree=tree)
27.0
tree_shap.naive_tree_shap(
tree,
current_node=0,
features={0: 150},
)
20.0
assert (
tree_shap.naive_tree_shap(
tree,
current_node=0,
features={0: 150},
)
== 20
)
assert (
tree_shap.naive_tree_shap(
tree,
current_node=0,
features={1: 75},
)
== 27
)
# given feature 0, having feature 1 doesn't make a difference
assert (
tree_shap.naive_tree_shap(
tree,
current_node=0,
features={0: 150, 1: 75},
)
== 20
)
X_test = pd.DataFrame(
{
"x0": [150],
"x1": [75],
"x2": [200],
}
)
tree.n_node_samples
array([10, 4, 2, 2, 6, 1, 5], dtype=int64)
tree.children_left
array([ 1, 2, -1, -1, 5, -1, -1], dtype=int64)
tree.children_right
array([ 4, 3, -1, -1, 6, -1, -1], dtype=int64)
tree.feature
array([ 0, 1, -2, -2, 0, -2, -2], dtype=int64)
tree.threshold
array([100., 300., -2., -2., 200., -2., -2.])
tree_model.predict(X_test)
array([20.])
import sklearn.tree
tree.value?
tree.value.shape
(7, 1, 1)
tree.value
array([[[15.]], [[40.]], [[50.]], [[30.]], [[10.]], [[20.]], [[10.]]])
tree.n_node_samples
array([10, 4, 2, 2, 6, 1, 5], dtype=int64)
tree.n_outputs
1
tree_model.feature_names_in_
array(['x0', 'x1', 'x2'], dtype=object)
tree_model.tree_.feature
array([ 0, 1, -2, -2, 0, -2, -2], dtype=int64)
tree.max_n_classes
1
tree_model.max_features_
3
tree.predict(np.array([[150, 75, 200]], dtype=np.float32))
array([[20.]])
tree.children_left?
tree.feature
array([ 0, 1, -2, -2, 0, -2, -2], dtype=int64)
tree.node_count
7
import tree_shap
assert (
tree_shap.naive_tree_shap(
tree,
current_node=0,
features={0: 150},
)
== 20
)
assert (
tree_shap.naive_tree_shap(
tree,
current_node=0,
features={1: 75},
)
== 27
)
# given feature 0, having feature 1 doesn't make a difference
assert (
tree_shap.naive_tree_shap(
tree,
current_node=0,
features={0: 150, 1: 75},
)
== 20
)
import itertools
features_tuple = ((0, 150), (1, 75), (2, 200))
Need to rename the variables here to make it more readable
all_phis = []
for permutation in itertools.permutations(features_tuple):
phis = {'basis': basis}
for i in range(len(permutation)):
phi_raw = tree_shap.naive_tree_shap(tree, current_node=0, features=dict(permutation[:i + 1]))
phi = phi_raw - sum(phis.values())
phis[permutation[i][0]] = phi
all_phis.append(phis)
all_phis
pd.DataFrame(all_phis).mean(axis=0)
basis 23.0 0 -5.0 1 2.0 2 0.0 dtype: float64
from collections import OrderedDict
OrderedDict(features_tuple)[:2]
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-43-dafde3bde0d2> in <module> ----> 1 OrderedDict(features_tuple)[:2] TypeError: unhashable type: 'slice'
list(itertools.permutations(OrderedDict(features_tuple)))
permutation
((2, 200), (1, 75), (0, 150))
phis
permutation
((2, 200), (1, 75), (0, 150))
np.array(all_phis)[:, 1:]
array([[-3., 0., 0.], [-3., 0., 0.], [ 4., -7., 0.], [ 4., 0., -7.], [ 0., -3., 0.], [ 0., 4., -7.]])
np.array(all_phis)[:, 1:].mean(axis=0)
array([ 0.33333333, -1. , -2.33333333])
itertools.subpermutation
(0, 1, 2)
tree.feature
array([ 0, 1, -2, -2, 0, -2, -2])
tree.tree_.value
array([[[15.]], [[40.]], [[50.]], [[30.]], [[10.]], [[20.]], [[10.]]])
tree.decision_path([[150, 75, 200]]).todense()
matrix([[1, 0, 0, 0, 1, 1, 0]])
tree.tree_.children_left
array([ 1, 2, -1, -1, 5, -1, -1])
tree.tree_.children_right
array([ 4, 3, -1, -1, 6, -1, -1])
tree.tree_.
4
tree.tree_.threshold
array([100., 300., -2., -2., 200., -2., -2.])
_ = plot_tree(tree, filled=True, proportion=False)
Explain for the example [x=150, y=75, z=200]
import sklearn
tree.tree_.threshold
array([100., 300., -2., -2., 200., -2., -2.])
tree.tree_.feature
array([ 0, 1, -2, -2, 0, -2, -2])
tree.tree_.children_left
array([ 1, 2, -1, -1, 5, -1, -1])
tree.tree_.children_right
array([ 4, 3, -1, -1, 6, -1, -1])
tree.tree_.children_left
array([ 1, 2, -1, -1, 5, -1, -1])
from collections import OrderedDict
features = [[1, 75], [2, 200], [0, 150]]
features[:2]
tree.tree_.n_node_samples
array([10, 4, 2, 2, 6, 1, 5])
def decision_path(i):
"""
i: index of features
"""
if i == tree.tree_.feature[0]
x > y > z
phi_x = 20 - phi_null
phi_y = 20 - phi_x - phi_null
phi_z = 20 - phi_y - phi_x - phi_null
phi_z
0.0
y > z > x
phi_y = (4 / 10) * 50 + (6 / 10) * (1 / 6 * 20 + 5 / 6 * 10) - phi_null
phi_y
4.0
phi_z = 0
phi_x = 20 - phi_y - phi_null
phi_x
-7.0
tree.predict([[150, 75, 200]])
array([20.])
X_test = pd.DataFrame({'x': [150], 'y': [75], 'z': [200]})
explainer = shap.TreeExplainer(tree)
shap_values = explainer.shap_values(X_test)
shap_values
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
array([[-5., 2., 0.]])