import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier as skDecisionTreeClassifier
class TreeNode():
def __init__(self):
self.left_child = -1
self.right_child = -1
self.feature = None
self.threshold = None
self.impurity = None
self.n_node = None
self.value = None
class DecisionTreeClassifier():
def __init__(self, max_depth=2):
self.max_depth = max_depth
def _gini(self, y_cnt):
prob = y_cnt / np.sum(y_cnt)
return 1 - np.sum(np.square(prob))
def _build_tree(self, X, y, cur_depth, parent, is_left):
if cur_depth == self.max_depth:
cur_node = TreeNode()
cur_node.n_node = X.shape[0]
cur_node.value = np.bincount(y, minlength=self.n_classes_)
cur_node.impurity = self._gini(cur_node.value)
cur_id = len(self.tree_)
self.tree_.append(cur_node)
if parent is not None:
if is_left:
self.tree_[parent].left_child = cur_id
else:
self.tree_[parent].right_child = cur_id
return
best_improvement = -np.inf
best_feature = None
best_threshold = None
best_left_ind = None
best_right_ind = None
y_cnt = np.bincount(y, minlength=self.n_classes_)
for i in range(X.shape[1]):
ind = np.argsort(X[:, i])
y_cnt_left = np.bincount([], minlength=self.n_classes_)
y_cnt_right = y_cnt.copy()
n_left = 0
n_right = X.shape[0]
for j in range(ind.shape[0] - 1):
y_cnt_left[y[ind[j]]] += 1
y_cnt_right[y[ind[j]]] -= 1
n_left += 1
n_right -= 1
if j + 1 < ind.shape[0] - 1 and np.isclose(X[ind[j], i], X[ind[j + 1], i]):
continue
cur_improvement = -n_left * self._gini(y_cnt_left) - n_right * self._gini(y_cnt_right)
if cur_improvement > best_improvement:
best_improvement = cur_improvement
best_feature = i
best_threshold = X[ind[j], i]
best_left_ind = ind[:j + 1]
best_right_ind = ind[j + 1:]
cur_node = TreeNode()
cur_node.feature = best_feature
cur_node.threshold = best_threshold
cur_node.n_node = X.shape[0]
cur_node.value = y_cnt
cur_node.impurity = self._gini(y_cnt)
cur_id = len(self.tree_)
self.tree_.append(cur_node)
if parent is not None:
if is_left:
self.tree_[parent].left_child = cur_id
else:
self.tree_[parent].right_child = cur_id
if cur_depth < self.max_depth:
self._build_tree(X[best_left_ind], y[best_left_ind], cur_depth + 1, cur_id, True)
self._build_tree(X[best_right_ind], y[best_right_ind], cur_depth + 1, cur_id, False)
def fit(self, X, y):
self.n_features = X.shape[1]
self.classes_, y_train = np.unique(y, return_inverse=True)
self.n_classes_ = len(self.classes_)
self.tree_ = []
self._build_tree(X, y_train, 0, None, None)
return self
def apply(self, X):
pred = np.zeros(X.shape[0], dtype=np.int)
for i in range(X.shape[0]):
cur_node = 0
while self.tree_[cur_node].left_child != -1:
if X[i][self.tree_[cur_node].feature] <= self.tree_[cur_node].threshold:
cur_node = self.tree_[cur_node].left_child
else:
cur_node = self.tree_[cur_node].right_child
pred[i] = cur_node
return pred
def predict_proba(self, X):
pred = self.apply(X)
prob = np.array([self.tree_[p].value for p in pred])
return prob / np.sum(prob, axis=1)[:, np.newaxis]
def predict(self, X):
pred = self.apply(X)
return np.array([self.classes_[np.argmax(self.tree_[p].value)] for p in pred])
@property
def feature_importances_(self):
importances = np.zeros(self.n_features)
for node in self.tree_:
if node.left_child != -1:
left_child = self.tree_[node.left_child]
right_child = self.tree_[node.right_child]
importances[node.feature] += (node.n_node * node.impurity
- left_child.n_node * left_child.impurity
- right_child.n_node * right_child.impurity)
return importances / np.sum(importances)
X, y = load_breast_cancer(return_X_y=True)
clf1 = DecisionTreeClassifier(max_depth=1).fit(X, y)
clf2 = skDecisionTreeClassifier(max_depth=1, random_state=0).fit(X, y)
assert np.array_equal([node.left_child for node in clf1.tree_],
clf2.tree_.children_left)
assert np.array_equal([node.right_child for node in clf1.tree_],
clf2.tree_.children_right)
clf1_leaf = [node.left_child != -1 for node in clf1.tree_]
clf2_leaf = clf2.tree_.children_left != -1
assert np.array_equal(clf1_leaf, clf2_leaf)
assert np.array_equal(np.array([node.feature for node in clf1.tree_])[clf1_leaf],
clf2.tree_.feature[clf2_leaf])
# threshold is slightly different
# because scikit-learn users the average value between current point and the next point
assert np.allclose([node.impurity for node in clf1.tree_],
clf2.tree_.impurity)
assert np.array_equal([node.n_node for node in clf1.tree_],
clf2.tree_.n_node_samples)
assert np.allclose(np.array([node.value for node in clf1.tree_]),
clf2.tree_.value[:, 0, :])
assert np.allclose(clf1.feature_importances_, clf2.feature_importances_)
pred1 = clf1.apply(X)
pred2 = clf2.apply(X)
assert np.array_equal(pred1, pred2)
prob1 = clf1.predict_proba(X)
prob2 = clf2.predict_proba(X)
assert np.allclose(prob1, prob2)
pred1 = clf1.predict(X)
pred2 = clf2.predict(X)
assert np.array_equal(pred1, pred2)
class DecisionTreeClassifier():
def __init__(self, max_depth=2):
self.max_depth = max_depth
def _entropy(self, y_cnt):
prob = y_cnt / np.sum(y_cnt)
prob = prob[prob > 0]
return -np.sum(prob * np.log2(prob))
def _build_tree(self, X, y, cur_depth, parent, is_left):
if cur_depth == self.max_depth:
cur_node = TreeNode()
cur_node.n_node = X.shape[0]
cur_node.value = np.bincount(y, minlength=self.n_classes_)
cur_node.impurity = self._entropy(cur_node.value)
cur_id = len(self.tree_)
self.tree_.append(cur_node)
if parent is not None:
if is_left:
self.tree_[parent].left_child = cur_id
else:
self.tree_[parent].right_child = cur_id
return
best_improvement = -np.inf
best_feature = None
best_threshold = None
best_left_ind = None
best_right_ind = None
y_cnt = np.bincount(y, minlength=self.n_classes_)
#for i in range(X.shape[1]):
for i in range(22, 23):
ind = np.argsort(X[:, i])
y_cnt_left = np.bincount([], minlength=self.n_classes_)
y_cnt_right = y_cnt.copy()
n_left = 0
n_right = X.shape[0]
for j in range(ind.shape[0] - 1):
y_cnt_left[y[ind[j]]] += 1
y_cnt_right[y[ind[j]]] -= 1
n_left += 1
n_right -= 1
if j + 1 < ind.shape[0] - 1 and np.isclose(X[ind[j], i], X[ind[j + 1], i]):
continue
cur_improvement = -n_left * self._entropy(y_cnt_left) - n_right * self._entropy(y_cnt_right)
if cur_improvement > best_improvement:
best_improvement = cur_improvement
best_feature = i
best_threshold = X[ind[j], i]
best_left_ind = ind[:j + 1]
best_right_ind = ind[j + 1:]
cur_node = TreeNode()
cur_node.feature = best_feature
cur_node.threshold = best_threshold
cur_node.n_node = X.shape[0]
cur_node.value = y_cnt
cur_node.impurity = self._entropy(y_cnt)
cur_id = len(self.tree_)
self.tree_.append(cur_node)
if parent is not None:
if is_left:
self.tree_[parent].left_child = cur_id
else:
self.tree_[parent].right_child = cur_id
if cur_depth < self.max_depth:
self._build_tree(X[best_left_ind], y[best_left_ind], cur_depth + 1, cur_id, True)
self._build_tree(X[best_right_ind], y[best_right_ind], cur_depth + 1, cur_id, False)
def fit(self, X, y):
self.n_features = X.shape[1]
self.classes_, y_train = np.unique(y, return_inverse=True)
self.n_classes_ = len(self.classes_)
self.tree_ = []
self._build_tree(X, y_train, 0, None, None)
return self
def apply(self, X):
pred = np.zeros(X.shape[0], dtype=np.int)
for i in range(X.shape[0]):
cur_node = 0
while self.tree_[cur_node].left_child != -1:
if X[i][self.tree_[cur_node].feature] <= self.tree_[cur_node].threshold:
cur_node = self.tree_[cur_node].left_child
else:
cur_node = self.tree_[cur_node].right_child
pred[i] = cur_node
return pred
def predict_proba(self, X):
pred = self.apply(X)
prob = np.array([self.tree_[p].value for p in pred])
return prob / np.sum(prob, axis=1)[:, np.newaxis]
def predict(self, X):
pred = self.apply(X)
return np.array([self.classes_[np.argmax(self.tree_[p].value)] for p in pred])
@property
def feature_importances_(self):
importances = np.zeros(self.n_features)
for node in self.tree_:
if node.left_child != -1:
left_child = self.tree_[node.left_child]
right_child = self.tree_[node.right_child]
importances[node.feature] += (node.n_node * node.impurity
- left_child.n_node * left_child.impurity
- right_child.n_node * right_child.impurity)
return importances / np.sum(importances)
X, y = load_breast_cancer(return_X_y=True)
clf1 = DecisionTreeClassifier(max_depth=1).fit(X, y)
clf2 = skDecisionTreeClassifier(criterion="entropy", max_depth=1, random_state=0).fit(X, y)
assert np.array_equal([node.left_child for node in clf1.tree_],
clf2.tree_.children_left)
assert np.array_equal([node.right_child for node in clf1.tree_],
clf2.tree_.children_right)
clf1_leaf = [node.left_child != -1 for node in clf1.tree_]
clf2_leaf = clf2.tree_.children_left != -1
assert np.array_equal(clf1_leaf, clf2_leaf)
assert np.array_equal(np.array([node.feature for node in clf1.tree_])[clf1_leaf],
clf2.tree_.feature[clf2_leaf])
# threshold is slightly different
# because scikit-learn users the average value between current point and the next point
assert np.allclose([node.impurity for node in clf1.tree_],
clf2.tree_.impurity)
assert np.array_equal([node.n_node for node in clf1.tree_],
clf2.tree_.n_node_samples)
assert np.allclose(np.array([node.value for node in clf1.tree_]),
clf2.tree_.value[:, 0, :])
assert np.allclose(clf1.feature_importances_, clf2.feature_importances_)
pred1 = clf1.apply(X)
pred2 = clf2.apply(X)
assert np.array_equal(pred1, pred2)
prob1 = clf1.predict_proba(X)
prob2 = clf2.predict_proba(X)
assert np.allclose(prob1, prob2)
pred1 = clf1.predict(X)
pred2 = clf2.predict(X)
assert np.array_equal(pred1, pred2)