Notebook

In [1]:

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier as skDecisionTreeClassifier

Implementation 1¶

similat to scikit-learn criterion="gini"

In [2]:

class TreeNode():
    def __init__(self):
        self.left_child = -1
        self.right_child = -1
        self.feature = None
        self.threshold = None
        self.impurity = None
        self.n_node = None
        self.value = None

In [3]:

class DecisionTreeClassifier():
    def __init__(self, max_depth=2):
        self.max_depth = max_depth

    def _gini(self, y_cnt):
        prob = y_cnt / np.sum(y_cnt)
        return 1 - np.sum(np.square(prob))

    def _build_tree(self, X, y, cur_depth, parent, is_left):
        if cur_depth == self.max_depth:
            cur_node = TreeNode()
            cur_node.n_node = X.shape[0]
            cur_node.value = np.bincount(y, minlength=self.n_classes_)
            cur_node.impurity = self._gini(cur_node.value)
            cur_id = len(self.tree_)
            self.tree_.append(cur_node)
            if parent is not None:
                if is_left:
                    self.tree_[parent].left_child = cur_id
                else:
                    self.tree_[parent].right_child = cur_id
            return
        best_improvement = -np.inf
        best_feature = None
        best_threshold = None
        best_left_ind = None
        best_right_ind = None
        y_cnt = np.bincount(y, minlength=self.n_classes_)
        for i in range(X.shape[1]):
            ind = np.argsort(X[:, i])
            y_cnt_left = np.bincount([], minlength=self.n_classes_)
            y_cnt_right = y_cnt.copy()
            n_left = 0
            n_right = X.shape[0]
            for j in range(ind.shape[0] - 1):
                y_cnt_left[y[ind[j]]] += 1
                y_cnt_right[y[ind[j]]] -= 1
                n_left += 1
                n_right -= 1
                if j + 1 < ind.shape[0] - 1 and np.isclose(X[ind[j], i], X[ind[j + 1], i]):
                    continue
                cur_improvement = -n_left * self._gini(y_cnt_left) - n_right * self._gini(y_cnt_right)
                if cur_improvement > best_improvement:
                    best_improvement = cur_improvement
                    best_feature = i
                    best_threshold = X[ind[j], i]
                    best_left_ind = ind[:j + 1]
                    best_right_ind = ind[j + 1:]
        cur_node = TreeNode()
        cur_node.feature = best_feature
        cur_node.threshold = best_threshold
        cur_node.n_node = X.shape[0]
        cur_node.value = y_cnt
        cur_node.impurity = self._gini(y_cnt)
        cur_id = len(self.tree_)
        self.tree_.append(cur_node)
        if parent is not None:
            if is_left:
                self.tree_[parent].left_child = cur_id
            else:
                self.tree_[parent].right_child = cur_id
        if cur_depth < self.max_depth:
            self._build_tree(X[best_left_ind], y[best_left_ind], cur_depth + 1, cur_id, True)
            self._build_tree(X[best_right_ind], y[best_right_ind], cur_depth + 1, cur_id, False)

    def fit(self, X, y):
        self.n_features = X.shape[1]
        self.classes_, y_train = np.unique(y, return_inverse=True)
        self.n_classes_ = len(self.classes_)
        self.tree_ = []
        self._build_tree(X, y_train, 0, None, None)
        return self

    def apply(self, X):
        pred = np.zeros(X.shape[0], dtype=np.int)
        for i in range(X.shape[0]):
            cur_node = 0
            while self.tree_[cur_node].left_child != -1:
                if X[i][self.tree_[cur_node].feature] <= self.tree_[cur_node].threshold:
                    cur_node = self.tree_[cur_node].left_child
                else:
                    cur_node = self.tree_[cur_node].right_child
            pred[i] = cur_node
        return pred

    def predict_proba(self, X):
        pred = self.apply(X)
        prob = np.array([self.tree_[p].value for p in pred])
        return prob / np.sum(prob, axis=1)[:, np.newaxis]

    def predict(self, X):
        pred = self.apply(X)
        return np.array([self.classes_[np.argmax(self.tree_[p].value)] for p in pred])

    @property
    def feature_importances_(self):
        importances = np.zeros(self.n_features)
        for node in self.tree_:
            if node.left_child != -1:
                left_child = self.tree_[node.left_child]
                right_child = self.tree_[node.right_child]
                importances[node.feature] += (node.n_node * node.impurity
                                              - left_child.n_node * left_child.impurity
                                              - right_child.n_node * right_child.impurity)
        return importances / np.sum(importances)

In [4]:

X, y = load_breast_cancer(return_X_y=True)
clf1 = DecisionTreeClassifier(max_depth=1).fit(X, y)
clf2 = skDecisionTreeClassifier(max_depth=1, random_state=0).fit(X, y)
assert np.array_equal([node.left_child for node in clf1.tree_],
                      clf2.tree_.children_left)
assert np.array_equal([node.right_child for node in clf1.tree_],
                      clf2.tree_.children_right)
clf1_leaf = [node.left_child != -1 for node in clf1.tree_]
clf2_leaf = clf2.tree_.children_left != -1
assert np.array_equal(clf1_leaf, clf2_leaf)
assert np.array_equal(np.array([node.feature for node in clf1.tree_])[clf1_leaf],
                      clf2.tree_.feature[clf2_leaf])
# threshold is slightly different
# because scikit-learn users the average value between current point and the next point
assert np.allclose([node.impurity for node in clf1.tree_],
                    clf2.tree_.impurity)
assert np.array_equal([node.n_node for node in clf1.tree_],
                      clf2.tree_.n_node_samples)
assert np.allclose(np.array([node.value for node in clf1.tree_]),
                   clf2.tree_.value[:, 0, :])
assert np.allclose(clf1.feature_importances_, clf2.feature_importances_)
pred1 = clf1.apply(X)
pred2 = clf2.apply(X)
assert np.array_equal(pred1, pred2)
prob1 = clf1.predict_proba(X)
prob2 = clf2.predict_proba(X)
assert np.allclose(prob1, prob2)
pred1 = clf1.predict(X)
pred2 = clf2.predict(X)
assert np.array_equal(pred1, pred2)

Implementation 2¶

similat to scikit-learn criterion="entropy"

In [5]:

class DecisionTreeClassifier():
    def __init__(self, max_depth=2):
        self.max_depth = max_depth

    def _entropy(self, y_cnt):
        prob = y_cnt / np.sum(y_cnt)
        prob = prob[prob > 0]
        return -np.sum(prob * np.log2(prob))

    def _build_tree(self, X, y, cur_depth, parent, is_left):
        if cur_depth == self.max_depth:
            cur_node = TreeNode()
            cur_node.n_node = X.shape[0]
            cur_node.value = np.bincount(y, minlength=self.n_classes_)
            cur_node.impurity = self._entropy(cur_node.value)
            cur_id = len(self.tree_)
            self.tree_.append(cur_node)
            if parent is not None:
                if is_left:
                    self.tree_[parent].left_child = cur_id
                else:
                    self.tree_[parent].right_child = cur_id
            return
        best_improvement = -np.inf
        best_feature = None
        best_threshold = None
        best_left_ind = None
        best_right_ind = None
        y_cnt = np.bincount(y, minlength=self.n_classes_)
        #for i in range(X.shape[1]):
        for i in range(22, 23):
            ind = np.argsort(X[:, i])
            y_cnt_left = np.bincount([], minlength=self.n_classes_)
            y_cnt_right = y_cnt.copy()
            n_left = 0
            n_right = X.shape[0]
            for j in range(ind.shape[0] - 1):
                y_cnt_left[y[ind[j]]] += 1
                y_cnt_right[y[ind[j]]] -= 1
                n_left += 1
                n_right -= 1
                if j + 1 < ind.shape[0] - 1 and np.isclose(X[ind[j], i], X[ind[j + 1], i]):
                    continue
                cur_improvement = -n_left * self._entropy(y_cnt_left) - n_right * self._entropy(y_cnt_right)
                if cur_improvement > best_improvement:
                    best_improvement = cur_improvement
                    best_feature = i
                    best_threshold = X[ind[j], i]
                    best_left_ind = ind[:j + 1]
                    best_right_ind = ind[j + 1:]
        cur_node = TreeNode()
        cur_node.feature = best_feature
        cur_node.threshold = best_threshold
        cur_node.n_node = X.shape[0]
        cur_node.value = y_cnt
        cur_node.impurity = self._entropy(y_cnt)
        cur_id = len(self.tree_)
        self.tree_.append(cur_node)
        if parent is not None:
            if is_left:
                self.tree_[parent].left_child = cur_id
            else:
                self.tree_[parent].right_child = cur_id
        if cur_depth < self.max_depth:
            self._build_tree(X[best_left_ind], y[best_left_ind], cur_depth + 1, cur_id, True)
            self._build_tree(X[best_right_ind], y[best_right_ind], cur_depth + 1, cur_id, False)

    def fit(self, X, y):
        self.n_features = X.shape[1]
        self.classes_, y_train = np.unique(y, return_inverse=True)
        self.n_classes_ = len(self.classes_)
        self.tree_ = []
        self._build_tree(X, y_train, 0, None, None)
        return self

    def apply(self, X):
        pred = np.zeros(X.shape[0], dtype=np.int)
        for i in range(X.shape[0]):
            cur_node = 0
            while self.tree_[cur_node].left_child != -1:
                if X[i][self.tree_[cur_node].feature] <= self.tree_[cur_node].threshold:
                    cur_node = self.tree_[cur_node].left_child
                else:
                    cur_node = self.tree_[cur_node].right_child
            pred[i] = cur_node
        return pred

    def predict_proba(self, X):
        pred = self.apply(X)
        prob = np.array([self.tree_[p].value for p in pred])
        return prob / np.sum(prob, axis=1)[:, np.newaxis]

    def predict(self, X):
        pred = self.apply(X)
        return np.array([self.classes_[np.argmax(self.tree_[p].value)] for p in pred])

    @property
    def feature_importances_(self):
        importances = np.zeros(self.n_features)
        for node in self.tree_:
            if node.left_child != -1:
                left_child = self.tree_[node.left_child]
                right_child = self.tree_[node.right_child]
                importances[node.feature] += (node.n_node * node.impurity
                                              - left_child.n_node * left_child.impurity
                                              - right_child.n_node * right_child.impurity)
        return importances / np.sum(importances)

In [6]:

X, y = load_breast_cancer(return_X_y=True)
clf1 = DecisionTreeClassifier(max_depth=1).fit(X, y)
clf2 = skDecisionTreeClassifier(criterion="entropy", max_depth=1, random_state=0).fit(X, y)
assert np.array_equal([node.left_child for node in clf1.tree_],
                      clf2.tree_.children_left)
assert np.array_equal([node.right_child for node in clf1.tree_],
                      clf2.tree_.children_right)
clf1_leaf = [node.left_child != -1 for node in clf1.tree_]
clf2_leaf = clf2.tree_.children_left != -1
assert np.array_equal(clf1_leaf, clf2_leaf)
assert np.array_equal(np.array([node.feature for node in clf1.tree_])[clf1_leaf],
                      clf2.tree_.feature[clf2_leaf])
# threshold is slightly different
# because scikit-learn users the average value between current point and the next point
assert np.allclose([node.impurity for node in clf1.tree_],
                    clf2.tree_.impurity)
assert np.array_equal([node.n_node for node in clf1.tree_],
                      clf2.tree_.n_node_samples)
assert np.allclose(np.array([node.value for node in clf1.tree_]),
                   clf2.tree_.value[:, 0, :])
assert np.allclose(clf1.feature_importances_, clf2.feature_importances_)
pred1 = clf1.apply(X)
pred2 = clf2.apply(X)
assert np.array_equal(pred1, pred2)
prob1 = clf1.predict_proba(X)
prob2 = clf2.predict_proba(X)
assert np.allclose(prob1, prob2)
pred1 = clf1.predict(X)
pred2 = clf2.predict(X)
assert np.array_equal(pred1, pred2)