#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np from sklearn.datasets import load_breast_cancer from sklearn.tree import DecisionTreeClassifier as skDecisionTreeClassifier # ### Implementation 1 # - similat to scikit-learn criterion="gini" # In[2]: class TreeNode(): def __init__(self): self.left_child = -1 self.right_child = -1 self.feature = None self.threshold = None self.impurity = None self.n_node = None self.value = None # In[3]: class DecisionTreeClassifier(): def __init__(self, max_depth=2): self.max_depth = max_depth def _gini(self, y_cnt): prob = y_cnt / np.sum(y_cnt) return 1 - np.sum(np.square(prob)) def _build_tree(self, X, y, cur_depth, parent, is_left): if cur_depth == self.max_depth: cur_node = TreeNode() cur_node.n_node = X.shape[0] cur_node.value = np.bincount(y, minlength=self.n_classes_) cur_node.impurity = self._gini(cur_node.value) cur_id = len(self.tree_) self.tree_.append(cur_node) if parent is not None: if is_left: self.tree_[parent].left_child = cur_id else: self.tree_[parent].right_child = cur_id return best_improvement = -np.inf best_feature = None best_threshold = None best_left_ind = None best_right_ind = None y_cnt = np.bincount(y, minlength=self.n_classes_) for i in range(X.shape[1]): ind = np.argsort(X[:, i]) y_cnt_left = np.bincount([], minlength=self.n_classes_) y_cnt_right = y_cnt.copy() n_left = 0 n_right = X.shape[0] for j in range(ind.shape[0] - 1): y_cnt_left[y[ind[j]]] += 1 y_cnt_right[y[ind[j]]] -= 1 n_left += 1 n_right -= 1 if j + 1 < ind.shape[0] - 1 and np.isclose(X[ind[j], i], X[ind[j + 1], i]): continue cur_improvement = -n_left * self._gini(y_cnt_left) - n_right * self._gini(y_cnt_right) if cur_improvement > best_improvement: best_improvement = cur_improvement best_feature = i best_threshold = X[ind[j], i] best_left_ind = ind[:j + 1] best_right_ind = ind[j + 1:] cur_node = TreeNode() cur_node.feature = best_feature cur_node.threshold = best_threshold cur_node.n_node = X.shape[0] cur_node.value = y_cnt cur_node.impurity = self._gini(y_cnt) cur_id = len(self.tree_) self.tree_.append(cur_node) if parent is not None: if is_left: self.tree_[parent].left_child = cur_id else: self.tree_[parent].right_child = cur_id if cur_depth < self.max_depth: self._build_tree(X[best_left_ind], y[best_left_ind], cur_depth + 1, cur_id, True) self._build_tree(X[best_right_ind], y[best_right_ind], cur_depth + 1, cur_id, False) def fit(self, X, y): self.n_features = X.shape[1] self.classes_, y_train = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) self.tree_ = [] self._build_tree(X, y_train, 0, None, None) return self def apply(self, X): pred = np.zeros(X.shape[0], dtype=np.int) for i in range(X.shape[0]): cur_node = 0 while self.tree_[cur_node].left_child != -1: if X[i][self.tree_[cur_node].feature] <= self.tree_[cur_node].threshold: cur_node = self.tree_[cur_node].left_child else: cur_node = self.tree_[cur_node].right_child pred[i] = cur_node return pred def predict_proba(self, X): pred = self.apply(X) prob = np.array([self.tree_[p].value for p in pred]) return prob / np.sum(prob, axis=1)[:, np.newaxis] def predict(self, X): pred = self.apply(X) return np.array([self.classes_[np.argmax(self.tree_[p].value)] for p in pred]) @property def feature_importances_(self): importances = np.zeros(self.n_features) for node in self.tree_: if node.left_child != -1: left_child = self.tree_[node.left_child] right_child = self.tree_[node.right_child] importances[node.feature] += (node.n_node * node.impurity - left_child.n_node * left_child.impurity - right_child.n_node * right_child.impurity) return importances / np.sum(importances) # In[4]: X, y = load_breast_cancer(return_X_y=True) clf1 = DecisionTreeClassifier(max_depth=1).fit(X, y) clf2 = skDecisionTreeClassifier(max_depth=1, random_state=0).fit(X, y) assert np.array_equal([node.left_child for node in clf1.tree_], clf2.tree_.children_left) assert np.array_equal([node.right_child for node in clf1.tree_], clf2.tree_.children_right) clf1_leaf = [node.left_child != -1 for node in clf1.tree_] clf2_leaf = clf2.tree_.children_left != -1 assert np.array_equal(clf1_leaf, clf2_leaf) assert np.array_equal(np.array([node.feature for node in clf1.tree_])[clf1_leaf], clf2.tree_.feature[clf2_leaf]) # threshold is slightly different # because scikit-learn users the average value between current point and the next point assert np.allclose([node.impurity for node in clf1.tree_], clf2.tree_.impurity) assert np.array_equal([node.n_node for node in clf1.tree_], clf2.tree_.n_node_samples) assert np.allclose(np.array([node.value for node in clf1.tree_]), clf2.tree_.value[:, 0, :]) assert np.allclose(clf1.feature_importances_, clf2.feature_importances_) pred1 = clf1.apply(X) pred2 = clf2.apply(X) assert np.array_equal(pred1, pred2) prob1 = clf1.predict_proba(X) prob2 = clf2.predict_proba(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2) # ### Implementation 2 # - similat to scikit-learn criterion="entropy" # In[5]: class DecisionTreeClassifier(): def __init__(self, max_depth=2): self.max_depth = max_depth def _entropy(self, y_cnt): prob = y_cnt / np.sum(y_cnt) prob = prob[prob > 0] return -np.sum(prob * np.log2(prob)) def _build_tree(self, X, y, cur_depth, parent, is_left): if cur_depth == self.max_depth: cur_node = TreeNode() cur_node.n_node = X.shape[0] cur_node.value = np.bincount(y, minlength=self.n_classes_) cur_node.impurity = self._entropy(cur_node.value) cur_id = len(self.tree_) self.tree_.append(cur_node) if parent is not None: if is_left: self.tree_[parent].left_child = cur_id else: self.tree_[parent].right_child = cur_id return best_improvement = -np.inf best_feature = None best_threshold = None best_left_ind = None best_right_ind = None y_cnt = np.bincount(y, minlength=self.n_classes_) #for i in range(X.shape[1]): for i in range(22, 23): ind = np.argsort(X[:, i]) y_cnt_left = np.bincount([], minlength=self.n_classes_) y_cnt_right = y_cnt.copy() n_left = 0 n_right = X.shape[0] for j in range(ind.shape[0] - 1): y_cnt_left[y[ind[j]]] += 1 y_cnt_right[y[ind[j]]] -= 1 n_left += 1 n_right -= 1 if j + 1 < ind.shape[0] - 1 and np.isclose(X[ind[j], i], X[ind[j + 1], i]): continue cur_improvement = -n_left * self._entropy(y_cnt_left) - n_right * self._entropy(y_cnt_right) if cur_improvement > best_improvement: best_improvement = cur_improvement best_feature = i best_threshold = X[ind[j], i] best_left_ind = ind[:j + 1] best_right_ind = ind[j + 1:] cur_node = TreeNode() cur_node.feature = best_feature cur_node.threshold = best_threshold cur_node.n_node = X.shape[0] cur_node.value = y_cnt cur_node.impurity = self._entropy(y_cnt) cur_id = len(self.tree_) self.tree_.append(cur_node) if parent is not None: if is_left: self.tree_[parent].left_child = cur_id else: self.tree_[parent].right_child = cur_id if cur_depth < self.max_depth: self._build_tree(X[best_left_ind], y[best_left_ind], cur_depth + 1, cur_id, True) self._build_tree(X[best_right_ind], y[best_right_ind], cur_depth + 1, cur_id, False) def fit(self, X, y): self.n_features = X.shape[1] self.classes_, y_train = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) self.tree_ = [] self._build_tree(X, y_train, 0, None, None) return self def apply(self, X): pred = np.zeros(X.shape[0], dtype=np.int) for i in range(X.shape[0]): cur_node = 0 while self.tree_[cur_node].left_child != -1: if X[i][self.tree_[cur_node].feature] <= self.tree_[cur_node].threshold: cur_node = self.tree_[cur_node].left_child else: cur_node = self.tree_[cur_node].right_child pred[i] = cur_node return pred def predict_proba(self, X): pred = self.apply(X) prob = np.array([self.tree_[p].value for p in pred]) return prob / np.sum(prob, axis=1)[:, np.newaxis] def predict(self, X): pred = self.apply(X) return np.array([self.classes_[np.argmax(self.tree_[p].value)] for p in pred]) @property def feature_importances_(self): importances = np.zeros(self.n_features) for node in self.tree_: if node.left_child != -1: left_child = self.tree_[node.left_child] right_child = self.tree_[node.right_child] importances[node.feature] += (node.n_node * node.impurity - left_child.n_node * left_child.impurity - right_child.n_node * right_child.impurity) return importances / np.sum(importances) # In[6]: X, y = load_breast_cancer(return_X_y=True) clf1 = DecisionTreeClassifier(max_depth=1).fit(X, y) clf2 = skDecisionTreeClassifier(criterion="entropy", max_depth=1, random_state=0).fit(X, y) assert np.array_equal([node.left_child for node in clf1.tree_], clf2.tree_.children_left) assert np.array_equal([node.right_child for node in clf1.tree_], clf2.tree_.children_right) clf1_leaf = [node.left_child != -1 for node in clf1.tree_] clf2_leaf = clf2.tree_.children_left != -1 assert np.array_equal(clf1_leaf, clf2_leaf) assert np.array_equal(np.array([node.feature for node in clf1.tree_])[clf1_leaf], clf2.tree_.feature[clf2_leaf]) # threshold is slightly different # because scikit-learn users the average value between current point and the next point assert np.allclose([node.impurity for node in clf1.tree_], clf2.tree_.impurity) assert np.array_equal([node.n_node for node in clf1.tree_], clf2.tree_.n_node_samples) assert np.allclose(np.array([node.value for node in clf1.tree_]), clf2.tree_.value[:, 0, :]) assert np.allclose(clf1.feature_importances_, clf2.feature_importances_) pred1 = clf1.apply(X) pred2 = clf2.apply(X) assert np.array_equal(pred1, pred2) prob1 = clf1.predict_proba(X) prob2 = clf2.predict_proba(X) assert np.allclose(prob1, prob2) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert np.array_equal(pred1, pred2)