Homepage: https://spkit.github.io
Nikesh Bajaj : http://nikeshbajaj.in
Note: This notebook covers the use of (1) Classification and (2) Regression Tree from *spkit* library with different verbosity mode while training and plotting resulting decision tree after training. We use two different datasets Iris and Breast Cancer for classification and Boston Housing price for Regression.
import numpy as np
import matplotlib.pyplot as plt
import spkit
#version
# it is 0.0.9.1 version
spkit.__version__
'0.0.9'
np.random.seed(11) # just to makesure same results
# import Classification and Regression Tree from spkit
from spkit.ml import ClassificationTree, RegressionTree
# import dataset and train-test split from sklearn or use your own dataset
from sklearn import datasets
from sklearn.model_selection import train_test_split
Loading and spliting for training and testing
data = datasets.load_iris()
X = data.data
y = data.target
feature_names = data.feature_names #Optional
Xt,Xs, yt, ys = train_test_split(X,y,test_size=0.3)
print(X.shape,y.shape, Xt.shape, yt.shape, Xs.shape, ys.shape)
(150, 4) (150,) (105, 4) (105,) (45, 4) (45,)
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=0,feature_names=feature_names)
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=1,feature_names=feature_names)
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... |subtrees::|100%|-------------------->|| |.........................tree is buit! ---------------------------------------
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=2,feature_names=feature_names)
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... |-Feature::3_petal length (cm) Gain::0.93 thr::_Depth = 1 |->False branch (<<<).. |->{Leaf Node:: value: 0 }_Depth =2 | |->True branch (>>>).. |--Feature::4_petal width (cm) Gain::0.81 thr::_Depth = 2 |-->False branch (<<<).. |--Feature::3_petal length (cm) Gain::0.18 thr::_Depth = 3 |-->False branch (<<<).. |-->{Leaf Node:: value: 1 }_Depth =4 | |-->True branch (>>>).. |--->{Leaf Node:: value: 2 }_Depth =4 | |-->True branch (>>>).. |---Feature::3_petal length (cm) Gain::0.1 thr::_Depth = 3 |--->False branch (<<<).. |---Feature::2_sepal width (cm) Gain::0.81 thr::_Depth = 4 |--->False branch (<<<).. |--->{Leaf Node:: value: 2 }_Depth =5 | |--->True branch (>>>).. |---->{Leaf Node:: value: 1 }_Depth =5 | |--->True branch (>>>).. |---->{Leaf Node:: value: 2 }_Depth =4 | |.........................tree is buit! ---------------------------------------
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=3,feature_names=feature_names)
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... None 1 | True 2 | T True 3 | TT True 4 | TTT False 4 | TTTF True 5 | TTTFT False 5 | TTTFF False 3 | TTF True 4 | TTFT False 4 | TTFF False 2 | TF | |.........................tree is buit! ---------------------------------------
%matplotlib notebook
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=4,feature_names=feature_names)
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... | |.........................tree is buit! ---------------------------------------
%matplotlib inline
plt.figure(figsize=(10,6))
clf.plotTree(show=True,scale=False)
plt.figure(figsize=(8,6))
clf.plotTree(DiffBranchColor=False)
ytp = clf.predict(Xt)
ysp = clf.predict(Xs)
ytpr = clf.predict_proba(Xt)[:,1]
yspr = clf.predict_proba(Xs)[:,1]
print('Depth of trained Tree ', clf.getTreeDepth())
print('Accuracy')
print('- Training : ',np.mean(ytp==yt))
print('- Testing : ',np.mean(ysp==ys))
print('Logloss')
Trloss = -np.mean(yt*np.log(ytpr+1e-10)+(1-yt)*np.log(1-ytpr+1e-10))
Tsloss = -np.mean(ys*np.log(yspr+1e-10)+(1-ys)*np.log(1-yspr+1e-10))
print('- Training : ',Trloss)
print('- Testing : ',Tsloss)
Depth of trained Tree 4 Accuracy - Training : 1.0 - Testing : 0.9111111111111111 Logloss - Training : 14.473392013068288 - Testing : 15.350567286593632
clf = ClassificationTree(max_depth=3)
clf.fit(Xt,yt,verbose=1,feature_names=feature_names)
plt.figure(figsize=(5,5))
clf.plotTree(show=True,DiffBranchColor=True)
ytp = clf.predict(Xt)
ysp = clf.predict(Xs)
ytpr = clf.predict_proba(Xt)[:,1]
yspr = clf.predict_proba(Xs)[:,1]
print('Depth of trained Tree ', clf.getTreeDepth())
print('Accuracy')
print('- Training : ',np.mean(ytp==yt))
print('- Testing : ',np.mean(ysp==ys))
print('Logloss')
Trloss = -np.mean(yt*np.log(ytpr+1e-10)+(1-yt)*np.log(1-ytpr+1e-10))
Tsloss = -np.mean(ys*np.log(yspr+1e-10)+(1-ys)*np.log(1-yspr+1e-10))
print('- Training : ',Trloss)
print('- Testing : ',Tsloss)
Number of features:: 4 Number of samples :: 105 --------------------------------------- |Building the tree..................... |subtrees::|100%|-------------------->|| |.........................tree is buit! ---------------------------------------
Depth of trained Tree 3 Accuracy - Training : 0.9904761904761905 - Testing : 0.8666666666666667 Logloss - Training : 2.5937142862825335 - Testing : 1.6081773385894438
data = datasets.load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names #Optional
Xt,Xs, yt, ys = train_test_split(X,y,test_size=0.3)
print(X.shape,y.shape, Xt.shape, yt.shape, Xs.shape, ys.shape)
(569, 30) (569,) (398, 30) (398,) (171, 30) (171,)
While building tree, To first choose True branch and then False set randomBranch=False
%matplotlib notebook
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=4,feature_names=feature_names,randomBranch=False)
plt.close(clf.fig)
Number of features:: 30 Number of samples :: 398 --------------------------------------- |Building the tree..................... | |.........................tree is buit! ---------------------------------------
To randomly selevting True or False branch set randomBranch=True
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=4,feature_names=feature_names,randomBranch=True)
plt.close(clf.fig)
Number of features:: 30 Number of samples :: 398 --------------------------------------- |Building the tree..................... | |.........................tree is buit! ---------------------------------------
%matplotlib inline
plt.figure(figsize=(10,6))
clf.plotTree(show=True,DiffBranchColor=True,scale=False)
plt.close(clf.fig)
#%matplotlib inline
clf = ClassificationTree()
clf.fit(Xt,yt,verbose=1,feature_names=feature_names)
plt.figure(figsize=(6,6))
clf.plotTree()
Number of features:: 30 Number of samples :: 398 --------------------------------------- |Building the tree..................... |subtrees::|100%|-------------------->|- |.........................tree is buit! ---------------------------------------
ytp = clf.predict(Xt)
ysp = clf.predict(Xs)
ytpr = clf.predict_proba(Xt)[:,1]
yspr = clf.predict_proba(Xs)[:,1]
print('Depth of trained Tree ', clf.getTreeDepth())
print('Accuracy')
print('- Training : ',np.mean(ytp==yt))
print('- Testing : ',np.mean(ysp==ys))
print('Logloss')
Trloss = -np.mean(yt*np.log(ytpr+1e-10)+(1-yt)*np.log(1-ytpr+1e-10))
Tsloss = -np.mean(ys*np.log(yspr+1e-10)+(1-ys)*np.log(1-yspr+1e-10))
print('- Training : ',Trloss)
print('- Testing : ',Tsloss)
Depth of trained Tree 6 Accuracy - Training : 1.0 - Testing : 0.9298245614035088 Logloss - Training : -1.000000082690371e-10 - Testing : 1.6158491879730155
It's overfitting, try with smaller trees by decresing the max_depth of classifier
data = datasets.load_boston()
X = data.data
y = data.target
feature_names = data.feature_names #Optional
Xt,Xs, yt, ys = train_test_split(X,y,test_size=0.3)
print(X.shape,y.shape, Xt.shape, yt.shape, Xs.shape, ys.shape)
(506, 13) (506,) (354, 13) (354,) (152, 13) (152,)
rgr = RegressionTree()
rgr.fit(Xt,yt,verbose=1,feature_names = feature_names)
Number of features:: 13 Number of samples :: 354 --------------------------------------- |Building the tree..................... |subtrees::|100%|-------------------->|\ |.........................tree is buit! ---------------------------------------
%matplotlib inline
plt.style.use('default')
plt.figure(figsize=(15,15))
rgr.plotTree(show=True,scale=True, showtitle =False, showDirection=False)