Homepage: https://spkit.github.io
Nikesh Bajaj : http://nikeshbajaj.in
Note:In this notebook, we show the capability of decision tree from *spkit* to analysie the training and testing performace at each depth of a trained tree. After which, a trained tree can be shrink to any smaller depth, *without retraining it. So, by using Decision Tree from spkit, you could choose a very high number for a max_depth* (or just choose -1, for infinity) and analysis the parformance (accuracy, mse, loss) of training and testing (practically, a validation set) sets at each depth level. Once you decide which is the right depth, you could shrink your trained tree to that layer, without explicit training it again to with new depth parameter.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import copy
np.random.seed(100) # just to ensure the reproducible results
import spkit
spkit.__version__
'0.0.9'
from spkit.ml import ClassificationTree
from sklearn.datasets import load_diabetes
data = load_diabetes()
X = data.data
y = 1*(data.target>np.mean(data.target))
feature_names = data.feature_names
print(X.shape, y.shape)
Xt,Xs,yt,ys = train_test_split(X,y,test_size =0.3)
print(Xt.shape, Xs.shape,yt.shape, ys.shape)
(442, 10) (442,) (309, 10) (133, 10) (309,) (133,)
clf = ClassificationTree(max_depth=15)
clf.fit(Xt,yt,feature_names=feature_names)
ytp = clf.predict(Xt)
ysp = clf.predict(Xs)
ytpr = clf.predict_proba(Xt)[:,1]
yspr = clf.predict_proba(Xs)[:,1]
print('Depth of trained Tree ', clf.getTreeDepth())
print('Accuracy')
print('- Training : ',np.mean(ytp==yt))
print('- Testing : ',np.mean(ysp==ys))
print('Logloss')
Trloss = -np.mean(yt*np.log(ytpr+1e-10)+(1-yt)*np.log(1-ytpr+1e-10))
Tsloss = -np.mean(ys*np.log(yspr+1e-10)+(1-ys)*np.log(1-yspr+1e-10))
print('- Training : ',Trloss)
print('- Testing : ',Tsloss)
Depth of trained Tree 12 Accuracy - Training : 1.0 - Testing : 0.7218045112781954 Logloss - Training : -1.0000000826903709e-10 - Testing : 6.405687852618023
plt.figure(figsize=(15,12))
clf.plotTree()
Lcurve = clf.getLcurve(Xt=Xt,yt=yt,Xs=Xs,ys=ys,measure='acc')
Lcurve
{'measure': 'acc', 1: {'train': 0.7378640776699029, 'test': 0.7142857142857143}, 2: {'train': 0.7378640776699029, 'test': 0.7142857142857143}, 3: {'train': 0.7831715210355987, 'test': 0.7218045112781954}, 4: {'train': 0.8252427184466019, 'test': 0.6842105263157895}, 5: {'train': 0.8543689320388349, 'test': 0.7142857142857143}, 6: {'train': 0.8867313915857605, 'test': 0.706766917293233}, 7: {'train': 0.9158576051779935, 'test': 0.7518796992481203}, 8: {'train': 0.9385113268608414, 'test': 0.7518796992481203}, 9: {'train': 0.9676375404530745, 'test': 0.7218045112781954}, 10: {'train': 0.9838187702265372, 'test': 0.7218045112781954}, 11: {'train': 0.9935275080906149, 'test': 0.7218045112781954}, 12: {'train': 1.0, 'test': 0.7218045112781954}}
clf.plotLcurve()
plt.xlim([1,clf.getTreeDepth()])
plt.xticks(np.arange(1,clf.getTreeDepth()+1))
plt.show()
plt.figure(figsize=(10,8))
clf.plotTree(show=False,Measures=True,showNodevalues=True,showThreshold=False)
clf.updateTree(shrink=True,max_depth=7)
ytp = clf.predict(Xt)
ysp = clf.predict(Xs)
ytpr = clf.predict_proba(Xt)[:,1]
yspr = clf.predict_proba(Xs)[:,1]
print('Depth of trained Tree ', clf.getTreeDepth())
print('Accuracy')
print('- Training : ',np.mean(ytp==yt))
print('- Testing : ',np.mean(ysp==ys))
print('Logloss')
Trloss = -np.mean(yt*np.log(ytpr+1e-10)+(1-yt)*np.log(1-ytpr+1e-10))
Tsloss = -np.mean(ys*np.log(yspr+1e-10)+(1-ys)*np.log(1-yspr+1e-10))
print('- Training : ',Trloss)
print('- Testing : ',Tsloss)
Depth of trained Tree 7 Accuracy - Training : 0.9158576051779935 - Testing : 0.7518796992481203 Logloss - Training : 0.175708775711661 - Testing : 4.172100873454743
plt.figure(figsize=(10,6))
clf.plotTree(show=False,Measures=True,showNodevalues=True,showThreshold=True)
from spkit.ml import RegressionTree
from sklearn.datasets import load_diabetes
data = load_diabetes()
X = data.data
y = data.target
feature_names = data.feature_names
print(X.shape, y.shape)
Xt,Xs,yt,ys = train_test_split(X,y,test_size =0.3)
print(Xt.shape, Xs.shape,yt.shape, ys.shape)
(442, 10) (442,) (309, 10) (133, 10) (309,) (133,)
rgs = RegressionTree(max_depth=15)
rgs.fit(Xt,yt,feature_names=feature_names)
ytp = rgs.predict(Xt)
ysp = rgs.predict(Xs)
print('Depth of trained Tree ', rgs.getTreeDepth())
print('MSE')
print('- Training : ',np.mean((ytp-yt)**2))
print('- Testing : ',np.mean((ysp-ys)**2))
print('MAE')
print('- Training : ',np.mean(np.abs(ytp-yt)))
print('- Testing : ',np.mean(np.abs(ysp-ys)))
Depth of trained Tree 15 MSE - Training : 0.0 - Testing : 6644.248120300752 MAE - Training : 0.0 - Testing : 64.203007518797
plt.figure(figsize=(15,12))
rgs.plotTree()