#!/usr/bin/env python # coding: utf-8 # Homepage: https://spkit.github.io #
Nikesh Bajaj : http://nikeshbajaj.in # # Decision Trees with shrinking capability from SpKit # **Note**:In this notebook, we show the capability of decision tree from ***spkit*** to analysie the training and testing performace at each depth of a trained tree. After which, a trained tree can be shrink to any smaller depth, ***without retraining it***. So, by using Decision Tree from ***spkit***, you could choose a very high number for a **max_depth** (or just choose -1, for infinity) and analysis the parformance (accuracy, mse, loss) of training and testing (practically, a validation set) sets at each depth level. Once you decide which is the right depth, you could shrink your trained tree to that layer, without explicit training it again to with new depth parameter. #

1 Classification - Diabetes Dataset - binary class
2 Regression - Diabetes Dataset - score

# In[1]: import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split import copy # In[2]: np.random.seed(100) # just to ensure the reproducible results # In[3]: import spkit spkit.__version__ # ## Classification - Diabetes Dataset - binary class # In[4]: from spkit.ml import ClassificationTree # In[5]: from sklearn.datasets import load_diabetes data = load_diabetes() X = data.data y = 1*(data.target>np.mean(data.target)) feature_names = data.feature_names print(X.shape, y.shape) Xt,Xs,yt,ys = train_test_split(X,y,test_size =0.3) print(Xt.shape, Xs.shape,yt.shape, ys.shape) # ### Train with max_depth =15, Accuracy, Logloss # In[6]: clf = ClassificationTree(max_depth=15) clf.fit(Xt,yt,feature_names=feature_names) ytp = clf.predict(Xt) ysp = clf.predict(Xs) ytpr = clf.predict_proba(Xt)[:,1] yspr = clf.predict_proba(Xs)[:,1] print('Depth of trained Tree ', clf.getTreeDepth()) print('Accuracy') print('- Training : ',np.mean(ytp==yt)) print('- Testing : ',np.mean(ysp==ys)) print('Logloss') Trloss = -np.mean(yt*np.log(ytpr+1e-10)+(1-yt)*np.log(1-ytpr+1e-10)) Tsloss = -np.mean(ys*np.log(yspr+1e-10)+(1-ys)*np.log(1-yspr+1e-10)) print('- Training : ',Trloss) print('- Testing : ',Tsloss) # ### Plot Trained Tree # In[7]: plt.figure(figsize=(15,12)) clf.plotTree() # ### Analysing the Learning Curve with test data (validation set) # In[8]: Lcurve = clf.getLcurve(Xt=Xt,yt=yt,Xs=Xs,ys=ys,measure='acc') Lcurve # In[9]: clf.plotLcurve() plt.xlim([1,clf.getTreeDepth()]) plt.xticks(np.arange(1,clf.getTreeDepth()+1)) plt.show() # ### Learning curve with tree # In[10]: plt.figure(figsize=(10,8)) clf.plotTree(show=False,Measures=True,showNodevalues=True,showThreshold=False) # ### Shrinking the trained tree to depth=7 # In[11]: clf.updateTree(shrink=True,max_depth=7) # In[12]: ytp = clf.predict(Xt) ysp = clf.predict(Xs) ytpr = clf.predict_proba(Xt)[:,1] yspr = clf.predict_proba(Xs)[:,1] print('Depth of trained Tree ', clf.getTreeDepth()) print('Accuracy') print('- Training : ',np.mean(ytp==yt)) print('- Testing : ',np.mean(ysp==ys)) print('Logloss') Trloss = -np.mean(yt*np.log(ytpr+1e-10)+(1-yt)*np.log(1-ytpr+1e-10)) Tsloss = -np.mean(ys*np.log(yspr+1e-10)+(1-ys)*np.log(1-yspr+1e-10)) print('- Training : ',Trloss) print('- Testing : ',Tsloss) # ### Plotting final tree # In[13]: plt.figure(figsize=(10,6)) clf.plotTree(show=False,Measures=True,showNodevalues=True,showThreshold=True) # ## Regression - Diabetes Dataset - score # In[14]: from spkit.ml import RegressionTree # In[15]: from sklearn.datasets import load_diabetes data = load_diabetes() X = data.data y = data.target feature_names = data.feature_names print(X.shape, y.shape) Xt,Xs,yt,ys = train_test_split(X,y,test_size =0.3) print(Xt.shape, Xs.shape,yt.shape, ys.shape) # ### Train with max_depth=15, MSE, MAE # In[16]: rgs = RegressionTree(max_depth=15) rgs.fit(Xt,yt,feature_names=feature_names) ytp = rgs.predict(Xt) ysp = rgs.predict(Xs) print('Depth of trained Tree ', rgs.getTreeDepth()) print('MSE') print('- Training : ',np.mean((ytp-yt)**2)) print('- Testing : ',np.mean((ysp-ys)**2)) print('MAE') print('- Training : ',np.mean(np.abs(ytp-yt))) print('- Testing : ',np.mean(np.abs(ysp-ys))) # ### Plot trained Tree # In[17]: plt.figure(figsize=(15,12)) rgs.plotTree() # ### Analysing the Learning Curve and Tree with MAE # In[18]: Lcurve = rgs.getLcurve(Xt=Xt,yt=yt,Xs=Xs,ys=ys,measure='mae') Lcurve # In[19]: rgs.plotLcurve() plt.xlim([1,rgs.getTreeDepth()]) plt.xticks(np.arange(1,rgs.getTreeDepth()+1)) plt.show() # In[20]: plt.figure(figsize=(10,8)) rgs.plotTree(show=False,Measures=True,showNodevalues=True,showThreshold=False) # ### Shrining a trained Tree to depth=2 # In[21]: rgs.updateTree(shrink=True,max_depth=2) # In[22]: ytp = rgs.predict(Xt) ysp = rgs.predict(Xs) print('Depth of trained Tree ', rgs.getTreeDepth()) print('MSE') print('- Training : ',np.mean((ytp-yt)**2)) print('- Testing : ',np.mean((ysp-ys)**2)) print('MAE') print('- Training : ',np.mean(np.abs(ytp-yt))) print('- Testing : ',np.mean(np.abs(ysp-ys))) # In[23]: plt.figure(figsize=(10,5)) rgs.plotTree(show=False,Measures=True,showNodevalues=True,showThreshold=True)

Table of Contents