#!/usr/bin/env python # coding: utf-8 # Homepage: https://spkit.github.io #
Nikesh Bajaj : http://nikeshbajaj.in # # Decision Trees with visualization using SpKit # **Note**: This notebook covers the use of (1) Classification and (2) Regression Tree from ***spkit*** library with different verbosity mode while training and plotting resulting decision tree after training. We use two different datasets Iris and Breast Cancer for classification and Boston Housing price for Regression. #

Table of Contents

#
# ### Import libraries # In[2]: import numpy as np import matplotlib.pyplot as plt import spkit #version # it is 0.0.9.1 version spkit.__version__ # In[3]: np.random.seed(11) # just to makesure same results # In[4]: # import Classification and Regression Tree from spkit from spkit.ml import ClassificationTree, RegressionTree # import dataset and train-test split from sklearn or use your own dataset from sklearn import datasets from sklearn.model_selection import train_test_split # ## Classification Tree # ### Iris Dataset # Loading and spliting for training and testing # In[5]: data = datasets.load_iris() X = data.data y = data.target feature_names = data.feature_names #Optional Xt,Xs, yt, ys = train_test_split(X,y,test_size=0.3) print(X.shape,y.shape, Xt.shape, yt.shape, Xs.shape, ys.shape) # #### Fitting a model (displaying the tree building) with different modes # ##### verbose=0 (silence mode) # In[6]: clf = ClassificationTree() clf.fit(Xt,yt,verbose=0,feature_names=feature_names) # ##### verbose=1 (progress bar) # In[50]: clf = ClassificationTree() clf.fit(Xt,yt,verbose=1,feature_names=feature_names) # ##### verbose=2 (printing tree info) # In[7]: clf = ClassificationTree() clf.fit(Xt,yt,verbose=2,feature_names=feature_names) # ##### verbose=3 (printing branches only) # In[8]: clf = ClassificationTree() clf.fit(Xt,yt,verbose=3,feature_names=feature_names) # ##### verbose=4 (Plotting tree.. while building) # In[9]: get_ipython().run_line_magic('matplotlib', 'notebook') # In[10]: clf = ClassificationTree() clf.fit(Xt,yt,verbose=4,feature_names=feature_names) # #### Plotting the resulting tree # In[11]: get_ipython().run_line_magic('matplotlib', 'inline') # In[12]: plt.figure(figsize=(10,6)) clf.plotTree(show=True,scale=False) # #### Plotting Tree with same color branches # In[13]: plt.figure(figsize=(8,6)) clf.plotTree(DiffBranchColor=False) # #### Predicting # In[14]: ytp = clf.predict(Xt) ysp = clf.predict(Xs) ytpr = clf.predict_proba(Xt)[:,1] yspr = clf.predict_proba(Xs)[:,1] print('Depth of trained Tree ', clf.getTreeDepth()) print('Accuracy') print('- Training : ',np.mean(ytp==yt)) print('- Testing : ',np.mean(ysp==ys)) print('Logloss') Trloss = -np.mean(yt*np.log(ytpr+1e-10)+(1-yt)*np.log(1-ytpr+1e-10)) Tsloss = -np.mean(ys*np.log(yspr+1e-10)+(1-ys)*np.log(1-yspr+1e-10)) print('- Training : ',Trloss) print('- Testing : ',Tsloss) # ### Iris data with smaller tree # In[15]: clf = ClassificationTree(max_depth=3) clf.fit(Xt,yt,verbose=1,feature_names=feature_names) plt.figure(figsize=(5,5)) clf.plotTree(show=True,DiffBranchColor=True) ytp = clf.predict(Xt) ysp = clf.predict(Xs) ytpr = clf.predict_proba(Xt)[:,1] yspr = clf.predict_proba(Xs)[:,1] print('Depth of trained Tree ', clf.getTreeDepth()) print('Accuracy') print('- Training : ',np.mean(ytp==yt)) print('- Testing : ',np.mean(ysp==ys)) print('Logloss') Trloss = -np.mean(yt*np.log(ytpr+1e-10)+(1-yt)*np.log(1-ytpr+1e-10)) Tsloss = -np.mean(ys*np.log(yspr+1e-10)+(1-ys)*np.log(1-yspr+1e-10)) print('- Training : ',Trloss) print('- Testing : ',Tsloss) # ### Breast Cancer data # In[16]: data = datasets.load_breast_cancer() X = data.data y = data.target feature_names = data.feature_names #Optional Xt,Xs, yt, ys = train_test_split(X,y,test_size=0.3) print(X.shape,y.shape, Xt.shape, yt.shape, Xs.shape, ys.shape) # #### Fitting model with displaying the details of tree in process (verbose=4) # **While building tree, To first choose True branch and then False set randomBranch=False** # In[17]: get_ipython().run_line_magic('matplotlib', 'notebook') clf = ClassificationTree() clf.fit(Xt,yt,verbose=4,feature_names=feature_names,randomBranch=False) plt.close(clf.fig) # **To randomly selevting True or False branch set randomBranch=True** # In[18]: clf = ClassificationTree() clf.fit(Xt,yt,verbose=4,feature_names=feature_names,randomBranch=True) plt.close(clf.fig) # #### Resulting tree # In[19]: get_ipython().run_line_magic('matplotlib', 'inline') plt.figure(figsize=(10,6)) clf.plotTree(show=True,DiffBranchColor=True,scale=False) plt.close(clf.fig) # #### Fitting model with displaying the progress only (verbose=1) # In[20]: #%matplotlib inline clf = ClassificationTree() clf.fit(Xt,yt,verbose=1,feature_names=feature_names) plt.figure(figsize=(6,6)) clf.plotTree() # #### Predicting # In[21]: ytp = clf.predict(Xt) ysp = clf.predict(Xs) ytpr = clf.predict_proba(Xt)[:,1] yspr = clf.predict_proba(Xs)[:,1] print('Depth of trained Tree ', clf.getTreeDepth()) print('Accuracy') print('- Training : ',np.mean(ytp==yt)) print('- Testing : ',np.mean(ysp==ys)) print('Logloss') Trloss = -np.mean(yt*np.log(ytpr+1e-10)+(1-yt)*np.log(1-ytpr+1e-10)) Tsloss = -np.mean(ys*np.log(yspr+1e-10)+(1-ys)*np.log(1-yspr+1e-10)) print('- Training : ',Trloss) print('- Testing : ',Tsloss) # **It's overfitting, try with smaller trees by decresing the max_depth of classifier** # ## Regression Tree # ### Boston House price # In[22]: data = datasets.load_boston() X = data.data y = data.target feature_names = data.feature_names #Optional Xt,Xs, yt, ys = train_test_split(X,y,test_size=0.3) print(X.shape,y.shape, Xt.shape, yt.shape, Xs.shape, ys.shape) # In[23]: rgr = RegressionTree() rgr.fit(Xt,yt,verbose=1,feature_names = feature_names) # ### Ploting resulting tree # In[24]: get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('default') plt.figure(figsize=(15,15)) rgr.plotTree(show=True,scale=True, showtitle =False, showDirection=False) # ### Prediction # In[25]: ytp = rgr.predict(Xt) ysp = rgr.predict(Xs) print('Training MSE: ',np.mean((ytp-yt)**2)) print('Testing MSE: ',np.mean((ysp-ys)**2)) # ### Boston Data with smaller tree # In[26]: rgr = RegressionTree(max_depth=4) rgr.fit(Xt,yt,verbose=1,feature_names = feature_names) # In[27]: get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('default') plt.figure(figsize=(6,5)) rgr.plotTree(show=True,scale=True, showtitle =True, showDirection=False,DiffBranchColor=True) ytp = rgr.predict(Xt) ysp = rgr.predict(Xs) print('Training MSE: ',np.mean((ytp-yt)**2)) print('Testing MSE: ',np.mean((ysp-ys)**2))