In [0]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd

Obteniendo los datos

In [39]:
!wget https://raw.githubusercontent.com/susanli2016/Machine-Learning-with-Python/master/diabetes.csv
--2018-10-14 23:42:51--  https://raw.githubusercontent.com/susanli2016/Machine-Learning-with-Python/master/diabetes.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23875 (23K) [text/plain]
Saving to: ‘diabetes.csv’

diabetes.csv        100%[===================>]  23.32K  --.-KB/s    in 0.008s  

2018-10-14 23:42:51 (2.91 MB/s) - ‘diabetes.csv’ saved [23875/23875]

In [0]:
dataset = pd.read_csv("diabetes.csv")
In [42]:
dataset.head()
Out[42]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
In [47]:
dataset.shape
Out[47]:
(768, 9)
In [0]:
features = dataset.drop(["Outcome"], axis=1)
X = np.array(features)
y = np.array(dataset["Outcome"])
In [0]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, test_size=0.20)

Creando el modelo

In [0]:
tree = DecisionTreeClassifier()
In [141]:
tree.fit(X_train, y_train)
Out[141]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
In [142]:
tree.tree_.max_depth
Out[142]:
15
In [0]:
validation_prediction = tree.predict(X_val)
training_prediction = tree.predict(X_train)
In [144]:
print('Exactitud training data: ', accuracy_score(y_true=y_train, y_pred=training_prediction))
print('Exactitud validation data: ', accuracy_score(y_true=y_val, y_pred=validation_prediction))
Exactitud training data:  1.0
Exactitud validation data:  0.7922077922077922

Mostrando el arbol de forma visual

In [0]:
!apt-get install graphviz
In [57]:
!pip install graphviz
Collecting graphviz
  Downloading https://files.pythonhosted.org/packages/47/87/313cd4ea4f75472826acb74c57f94fc83e04ba93e4ccf35656f6b7f502e2/graphviz-0.9-py2.py3-none-any.whl
Installing collected packages: graphviz
Successfully installed graphviz-0.9
In [0]:
import graphviz 
from sklearn.tree import export_graphviz
In [0]:
feature_names = features.columns
In [0]:
dot_data = export_graphviz(tree, out_file=None, 
                         feature_names=feature_names,  
                         class_names=True,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = graphviz.Source(dot_data)
In [147]:
graph
Out[147]:
Tree 0 Glucose ≤ 123.5 gini = 0.461 samples = 614 value = [393, 221] class = y 0 1 Age ≤ 28.5 gini = 0.301 samples = 352 value = [287, 65] class = y 0 0->1 True 100 BMI ≤ 30.05 gini = 0.482 samples = 262 value = [106, 156] class = y 1 0->100 False 2 BMI ≤ 30.95 gini = 0.162 samples = 202 value = [184, 18] class = y 0 1->2 39 BMI ≤ 26.35 gini = 0.43 samples = 150 value = [103, 47] class = y 0 1->39 3 Pregnancies ≤ 7.0 gini = 0.036 samples = 110 value = [108, 2] class = y 0 2->3 10 BloodPressure ≤ 53.0 gini = 0.287 samples = 92 value = [76, 16] class = y 0 2->10 4 DiabetesPedigreeFunction ≤ 0.672 gini = 0.018 samples = 109 value = [108, 1] class = y 0 3->4 9 gini = 0.0 samples = 1 value = [0, 1] class = y 1 3->9 5 gini = 0.0 samples = 99 value = [99, 0] class = y 0 4->5 6 DiabetesPedigreeFunction ≤ 0.697 gini = 0.18 samples = 10 value = [9, 1] class = y 0 4->6 7 gini = 0.0 samples = 1 value = [0, 1] class = y 1 6->7 8 gini = 0.0 samples = 9 value = [9, 0] class = y 0 6->8 11 DiabetesPedigreeFunction ≤ 0.508 gini = 0.444 samples = 6 value = [2, 4] class = y 1 10->11 14 DiabetesPedigreeFunction ≤ 1.272 gini = 0.24 samples = 86 value = [74, 12] class = y 0 10->14 12 gini = 0.0 samples = 4 value = [0, 4] class = y 1 11->12 13 gini = 0.0 samples = 2 value = [2, 0] class = y 0 11->13 15 DiabetesPedigreeFunction ≤ 0.501 gini = 0.225 samples = 85 value = [74, 11] class = y 0 14->15 38 gini = 0.0 samples = 1 value = [0, 1] class = y 1 14->38 16 BMI ≤ 45.35 gini = 0.135 samples = 55 value = [51, 4] class = y 0 15->16 27 BloodPressure ≤ 69.0 gini = 0.358 samples = 30 value = [23, 7] class = y 0 15->27 17 Insulin ≤ 36.5 gini = 0.105 samples = 54 value = [51, 3] class = y 0 16->17 26 gini = 0.0 samples = 1 value = [0, 1] class = y 1 16->26 18 Insulin ≤ 34.0 gini = 0.266 samples = 19 value = [16, 3] class = y 0 17->18 25 gini = 0.0 samples = 35 value = [35, 0] class = y 0 17->25 19 Glucose ≤ 111.5 gini = 0.198 samples = 18 value = [16, 2] class = y 0 18->19 24 gini = 0.0 samples = 1 value = [0, 1] class = y 1 18->24 20 gini = 0.0 samples = 13 value = [13, 0] class = y 0 19->20 21 BMI ≤ 34.5 gini = 0.48 samples = 5 value = [3, 2] class = y 0 19->21 22 gini = 0.0 samples = 2 value = [0, 2] class = y 1 21->22 23 gini = 0.0 samples = 3 value = [3, 0] class = y 0 21->23 28 Glucose ≤ 88.5 gini = 0.492 samples = 16 value = [9, 7] class = y 0 27->28 37 gini = 0.0 samples = 14 value = [14, 0] class = y 0 27->37 29 gini = 0.0 samples = 7 value = [7, 0] class = y 0 28->29 30 DiabetesPedigreeFunction ≤ 0.908 gini = 0.346 samples = 9 value = [2, 7] class = y 1 28->30 31 Glucose ≤ 98.5 gini = 0.219 samples = 8 value = [1, 7] class = y 1 30->31 36 gini = 0.0 samples = 1 value = [1, 0] class = y 0 30->36 32 BMI ≤ 37.4 gini = 0.444 samples = 3 value = [1, 2] class = y 1 31->32 35 gini = 0.0 samples = 5 value = [0, 5] class = y 1 31->35 33 gini = 0.0 samples = 1 value = [1, 0] class = y 0 32->33 34 gini = 0.0 samples = 2 value = [0, 2] class = y 1 32->34 40 gini = 0.0 samples = 29 value = [29, 0] class = y 0 39->40 41 Glucose ≤ 99.5 gini = 0.475 samples = 121 value = [74, 47] class = y 0 39->41 42 Glucose ≤ 28.5 gini = 0.337 samples = 42 value = [33, 9] class = y 0 41->42 59 BMI ≤ 27.55 gini = 0.499 samples = 79 value = [41, 38] class = y 0 41->59 43 gini = 0.0 samples = 2 value = [0, 2] class = y 1 42->43 44 Age ≤ 42.5 gini = 0.289 samples = 40 value = [33, 7] class = y 0 42->44 45 DiabetesPedigreeFunction ≤ 1.16 gini = 0.137 samples = 27 value = [25, 2] class = y 0 44->45 52 BMI ≤ 30.85 gini = 0.473 samples = 13 value = [8, 5] class = y 0 44->52 46 DiabetesPedigreeFunction ≤ 0.171 gini = 0.074 samples = 26 value = [25, 1] class = y 0 45->46 51 gini = 0.0 samples = 1 value = [0, 1] class = y 1 45->51 47 Age ≤ 32.0 gini = 0.375 samples = 4 value = [3, 1] class = y 0 46->47 50 gini = 0.0 samples = 22 value = [22, 0] class = y 0 46->50 48 gini = 0.0 samples = 1 value = [0, 1] class = y 1 47->48 49 gini = 0.0 samples = 3 value = [3, 0] class = y 0 47->49 53 gini = 0.0 samples = 2 value = [0, 2] class = y 1 52->53 54 SkinThickness ≤ 21.5 gini = 0.397 samples = 11 value = [8, 3] class = y 0 52->54 55 gini = 0.0 samples = 5 value = [5, 0] class = y 0 54->55 56 BloodPressure ≤ 75.0 gini = 0.5 samples = 6 value = [3, 3] class = y 0 54->56 57 gini = 0.0 samples = 3 value = [3, 0] class = y 0 56->57 58 gini = 0.0 samples = 3 value = [0, 3] class = y 1 56->58 60 gini = 0.0 samples = 5 value = [0, 5] class = y 1 59->60 61 DiabetesPedigreeFunction ≤ 0.179 gini = 0.494 samples = 74 value = [41, 33] class = y 0 59->61 62 gini = 0.0 samples = 8 value = [8, 0] class = y 0 61->62 63 Pregnancies ≤ 6.5 gini = 0.5 samples = 66 value = [33, 33] class = y 0 61->63 64 Pregnancies ≤ 1.5 gini = 0.483 samples = 44 value = [26, 18] class = y 0 63->64 87 Age ≤ 39.0 gini = 0.434 samples = 22 value = [7, 15] class = y 1 63->87 65 DiabetesPedigreeFunction ≤ 0.893 gini = 0.475 samples = 18 value = [7, 11] class = y 1 64->65 74 Age ≤ 34.5 gini = 0.393 samples = 26 value = [19, 7] class = y 0 64->74 66 DiabetesPedigreeFunction ≤ 0.2 gini = 0.391 samples = 15 value = [4, 11] class = y 1 65->66 73 gini = 0.0 samples = 3 value = [3, 0] class = y 0 65->73 67 gini = 0.0 samples = 2 value = [2, 0] class = y 0 66->67 68 BloodPressure ≤ 92.0 gini = 0.26 samples = 13 value = [2, 11] class = y 1 66->68 69 Glucose ≤ 101.0 gini = 0.153 samples = 12 value = [1, 11] class = y 1 68->69 72 gini = 0.0 samples = 1 value = [1, 0] class = y 0 68->72 70 gini = 0.0 samples = 1 value = [1, 0] class = y 0 69->70 71 gini = 0.0 samples = 11 value = [0, 11] class = y 1 69->71 75 BloodPressure ≤ 63.0 gini = 0.133 samples = 14 value = [13, 1] class = y 0 74->75 80 BloodPressure ≤ 74.5 gini = 0.5 samples = 12 value = [6, 6] class = y 0 74->80 76 SkinThickness ≤ 25.5 gini = 0.5 samples = 2 value = [1, 1] class = y 0 75->76 79 gini = 0.0 samples = 12 value = [12, 0] class = y 0 75->79 77 gini = 0.0 samples = 1 value = [0, 1] class = y 1 76->77 78 gini = 0.0 samples = 1 value = [1, 0] class = y 0 76->78 81 gini = 0.0 samples = 4 value = [0, 4] class = y 1 80->81 82 BMI ≤ 32.65 gini = 0.375 samples = 8 value = [6, 2] class = y 0 80->82 83 BMI ≤ 29.3 gini = 0.444 samples = 3 value = [1, 2] class = y 1 82->83 86 gini = 0.0 samples = 5 value = [5, 0] class = y 0 82->86 84 gini = 0.0 samples = 1 value = [1, 0] class = y 0 83->84 85 gini = 0.0 samples = 2 value = [0, 2] class = y 1 83->85 88 gini = 0.0 samples = 8 value = [0, 8] class = y 1 87->88 89 DiabetesPedigreeFunction ≤ 0.587 gini = 0.5 samples = 14 value = [7, 7] class = y 0 87->89 90 Glucose ≤ 109.0 gini = 0.42 samples = 10 value = [7, 3] class = y 0 89->90 99 gini = 0.0 samples = 4 value = [0, 4] class = y 1 89->99 91 Glucose ≤ 107.0 gini = 0.5 samples = 6 value = [3, 3] class = y 0 90->91 98 gini = 0.0 samples = 4 value = [4, 0] class = y 0 90->98 92 BMI ≤ 35.4 gini = 0.48 samples = 5 value = [3, 2] class = y 0 91->92 97 gini = 0.0 samples = 1 value = [0, 1] class = y 1 91->97 93 gini = 0.0 samples = 2 value = [2, 0] class = y 0 92->93 94 DiabetesPedigreeFunction ≤ 0.221 gini = 0.444 samples = 3 value = [1, 2] class = y 1 92->94 95 gini = 0.0 samples = 1 value = [1, 0] class = y 0 94->95 96 gini = 0.0 samples = 2 value = [0, 2] class = y 1 94->96 101 Age ≤ 26.0 gini = 0.438 samples = 74 value = [50, 24] class = y 0 100->101 132 Glucose ≤ 157.5 gini = 0.418 samples = 188 value = [56, 132] class = y 1 100->132 102 gini = 0.0 samples = 19 value = [19, 0] class = y 0 101->102 103 Age ≤ 60.5 gini = 0.492 samples = 55 value = [31, 24] class = y 0 101->103 104 Glucose ≤ 151.5 gini = 0.5 samples = 47 value = [23, 24] class = y 1 103->104 131 gini = 0.0 samples = 8 value = [8, 0] class = y 0 103->131 105 Glucose ≤ 125.5 gini = 0.469 samples = 32 value = [20, 12] class = y 0 104->105 124 BMI ≤ 27.1 gini = 0.32 samples = 15 value = [3, 12] class = y 1 104->124 106 SkinThickness ≤ 27.0 gini = 0.375 samples = 8 value = [2, 6] class = y 1 105->106 111 BloodPressure ≤ 73.0 gini = 0.375 samples = 24 value = [18, 6] class = y 0 105->111 107 gini = 0.0 samples = 5 value = [0, 5] class = y 1 106->107 108 Age ≤ 43.0 gini = 0.444 samples = 3 value = [2, 1] class = y 0 106->108 109 gini = 0.0 samples = 2 value = [2, 0] class = y 0 108->109 110 gini = 0.0 samples = 1 value = [0, 1] class = y 1 108->110 112 BloodPressure ≤ 64.5 gini = 0.5 samples = 8 value = [4, 4] class = y 0 111->112 117 BMI ≤ 28.0 gini = 0.219 samples = 16 value = [14, 2] class = y 0 111->117 113 Age ≤ 28.5 gini = 0.32 samples = 5 value = [4, 1] class = y 0 112->113 116 gini = 0.0 samples = 3 value = [0, 3] class = y 1 112->116 114 gini = 0.0 samples = 1 value = [0, 1] class = y 1 113->114 115 gini = 0.0 samples = 4 value = [4, 0] class = y 0 113->115 118 gini = 0.0 samples = 9 value = [9, 0] class = y 0 117->118 119 BMI ≤ 29.55 gini = 0.408 samples = 7 value = [5, 2] class = y 0 117->119 120 Age ≤ 30.5 gini = 0.444 samples = 3 value = [1, 2] class = y 1 119->120 123 gini = 0.0 samples = 4 value = [4, 0] class = y 0 119->123 121 gini = 0.0 samples = 1 value = [1, 0] class = y 0 120->121 122 gini = 0.0 samples = 2 value = [0, 2] class = y 1 120->122 125 gini = 0.0 samples = 8 value = [0, 8] class = y 1 124->125 126 BMI ≤ 29.1 gini = 0.49 samples = 7 value = [3, 4] class = y 1 124->126 127 Age ≤ 36.5 gini = 0.375 samples = 4 value = [3, 1] class = y 0 126->127 130 gini = 0.0 samples = 3 value = [0, 3] class = y 1 126->130 128 gini = 0.0 samples = 1 value = [0, 1] class = y 1 127->128 129 gini = 0.0 samples = 3 value = [3, 0] class = y 0 127->129 133 Age ≤ 28.5 gini = 0.482 samples = 116 value = [47, 69] class = y 1 132->133 202 Insulin ≤ 595.0 gini = 0.219 samples = 72 value = [9, 63] class = y 1 132->202 134 BloodPressure ≤ 73.0 gini = 0.49 samples = 42 value = [24, 18] class = y 0 133->134 157 DiabetesPedigreeFunction ≤ 0.429 gini = 0.428 samples = 74 value = [23, 51] class = y 1 133->157 135 DiabetesPedigreeFunction ≤ 0.186 gini = 0.463 samples = 22 value = [8, 14] class = y 1 134->135 148 BloodPressure ≤ 89.0 gini = 0.32 samples = 20 value = [16, 4] class = y 0 134->148 136 gini = 0.0 samples = 3 value = [3, 0] class = y 0 135->136 137 Glucose ≤ 147.0 gini = 0.388 samples = 19 value = [5, 14] class = y 1 135->137 138 Insulin ≤ 365.0 gini = 0.231 samples = 15 value = [2, 13] class = y 1 137->138 145 Glucose ≤ 154.5 gini = 0.375 samples = 4 value = [3, 1] class = y 0 137->145 139 Pregnancies ≤ 3.5 gini = 0.133 samples = 14 value = [1, 13] class = y 1 138->139 144 gini = 0.0 samples = 1 value = [1, 0] class = y 0 138->144 140 gini = 0.0 samples = 11 value = [0, 11] class = y 1 139->140 141 BMI ≤ 33.6 gini = 0.444 samples = 3 value = [1, 2] class = y 1 139->141 142 gini = 0.0 samples = 2 value = [0, 2] class = y 1 141->142 143 gini = 0.0 samples = 1 value = [1, 0] class = y 0 141->143 146 gini = 0.0 samples = 3 value = [3, 0] class = y 0 145->146 147 gini = 0.0 samples = 1 value = [0, 1] class = y 1 145->147 149 Pregnancies ≤ 4.5 gini = 0.117 samples = 16 value = [15, 1] class = y 0 148->149 154 DiabetesPedigreeFunction ≤ 0.302 gini = 0.375 samples = 4 value = [1, 3] class = y 1 148->154 150 gini = 0.0 samples = 14 value = [14, 0] class = y 0 149->150 151 Glucose ≤ 146.5 gini = 0.5 samples = 2 value = [1, 1] class = y 0 149->151 152 gini = 0.0 samples = 1 value = [0, 1] class = y 1 151->152 153 gini = 0.0 samples = 1 value = [1, 0] class = y 0 151->153 155 gini = 0.0 samples = 1 value = [1, 0] class = y 0 154->155 156 gini = 0.0 samples = 3 value = [0, 3] class = y 1 154->156 158 BMI ≤ 45.55 gini = 0.499 samples = 38 value = [18, 20] class = y 1 157->158 187 Insulin ≤ 333.5 gini = 0.239 samples = 36 value = [5, 31] class = y 1 157->187 159 Pregnancies ≤ 1.5 gini = 0.496 samples = 33 value = [18, 15] class = y 0 158->159 186 gini = 0.0 samples = 5 value = [0, 5] class = y 1 158->186 160 gini = 0.0 samples = 3 value = [0, 3] class = y 1 159->160 161 BMI ≤ 37.25 gini = 0.48 samples = 30 value = [18, 12] class = y 0 159->161 162 BMI ≤ 36.25 gini = 0.499 samples = 21 value = [10, 11] class = y 1 161->162 181 Glucose ≤ 146.5 gini = 0.198 samples = 9 value = [8, 1] class = y 0 161->181 163 Pregnancies ≤ 8.5 gini = 0.499 samples = 19 value = [10, 9] class = y 0 162->163 180 gini = 0.0 samples = 2 value = [0, 2] class = y 1 162->180 164 DiabetesPedigreeFunction ≤ 0.266 gini = 0.49 samples = 14 value = [6, 8] class = y 1 163->164 175 BloodPressure ≤ 73.0 gini = 0.32 samples = 5 value = [4, 1] class = y 0 163->175 165 BMI ≤ 31.65 gini = 0.496 samples = 11 value = [6, 5] class = y 0 164->165 174 gini = 0.0 samples = 3 value = [0, 3] class = y 1 164->174 166 gini = 0.0 samples = 2 value = [0, 2] class = y 1 165->166 167 Age ≤ 67.5 gini = 0.444 samples = 9 value = [6, 3] class = y 0 165->167 168 Glucose ≤ 126.5 gini = 0.375 samples = 8 value = [6, 2] class = y 0 167->168 173 gini = 0.0 samples = 1 value = [0, 1] class = y 1 167->173 169 Age ≤ 34.5 gini = 0.444 samples = 3 value = [1, 2] class = y 1 168->169 172 gini = 0.0 samples = 5 value = [5, 0] class = y 0 168->172 170 gini = 0.0 samples = 1 value = [1, 0] class = y 0 169->170 171 gini = 0.0 samples = 2 value = [0, 2] class = y 1 169->171 176 Pregnancies ≤ 9.5 gini = 0.5 samples = 2 value = [1, 1] class = y 0 175->176 179 gini = 0.0 samples = 3 value = [3, 0] class = y 0 175->179 177 gini = 0.0 samples = 1 value = [1, 0] class = y 0 176->177 178 gini = 0.0 samples = 1 value = [0, 1] class = y 1 176->178 182 gini = 0.0 samples = 7 value = [7, 0] class = y 0 181->182 183 SkinThickness ≤ 17.5 gini = 0.5 samples = 2 value = [1, 1] class = y 0 181->183 184 gini = 0.0 samples = 1 value = [0, 1] class = y 1 183->184 185 gini = 0.0 samples = 1 value = [1, 0] class = y 0 183->185 188 Pregnancies ≤ 0.5 gini = 0.165 samples = 33 value = [3, 30] class = y 1 187->188 199 DiabetesPedigreeFunction ≤ 0.581 gini = 0.444 samples = 3 value = [2, 1] class = y 0 187->199 189 BloodPressure ≤ 60.0 gini = 0.48 samples = 5 value = [2, 3] class = y 1 188->189 194 BMI ≤ 40.05 gini = 0.069 samples = 28 value = [1, 27] class = y 1 188->194 190 gini = 0.0 samples = 2 value = [0, 2] class = y 1 189->190 191 BloodPressure ≤ 85.0 gini = 0.444 samples = 3 value = [2, 1] class = y 0 189->191 192 gini = 0.0 samples = 2 value = [2, 0] class = y 0 191->192 193 gini = 0.0 samples = 1 value = [0, 1] class = y 1 191->193 195 gini = 0.0 samples = 22 value = [0, 22] class = y 1 194->195 196 BMI ≤ 40.7 gini = 0.278 samples = 6 value = [1, 5] class = y 1 194->196 197 gini = 0.0 samples = 1 value = [1, 0] class = y 0 196->197 198 gini = 0.0 samples = 5 value = [0, 5] class = y 1 196->198 200 gini = 0.0 samples = 1 value = [0, 1] class = y 1 199->200 201 gini = 0.0 samples = 2 value = [2, 0] class = y 0 199->201 203 DiabetesPedigreeFunction ≤ 0.307 gini = 0.182 samples = 69 value = [7, 62] class = y 1 202->203 220 DiabetesPedigreeFunction ≤ 0.412 gini = 0.444 samples = 3 value = [2, 1] class = y 0 202->220 204 BMI ≤ 31.4 gini = 0.401 samples = 18 value = [5, 13] class = y 1 203->204 213 Age ≤ 48.0 gini = 0.075 samples = 51 value = [2, 49] class = y 1 203->213 205 gini = 0.0 samples = 2 value = [2, 0] class = y 0 204->205 206 Glucose ≤ 179.5 gini = 0.305 samples = 16 value = [3, 13] class = y 1 204->206 207 Glucose ≤ 177.0 gini = 0.42 samples = 10 value = [3, 7] class = y 1 206->207 212 gini = 0.0 samples = 6 value = [0, 6] class = y 1 206->212 208 BMI ≤ 45.1 gini = 0.219 samples = 8 value = [1, 7] class = y 1 207->208 211 gini = 0.0 samples = 2 value = [2, 0] class = y 0 207->211 209 gini = 0.0 samples = 7 value = [0, 7] class = y 1 208->209 210 gini = 0.0 samples = 1 value = [1, 0] class = y 0 208->210 214 gini = 0.0 samples = 42 value = [0, 42] class = y 1 213->214 215 Age ≤ 50.5 gini = 0.346 samples = 9 value = [2, 7] class = y 1 213->215 216 gini = 0.0 samples = 1 value = [1, 0] class = y 0 215->216 217 BMI ≤ 45.95 gini = 0.219 samples = 8 value = [1, 7] class = y 1 215->217 218 gini = 0.0 samples = 7 value = [0, 7] class = y 1 217->218 219 gini = 0.0 samples = 1 value = [1, 0] class = y 0 217->219 221 gini = 0.0 samples = 1 value = [0, 1] class = y 1 220->221 222 gini = 0.0 samples = 2 value = [2, 0] class = y 0 220->222

Creando el segundo modelo

In [0]:
tree = DecisionTreeClassifier(min_samples_leaf=10, max_depth=8, min_samples_split=50)
In [149]:
tree.fit(X_train, y_train)
Out[149]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=50,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
In [0]:
validation_prediction = tree.predict(X_val)
training_prediction = tree.predict(X_train)
In [151]:
print('Exactitud training data: ', accuracy_score(y_true=y_train, y_pred=training_prediction))
print('Exactitud validation data: ', accuracy_score(y_true=y_val, y_pred=validation_prediction))
Exactitud training data:  0.7964169381107492
Exactitud validation data:  0.8116883116883117
In [0]:
dot_data = export_graphviz(tree, out_file=None, 
                         feature_names=feature_names,  
                         class_names=True,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = graphviz.Source(dot_data)
In [153]:
graph
Out[153]:
Tree 0 Glucose ≤ 123.5 gini = 0.461 samples = 614 value = [393, 221] class = y 0 1 Age ≤ 28.5 gini = 0.301 samples = 352 value = [287, 65] class = y 0 0->1 True 22 BMI ≤ 30.05 gini = 0.482 samples = 262 value = [106, 156] class = y 1 0->22 False 2 BMI ≤ 30.95 gini = 0.162 samples = 202 value = [184, 18] class = y 0 1->2 13 BMI ≤ 26.35 gini = 0.43 samples = 150 value = [103, 47] class = y 0 1->13 3 Glucose ≤ 106.5 gini = 0.036 samples = 110 value = [108, 2] class = y 0 2->3 6 BloodPressure ≤ 64.5 gini = 0.287 samples = 92 value = [76, 16] class = y 0 2->6 4 gini = 0.0 samples = 83 value = [83, 0] class = y 0 3->4 5 gini = 0.137 samples = 27 value = [25, 2] class = y 0 3->5 7 gini = 0.404 samples = 32 value = [23, 9] class = y 0 6->7 8 BloodPressure ≤ 81.0 gini = 0.206 samples = 60 value = [53, 7] class = y 0 6->8 9 BMI ≤ 33.7 gini = 0.147 samples = 50 value = [46, 4] class = y 0 8->9 12 gini = 0.42 samples = 10 value = [7, 3] class = y 0 8->12 10 gini = 0.305 samples = 16 value = [13, 3] class = y 0 9->10 11 gini = 0.057 samples = 34 value = [33, 1] class = y 0 9->11 14 gini = 0.0 samples = 29 value = [29, 0] class = y 0 13->14 15 Glucose ≤ 99.5 gini = 0.475 samples = 121 value = [74, 47] class = y 0 13->15 16 gini = 0.337 samples = 42 value = [33, 9] class = y 0 15->16 17 DiabetesPedigreeFunction ≤ 0.22 gini = 0.499 samples = 79 value = [41, 38] class = y 0 15->17 18 gini = 0.375 samples = 16 value = [12, 4] class = y 0 17->18 19 Pregnancies ≤ 6.5 gini = 0.497 samples = 63 value = [29, 34] class = y 1 17->19 20 gini = 0.493 samples = 41 value = [23, 18] class = y 0 19->20 21 gini = 0.397 samples = 22 value = [6, 16] class = y 1 19->21 23 Age ≤ 26.0 gini = 0.438 samples = 74 value = [50, 24] class = y 0 22->23 28 Glucose ≤ 157.5 gini = 0.418 samples = 188 value = [56, 132] class = y 1 22->28 24 gini = 0.0 samples = 19 value = [19, 0] class = y 0 23->24 25 Age ≤ 54.5 gini = 0.492 samples = 55 value = [31, 24] class = y 0 23->25 26 gini = 0.494 samples = 38 value = [17, 21] class = y 1 25->26 27 gini = 0.291 samples = 17 value = [14, 3] class = y 0 25->27 29 Age ≤ 28.5 gini = 0.482 samples = 116 value = [47, 69] class = y 1 28->29 34 DiabetesPedigreeFunction ≤ 0.307 gini = 0.219 samples = 72 value = [9, 63] class = y 1 28->34 30 gini = 0.49 samples = 42 value = [24, 18] class = y 0 29->30 31 DiabetesPedigreeFunction ≤ 0.429 gini = 0.428 samples = 74 value = [23, 51] class = y 1 29->31 32 gini = 0.499 samples = 38 value = [18, 20] class = y 1 31->32 33 gini = 0.239 samples = 36 value = [5, 31] class = y 1 31->33 35 gini = 0.401 samples = 18 value = [5, 13] class = y 1 34->35 36 Insulin ≤ 257.0 gini = 0.137 samples = 54 value = [4, 50] class = y 1 34->36 37 gini = 0.05 samples = 39 value = [1, 38] class = y 1 36->37 38 gini = 0.32 samples = 15 value = [3, 12] class = y 1 36->38
In [0]: