We are going to need a lot of Python packages, so let's start by importing all of them.
# Import the libraries we will be using
import os
import numpy as np
import pandas as pd
import math
import matplotlib.patches as patches
import matplotlib.pylab as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import metrics
from sklearn import datasets
from IPython.display import Image
%matplotlib inline
We are also going to do a lot of repetitive stuff, so let's predefine some useful functions.
# A function that gives a visual representation of the decision tree
def Decision_Tree_Image(decision_tree, feature_names, name="temp"):
# Export our decision tree to graphviz format
dot_file = tree.export_graphviz(decision_tree.tree_, out_file='images/' + name + '.dot', feature_names=feature_names)
# Call graphviz to make an image file from our decision tree
os.system("dot -T png images/" + name + ".dot -o images/" + name + ".png")
# Return the .png image so we can see it
return Image(filename='images/' + name + '.png')
# A function to plot the data
def Plot_Data(data, v1, v2, tv):
# Make the plot square
plt.rcParams['figure.figsize'] = [12.0, 8.0]
# Color
color = ["red" if x == 0 else "blue" for x in data[tv]]
# Plot and label
plt.scatter(data[v1], data[v2], c=color, s=50)
plt.xlabel(v1)
plt.ylabel(v2)
plt.xlim([min(data[v1]) - 1, max(data[v1]) + 1])
plt.ylim([min(data[v2]) - .05, max(data[v2]) + .05])
def Decision_Surface(x, y, model, cell_size=.01):
# Get blob sizes for shading
x = (min(x), max(x))
y = (min(y), max(y))
x_step = (x[1] - x[0]) * cell_size
y_step = (y[1] - y[0]) * cell_size
# Create blobs
x_values = []
y_values = []
for i in np.arange(x[0], x[1], x_step):
for j in np.arange(y[0], y[1], y_step):
y_values.append(float(i))
x_values.append(float(j))
data_blob = pd.DataFrame({"x": x_values, "y": y_values})
# Predict the blob labels
label= decision_tree.predict(data_blob)
# Color and plot them
color = ["red" if l == 0 else "blue" for l in label]
plt.scatter(data_blob['y'], data_blob['x'], marker='o', edgecolor='black', linewidth='0', c=color, alpha=0.3)
# Get the raw decision tree rules
decision_tree_raw = []
for feature, left_c, right_c, threshold, value in zip(decision_tree.tree_.feature,
decision_tree.tree_.children_left,
decision_tree.tree_.children_right,
decision_tree.tree_.threshold,
decision_tree.tree_.value):
decision_tree_raw.append([feature, left_c, right_c, threshold, value])
# Plot the data
Plot_Data(data, "humor", "number_pets", "success")
# Used for formatting the boundry lines
currentAxis = plt.gca()
line_color = "black"
line_width = 3
# For each rule
for row in decision_tree_raw:
feature, left_c, right_c, threshold, value = row
if threshold != -2:
if feature == 0:
plt.plot([20, 100], [threshold, threshold], c=line_color, linewidth=line_width)
else:
plt.plot([threshold, threshold], [0, 5], c=line_color, linewidth=line_width)
plt.xlim([min(x) - 1, max(x) + 1])
plt.ylim([min(y) - .05, max(y) + .05])
plt.show()
We also need some data, so let's create a dataset consisting of 500 people.
# Set the randomness
np.random.seed(36)
# Number of users
n_users = 500
# Relationships
variable_names = ["age", "humor", "number_pets"]
variables_keep = ["number_pets", "humor"]
target_name = "success"
# Generate data
predictors, target = datasets.make_classification(n_features=3, n_redundant=0,
n_informative=2, n_clusters_per_class=2,
n_samples=n_users)
data = pd.DataFrame(predictors, columns=variable_names)
data['age'] = data['age'] * 10 + 50
data['humor'] = data['humor'] * 10 + 50
data['number_pets'] = (data['number_pets'] + 6)/2
data[target_name] = target
X = data[[variables_keep[0], variables_keep[1]]]
Y = data[target_name]
Let's take a look at one of our features -- "number_pets"
. Is this feature useful? Let's plot the possible values of "number_pets"
and color code our target variable, which is, in this case, "success"
.
# Make the plot long
plt.rcParams['figure.figsize'] = [20.0, 4.0]
color = ["red" if x == 0 else "blue" for x in data["success"]]
plt.scatter(X['number_pets'], [1] * n_users, c=color, s=50)
Is "number_pets"
actually useful? Let's quantify it.
Entropy ($H$) and information gain ($IG$) are crucial in determining which features are the most informative. Given the data, it is fairly straight forward to calculate both of these.
![]() |
![]() |
def entropy(target):
# Get the number of users
n = len(target)
# Count how frequently each unique value occurs
counts = np.bincount(target).astype(float)
# Initialize entropy
entropy = 0
# If the split is perfect, return 0
if len(counts) <= 1 or 0 in counts:
return entropy
# Otherwise, for each possible value, update entropy
for count in counts:
entropy += math.log(count/n, len(counts)) * count/n
# Return entropy
return -1 * entropy
def information_gain(feature, threshold, target):
# Dealing with numpy arrays makes this slightly easier
target = np.array(target)
feature = np.array(feature)
# Cut the feature vector on the threshold
feature = (feature < threshold)
# Initialize information gain with the parent entropy
ig = entropy(target)
# For both sides of the threshold, update information gain
for level, count in zip([0, 1], np.bincount(feature).astype(float)):
ig -= count/len(feature) * entropy(target[feature == level])
# Return information gain
return ig
Now that we have a way of calculating $H$ and $IG$, let's pick a threshold, split "number_pets"
, and calculate $IG$.
threshold = 3.2
print "IG = %.4f with a thresholding of %.2f." % (information_gain(X['number_pets'], threshold, np.array(Y)), threshold)
To be more precise, we can iterate through all values and find the best split.
def best_threshold():
maximum_ig = 0
maximum_threshold = 0
for threshold in X['number_pets']:
ig = information_gain(X['number_pets'], threshold, np.array(Y))
if ig > maximum_ig:
maximum_ig = ig
maximum_threshold = threshold
return "The maximum IG = %.3f and it occured by splitting on %.4f." % (maximum_ig, maximum_threshold)
print best_threshold()
Let's see how we can do this with just sklearn!
decision_tree = DecisionTreeClassifier(max_depth=1, criterion="entropy")
decision_tree.fit(X, Y)
Decision_Tree_Image(decision_tree, X.columns)
Let's look at another one of our features, "humor"
, and see how it relates to "number_pets"
.
Plot_Data(data, "humor", "number_pets", "success")
Decision_Surface(data['humor'], data['number_pets'], decision_tree)
What does our tree look like when we look at it along with the plot we just made?
If we use this as our decision tree, how accurate is it?
print "Accuracy = %.3f" % (metrics.accuracy_score(decision_tree.predict(X), Y))
Let's add one more level to our decision tree.
decision_tree = DecisionTreeClassifier(max_depth=2, criterion="entropy")
decision_tree.fit(X, Y)
Decision_Tree_Image(decision_tree, X.columns)
Decision_Surface(data['humor'], data['number_pets'], decision_tree)
print "Accuracy = %.3f" % (metrics.accuracy_score(decision_tree.predict(X), Y))
Not much of a change? Can you see why? Try experimenting with different values of max_depth
and find which one is the most accurate. Is there a point where the tree stops growing?
# Your code here!