Notebook

Introduction to Data Science¶

From correlation to supervised segmentation and tree-structured models¶

We are going to need a lot of Python packages, so let's start by importing all of them.

In [ ]:

# Import the libraries we will be using
import os
import numpy as np
import pandas as pd
import math
import matplotlib.patches as patches
import matplotlib.pylab as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import metrics
from sklearn import datasets
from IPython.display import Image

%matplotlib inline

We are also going to do a lot of repetitive stuff, so let's predefine some useful functions.

In [ ]:

# A function that gives a visual representation of the decision tree
def Decision_Tree_Image(decision_tree, feature_names, name="temp"):
    # Export our decision tree to graphviz format
    dot_file = tree.export_graphviz(decision_tree.tree_, out_file='images/' + name + '.dot', feature_names=feature_names)
    
    # Call graphviz to make an image file from our decision tree
    os.system("dot -T png images/" + name + ".dot -o images/" + name + ".png")
    
    # Return the .png image so we can see it
    return Image(filename='images/' + name + '.png')

# A function to plot the data
def Plot_Data(data, v1, v2, tv):
    # Make the plot square
    plt.rcParams['figure.figsize'] = [12.0, 8.0]
    
    # Color
    color = ["red" if x == 0 else "blue" for x in data[tv]]
    
    # Plot and label
    plt.scatter(data[v1], data[v2], c=color, s=50)
    plt.xlabel(v1)
    plt.ylabel(v2)
    plt.xlim([min(data[v1]) - 1, max(data[v1]) + 1])
    plt.ylim([min(data[v2]) - .05, max(data[v2]) + .05])
    
def Decision_Surface(x, y, model, cell_size=.01):
    # Get blob sizes for shading
    x = (min(x), max(x))
    y = (min(y), max(y))
    x_step = (x[1] - x[0]) * cell_size
    y_step = (y[1] - y[0]) * cell_size

    # Create blobs
    x_values = []
    y_values = []
    
    for i in np.arange(x[0], x[1], x_step):
        for j in np.arange(y[0], y[1], y_step):
            y_values.append(float(i))
            x_values.append(float(j))
    
    data_blob = pd.DataFrame({"x": x_values, "y": y_values})

    # Predict the blob labels
    label= decision_tree.predict(data_blob)
    
    # Color and plot them
    color = ["red" if l == 0 else "blue" for l in label]
    plt.scatter(data_blob['y'], data_blob['x'], marker='o', edgecolor='black', linewidth='0', c=color, alpha=0.3)
    
    # Get the raw decision tree rules
    decision_tree_raw = []
    for feature, left_c, right_c, threshold, value in zip(decision_tree.tree_.feature, 
                                                          decision_tree.tree_.children_left, 
                                                          decision_tree.tree_.children_right, 
                                                          decision_tree.tree_.threshold, 
                                                          decision_tree.tree_.value):
        decision_tree_raw.append([feature, left_c, right_c, threshold, value])

    # Plot the data
    Plot_Data(data, "humor", "number_pets", "success")

    # Used for formatting the boundry lines
    currentAxis = plt.gca()
    line_color = "black"
    line_width = 3

    # For each rule
    for row in decision_tree_raw:
        feature, left_c, right_c, threshold, value = row

        if threshold != -2:
            if feature == 0:
                plt.plot([20, 100], [threshold, threshold], c=line_color, linewidth=line_width)
            else:
                plt.plot([threshold, threshold], [0, 5], c=line_color, linewidth=line_width)

    plt.xlim([min(x) - 1, max(x) + 1])
    plt.ylim([min(y) - .05, max(y) + .05])
    plt.show()

We also need some data, so let's create a dataset consisting of 500 people.

In [ ]:

# Set the randomness
np.random.seed(36)

# Number of users
n_users = 500

# Relationships
variable_names = ["age", "humor", "number_pets"]
variables_keep = ["number_pets", "humor"]
target_name = "success"

# Generate data
predictors, target = datasets.make_classification(n_features=3, n_redundant=0, 
                                                  n_informative=2, n_clusters_per_class=2,
                                                  n_samples=n_users)
data = pd.DataFrame(predictors, columns=variable_names)
data['age'] = data['age'] * 10 + 50
data['humor'] = data['humor'] * 10 + 50
data['number_pets'] = (data['number_pets'] + 6)/2
data[target_name] = target

X = data[[variables_keep[0], variables_keep[1]]]
Y = data[target_name]

Useful features¶

Let's take a look at one of our features -- "number_pets". Is this feature useful? Let's plot the possible values of "number_pets" and color code our target variable, which is, in this case, "success".

In [ ]:

# Make the plot long
plt.rcParams['figure.figsize'] = [20.0, 4.0]
color = ["red" if x == 0 else "blue" for x in data["success"]]
plt.scatter(X['number_pets'], [1] * n_users, c=color, s=50)

Is "number_pets" actually useful? Let's quantify it.

Entropy ($H$) and information gain ($IG$) are crucial in determining which features are the most informative. Given the data, it is fairly straight forward to calculate both of these.

Figure 3-4. Splitting the "write-off" sample into two segments, based on splitting the Balance attribute (account balance) at 50K.

Figure 3-5. A classification tree split on the three-values Residence attribute.

In [ ]:

def entropy(target):
    # Get the number of users
    n = len(target)
    # Count how frequently each unique value occurs
    counts = np.bincount(target).astype(float)
    # Initialize entropy
    entropy = 0
    # If the split is perfect, return 0
    if len(counts) <= 1 or 0 in counts:
        return entropy
    # Otherwise, for each possible value, update entropy
    for count in counts:
        entropy += math.log(count/n, len(counts)) * count/n
    # Return entropy
    return -1 * entropy

def information_gain(feature, threshold, target):
    # Dealing with numpy arrays makes this slightly easier
    target = np.array(target)
    feature = np.array(feature)
    # Cut the feature vector on the threshold
    feature = (feature < threshold)
    # Initialize information gain with the parent entropy
    ig = entropy(target)
    # For both sides of the threshold, update information gain
    for level, count in zip([0, 1], np.bincount(feature).astype(float)):
        ig -= count/len(feature) * entropy(target[feature == level])
    # Return information gain
    return ig

Now that we have a way of calculating $H$ and $IG$, let's pick a threshold, split "number_pets", and calculate $IG$.

In [ ]:

threshold = 3.2
print "IG = %.4f with a thresholding of %.2f." % (information_gain(X['number_pets'], threshold, np.array(Y)), threshold)

To be more precise, we can iterate through all values and find the best split.

In [ ]:

def best_threshold():
    maximum_ig = 0
    maximum_threshold = 0

    for threshold in X['number_pets']:
        ig = information_gain(X['number_pets'], threshold, np.array(Y))
        if ig > maximum_ig:
            maximum_ig = ig
            maximum_threshold = threshold

    return "The maximum IG = %.3f and it occured by splitting on %.4f." % (maximum_ig, maximum_threshold)

print best_threshold()

Let's see how we can do this with just sklearn!

In [ ]:

decision_tree = DecisionTreeClassifier(max_depth=1, criterion="entropy")
decision_tree.fit(X, Y)
Decision_Tree_Image(decision_tree, X.columns)

Let's look at another one of our features, "humor", and see how it relates to "number_pets".

In [ ]:

Plot_Data(data, "humor", "number_pets", "success")

In [ ]:

Decision_Surface(data['humor'], data['number_pets'], decision_tree)

What does our tree look like when we look at it along with the plot we just made?

If we use this as our decision tree, how accurate is it?

In [ ]:

print "Accuracy = %.3f" % (metrics.accuracy_score(decision_tree.predict(X), Y))

Let's add one more level to our decision tree.

In [ ]:

decision_tree = DecisionTreeClassifier(max_depth=2, criterion="entropy")
decision_tree.fit(X, Y)
Decision_Tree_Image(decision_tree, X.columns)

In [ ]:

Decision_Surface(data['humor'], data['number_pets'], decision_tree)

In [ ]:

print "Accuracy = %.3f" % (metrics.accuracy_score(decision_tree.predict(X), Y))

Not much of a change? Can you see why? Try experimenting with different values of max_depth and find which one is the most accurate. Is there a point where the tree stops growing?

In [ ]:

# Your code here!