#!/usr/bin/env python # coding: utf-8 # # Introduction to Data Science # ## From correlation to supervised segmentation and tree-structured models # We are going to need a lot of Python packages, so let's start by importing all of them. # In[ ]: # Import the libraries we will be using import os import numpy as np import pandas as pd import math import matplotlib.patches as patches import matplotlib.pylab as plt from sklearn.tree import DecisionTreeClassifier from sklearn import tree from sklearn import metrics from sklearn import datasets from IPython.display import Image get_ipython().run_line_magic('matplotlib', 'inline') # We are also going to do a lot of repetitive stuff, so let's predefine some useful functions. # In[ ]: # A function that gives a visual representation of the decision tree def Decision_Tree_Image(decision_tree, feature_names, name="temp"): # Export our decision tree to graphviz format dot_file = tree.export_graphviz(decision_tree.tree_, out_file='images/' + name + '.dot', feature_names=feature_names) # Call graphviz to make an image file from our decision tree os.system("dot -T png images/" + name + ".dot -o images/" + name + ".png") # Return the .png image so we can see it return Image(filename='images/' + name + '.png') # A function to plot the data def Plot_Data(data, v1, v2, tv): # Make the plot square plt.rcParams['figure.figsize'] = [12.0, 8.0] # Color color = ["red" if x == 0 else "blue" for x in data[tv]] # Plot and label plt.scatter(data[v1], data[v2], c=color, s=50) plt.xlabel(v1) plt.ylabel(v2) plt.xlim([min(data[v1]) - 1, max(data[v1]) + 1]) plt.ylim([min(data[v2]) - .05, max(data[v2]) + .05]) def Decision_Surface(x, y, model, cell_size=.01): # Get blob sizes for shading x = (min(x), max(x)) y = (min(y), max(y)) x_step = (x[1] - x[0]) * cell_size y_step = (y[1] - y[0]) * cell_size # Create blobs x_values = [] y_values = [] for i in np.arange(x[0], x[1], x_step): for j in np.arange(y[0], y[1], y_step): y_values.append(float(i)) x_values.append(float(j)) data_blob = pd.DataFrame({"x": x_values, "y": y_values}) # Predict the blob labels label= decision_tree.predict(data_blob) # Color and plot them color = ["red" if l == 0 else "blue" for l in label] plt.scatter(data_blob['y'], data_blob['x'], marker='o', edgecolor='black', linewidth='0', c=color, alpha=0.3) # Get the raw decision tree rules decision_tree_raw = [] for feature, left_c, right_c, threshold, value in zip(decision_tree.tree_.feature, decision_tree.tree_.children_left, decision_tree.tree_.children_right, decision_tree.tree_.threshold, decision_tree.tree_.value): decision_tree_raw.append([feature, left_c, right_c, threshold, value]) # Plot the data Plot_Data(data, "humor", "number_pets", "success") # Used for formatting the boundry lines currentAxis = plt.gca() line_color = "black" line_width = 3 # For each rule for row in decision_tree_raw: feature, left_c, right_c, threshold, value = row if threshold != -2: if feature == 0: plt.plot([20, 100], [threshold, threshold], c=line_color, linewidth=line_width) else: plt.plot([threshold, threshold], [0, 5], c=line_color, linewidth=line_width) plt.xlim([min(x) - 1, max(x) + 1]) plt.ylim([min(y) - .05, max(y) + .05]) plt.show() # We also need some data, so let's create a dataset consisting of 500 people. # In[ ]: # Set the randomness np.random.seed(36) # Number of users n_users = 500 # Relationships variable_names = ["age", "humor", "number_pets"] variables_keep = ["number_pets", "humor"] target_name = "success" # Generate data predictors, target = datasets.make_classification(n_features=3, n_redundant=0, n_informative=2, n_clusters_per_class=2, n_samples=n_users) data = pd.DataFrame(predictors, columns=variable_names) data['age'] = data['age'] * 10 + 50 data['humor'] = data['humor'] * 10 + 50 data['number_pets'] = (data['number_pets'] + 6)/2 data[target_name] = target X = data[[variables_keep[0], variables_keep[1]]] Y = data[target_name] # ### Useful features # Let's take a look at one of our features -- `"number_pets"`. Is this feature useful? Let's plot the possible values of `"number_pets"` and color code our target variable, which is, in this case, `"success"`. # In[ ]: # Make the plot long plt.rcParams['figure.figsize'] = [20.0, 4.0] color = ["red" if x == 0 else "blue" for x in data["success"]] plt.scatter(X['number_pets'], [1] * n_users, c=color, s=50) # Is `"number_pets"` actually useful? Let's quantify it. # # **Entropy** ($H$) and **information gain** ($IG$) are crucial in determining which features are the most informative. Given the data, it is fairly straight forward to calculate both of these. # # # # # # # #
# Figure 3-4. Splitting the "write-off" sample into two segments, based on splitting the Balance attribute (account balance) at 50K. # Figure 3-5. A classification tree split on the three-values Residence attribute.
# In[ ]: def entropy(target): # Get the number of users n = len(target) # Count how frequently each unique value occurs counts = np.bincount(target).astype(float) # Initialize entropy entropy = 0 # If the split is perfect, return 0 if len(counts) <= 1 or 0 in counts: return entropy # Otherwise, for each possible value, update entropy for count in counts: entropy += math.log(count/n, len(counts)) * count/n # Return entropy return -1 * entropy def information_gain(feature, threshold, target): # Dealing with numpy arrays makes this slightly easier target = np.array(target) feature = np.array(feature) # Cut the feature vector on the threshold feature = (feature < threshold) # Initialize information gain with the parent entropy ig = entropy(target) # For both sides of the threshold, update information gain for level, count in zip([0, 1], np.bincount(feature).astype(float)): ig -= count/len(feature) * entropy(target[feature == level]) # Return information gain return ig # Now that we have a way of calculating $H$ and $IG$, let's pick a threshold, split `"number_pets"`, and calculate $IG$. # In[ ]: threshold = 3.2 print "IG = %.4f with a thresholding of %.2f." % (information_gain(X['number_pets'], threshold, np.array(Y)), threshold) # To be more precise, we can iterate through all values and find the best split. # In[ ]: def best_threshold(): maximum_ig = 0 maximum_threshold = 0 for threshold in X['number_pets']: ig = information_gain(X['number_pets'], threshold, np.array(Y)) if ig > maximum_ig: maximum_ig = ig maximum_threshold = threshold return "The maximum IG = %.3f and it occured by splitting on %.4f." % (maximum_ig, maximum_threshold) print best_threshold() # Let's see how we can do this with just sklearn! # In[ ]: decision_tree = DecisionTreeClassifier(max_depth=1, criterion="entropy") decision_tree.fit(X, Y) Decision_Tree_Image(decision_tree, X.columns) # Let's look at another one of our features, `"humor"`, and see how it relates to `"number_pets"`. # In[ ]: Plot_Data(data, "humor", "number_pets", "success") # In[ ]: Decision_Surface(data['humor'], data['number_pets'], decision_tree) # What does our tree look like when we look at it along with the plot we just made? # If we use this as our decision tree, how accurate is it? # In[ ]: print "Accuracy = %.3f" % (metrics.accuracy_score(decision_tree.predict(X), Y)) # Let's add one more level to our decision tree. # In[ ]: decision_tree = DecisionTreeClassifier(max_depth=2, criterion="entropy") decision_tree.fit(X, Y) Decision_Tree_Image(decision_tree, X.columns) # In[ ]: Decision_Surface(data['humor'], data['number_pets'], decision_tree) # In[ ]: print "Accuracy = %.3f" % (metrics.accuracy_score(decision_tree.predict(X), Y)) # Not much of a change? Can you see why? Try experimenting with different values of `max_depth` and find which one is the most accurate. Is there a point where the tree stops growing? # In[ ]: # Your code here!