#!/usr/bin/env python # coding: utf-8 # # Introduction to Data Science # ## From correlation to supervised segmentation and tree-structured models # We are going to need a lot of Python packages, so let's start by importing all of them. # In[ ]: # Import the libraries we will be using import os import numpy as np import pandas as pd import math import matplotlib.patches as patches import matplotlib.pylab as plt from sklearn.tree import DecisionTreeClassifier from sklearn import tree from sklearn import metrics from sklearn import datasets from IPython.display import Image get_ipython().run_line_magic('matplotlib', 'inline') # We are also going to do a lot of repetitive stuff, so let's predefine some useful functions. # In[ ]: # A function that gives a visual representation of the decision tree def Decision_Tree_Image(decision_tree, feature_names, name="temp"): # Export our decision tree to graphviz format dot_file = tree.export_graphviz(decision_tree.tree_, out_file='images/' + name + '.dot', feature_names=feature_names) # Call graphviz to make an image file from our decision tree os.system("dot -T png images/" + name + ".dot -o images/" + name + ".png") # Return the .png image so we can see it return Image(filename='images/' + name + '.png') # A function to plot the data def Plot_Data(data, v1, v2, tv): # Make the plot square plt.rcParams['figure.figsize'] = [12.0, 8.0] # Color color = ["red" if x == 0 else "blue" for x in data[tv]] # Plot and label plt.scatter(data[v1], data[v2], c=color, s=50) plt.xlabel(v1) plt.ylabel(v2) plt.xlim([min(data[v1]) - 1, max(data[v1]) + 1]) plt.ylim([min(data[v2]) - .05, max(data[v2]) + .05]) def Decision_Surface(x, y, model, cell_size=.01): # Get blob sizes for shading x = (min(x), max(x)) y = (min(y), max(y)) x_step = (x[1] - x[0]) * cell_size y_step = (y[1] - y[0]) * cell_size # Create blobs x_values = [] y_values = [] for i in np.arange(x[0], x[1], x_step): for j in np.arange(y[0], y[1], y_step): y_values.append(float(i)) x_values.append(float(j)) data_blob = pd.DataFrame({"x": x_values, "y": y_values}) # Predict the blob labels label= decision_tree.predict(data_blob) # Color and plot them color = ["red" if l == 0 else "blue" for l in label] plt.scatter(data_blob['y'], data_blob['x'], marker='o', edgecolor='black', linewidth='0', c=color, alpha=0.3) # Get the raw decision tree rules decision_tree_raw = [] for feature, left_c, right_c, threshold, value in zip(decision_tree.tree_.feature, decision_tree.tree_.children_left, decision_tree.tree_.children_right, decision_tree.tree_.threshold, decision_tree.tree_.value): decision_tree_raw.append([feature, left_c, right_c, threshold, value]) # Plot the data Plot_Data(data, "humor", "number_pets", "success") # Used for formatting the boundry lines currentAxis = plt.gca() line_color = "black" line_width = 3 # For each rule for row in decision_tree_raw: feature, left_c, right_c, threshold, value = row if threshold != -2: if feature == 0: plt.plot([20, 100], [threshold, threshold], c=line_color, linewidth=line_width) else: plt.plot([threshold, threshold], [0, 5], c=line_color, linewidth=line_width) plt.xlim([min(x) - 1, max(x) + 1]) plt.ylim([min(y) - .05, max(y) + .05]) plt.show() # We also need some data, so let's create a dataset consisting of 500 people. # In[ ]: # Set the randomness np.random.seed(36) # Number of users n_users = 500 # Relationships variable_names = ["age", "humor", "number_pets"] variables_keep = ["number_pets", "humor"] target_name = "success" # Generate data predictors, target = datasets.make_classification(n_features=3, n_redundant=0, n_informative=2, n_clusters_per_class=2, n_samples=n_users) data = pd.DataFrame(predictors, columns=variable_names) data['age'] = data['age'] * 10 + 50 data['humor'] = data['humor'] * 10 + 50 data['number_pets'] = (data['number_pets'] + 6)/2 data[target_name] = target X = data[[variables_keep[0], variables_keep[1]]] Y = data[target_name] # ### Useful features # Let's take a look at one of our features -- `"number_pets"`. Is this feature useful? Let's plot the possible values of `"number_pets"` and color code our target variable, which is, in this case, `"success"`. # In[ ]: # Make the plot long plt.rcParams['figure.figsize'] = [20.0, 4.0] color = ["red" if x == 0 else "blue" for x in data["success"]] plt.scatter(X['number_pets'], [1] * n_users, c=color, s=50) # Is `"number_pets"` actually useful? Let's quantify it. # # **Entropy** ($H$) and **information gain** ($IG$) are crucial in determining which features are the most informative. Given the data, it is fairly straight forward to calculate both of these. # #
![]() |
# # | ![]() |
#