#!/usr/bin/env python # coding: utf-8 # # **Decision Tree Classification using Python** # In this Case Study, we'll look at how a Decision Tree classifier deals with different shapes of data and the kind of decision regions it makes in 2-D. # In[1]: import pandas as pd from sklearn.tree import DecisionTreeClassifier # ## Linearly separable data # # Lets look at the blobs data. It is linearly separable, meaning the two classes can be seperated with just a line! # In[2]: angle_train = pd.read_csv("angle_train.csv") test_data = pd.read_csv("test_data.csv") # plot the train data. angle_train.plot.scatter(x="feat1",y="feat2",c="label", cmap="viridis", colorbar=False, figsize=(12,8)); # In[3]: # train the classifier mx_depth = 1 clf = DecisionTreeClassifier(random_state=1, max_depth=mx_depth) clf.fit(angle_train[["feat1", "feat2"]], angle_train["label"]) # predict the label on test data test_data["pred"] = clf.predict(test_data[["feat1", "feat2"]]) # plot both of the data in one plot # We are making the test point 40% transparent using alpha = 0.4 # Dark points are training points ax1 = angle_train.plot.scatter(x="feat1",y="feat2",c="label", cmap="viridis", colorbar=False, figsize=(12,8)); test_data.plot.scatter(x="feat1",y="feat2",c="pred", cmap="viridis", colorbar=False, figsize=(12,8), ax=ax1, alpha=0.4); print("Using max_depth =",mx_depth) # ### What did we obtain in the graph above? # - `test_data` contains all the points of the grid # - We predict labels for each point and visualize them in the plot above # - This way we can visualize the **decision region** obtained by the Decision Tree # # What are some of the decision rules leaned by this model? # ## Linearly non-separable data # Now lets look at how a Decision Tree classifier performs when the data is not linearly separable # In[4]: circles_train = pd.read_csv("circles_train.csv") test_data = pd.read_csv("test_data.csv") circles_train.plot.scatter(x="feat1",y="feat2",c="label", cmap="viridis", colorbar=False, figsize=(12,8)); # In[5]: mx_depth = 2 clf = DecisionTreeClassifier(random_state=1, max_depth=mx_depth) clf.fit(circles_train[["feat1", "feat2"]], circles_train["label"]) test_data["pred"] = clf.predict(test_data[["feat1", "feat2"]]) ax1 = circles_train.plot.scatter(x="feat1",y="feat2",c="label", cmap="viridis", colorbar=False, figsize=(12,8)); test_data.plot.scatter(x="feat1",y="feat2",c="pred", cmap="viridis", colorbar=False, figsize=(12,8), ax=ax1, alpha=0.4) print("Using max_depth =",mx_depth) # Great!! even with non linear data, the decision tree finds good decision rules! # # Notice how we'd need `max_depth` at least 4 to classify circular data like the one above. #
# ## Understanding Feature Importance # # Not lets understand feature importance visually. # # Which of the two features do you think is more important in the plot below? # In[6]: vertical_train = pd.read_csv("vertical_train.csv") test_data = pd.read_csv("test_data.csv") vertical_train.plot.scatter(x="feat1",y="feat2",c="label", cmap="viridis", colorbar=False, figsize=(12,8)); # `feat2` seems to be much more important to classify the above data as if we use the rule `if feat2 > 0, class_blue otherwise class_yellow` We'd do a pretty good job of classifying the above data! # # Notice we can not find any such decision line with respect to `feat1` hence it is not very important for the above classification # # Lets visualize the decison tree classifier's predictions to see what rule it learns # In[7]: mx_depth = 1 clf = DecisionTreeClassifier(random_state=1, max_depth=mx_depth) clf.fit(vertical_train[["feat1", "feat2"]], vertical_train["label"]) test_data["pred"] = clf.predict(test_data[["feat1", "feat2"]]) ax1 = vertical_train.plot.scatter(x="feat1",y="feat2",c="label", cmap="viridis", colorbar=False, figsize=(12,8)); test_data.plot.scatter(x="feat1",y="feat2",c="pred", cmap="viridis", colorbar=False, figsize=(12,8), ax=ax1, alpha=0.4) print("Using max_depth =",mx_depth) # It does learn the decision region we hypothesized above! # # Now lets see what does the decision tree classifier thinks about feature importances # In[8]: clf.feature_importances_ # Low importance is assigned to `feat1` and a much higher importance is assigned to `feat2` # --- # # # **Quiz Problems** # ## Question 1 # # > Read the `blobs_train.csv` and `test_data.csv` dataset. Fit a DecisionTreeClassifier (`random_state = 1, max_depth=2`) model on the training data. Predict on the testing data and plot the results. What rules does the decision tree learn for `feat1` and `feat2`? Choose the correct answer from the options # # >Please use the `cmap="viridis"` as we have above. # In[ ]: blobs_train = pd.read_csv("blobs_train.csv") test_data = pd.read_csv("test_data.csv") # your code here # ## Question 2 # # > Read the `moons_train.csv` and `test_data.csv` train and test data dataset. Fit a DecisionTreeClassifier (`random_state = 1, max_depth=10`) model on the training data. Plot the training data, then predict on the testing data. # > # >Now plot both the training data and testing data together and reason about the feature importance of the two features by looking at the plots. # >Find the feature importance from the classifier and choose the correct answer(s) # # In[ ]: moons_train = pd.read_csv("moons_train.csv") test_data = pd.read_csv("test_data.csv") # your code here