In this exercise, we'll use decision trees to predict whether students get admitted to college based on their application data.
A decision tree is like a flowchart that asks yes/no questions about your data to make predictions. Unlike logistic regression which creates a smooth probability curve, decision trees create clear "if-then" rules.
For example:
# JUST RUN THIS
from google.colab import drive
import pandas as pd
drive.mount('/content/gdrive')
# Load the data
df = pd.read_csv('/content/gdrive/MyDrive/datasets/admission_predict.csv')
# Convert "Chance of Admit" to a True/False "Admitted" column
df["Admitted"] = df["Chance of Admit "] > 0.75
df.drop("Chance of Admit ", axis=1, inplace=True)
df.rename(columns={"LOR ": "LOR"}, inplace=True)
# Explore the data
print(f"Total applicants: {len(df)}")
print(f"Admitted: {df['Admitted'].sum()}")
print(f"Not admitted: {(~df['Admitted']).sum()}")
print("\nColumns:")
print(df.columns.tolist())
# Look at a random sample of the data
print("\nRandom sample of 5 applicants:")
df.sample(5)
Just like with logistic regression, we need to split our data for training and testing.
# JUST RUN THIS
from sklearn.model_selection import train_test_split
# Split the data
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
print(f"Training set: {len(df_train)} applicants")
print(f"Test set: {len(df_test)} applicants")
Let's separate our features (X) from our labels (y).
This is exactly like the logistic regression assignment! We need to:
Remember:
def prepare_features(df_train, df_test):
# Input: df_train and df_test are DataFrames with all columns
# Output: Returns X_train, X_test (features only, no Serial No. or Admitted)
# TODO: Your code here!
# 1. First, list all the feature columns you want to use
# Hint: All columns except 'Serial No.' and 'Admitted'
# You can type them out: feature_cols = ['GRE Score', 'TOEFL Score', ...]
# Or use: feature_cols = df_train.columns.drop(['Serial No.', 'Admitted']).tolist()
# 2. Create X_train using double brackets
# X_train = df_train[feature_cols]
# 3. Create X_test the same way
# X_test = df_test[feature_cols]
# 4. Return both (yes, functions can return multiple values!)
# return X_train, X_test
pass
def prepare_labels(df_train, df_test):
# Input: df_train and df_test are DataFrames
# Output: Returns y_train, y_test (just the Admitted column)
# TODO: Your code here!
# 1. Extract the 'Admitted' column from df_train
# y_train = df_train['Admitted'] # Single brackets for a Series!
# 2. Extract the 'Admitted' column from df_test
# y_test = df_test['Admitted']
# 3. Return both values
# return y_train, y_test
pass
# Test your function
X_train, X_test = prepare_features(df_train, df_test)
y_train, y_test = prepare_labels(df_train, df_test)
print(f"Features shape - Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Features used: {X_train.columns.tolist()}")
print(f"Training: {y_train.sum()} admitted out of {len(y_train)}")
print(f"Testing: {y_test.sum()} admitted out of {len(y_test)}")
Time to train our decision tree! This is very similar to logistic regression, but we use DecisionTreeClassifier
instead.
The pattern is almost identical to logistic regression:
The only difference is we're using DecisionTreeClassifier()
instead of LogisticRegression()
.
from sklearn.tree import DecisionTreeClassifier
def train_decision_tree(X_train, y_train):
# Input: X_train (features), y_train (labels)
# Output: Returns trained model
# TODO: Your code here!
# 1. Create a DecisionTreeClassifier model
# model = DecisionTreeClassifier(random_state=42)
# Note: random_state ensures consistent results
# 2. Train the model using the fit method
# model.fit(X_train, y_train)
# 3. Return the trained model
# return model
pass
# Train the model
model = train_decision_tree(X_train, y_train)
print("Model trained!")
print(f"Tree depth: {model.get_depth()}")
print(f"Number of leaves: {model.get_n_leaves()}")
One of the best things about decision trees is that we can see exactly how they make decisions!
# JUST RUN THIS
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 10))
plot_tree(model,
feature_names=X_train.columns,
class_names=['Not Admitted', 'Admitted'],
filled=True,
max_depth=3) # Only show first 3 levels for clarity
plt.title("Decision Tree for College Admissions (First 3 Levels)")
plt.show()
# What do you notice about which features appear at the top of the tree?
def make_predictions(model, X_test):
# Input: model (trained), X_test (features to predict)
# Output: Returns predictions as a pandas Series
# TODO: Your code here!
# 1. Use the model's predict method to get predictions
# y_pred = pd.Series(model.predict(X_test), index=X_test.index)
# 2. Return the Series
# return y_pred
y_pred = pd.Series(model.predict(X_test), index=X_test.index)
return y_pred
# Make predictions
y_pred = make_predictions(model, X_test)
print(f"Predicted {y_pred.sum()} admissions out of {len(y_pred)} applicants")
Let's calculate our model's accuracy. For decision trees, accuracy is often a good metric because they naturally handle class imbalance well but the following code calculates the full confusion matrix and all three metrics.
# JUST RUN THIS
def calculate_confusion_matrix(y_test, y_pred):
# Input: df has 'Admitted' and 'Predicted' columns
# Output: Returns tp, tn, fp, fn
tp = ((y_test == True) & (y_pred == True)).sum() # True Positive
tn = ((y_test == False) & (y_pred == False)).sum() # True Negative
fp = ((y_test == False) & (y_pred == True)).sum() # False Positive
fn = ((y_test == True) & (y_pred == False)).sum() # False Negative
return tp, tn, fp, fn
# Calculate confusion matrix
tp, tn, fp, fn = calculate_confusion_matrix(y_test, y_pred)
print(" Predicted Positive | Predicted Negative")
print(f"Actual Positive |{tp:>19d} |{fn:>19d} ")
print(f"Actual Negative |{fp:>19d} |{tn:>19d} ")
print("")
# Calculate accuracy, precision, and recall
total = len(y_test)
accuracy = (tp + tn) / total
precision = tp / (tp + fp)
recall = tp / (tp + fn)
print(f"Accuracy: {accuracy:>6.2%} (Correctly classified {tp + tn} out of {total})")
print(f"Precision: {precision:>6.2%} (When predicted positive, correct {precision:.0%} of the time)")
print(f"Recall: {recall:>6.2%} (Found {recall:.0%} of all positive cases)")
Unlike logistic regression, decision trees can tell us exactly how important each feature is!
# JUST RUN THIS
# Get feature importances
importances = pd.DataFrame({
'Feature': X_train.columns,
'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print("Feature Importances (higher = more important):")
print(importances)
# Visualize feature importances
plt.figure(figsize=(10, 6))
plt.bar(importances['Feature'], importances['Importance'])
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importances in Decision Tree')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Decision trees can grow very deep and memorize the training data. Let's create a simpler tree that generalizes better.
We can control tree complexity with parameters:
max_depth
: Maximum depth of the treemin_samples_split
: Minimum samples needed to split a nodemin_samples_leaf
: Minimum samples in a leaf nodedef train_pruned_tree(X_train, y_train, max_depth=3):
# Input: X_train, y_train, and max_depth parameter
# Output: Returns a simpler, pruned decision tree
# TODO: Your code here!
# 1. Create a DecisionTreeClassifier with max_depth limit
# model = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
# 2. Train the model
# model.fit(X_train, y_train)
# 3. Return the model
# return model
pass
# Train a simpler tree
simple_model = train_pruned_tree(X_train, y_train, max_depth=3)
# Evaluate the simpler model
simple_predictions = pd.Series(make_predictions(simple_model, X_test))
tp, tn, fp, fn = calculate_confusion_matrix(y_test, simple_predictions)
total = len(y_test)
simple_accuracy = (tp + tn) / (tp + tn + fp + fn)
print(f"Simple Tree (max_depth=3) Test Accuracy: {simple_accuracy:.2%}")
print(f"Original Tree Test Accuracy: {accuracy:.2%}")
print(f"\nSimple tree has {simple_model.get_n_leaves()} leaves vs {model.get_n_leaves()} in original")
Let's see how much simpler our pruned tree is:
# JUST RUN THIS
plt.figure(figsize=(15, 8))
plot_tree(simple_model,
feature_names=X_train.columns,
class_names=['Not Admitted', 'Admitted'],
filled=True,
fontsize=10)
plt.title("Simplified Decision Tree (max_depth=3)")
plt.show()
# This tree is much easier to interpret!
Decision trees can use different methods to decide how to split:
Try creating a model with criterion="entropy"
and see if it performs differently!
entropy_model = DecisionTreeClassifier(criterion="entropy", max_depth=3, random_state=42)
entropy_model.fit(X_train, y_train)
# Evaluate and compare
# BONUS CODE HERE
One amazing thing about decision trees is we can convert them to simple if-else statements!
This code converts your trained tree into a Python function:
# JUST RUN THIS (if you're curious)
from sklearn.tree import _tree
def tree_to_code(tree, feature_names):
tree_ = tree.tree_
feature_name = [
feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
for i in tree_.feature
]
print("def predict_admission({}):".format(", ".join(feature_names)))
def recurse(node, depth):
indent = " " * depth
if tree_.feature[node] != _tree.TREE_UNDEFINED:
name = feature_name[node]
threshold = tree_.threshold[node]
print("{}if {} <= {:.2f}:".format(indent, name, threshold))
recurse(tree_.children_left[node], depth + 1)
print("{}else: # if {} > {:.2f}".format(indent, name, threshold))
recurse(tree_.children_right[node], depth + 1)
else:
# Get the class prediction
values = tree_.value[node][0]
class_idx = values.argmax()
class_name = "'Admitted'" if class_idx == 1 else "'Not Admitted'"
print("{}return {}".format(indent, class_name))
recurse(0, 1)
print("\nYour simple decision tree as Python code:")
print("=" * 50)
tree_to_code(simple_model, X_train.columns)
print("\n" + "=" * 50)
print("You could copy this function and use it anywhere!")