# JUST RUN THIS from google.colab import drive import pandas as pd drive.mount('/content/gdrive') # Load the data df = pd.read_csv('/content/gdrive/MyDrive/datasets/admission_predict.csv') # Convert "Chance of Admit" to a True/False "Admitted" column df["Admitted"] = df["Chance of Admit "] > 0.75 df.drop("Chance of Admit ", axis=1, inplace=True) df.rename(columns={"LOR ": "LOR"}, inplace=True) # Explore the data print(f"Total applicants: {len(df)}") print(f"Admitted: {df['Admitted'].sum()}") print(f"Not admitted: {(~df['Admitted']).sum()}") print("\nColumns:") print(df.columns.tolist()) # Look at a random sample of the data print("\nRandom sample of 5 applicants:") df.sample(5) # JUST RUN THIS from sklearn.model_selection import train_test_split # Split the data df_train, df_test = train_test_split(df, test_size=0.2, random_state=42) print(f"Training set: {len(df_train)} applicants") print(f"Test set: {len(df_test)} applicants") def prepare_features(df_train, df_test): features = ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', 'Research'] X_train = df_train[features] X_test = df_test[features] return X_train, X_test def prepare_labels(df_train, df_test): y_train = df_train['Admitted'] y_test = df_test['Admitted'] return y_train, y_test # Test your function X_train, X_test = prepare_features(df_train, df_test) y_train, y_test = prepare_labels(df_train, df_test) print(f"Features shape - Train: {X_train.shape}, Test: {X_test.shape}") print(f"Features used: {X_train.columns.tolist()}") print(f"Training: {y_train.sum()} admitted out of {len(y_train)}") print(f"Testing: {y_test.sum()} admitted out of {len(y_test)}") from sklearn.tree import DecisionTreeClassifier def train_decision_tree(X_train, y_train): model = DecisionTreeClassifier() model.fit(X_train, y_train) return model # Train the model model = train_decision_tree(X_train, y_train) print("Model trained!") print(f"Tree depth: {model.get_depth()}") print(f"Number of leaves: {model.get_n_leaves()}") # JUST RUN THIS from sklearn.tree import plot_tree import matplotlib.pyplot as plt plt.figure(figsize=(20, 10)) plot_tree(model, feature_names=X_train.columns, class_names=['Admitted', 'Not Admitted'], filled=True, max_depth=8) # Only show first 3 levels for clarity plt.title("Decision Tree for College Admissions (First 3 Levels)") plt.show() # What do you notice about which features appear at the top of the tree? def make_predictions(model, X_test): y_pred = pd.Series(model.predict(X_test), index=X_test.index) return y_pred # Make predictions y_pred = make_predictions(model, X_test) print(f"Predicted {y_pred.sum()} admissions out of {len(y_pred)} applicants") # JUST RUN THIS def calculate_confusion_matrix(y_test, y_pred): # Input: df has 'Admitted' and 'Predicted' columns # Output: Returns tp, tn, fp, fn tp = ((y_test == True) & (y_pred == True)).sum() # True Positive tn = ((y_test == False) & (y_pred == False)).sum() # True Negative fp = ((y_test == False) & (y_pred == True)).sum() # False Positive fn = ((y_test == True) & (y_pred == False)).sum() # False Negative return tp, tn, fp, fn # Calculate confusion matrix tp, tn, fp, fn = calculate_confusion_matrix(y_test, y_pred) print(" Predicted Positive | Predicted Negative") print(f"Actual Positive |{tp:>19d} |{fn:>19d} ") print(f"Actual Negative |{fp:>19d} |{tn:>19d} ") print("") # Calculate accuracy, precision, and recall total = len(y_test) accuracy = (tp + tn) / total precision = tp / (tp + fp) recall = tp / (tp + fn) print(f"Accuracy: {accuracy:>6.2%} (Correctly classified {tp + tn} out of {total})") print(f"Precision: {precision:>6.2%} (When predicted positive, correct {precision:.0%} of the time)") print(f"Recall: {recall:>6.2%} (Found {recall:.0%} of all positive cases)") # JUST RUN THIS # Get feature importances importances = pd.DataFrame({ 'Feature': X_train.columns, 'Importance': model.feature_importances_ }).sort_values('Importance', ascending=False) print("Feature Importances (higher = more important):") print(importances) # Visualize feature importances plt.figure(figsize=(10, 6)) plt.bar(importances['Feature'], importances['Importance']) plt.xlabel('Features') plt.ylabel('Importance') plt.title('Feature Importances in Decision Tree') plt.xticks(rotation=45) plt.tight_layout() plt.show() def train_pruned_tree(X_train, y_train, max_depth=3): model = DecisionTreeClassifier(max_depth=max_depth) model.fit(X_train, y_train) return model # Train a simpler tree simple_model = train_pruned_tree(X_train, y_train, max_depth=3) # Evaluate the simpler model simple_predictions = pd.Series(make_predictions(simple_model, X_test)) tp, tn, fp, fn = calculate_confusion_matrix(y_test, simple_predictions) total = len(y_test) simple_accuracy = (tp + tn) / (tp + tn + fp + fn) print(f"Simple Tree (max_depth=3) Test Accuracy: {simple_accuracy:.2%}") print(f"Original Tree Test Accuracy: {accuracy:.2%}") print(f"\nSimple tree has {simple_model.get_n_leaves()} leaves vs {model.get_n_leaves()} in original") # JUST RUN THIS plt.figure(figsize=(15, 8)) plot_tree(simple_model, feature_names=X_train.columns, class_names=['Not Admitted', 'Admitted'], filled=True, fontsize=10) plt.title("Simplified Decision Tree (max_depth=3)") plt.show() # This tree is much easier to interpret! # BONUS CODE HERE def train_pruned_tree(X_train, y_train, criterion="entropy"): model = DecisionTreeClassifier(criterion=criterion) model.fit(X_train, y_train) return model # Train a simpler tree alt_model = train_pruned_tree(X_train, y_train, "entropy") # Evaluate the simpler model alt_predictions = pd.Series(make_predictions(alt_model, X_test)) tp, tn, fp, fn = calculate_confusion_matrix(y_test, simple_predictions) total = len(y_test) alt_accuracy = (tp + tn) / (tp + tn + fp + fn) print(f"Entropy Tree (max_depth=3) Test Accuracy: {alt_accuracy:.2%}") print(f"Original Tree Test Accuracy: {accuracy:.2%}") # JUST RUN THIS (if you're curious) from sklearn.tree import _tree def tree_to_code(tree, feature_names): tree_ = tree.tree_ feature_name = [ feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" for i in tree_.feature ] print("def predict_admission({}):".format(", ".join(feature_names))) def recurse(node, depth): indent = " " * depth if tree_.feature[node] != _tree.TREE_UNDEFINED: name = feature_name[node] threshold = tree_.threshold[node] print("{}if {} <= {:.2f}:".format(indent, name, threshold)) recurse(tree_.children_left[node], depth + 1) print("{}else: # if {} > {:.2f}".format(indent, name, threshold)) recurse(tree_.children_right[node], depth + 1) else: # Get the class prediction values = tree_.value[node][0] class_idx = values.argmax() class_name = "'Admitted'" if class_idx == 1 else "'Not Admitted'" print("{}return {}".format(indent, class_name)) recurse(0, 1) print("\nYour simple decision tree as Python code:") print("=" * 50) tree_to_code(simple_model, X_train.columns) print("\n" + "=" * 50) print("You could copy this function and use it anywhere!")