# JUST RUN THIS from google.colab import drive import pandas as pd drive.mount('/content/gdrive') # Load the data df = pd.read_csv('/content/gdrive/MyDrive/datasets/admission_predict.csv') # Convert "Chance of Admit" to a True/False "Admitted" column df["Admitted"] = df["Chance of Admit "] > 0.75 df.drop("Chance of Admit ", axis=1, inplace=True) df.rename(columns={"LOR ": "LOR"}, inplace=True) # Explore the data print(f"Total applicants: {len(df)}") print(f"Admitted: {df['Admitted'].sum()}") print(f"Not admitted: {(~df['Admitted']).sum()}") print("\nColumns:") print(df.columns.tolist()) # Look at a random sample of the data print("\nRandom sample of 5 applicants:") df.sample(5) # JUST RUN THIS from sklearn.model_selection import train_test_split # Split the data df_train, df_test = train_test_split(df, test_size=0.2, random_state=42) print(f"Training set: {len(df_train)} applicants") print(f"Test set: {len(df_test)} applicants") def prepare_features(df_train, df_test): # Input: df_train and df_test are DataFrames with all columns # Output: Returns X_train, X_test (features only, no Serial No. or Admitted) # TODO: Your code here! # 1. First, list all the feature columns you want to use # Hint: All columns except 'Serial No.' and 'Admitted' # You can type them out: feature_cols = ['GRE Score', 'TOEFL Score', ...] # Or use: feature_cols = df_train.columns.drop(['Serial No.', 'Admitted']).tolist() # 2. Create X_train using double brackets # X_train = df_train[feature_cols] # 3. Create X_test the same way # X_test = df_test[feature_cols] # 4. Return both (yes, functions can return multiple values!) # return X_train, X_test pass def prepare_labels(df_train, df_test): # Input: df_train and df_test are DataFrames # Output: Returns y_train, y_test (just the Admitted column) # TODO: Your code here! # 1. Extract the 'Admitted' column from df_train # y_train = df_train['Admitted'] # Single brackets for a Series! # 2. Extract the 'Admitted' column from df_test # y_test = df_test['Admitted'] # 3. Return both values # return y_train, y_test pass # Test your function X_train, X_test = prepare_features(df_train, df_test) y_train, y_test = prepare_labels(df_train, df_test) print(f"Features shape - Train: {X_train.shape}, Test: {X_test.shape}") print(f"Features used: {X_train.columns.tolist()}") print(f"Training: {y_train.sum()} admitted out of {len(y_train)}") print(f"Testing: {y_test.sum()} admitted out of {len(y_test)}") from sklearn.tree import DecisionTreeClassifier def train_decision_tree(X_train, y_train): # Input: X_train (features), y_train (labels) # Output: Returns trained model # TODO: Your code here! # 1. Create a DecisionTreeClassifier model # model = DecisionTreeClassifier(random_state=42) # Note: random_state ensures consistent results # 2. Train the model using the fit method # model.fit(X_train, y_train) # 3. Return the trained model # return model pass # Train the model model = train_decision_tree(X_train, y_train) print("Model trained!") print(f"Tree depth: {model.get_depth()}") print(f"Number of leaves: {model.get_n_leaves()}") # JUST RUN THIS from sklearn.tree import plot_tree import matplotlib.pyplot as plt plt.figure(figsize=(20, 10)) plot_tree(model, feature_names=X_train.columns, class_names=['Not Admitted', 'Admitted'], filled=True, max_depth=3) # Only show first 3 levels for clarity plt.title("Decision Tree for College Admissions (First 3 Levels)") plt.show() # What do you notice about which features appear at the top of the tree? def make_predictions(model, X_test): # Input: model (trained), X_test (features to predict) # Output: Returns predictions as a pandas Series # TODO: Your code here! # 1. Use the model's predict method to get predictions # y_pred = pd.Series(model.predict(X_test), index=X_test.index) # 2. Return the Series # return y_pred y_pred = pd.Series(model.predict(X_test), index=X_test.index) return y_pred # Make predictions y_pred = make_predictions(model, X_test) print(f"Predicted {y_pred.sum()} admissions out of {len(y_pred)} applicants") # JUST RUN THIS def calculate_confusion_matrix(y_test, y_pred): # Input: df has 'Admitted' and 'Predicted' columns # Output: Returns tp, tn, fp, fn tp = ((y_test == True) & (y_pred == True)).sum() # True Positive tn = ((y_test == False) & (y_pred == False)).sum() # True Negative fp = ((y_test == False) & (y_pred == True)).sum() # False Positive fn = ((y_test == True) & (y_pred == False)).sum() # False Negative return tp, tn, fp, fn # Calculate confusion matrix tp, tn, fp, fn = calculate_confusion_matrix(y_test, y_pred) print(" Predicted Positive | Predicted Negative") print(f"Actual Positive |{tp:>19d} |{fn:>19d} ") print(f"Actual Negative |{fp:>19d} |{tn:>19d} ") print("") # Calculate accuracy, precision, and recall total = len(y_test) accuracy = (tp + tn) / total precision = tp / (tp + fp) recall = tp / (tp + fn) print(f"Accuracy: {accuracy:>6.2%} (Correctly classified {tp + tn} out of {total})") print(f"Precision: {precision:>6.2%} (When predicted positive, correct {precision:.0%} of the time)") print(f"Recall: {recall:>6.2%} (Found {recall:.0%} of all positive cases)") # JUST RUN THIS # Get feature importances importances = pd.DataFrame({ 'Feature': X_train.columns, 'Importance': model.feature_importances_ }).sort_values('Importance', ascending=False) print("Feature Importances (higher = more important):") print(importances) # Visualize feature importances plt.figure(figsize=(10, 6)) plt.bar(importances['Feature'], importances['Importance']) plt.xlabel('Features') plt.ylabel('Importance') plt.title('Feature Importances in Decision Tree') plt.xticks(rotation=45) plt.tight_layout() plt.show() def train_pruned_tree(X_train, y_train, max_depth=3): # Input: X_train, y_train, and max_depth parameter # Output: Returns a simpler, pruned decision tree # TODO: Your code here! # 1. Create a DecisionTreeClassifier with max_depth limit # model = DecisionTreeClassifier(max_depth=max_depth, random_state=42) # 2. Train the model # model.fit(X_train, y_train) # 3. Return the model # return model pass # Train a simpler tree simple_model = train_pruned_tree(X_train, y_train, max_depth=3) # Evaluate the simpler model simple_predictions = pd.Series(make_predictions(simple_model, X_test)) tp, tn, fp, fn = calculate_confusion_matrix(y_test, simple_predictions) total = len(y_test) simple_accuracy = (tp + tn) / (tp + tn + fp + fn) print(f"Simple Tree (max_depth=3) Test Accuracy: {simple_accuracy:.2%}") print(f"Original Tree Test Accuracy: {accuracy:.2%}") print(f"\nSimple tree has {simple_model.get_n_leaves()} leaves vs {model.get_n_leaves()} in original") # JUST RUN THIS plt.figure(figsize=(15, 8)) plot_tree(simple_model, feature_names=X_train.columns, class_names=['Not Admitted', 'Admitted'], filled=True, fontsize=10) plt.title("Simplified Decision Tree (max_depth=3)") plt.show() # This tree is much easier to interpret! # BONUS CODE HERE # JUST RUN THIS (if you're curious) from sklearn.tree import _tree def tree_to_code(tree, feature_names): tree_ = tree.tree_ feature_name = [ feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" for i in tree_.feature ] print("def predict_admission({}):".format(", ".join(feature_names))) def recurse(node, depth): indent = " " * depth if tree_.feature[node] != _tree.TREE_UNDEFINED: name = feature_name[node] threshold = tree_.threshold[node] print("{}if {} <= {:.2f}:".format(indent, name, threshold)) recurse(tree_.children_left[node], depth + 1) print("{}else: # if {} > {:.2f}".format(indent, name, threshold)) recurse(tree_.children_right[node], depth + 1) else: # Get the class prediction values = tree_.value[node][0] class_idx = values.argmax() class_name = "'Admitted'" if class_idx == 1 else "'Not Admitted'" print("{}return {}".format(indent, class_name)) recurse(0, 1) print("\nYour simple decision tree as Python code:") print("=" * 50) tree_to_code(simple_model, X_train.columns) print("\n" + "=" * 50) print("You could copy this function and use it anywhere!")