# JUST RUN THIS

from google.colab import drive
import pandas as pd

drive.mount('/content/gdrive')

# Load the data
df = pd.read_csv('/content/gdrive/MyDrive/datasets/admission_predict.csv')

# Convert "Chance of Admit" to a True/False "Admitted" column
df["Admitted"] = df["Chance of Admit "] > 0.75
df.drop("Chance of Admit ", axis=1, inplace=True)
df.rename(columns={"LOR ": "LOR"}, inplace=True)

# Explore the data
print(f"Total applicants: {len(df)}")
print(f"Admitted: {df['Admitted'].sum()}")
print(f"Not admitted: {(~df['Admitted']).sum()}")
print("\nColumns:")
print(df.columns.tolist())

# Look at a random sample of the data
print("\nRandom sample of 5 applicants:")
df.sample(5)

# JUST RUN THIS

from sklearn.model_selection import train_test_split

# Split the data
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

print(f"Training set: {len(df_train)} applicants")
print(f"Test set: {len(df_test)} applicants")

def prepare_features(df_train, df_test):
    features = ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', 'Research']
    X_train = df_train[features]
    X_test  = df_test[features]
    return X_train, X_test

def prepare_labels(df_train, df_test):
    y_train = df_train['Admitted']
    y_test = df_test['Admitted']
    return y_train, y_test

# Test your function
X_train, X_test = prepare_features(df_train, df_test)
y_train, y_test = prepare_labels(df_train, df_test)
print(f"Features shape - Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Features used: {X_train.columns.tolist()}")
print(f"Training: {y_train.sum()} admitted out of {len(y_train)}")
print(f"Testing: {y_test.sum()} admitted out of {len(y_test)}")

from sklearn.tree import DecisionTreeClassifier

def train_decision_tree(X_train, y_train):
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    return model

# Train the model
model = train_decision_tree(X_train, y_train)
print("Model trained!")
print(f"Tree depth: {model.get_depth()}")
print(f"Number of leaves: {model.get_n_leaves()}")

# JUST RUN THIS

from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 10))
plot_tree(model,
          feature_names=X_train.columns,
          class_names=['Admitted', 'Not Admitted'],
          filled=True,
          max_depth=8)  # Only show first 3 levels for clarity
plt.title("Decision Tree for College Admissions (First 3 Levels)")
plt.show()

# What do you notice about which features appear at the top of the tree?

def make_predictions(model, X_test):
    y_pred = pd.Series(model.predict(X_test), index=X_test.index)
    return y_pred

# Make predictions
y_pred = make_predictions(model, X_test)
print(f"Predicted {y_pred.sum()} admissions out of {len(y_pred)} applicants")

# JUST RUN THIS

def calculate_confusion_matrix(y_test, y_pred):
    # Input: df has 'Admitted' and 'Predicted' columns
    # Output: Returns tp, tn, fp, fn
    tp = ((y_test == True)  & (y_pred == True)).sum()  # True Positive
    tn = ((y_test == False) & (y_pred == False)).sum() # True Negative
    fp = ((y_test == False) & (y_pred == True)).sum()  # False Positive
    fn = ((y_test == True)  & (y_pred == False)).sum() # False Negative
    return tp, tn, fp, fn

# Calculate confusion matrix
tp, tn, fp, fn = calculate_confusion_matrix(y_test, y_pred)
print("                  Predicted Positive | Predicted Negative")
print(f"Actual Positive |{tp:>19d} |{fn:>19d} ")
print(f"Actual Negative |{fp:>19d} |{tn:>19d} ")
print("")

# Calculate accuracy, precision, and recall
total = len(y_test)
accuracy = (tp + tn) / total
precision = tp / (tp + fp)
recall = tp / (tp + fn)
print(f"Accuracy:  {accuracy:>6.2%} (Correctly classified {tp + tn} out of {total})")
print(f"Precision: {precision:>6.2%} (When predicted positive, correct {precision:.0%} of the time)")
print(f"Recall:    {recall:>6.2%} (Found {recall:.0%} of all positive cases)")

# JUST RUN THIS

# Get feature importances
importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print("Feature Importances (higher = more important):")
print(importances)

# Visualize feature importances
plt.figure(figsize=(10, 6))
plt.bar(importances['Feature'], importances['Importance'])
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importances in Decision Tree')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

def train_pruned_tree(X_train, y_train, max_depth=3):
    model = DecisionTreeClassifier(max_depth=max_depth)
    model.fit(X_train, y_train)
    return model

# Train a simpler tree
simple_model = train_pruned_tree(X_train, y_train, max_depth=3)

# Evaluate the simpler model

simple_predictions = pd.Series(make_predictions(simple_model, X_test))

tp, tn, fp, fn = calculate_confusion_matrix(y_test, simple_predictions)
total = len(y_test)
simple_accuracy = (tp + tn) / (tp + tn + fp + fn)
print(f"Simple Tree (max_depth=3) Test Accuracy: {simple_accuracy:.2%}")
print(f"Original Tree Test Accuracy: {accuracy:.2%}")
print(f"\nSimple tree has {simple_model.get_n_leaves()} leaves vs {model.get_n_leaves()} in original")

# JUST RUN THIS

plt.figure(figsize=(15, 8))
plot_tree(simple_model,
          feature_names=X_train.columns,
          class_names=['Not Admitted', 'Admitted'],
          filled=True,
          fontsize=10)
plt.title("Simplified Decision Tree (max_depth=3)")
plt.show()

# This tree is much easier to interpret!

# BONUS CODE HERE

def train_pruned_tree(X_train, y_train, criterion="entropy"):
    model = DecisionTreeClassifier(criterion=criterion)
    model.fit(X_train, y_train)
    return model

# Train a simpler tree
alt_model = train_pruned_tree(X_train, y_train, "entropy")

# Evaluate the simpler model

alt_predictions = pd.Series(make_predictions(alt_model, X_test))

tp, tn, fp, fn = calculate_confusion_matrix(y_test, simple_predictions)
total = len(y_test)
alt_accuracy = (tp + tn) / (tp + tn + fp + fn)
print(f"Entropy Tree (max_depth=3) Test Accuracy: {alt_accuracy:.2%}")
print(f"Original Tree Test Accuracy: {accuracy:.2%}")


# JUST RUN THIS (if you're curious)

from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print("def predict_admission({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "    " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {:.2f}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:  # if {} > {:.2f}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            # Get the class prediction
            values = tree_.value[node][0]
            class_idx = values.argmax()
            class_name = "'Admitted'" if class_idx == 1 else "'Not Admitted'"
            print("{}return {}".format(indent, class_name))

    recurse(0, 1)

print("\nYour simple decision tree as Python code:")
print("=" * 50)
tree_to_code(simple_model, X_train.columns)
print("\n" + "=" * 50)
print("You could copy this function and use it anywhere!")