In this exercise, we'll use logistic regression to predict whether students get admitted to college based on their application data.
While linear regression predicts continuous values (like passenger numbers), logistic regression predicts categories (like Yes/No, True/False). Perfect for admission decisions!
# JUST RUN THIS
from google.colab import drive
import pandas as pd
drive.mount('/content/gdrive')
# Load the data
df = pd.read_csv('/content/gdrive/MyDrive/datasets/admission_predict.csv')
# Convert "Chance of Admit" to a True/False "Admitted" column
df["Admitted"] = df["Chance of Admit "] > 0.75
df.drop("Chance of Admit ", axis=1, inplace=True)
df.rename(columns={"LOR ": "LOR"}, inplace=True)
# Explore the data
print(f"Total applicants: {len(df)}")
print(f"Admitted: {df['Admitted'].sum()}")
print(f"Not admitted: {(~df['Admitted']).sum()}")
print("\nColumns:")
print(df.columns.tolist())
# Look at a random sample of the data
print("\nRandom sample of 5 applicants:")
df.sample(5)
First, we need to split our data into two sets:
df_train
: This is the data we'll train the model on.df_test
: This is a special set we'll exclude so we can test out how well our model did later.# JUST RUN THIS
from sklearn.model_selection import train_test_split
# Split the data (I'm doing this for you!)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
print(f"Training set: {len(df_train)} applicants")
print(f"Test set: {len(df_test)} applicants")
Now we need to separate our features (X) from our labels (y).
Features (X): The information we use to make predictions (test scores, GPA, etc.)
Labels (y): What we're trying to predict (Admitted: True/False)
When selecting columns in pandas:
df['column']
returns a Series (1D)df[['column']]
returns a DataFrame (2D)sklearn needs features as a DataFrame (2D), so we use double brackets!
# Example of single vs double brackets
print("Single brackets - Series:")
print(type(df['GRE Score'])) # Returns pandas.Series
print("\nDouble brackets - DataFrame:")
print(type(df[['GRE Score']])) # Returns pandas.DataFrame
This function will need to return two values at once and, YES, THAT'S POSSIBLE IN PYTHON!
This is done by your return having two values, comma-separated, and assigning the results to two variables, comma-separated.
def my_function_that_returns_two():
return "hello world", 42
my_str, my_number = my_function_that_returns_two()
In our case, we'll want our function to return both X_train
and X_test
:
def prepare_features(df_train, df_test):
...
return X_train, X_test
Once you pick your features (look at the list printed out in Part 1) that you think will predict admission, you can select them each all at once to build the new dataframe.
X_train = df[['GRE Score', 'University Rating', 'CGPA']]
X_test = df[['GRE Score', 'University Rating', 'CGPA']]
return X_train, X_test
Now let's select our features:
def prepare_features(df_train, df_test):
# Input: df_train and df_test are DataFrames with all columns
# Output: Returns X_train, X_test (features only, no Serial No. or Admitted)
# TODO: Your code here!
# 1. First, list all the feature columns you want to use
# Hint: All columns except 'Serial No.' and 'Admitted'
# You can type them out: feature_cols = ['GRE Score', 'TOEFL Score', ...]
# Or use: feature_cols = df_train.columns.drop(['Serial No.', 'Admitted']).tolist()
# 2. Create X_train using double brackets
# X_train = df_train[feature_cols]
# 3. Create X_test the same way
# X_test = df_test[feature_cols]
# 4. Return both (yes, functions can return multiple values!)
# return X_train, X_test
pass
# Test your function
X_train, X_test = prepare_features(df_train, df_test)
print(f"Features shape - Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Features used: {X_train.columns.tolist()}")
# Check to make sure the same features were picked for X_train and X_test
assert(df_train.columns.equals(df_test.columns)
Now let's get our labels (what we're trying to predict):
This is similar to Part 3 but now we're returning two Series
instead of two Dataframes
.
def prepare_labels(df_train, df_test):
# Input: df_train and df_test are DataFrames
# Output: Returns y_train, y_test (just the Admitted column)
# TODO: Your code here!
# 1. Extract the 'Admitted' column from df_train
# y_train = df_train['Admitted'] # Single brackets for a Series!
# 2. Extract the 'Admitted' column from df_test
# y_test = df_test['Admitted']
# 3. Return both values
# return y_train, y_test
pass
# Test your function
y_train, y_test = prepare_labels(df_train, df_test)
print(f"Training: {y_train.sum()} admitted out of {len(y_train)}")
print(f"Testing: {y_test.sum()} admitted out of {len(y_test)}")
Time to train our logistic regression model! This is very similar to linear regression:
model = LogisticRegression()
model.fit(X_train, y_train)
from sklearn.linear_model import LogisticRegression
def train_logistic_model(X_train, y_train):
# Input: X_train (features), y_train (labels)
# Output: Returns trained model
# TODO: Your code here!
# 1. Create a LogisticRegression model
# model = LogisticRegression(max_iter=1000)
# Note: max_iter=1000 gives the model more iterations to converge
# 2. Train the model using the fit method
# model.fit(X_train, y_train)
# 3. Return the trained model
# return model
pass
# Train the model
model = train_logistic_model(X_train, y_train)
print("Model trained!")
Now let's use our trained model to make predictions on the test set:
Use model.predict(X_test
).
def make_predictions(model, X_test):
# Input: model (trained), X_test (features to predict)
# Output: Returns predictions as a pandas Series
# TODO: Your code here!
# 1. Use the model's predict method to get predictions
# predictions = model.predict(X_test)
# This returns a numpy array of True/False values
# 2. Return the Series
# return predictions_series
pass
# Make predictions
y_pred = pd.Series(make_predictions(model, X_test))
print(f"Predicted {y_pred.sum()} admissions out of {len(y_pred)} applicants")
A confusion matrix helps us understand our model's mistakes:
You can just run this code, but to understand what's happening here, we want this table:
Predicted Positive (PP) | Predicted Negative (PN) | |
---|---|---|
Actual Positive (P) | True Positive (tp ) |
False Negative (fn ) |
Actual Negative (N) | False Positive (fp ) |
True Negative (tn ) |
And we have:
Boolean Series | |
---|---|
Actual Positive (P) | y_test == True |
Actual Negative (N) | y_test == False |
Predicted Positive (PP) | y_pred == True |
Predicted Negative (PN) | y_pred == False |
And we're use &
to find the entries where the two Boolean Series created by the two predicates both return True at the same time.
# JUST RUN THIS
def calculate_confusion_matrix(y_test, y_pred):
# Input: df has 'Admitted' and 'Predicted' columns
# Output: Returns tp, tn, fp, fn
tp = ((y_test == True) & (y_pred == True)).sum() # True Positive
tn = ((y_test == False) & (y_pred == False)).sum() # True Negative
fp = ((y_test == False) & (y_pred == True)).sum() # False Positive
fn = ((y_test == True) & (y_pred == False)).sum() # False Negative
return tp, tn, fp, fn
# Calculate confusion matrix
tp, tn, fp, fn = calculate_confusion_matrix(y_test, y_pred)
print(" Predicted Positive | Predicted Negative")
print(f"Actual Positive |{tp:>19d} |{fn:>19d} ")
print(f"Actual Negative |{fp:>19d} |{tn:>19d} ")
Now let's calculate how well our model performed:
$ \text{Accuracy} = \frac{tp + tn}{tp + fp + fn + tn} \\ \text{Precision} = \frac{tp}{tp + fp} \\ \text{Recall} = \frac{tp}{tp + fn} \\ $
def calculate_metrics(tp, tn, fp, fn):
# Input: Confusion matrix values
# Output: Returns accuracy, precision, recall (3 values!)
# TODO: Your code here!
# Calculate total predictions
# total = tp + tn + fp + fn
# Accuracy: What percentage did we get right?
# accuracy = (tp + tn) / total
# Precision: When we predict admission, how often are we right?
# precision = tp / (tp + fp) if (tp + fp) > 0 else 0
# Recall: Of all actual admissions, what percentage did we catch?
# recall = tp / (tp + fn) if (tp + fn) > 0 else 0
# Return all three metrics
# return accuracy, precision, recall
pass
# Calculate metrics
accuracy, precision, recall = calculate_metrics(tp, tn, fp, fn)
print(f"Accuracy: {accuracy:.2%} (Overall correctness)")
print(f"Precision: {precision:.2%} (When we predict admit, how often are we right?)")
print(f"Recall: {recall:.2%} (Of all admits, what percentage did we identify?)")
# JUST RUN THIS
# Let's look at where we went wrong
false_positives = df_test[(y_test == False) & (y_pred == True)]
false_negatives = df_test[(y_test == True) & (y_pred == False)]
print(f"\nFalse Positives (predicted admit but rejected):")
if len(false_positives) > 0:
display(false_positives[['GRE Score', 'CGPA', 'Research']].head())
print(f"\nFalse Negatives (predicted reject but admitted):")
if len(false_negatives) > 0:
display(false_negatives[['GRE Score', 'CGPA', 'Research']].head())
Which factors matter most for admission?
You can use the following code for getting feature importance:
# Get feature importances (absolute values of coefficients)
importances = pd.DataFrame({
'Feature': X_train.columns,
'Importance': abs(model.coef_[0])
})
importances = importances.sort_values('Importance', ascending=False)
print("Most important factors for admission:")
print(importances)
# BONUS CODE HERE