# JUST RUN THIS from google.colab import drive import pandas as pd drive.mount('/content/gdrive') # Load the data df = pd.read_csv('/content/gdrive/MyDrive/datasets/admission_predict.csv') # Convert "Chance of Admit" to a True/False "Admitted" column df["Admitted"] = df["Chance of Admit "] > 0.75 df.drop("Chance of Admit ", axis=1, inplace=True) df.rename(columns={"LOR ": "LOR"}, inplace=True) # Explore the data print(f"Total applicants: {len(df)}") print(f"Admitted: {df['Admitted'].sum()}") print(f"Not admitted: {(~df['Admitted']).sum()}") print("\nColumns:") print(df.columns.tolist()) # Look at a random sample of the data print("\nRandom sample of 5 applicants:") df.sample(5) # JUST RUN THIS from sklearn.model_selection import train_test_split # Split the data (I'm doing this for you!) df_train, df_test = train_test_split(df, test_size=0.2, random_state=42) print(f"Training set: {len(df_train)} applicants") print(f"Test set: {len(df_test)} applicants") def prepare_features(df_train, df_test): # Input: df_train and df_test are DataFrames with all columns # Output: Returns X_train, X_test (features only, no Serial No. or Admitted) # TODO: Your code here! # 1. First, list all the feature columns you want to use # Hint: All columns except 'Serial No.' and 'Admitted' # You can type them out: feature_cols = ['GRE Score', 'TOEFL Score', ...] # Or use: feature_cols = df_train.columns.drop(['Serial No.', 'Admitted']).tolist() # 2. Create X_train using double brackets # X_train = df_train[feature_cols] # 3. Create X_test the same way # X_test = df_test[feature_cols] # 4. Return both (yes, functions can return multiple values!) # return X_train, X_test pass # Test your function X_train, X_test = prepare_features(df_train, df_test) print(f"Features shape - Train: {X_train.shape}, Test: {X_test.shape}") print(f"Features used: {X_train.columns.tolist()}") # Check to make sure the same features were picked for X_train and X_test assert(df_train.columns.equals(df_test.columns) def prepare_labels(df_train, df_test): # Input: df_train and df_test are DataFrames # Output: Returns y_train, y_test (just the Admitted column) # TODO: Your code here! # 1. Extract the 'Admitted' column from df_train # y_train = df_train['Admitted'] # Single brackets for a Series! # 2. Extract the 'Admitted' column from df_test # y_test = df_test['Admitted'] # 3. Return both values # return y_train, y_test pass # Test your function y_train, y_test = prepare_labels(df_train, df_test) print(f"Training: {y_train.sum()} admitted out of {len(y_train)}") print(f"Testing: {y_test.sum()} admitted out of {len(y_test)}") from sklearn.linear_model import LogisticRegression def train_logistic_model(X_train, y_train): # Input: X_train (features), y_train (labels) # Output: Returns trained model # TODO: Your code here! # 1. Create a LogisticRegression model # model = LogisticRegression(max_iter=1000) # Note: max_iter=1000 gives the model more iterations to converge # 2. Train the model using the fit method # model.fit(X_train, y_train) # 3. Return the trained model # return model pass # Train the model model = train_logistic_model(X_train, y_train) print("Model trained!") def make_predictions(model, X_test): # Input: model (trained), X_test (features to predict) # Output: Returns predictions as a pandas Series # TODO: Your code here! # 1. Use the model's predict method to get predictions # predictions = model.predict(X_test) # This returns a numpy array of True/False values # 2. Return the Series # return predictions_series pass # Make predictions y_pred = pd.Series(make_predictions(model, X_test)) print(f"Predicted {y_pred.sum()} admissions out of {len(y_pred)} applicants") # JUST RUN THIS def calculate_confusion_matrix(y_test, y_pred): # Input: df has 'Admitted' and 'Predicted' columns # Output: Returns tp, tn, fp, fn tp = ((y_test == True) & (y_pred == True)).sum() # True Positive tn = ((y_test == False) & (y_pred == False)).sum() # True Negative fp = ((y_test == False) & (y_pred == True)).sum() # False Positive fn = ((y_test == True) & (y_pred == False)).sum() # False Negative return tp, tn, fp, fn # Calculate confusion matrix tp, tn, fp, fn = calculate_confusion_matrix(y_test, y_pred) print(" Predicted Positive | Predicted Negative") print(f"Actual Positive |{tp:>19d} |{fn:>19d} ") print(f"Actual Negative |{fp:>19d} |{tn:>19d} ") def calculate_metrics(tp, tn, fp, fn): # Input: Confusion matrix values # Output: Returns accuracy, precision, recall (3 values!) # TODO: Your code here! # Calculate total predictions # total = tp + tn + fp + fn # Accuracy: What percentage did we get right? # accuracy = (tp + tn) / total # Precision: When we predict admission, how often are we right? # precision = tp / (tp + fp) if (tp + fp) > 0 else 0 # Recall: Of all actual admissions, what percentage did we catch? # recall = tp / (tp + fn) if (tp + fn) > 0 else 0 # Return all three metrics # return accuracy, precision, recall pass # Calculate metrics accuracy, precision, recall = calculate_metrics(tp, tn, fp, fn) print(f"Accuracy: {accuracy:.2%} (Overall correctness)") print(f"Precision: {precision:.2%} (When we predict admit, how often are we right?)") print(f"Recall: {recall:.2%} (Of all admits, what percentage did we identify?)") # JUST RUN THIS # Let's look at where we went wrong false_positives = df_test[(y_test == False) & (y_pred == True)] false_negatives = df_test[(y_test == True) & (y_pred == False)] print(f"\nFalse Positives (predicted admit but rejected):") if len(false_positives) > 0: display(false_positives[['GRE Score', 'CGPA', 'Research']].head()) print(f"\nFalse Negatives (predicted reject but admitted):") if len(false_negatives) > 0: display(false_negatives[['GRE Score', 'CGPA', 'Research']].head()) # BONUS CODE HERE