#!/usr/bin/env python # coding: utf-8 # In[3]: import numpy as np import math import pandas as pd import ISLP import statsmodels.api as sm from scipy.special import logsumexp # In[18]: # Load datasets train_matrix = pd.read_csv('trainMatrix.csv') test_matrix = pd.read_csv('testMatrix.csv') train_labels = pd.read_csv('trainCategory.csv')['SPAM'] # Directly use the 'SPAM' column test_labels = pd.read_csv('testCategory.csv')['SPAM'] # Directly use the 'SPAM' column # Laplace smoothing function def laplace_smoothing(matrix): word_counts = matrix.sum(axis=0) + 1 # Add 1 to all word counts for smoothing total_word_count = word_counts.sum() + matrix.shape[1] # Total word count + vocab size return word_counts / total_word_count # Separate spam and non-spam emails in training data spam_emails = train_matrix[train_labels == 1] not_spam_emails = train_matrix[train_labels == 0] # Calculate prior probabilities pi_spam = len(spam_emails) / len(train_labels) pi_not_spam = len(not_spam_emails) / len(train_labels) # Calculate conditional probabilities with Laplace smoothing p_word_given_spam = laplace_smoothing(spam_emails) p_word_given_not_spam = laplace_smoothing(not_spam_emails) # Prediction function using Naive Bayes def predict_naive_bayes(test_matrix, p_word_given_spam, p_word_given_not_spam, pi_spam, pi_not_spam): log_pi_spam = np.log(pi_spam) log_pi_not_spam = np.log(pi_not_spam) # Compute the log probabilities for spam and not spam for each email log_prob_spam = log_pi_spam + test_matrix.dot(np.log(p_word_given_spam)) log_prob_not_spam = log_pi_not_spam + test_matrix.dot(np.log(p_word_given_not_spam)) # Return 1 for spam if log_prob_spam > log_prob_not_spam, else 0 for not spam return (log_prob_spam > log_prob_not_spam).astype(int) # Predict on the test set and calculate the classification error rate predictions = predict_naive_bayes(test_matrix, p_word_given_spam, p_word_given_not_spam, pi_spam, pi_not_spam) error_rate = (predictions != test_labels).mean() # Output the classification error rate print(f'Classification Error Rate: {error_rate * 100:.2f}%') # In[19]: ## Extract the tokens from the columns of train_matrix tokens = train_matrix.columns # Calculate the log-ratio for each token log_ratio = np.log(p_word_given_spam / (p_word_given_not_spam + 1e-9)) # Sort tokens by the log-ratio in descending order and get the indices of the top 5 tokens top_5_indices = np.argsort(log_ratio)[-5:] # Retrieve the corresponding token names top_5_tokens = tokens[top_5_indices] # Display the top 5 tokens and their log-ratio values print("Top 5 tokens indicative of SPAM class:") for i in top_5_indices: print(f"Token: {tokens[i]}, Log-ratio: {log_ratio[i]}") # In[21]: from ISLP import load_data weekly_data = load_data('Weekly') import statsmodels.api as sm # Extract the predictor variables Lag1, Lag2, and the target variable Direction X = weekly_data[['Lag1', 'Lag2']] y = (weekly_data['Direction'] == 'Up').astype(int) # Convert 'Up' to 1 and 'Down' to 0 # Add an intercept term to the model X = sm.add_constant(X) # Fit a logistic regression model model = sm.Logit(y, X) result = model.fit() # Print the model summary print(result.summary()) # In[23]: # Remove the first observation (row 0) weekly_data_no_first = weekly_data.iloc[1:, :] # Extract the predictor variables Lag1, Lag2, and the target variable Direction X = weekly_data_no_first[['Lag1', 'Lag2']] y = (weekly_data_no_first['Direction'] == 'Up').astype(int) # Convert 'Up' to 1 and 'Down' to 0 # Add an intercept term to the model X = sm.add_constant(X) # Fit a logistic regression model excluding the first observation model = sm.Logit(y, X) result = model.fit() # Print the model summary print(result.summary()) # In[31]: from ISLP.models import (ModelSpec as MS, summarize) allvars=weekly_data.columns.drop(['Direction','Year','Volume','Today','Lag3','Lag4','Lag5']) design=MS(allvars) X=design.fit_transform(weekly_data) prob = result.predict([X.iloc[0]]) label = np.where(prob > 0.5, 'Up', 'Down') label weekly_data.iloc[0] # In[32]: # Extract predictor variables (Lag1, Lag2) and the target variable (Direction) X = weekly_data[['Lag1', 'Lag2']] y = (weekly_data['Direction'] == 'Up').astype(int) # Convert 'Up' to 1, 'Down' to 0 # Add intercept to the data X = sm.add_constant(X) # Initialize an empty array to store whether an error was made (1 if error, 0 if correct) n = len(weekly_data) errors = np.zeros(n) # Perform Leave-One-Out Cross-Validation for i in range(n): # Exclude the i-th observation X_train = np.delete(X.values, i, axis=0) y_train = np.delete(y.values, i) # Fit logistic regression using all but the i-th observation model = sm.Logit(y_train, X_train).fit(disp=False) # Compute the posterior probability for the i-th observation X_test = X.values[i].reshape(1, -1) # i-th observation prob = model.predict(X_test)[0] # Posterior probability for market going up # Predict direction (Up if prob > 0.5, Down otherwise) prediction = 1 if prob > 0.5 else 0 # Determine if an error was made (1 if prediction != actual, 0 otherwise) errors[i] = 1 if prediction != y[i] else 0 # Output the total number of errors total_errors = errors.sum() print(f'Total number of errors: {int(total_errors)}') print(f'LOOCV error rate: {total_errors / n:.4f}') # In[ ]: