import numpy as np
import math
import pandas as pd
import ISLP
import statsmodels.api as sm
from scipy.special import logsumexp
# Load datasets
train_matrix = pd.read_csv('trainMatrix.csv')
test_matrix = pd.read_csv('testMatrix.csv')
train_labels = pd.read_csv('trainCategory.csv')['SPAM'] # Directly use the 'SPAM' column
test_labels = pd.read_csv('testCategory.csv')['SPAM'] # Directly use the 'SPAM' column
# Laplace smoothing function
def laplace_smoothing(matrix):
word_counts = matrix.sum(axis=0) + 1 # Add 1 to all word counts for smoothing
total_word_count = word_counts.sum() + matrix.shape[1] # Total word count + vocab size
return word_counts / total_word_count
# Separate spam and non-spam emails in training data
spam_emails = train_matrix[train_labels == 1]
not_spam_emails = train_matrix[train_labels == 0]
# Calculate prior probabilities
pi_spam = len(spam_emails) / len(train_labels)
pi_not_spam = len(not_spam_emails) / len(train_labels)
# Calculate conditional probabilities with Laplace smoothing
p_word_given_spam = laplace_smoothing(spam_emails)
p_word_given_not_spam = laplace_smoothing(not_spam_emails)
# Prediction function using Naive Bayes
def predict_naive_bayes(test_matrix, p_word_given_spam, p_word_given_not_spam, pi_spam, pi_not_spam):
log_pi_spam = np.log(pi_spam)
log_pi_not_spam = np.log(pi_not_spam)
# Compute the log probabilities for spam and not spam for each email
log_prob_spam = log_pi_spam + test_matrix.dot(np.log(p_word_given_spam))
log_prob_not_spam = log_pi_not_spam + test_matrix.dot(np.log(p_word_given_not_spam))
# Return 1 for spam if log_prob_spam > log_prob_not_spam, else 0 for not spam
return (log_prob_spam > log_prob_not_spam).astype(int)
# Predict on the test set and calculate the classification error rate
predictions = predict_naive_bayes(test_matrix, p_word_given_spam, p_word_given_not_spam, pi_spam, pi_not_spam)
error_rate = (predictions != test_labels).mean()
# Output the classification error rate
print(f'Classification Error Rate: {error_rate * 100:.2f}%')
Classification Error Rate: 1.62%
## Extract the tokens from the columns of train_matrix
tokens = train_matrix.columns
# Calculate the log-ratio for each token
log_ratio = np.log(p_word_given_spam / (p_word_given_not_spam + 1e-9))
# Sort tokens by the log-ratio in descending order and get the indices of the top 5 tokens
top_5_indices = np.argsort(log_ratio)[-5:]
# Retrieve the corresponding token names
top_5_tokens = tokens[top_5_indices]
# Display the top 5 tokens and their log-ratio values
print("Top 5 tokens indicative of SPAM class:")
for i in top_5_indices:
print(f"Token: {tokens[i]}, Log-ratio: {log_ratio[i]}")
Top 5 tokens indicative of SPAM class: Token: valet, Log-ratio: 5.1965300645855175 Token: ebai, Log-ratio: 5.204282041389836 Token: unsubscrib, Log-ratio: 5.428197495482627 Token: spam, Log-ratio: 6.976547715936889 Token: httpaddr, Log-ratio: 7.031443437668806
/var/folders/2b/jzlyxgj50yvcsqrnq8qbkfnh0000gs/T/ipykernel_70158/202882007.py:16: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` print(f"Token: {tokens[i]}, Log-ratio: {log_ratio[i]}")
from ISLP import load_data
weekly_data = load_data('Weekly')
import statsmodels.api as sm
# Extract the predictor variables Lag1, Lag2, and the target variable Direction
X = weekly_data[['Lag1', 'Lag2']]
y = (weekly_data['Direction'] == 'Up').astype(int) # Convert 'Up' to 1 and 'Down' to 0
# Add an intercept term to the model
X = sm.add_constant(X)
# Fit a logistic regression model
model = sm.Logit(y, X)
result = model.fit()
# Print the model summary
print(result.summary())
Optimization terminated successfully. Current function value: 0.683297 Iterations 4 Logit Regression Results ============================================================================== Dep. Variable: Direction No. Observations: 1089 Model: Logit Df Residuals: 1086 Method: MLE Df Model: 2 Date: Thu, 03 Oct 2024 Pseudo R-squ.: 0.005335 Time: 23:12:50 Log-Likelihood: -744.11 converged: True LL-Null: -748.10 Covariance Type: nonrobust LLR p-value: 0.01848 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ const 0.2212 0.061 3.599 0.000 0.101 0.342 Lag1 -0.0387 0.026 -1.477 0.140 -0.090 0.013 Lag2 0.0602 0.027 2.270 0.023 0.008 0.112 ==============================================================================
# Remove the first observation (row 0)
weekly_data_no_first = weekly_data.iloc[1:, :]
# Extract the predictor variables Lag1, Lag2, and the target variable Direction
X = weekly_data_no_first[['Lag1', 'Lag2']]
y = (weekly_data_no_first['Direction'] == 'Up').astype(int) # Convert 'Up' to 1 and 'Down' to 0
# Add an intercept term to the model
X = sm.add_constant(X)
# Fit a logistic regression model excluding the first observation
model = sm.Logit(y, X)
result = model.fit()
# Print the model summary
print(result.summary())
Optimization terminated successfully. Current function value: 0.683147 Iterations 4 Logit Regression Results ============================================================================== Dep. Variable: Direction No. Observations: 1088 Model: Logit Df Residuals: 1085 Method: MLE Df Model: 2 Date: Thu, 03 Oct 2024 Pseudo R-squ.: 0.005387 Time: 23:15:38 Log-Likelihood: -743.26 converged: True LL-Null: -747.29 Covariance Type: nonrobust LLR p-value: 0.01785 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ const 0.2232 0.061 3.630 0.000 0.103 0.344 Lag1 -0.0384 0.026 -1.466 0.143 -0.090 0.013 Lag2 0.0608 0.027 2.291 0.022 0.009 0.113 ==============================================================================
from ISLP.models import (ModelSpec as MS,
summarize)
allvars=weekly_data.columns.drop(['Direction','Year','Volume','Today','Lag3','Lag4','Lag5'])
design=MS(allvars)
X=design.fit_transform(weekly_data)
prob = result.predict([X.iloc[0]])
label = np.where(prob > 0.5, 'Up', 'Down')
label
weekly_data.iloc[0]
Year 1990 Lag1 0.816 Lag2 1.572 Lag3 -3.936 Lag4 -0.229 Lag5 -3.484 Volume 0.154976 Today -0.27 Direction Down Name: 0, dtype: object
# Extract predictor variables (Lag1, Lag2) and the target variable (Direction)
X = weekly_data[['Lag1', 'Lag2']]
y = (weekly_data['Direction'] == 'Up').astype(int) # Convert 'Up' to 1, 'Down' to 0
# Add intercept to the data
X = sm.add_constant(X)
# Initialize an empty array to store whether an error was made (1 if error, 0 if correct)
n = len(weekly_data)
errors = np.zeros(n)
# Perform Leave-One-Out Cross-Validation
for i in range(n):
# Exclude the i-th observation
X_train = np.delete(X.values, i, axis=0)
y_train = np.delete(y.values, i)
# Fit logistic regression using all but the i-th observation
model = sm.Logit(y_train, X_train).fit(disp=False)
# Compute the posterior probability for the i-th observation
X_test = X.values[i].reshape(1, -1) # i-th observation
prob = model.predict(X_test)[0] # Posterior probability for market going up
# Predict direction (Up if prob > 0.5, Down otherwise)
prediction = 1 if prob > 0.5 else 0
# Determine if an error was made (1 if prediction != actual, 0 otherwise)
errors[i] = 1 if prediction != y[i] else 0
# Output the total number of errors
total_errors = errors.sum()
print(f'Total number of errors: {int(total_errors)}')
print(f'LOOCV error rate: {total_errors / n:.4f}')
Total number of errors: 490 LOOCV error rate: 0.4500