Notebook

In [3]:

import numpy as np
import math
import pandas as pd
import ISLP
import statsmodels.api as sm 

from scipy.special import logsumexp

In [18]:

# Load datasets
train_matrix = pd.read_csv('trainMatrix.csv')
test_matrix = pd.read_csv('testMatrix.csv')
train_labels = pd.read_csv('trainCategory.csv')['SPAM']  # Directly use the 'SPAM' column
test_labels = pd.read_csv('testCategory.csv')['SPAM']    # Directly use the 'SPAM' column

# Laplace smoothing function
def laplace_smoothing(matrix):
    word_counts = matrix.sum(axis=0) + 1  # Add 1 to all word counts for smoothing
    total_word_count = word_counts.sum() + matrix.shape[1]  # Total word count + vocab size
    return word_counts / total_word_count

# Separate spam and non-spam emails in training data
spam_emails = train_matrix[train_labels == 1]
not_spam_emails = train_matrix[train_labels == 0]

# Calculate prior probabilities
pi_spam = len(spam_emails) / len(train_labels)
pi_not_spam = len(not_spam_emails) / len(train_labels)

# Calculate conditional probabilities with Laplace smoothing
p_word_given_spam = laplace_smoothing(spam_emails)
p_word_given_not_spam = laplace_smoothing(not_spam_emails)

# Prediction function using Naive Bayes
def predict_naive_bayes(test_matrix, p_word_given_spam, p_word_given_not_spam, pi_spam, pi_not_spam):
    log_pi_spam = np.log(pi_spam)
    log_pi_not_spam = np.log(pi_not_spam)
    
    # Compute the log probabilities for spam and not spam for each email
    log_prob_spam = log_pi_spam + test_matrix.dot(np.log(p_word_given_spam))
    log_prob_not_spam = log_pi_not_spam + test_matrix.dot(np.log(p_word_given_not_spam))
    
    # Return 1 for spam if log_prob_spam > log_prob_not_spam, else 0 for not spam
    return (log_prob_spam > log_prob_not_spam).astype(int)

# Predict on the test set and calculate the classification error rate
predictions = predict_naive_bayes(test_matrix, p_word_given_spam, p_word_given_not_spam, pi_spam, pi_not_spam)
error_rate = (predictions != test_labels).mean()

# Output the classification error rate
print(f'Classification Error Rate: {error_rate * 100:.2f}%')

Classification Error Rate: 1.62%

In [19]:

## Extract the tokens from the columns of train_matrix
tokens = train_matrix.columns

# Calculate the log-ratio for each token
log_ratio = np.log(p_word_given_spam / (p_word_given_not_spam + 1e-9))

# Sort tokens by the log-ratio in descending order and get the indices of the top 5 tokens
top_5_indices = np.argsort(log_ratio)[-5:]

# Retrieve the corresponding token names
top_5_tokens = tokens[top_5_indices]

# Display the top 5 tokens and their log-ratio values
print("Top 5 tokens indicative of SPAM class:")
for i in top_5_indices:
    print(f"Token: {tokens[i]}, Log-ratio: {log_ratio[i]}")

Top 5 tokens indicative of SPAM class:
Token: valet, Log-ratio: 5.1965300645855175
Token: ebai, Log-ratio: 5.204282041389836
Token: unsubscrib, Log-ratio: 5.428197495482627
Token: spam, Log-ratio: 6.976547715936889
Token: httpaddr, Log-ratio: 7.031443437668806

/var/folders/2b/jzlyxgj50yvcsqrnq8qbkfnh0000gs/T/ipykernel_70158/202882007.py:16: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  print(f"Token: {tokens[i]}, Log-ratio: {log_ratio[i]}")

In [21]:

from ISLP import load_data

weekly_data = load_data('Weekly')
import statsmodels.api as sm

# Extract the predictor variables Lag1, Lag2, and the target variable Direction
X = weekly_data[['Lag1', 'Lag2']]
y = (weekly_data['Direction'] == 'Up').astype(int)  # Convert 'Up' to 1 and 'Down' to 0

# Add an intercept term to the model
X = sm.add_constant(X)

# Fit a logistic regression model
model = sm.Logit(y, X)
result = model.fit()

# Print the model summary
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.683297
         Iterations 4
                           Logit Regression Results                           
==============================================================================
Dep. Variable:              Direction   No. Observations:                 1089
Model:                          Logit   Df Residuals:                     1086
Method:                           MLE   Df Model:                            2
Date:                Thu, 03 Oct 2024   Pseudo R-squ.:                0.005335
Time:                        23:12:50   Log-Likelihood:                -744.11
converged:                       True   LL-Null:                       -748.10
Covariance Type:            nonrobust   LLR p-value:                   0.01848
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2212      0.061      3.599      0.000       0.101       0.342
Lag1          -0.0387      0.026     -1.477      0.140      -0.090       0.013
Lag2           0.0602      0.027      2.270      0.023       0.008       0.112
==============================================================================

In [23]:

# Remove the first observation (row 0)
weekly_data_no_first = weekly_data.iloc[1:, :]

# Extract the predictor variables Lag1, Lag2, and the target variable Direction
X = weekly_data_no_first[['Lag1', 'Lag2']]
y = (weekly_data_no_first['Direction'] == 'Up').astype(int)  # Convert 'Up' to 1 and 'Down' to 0

# Add an intercept term to the model
X = sm.add_constant(X)

# Fit a logistic regression model excluding the first observation
model = sm.Logit(y, X)
result = model.fit()

# Print the model summary
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.683147
         Iterations 4
                           Logit Regression Results                           
==============================================================================
Dep. Variable:              Direction   No. Observations:                 1088
Model:                          Logit   Df Residuals:                     1085
Method:                           MLE   Df Model:                            2
Date:                Thu, 03 Oct 2024   Pseudo R-squ.:                0.005387
Time:                        23:15:38   Log-Likelihood:                -743.26
converged:                       True   LL-Null:                       -747.29
Covariance Type:            nonrobust   LLR p-value:                   0.01785
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2232      0.061      3.630      0.000       0.103       0.344
Lag1          -0.0384      0.026     -1.466      0.143      -0.090       0.013
Lag2           0.0608      0.027      2.291      0.022       0.009       0.113
==============================================================================

In [31]:

from ISLP.models import (ModelSpec as MS,
                         summarize)
allvars=weekly_data.columns.drop(['Direction','Year','Volume','Today','Lag3','Lag4','Lag5'])
design=MS(allvars)
X=design.fit_transform(weekly_data)
prob = result.predict([X.iloc[0]])

label = np.where(prob > 0.5, 'Up', 'Down')
label
weekly_data.iloc[0]

Out[31]:

Year             1990
Lag1            0.816
Lag2            1.572
Lag3           -3.936
Lag4           -0.229
Lag5           -3.484
Volume       0.154976
Today           -0.27
Direction        Down
Name: 0, dtype: object

In [32]:

# Extract predictor variables (Lag1, Lag2) and the target variable (Direction)
X = weekly_data[['Lag1', 'Lag2']]
y = (weekly_data['Direction'] == 'Up').astype(int)  # Convert 'Up' to 1, 'Down' to 0

# Add intercept to the data
X = sm.add_constant(X)

# Initialize an empty array to store whether an error was made (1 if error, 0 if correct)
n = len(weekly_data)
errors = np.zeros(n)

# Perform Leave-One-Out Cross-Validation
for i in range(n):
    # Exclude the i-th observation
    X_train = np.delete(X.values, i, axis=0)
    y_train = np.delete(y.values, i)
    
    # Fit logistic regression using all but the i-th observation
    model = sm.Logit(y_train, X_train).fit(disp=False)
    
    # Compute the posterior probability for the i-th observation
    X_test = X.values[i].reshape(1, -1)  # i-th observation
    prob = model.predict(X_test)[0]  # Posterior probability for market going up
    
    # Predict direction (Up if prob > 0.5, Down otherwise)
    prediction = 1 if prob > 0.5 else 0
    
    # Determine if an error was made (1 if prediction != actual, 0 otherwise)
    errors[i] = 1 if prediction != y[i] else 0

# Output the total number of errors
total_errors = errors.sum()
print(f'Total number of errors: {int(total_errors)}')
print(f'LOOCV error rate: {total_errors / n:.4f}')

Total number of errors: 490
LOOCV error rate: 0.4500

In [ ]: