#!/usr/bin/env python
# coding: utf-8

# In[3]:


import numpy as np
import math
import pandas as pd
import ISLP
import statsmodels.api as sm 

from scipy.special import logsumexp


# In[18]:


# Load datasets
train_matrix = pd.read_csv('trainMatrix.csv')
test_matrix = pd.read_csv('testMatrix.csv')
train_labels = pd.read_csv('trainCategory.csv')['SPAM']  # Directly use the 'SPAM' column
test_labels = pd.read_csv('testCategory.csv')['SPAM']    # Directly use the 'SPAM' column

# Laplace smoothing function
def laplace_smoothing(matrix):
    word_counts = matrix.sum(axis=0) + 1  # Add 1 to all word counts for smoothing
    total_word_count = word_counts.sum() + matrix.shape[1]  # Total word count + vocab size
    return word_counts / total_word_count

# Separate spam and non-spam emails in training data
spam_emails = train_matrix[train_labels == 1]
not_spam_emails = train_matrix[train_labels == 0]

# Calculate prior probabilities
pi_spam = len(spam_emails) / len(train_labels)
pi_not_spam = len(not_spam_emails) / len(train_labels)

# Calculate conditional probabilities with Laplace smoothing
p_word_given_spam = laplace_smoothing(spam_emails)
p_word_given_not_spam = laplace_smoothing(not_spam_emails)

# Prediction function using Naive Bayes
def predict_naive_bayes(test_matrix, p_word_given_spam, p_word_given_not_spam, pi_spam, pi_not_spam):
    log_pi_spam = np.log(pi_spam)
    log_pi_not_spam = np.log(pi_not_spam)
    
    # Compute the log probabilities for spam and not spam for each email
    log_prob_spam = log_pi_spam + test_matrix.dot(np.log(p_word_given_spam))
    log_prob_not_spam = log_pi_not_spam + test_matrix.dot(np.log(p_word_given_not_spam))
    
    # Return 1 for spam if log_prob_spam > log_prob_not_spam, else 0 for not spam
    return (log_prob_spam > log_prob_not_spam).astype(int)

# Predict on the test set and calculate the classification error rate
predictions = predict_naive_bayes(test_matrix, p_word_given_spam, p_word_given_not_spam, pi_spam, pi_not_spam)
error_rate = (predictions != test_labels).mean()

# Output the classification error rate
print(f'Classification Error Rate: {error_rate * 100:.2f}%')


# In[19]:


## Extract the tokens from the columns of train_matrix
tokens = train_matrix.columns

# Calculate the log-ratio for each token
log_ratio = np.log(p_word_given_spam / (p_word_given_not_spam + 1e-9))

# Sort tokens by the log-ratio in descending order and get the indices of the top 5 tokens
top_5_indices = np.argsort(log_ratio)[-5:]

# Retrieve the corresponding token names
top_5_tokens = tokens[top_5_indices]

# Display the top 5 tokens and their log-ratio values
print("Top 5 tokens indicative of SPAM class:")
for i in top_5_indices:
    print(f"Token: {tokens[i]}, Log-ratio: {log_ratio[i]}")


# In[21]:


from ISLP import load_data

weekly_data = load_data('Weekly')
import statsmodels.api as sm

# Extract the predictor variables Lag1, Lag2, and the target variable Direction
X = weekly_data[['Lag1', 'Lag2']]
y = (weekly_data['Direction'] == 'Up').astype(int)  # Convert 'Up' to 1 and 'Down' to 0

# Add an intercept term to the model
X = sm.add_constant(X)

# Fit a logistic regression model
model = sm.Logit(y, X)
result = model.fit()

# Print the model summary
print(result.summary())


# In[23]:


# Remove the first observation (row 0)
weekly_data_no_first = weekly_data.iloc[1:, :]

# Extract the predictor variables Lag1, Lag2, and the target variable Direction
X = weekly_data_no_first[['Lag1', 'Lag2']]
y = (weekly_data_no_first['Direction'] == 'Up').astype(int)  # Convert 'Up' to 1 and 'Down' to 0

# Add an intercept term to the model
X = sm.add_constant(X)

# Fit a logistic regression model excluding the first observation
model = sm.Logit(y, X)
result = model.fit()

# Print the model summary
print(result.summary())


# In[31]:


from ISLP.models import (ModelSpec as MS,
                         summarize)
allvars=weekly_data.columns.drop(['Direction','Year','Volume','Today','Lag3','Lag4','Lag5'])
design=MS(allvars)
X=design.fit_transform(weekly_data)
prob = result.predict([X.iloc[0]])

label = np.where(prob > 0.5, 'Up', 'Down')
label
weekly_data.iloc[0]


# In[32]:


# Extract predictor variables (Lag1, Lag2) and the target variable (Direction)
X = weekly_data[['Lag1', 'Lag2']]
y = (weekly_data['Direction'] == 'Up').astype(int)  # Convert 'Up' to 1, 'Down' to 0

# Add intercept to the data
X = sm.add_constant(X)

# Initialize an empty array to store whether an error was made (1 if error, 0 if correct)
n = len(weekly_data)
errors = np.zeros(n)

# Perform Leave-One-Out Cross-Validation
for i in range(n):
    # Exclude the i-th observation
    X_train = np.delete(X.values, i, axis=0)
    y_train = np.delete(y.values, i)
    
    # Fit logistic regression using all but the i-th observation
    model = sm.Logit(y_train, X_train).fit(disp=False)
    
    # Compute the posterior probability for the i-th observation
    X_test = X.values[i].reshape(1, -1)  # i-th observation
    prob = model.predict(X_test)[0]  # Posterior probability for market going up
    
    # Predict direction (Up if prob > 0.5, Down otherwise)
    prediction = 1 if prob > 0.5 else 0
    
    # Determine if an error was made (1 if prediction != actual, 0 otherwise)
    errors[i] = 1 if prediction != y[i] else 0

# Output the total number of errors
total_errors = errors.sum()
print(f'Total number of errors: {int(total_errors)}')
print(f'LOOCV error rate: {total_errors / n:.4f}')


# In[ ]: