import random import torch import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns import statsmodels.api as sm from sklearn.preprocessing import StandardScaler import torch.nn as nn import torch.optim as optim from sklearn import linear_model from sklearn.metrics import confusion_matrix, accuracy_score from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.model_selection import train_test_split import itertools def synthetic_data(w, b, n): X = torch.normal(0, 1, (n, len(w))) linear_combination = torch.matmul(X, w) + b y = (linear_combination > 0).float() return X, y.reshape((-1, 1)) w_str = torch.tensor([2.0, -3.4]) b_str = 4.2 features, labels = synthetic_data(w_str, b_str, 1000) plt.figure(figsize=(8, 6)) plt.scatter(features[:, 0], features[:, 1], c=labels.flatten(), cmap='coolwarm', marker='o') plt.title("Synthetic Data for Logistic Regression") plt.xlabel('Feature 1') plt.ylabel('Feature 2') plt.show() def data_iter(batch_size, features, labels): n = len(features) idx = list(range(n)) random.shuffle(idx) for i in range(0, n, batch_size): batch_idx = torch.tensor(idx[i:min(batch_size + i, n)]) yield features[batch_idx], labels[batch_idx] def logistic_regression(X, w, b): return torch.sigmoid(torch.matmul(X, w) + b) def binary_cross_entropy_loss(y_hat, y): epsilon = 1e-5 # To avoid log(0) errors return -(y * torch.log(y_hat + epsilon) + (1 - y) * torch.log(1 - y_hat + epsilon)).mean() # Initialize weight and bias parameters w = torch.normal(0, 0.01, size=(2, 1), requires_grad=True) b = torch.zeros(1, requires_grad=True) # Define optimization algorithm trainer = torch.optim.SGD([w, b], lr=0.03) # Main training loop epochs = 50 batch_size = 10 for epoch in range(epochs): for X, y in data_iter(batch_size, features, labels): trainer.zero_grad() y_hat = logistic_regression(X, w, b) train_loss = binary_cross_entropy_loss(y_hat, y) train_loss.backward() trainer.step() if (epoch + 1) % 10 == 0: print(f'Epoch {epoch + 1}, Loss: {train_loss.item():.4f}') # Weight and bias after training print('Estimated weights:', w.detach().numpy().flatten()) print('Estimated bias:', b.item()) x_min, x_max = features[:, 0].min() - 1, features[:, 0].max() + 1 y_min, y_max = features[:, 1].min() - 1, features[:, 1].max() + 1 xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) grid = torch.tensor(np.c_[xx.ravel(), yy.ravel()], dtype=torch.float32) probs = logistic_regression(grid, w, b).detach().numpy().reshape(xx.shape) plt.figure(figsize=(8, 6)) plt.contourf(xx, yy, probs, levels=[0, 0.5, 1], cmap="coolwarm", alpha=0.6) plt.scatter(features[:, 0], features[:, 1], c=labels.flatten(), cmap='coolwarm', marker='o') # Bayes decision boundary x_vals = np.linspace(x_min, x_max, 100) bayes_boundary = -(w_str[0] / w_str[1]) * x_vals - (b_str / w_str[1]) plt.plot(x_vals, bayes_boundary, color='green', linestyle='--', label='Bayes Decision Boundary') plt.title('Logistic Regression vs. Bayes Decision Boundary') plt.xlabel('Feature 1') plt.ylabel('Feature 2') plt.legend() plt.show() from google.colab import drive drive.mount('/content/drive') file_path = '/content/drive/MyDrive/Weekly.csv' weekly = pd.read_csv(file_path) print(weekly.head()) summary_stats = weekly.describe() print("Summary Statistics:\n", summary_stats) print("Missing Values:\n", weekly.isnull().sum()) weekly.hist(figsize=(12, 10), bins=20) plt.suptitle('Histograms of Weekly Data') plt.show() sns.pairplot(weekly, hue="Direction", diag_kind='kde', markers=["o", "s"]) plt.show() plt.figure(figsize=(10, 6)) sns.boxplot(x='Year', y='Volume', data=weekly) plt.title('Boxplot of Trading Volume Across Years') plt.xticks(rotation=90) plt.show() # Convert 'Direction' column to numerical values weekly['Direction'] = weekly['Direction'].map({'Up': 1, 'Down': 0}) plt.figure(figsize=(8, 6)) numeric_weekly = weekly.select_dtypes(include=np.number) sns.heatmap(weekly.corr(), annot=True, cmap='coolwarm') plt.title('Correlation Heatmap of Weekly Data') plt.show() X = weekly[['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']] X = sm.add_constant(X) y = weekly['Direction'] logit_model = sm.Logit(y, X) result = logit_model.fit() print(result.summary()) y_pred_prob = result.predict(X) y_pred_class = (y_pred_prob >= 0.5).astype(int) cm = confusion_matrix(y, y_pred_class) print("Confusion Matrix:\n", cm) accuracy = accuracy_score(y, y_pred_class) print(f"Overall Accuracy: {accuracy:.4f}") train_data = weekly[weekly['Year'] <= 2008] test_data = weekly[weekly['Year'] > 2008] X_train = train_data[['Lag2']] y_train = train_data['Direction'] X_test = test_data[['Lag2']] y_test = test_data['Direction'] X_train = sm.add_constant(X_train) X_test = sm.add_constant(X_test) logit_model = sm.Logit(y_train, X_train) result = logit_model.fit() print(result.summary()) y_pred_prob = result.predict(X_test) y_pred_class = (y_pred_prob >= 0.5).astype(int) cm = confusion_matrix(y_test, y_pred_class) print("Confusion Matrix:\n", cm) accuracy = accuracy_score(y_test, y_pred_class) print(f"Overall Accuracy on Test Data: {accuracy:.4f}") lda_model = LinearDiscriminantAnalysis() lda_model.fit(X_train, y_train) y_pred_class = lda_model.predict(X_test) cm = confusion_matrix(y_test, y_pred_class) print("Confusion Matrix:\n", cm) accuracy = accuracy_score(y_test, y_pred_class) print(f"Overall Accuracy on Test Data: {accuracy:.4f}") train_data = weekly[weekly['Year'] <= 2008] test_data = weekly[weekly['Year'] > 2008] X_train = train_data[['Lag2']] y_train = train_data['Direction'] X_test = test_data[['Lag2']] y_test = test_data['Direction'] qda_model = QuadraticDiscriminantAnalysis() qda_model.fit(X_train, y_train) y_pred_class = qda_model.predict(X_test) cm = confusion_matrix(y_test, y_pred_class) print("Confusion Matrix:\n", cm) accuracy = accuracy_score(y_test, y_pred_class) print(f"Overall Accuracy on Test Data: {accuracy:.4f}") knn_model = KNeighborsClassifier(n_neighbors=1) knn_model.fit(X_train, y_train) y_pred_class = knn_model.predict(X_test) cm = confusion_matrix(y_test, y_pred_class) print("Confusion Matrix:\n", cm) accuracy = accuracy_score(y_test, y_pred_class) print(f"Overall Accuracy on Test Data: {accuracy:.4f}") nb_model = GaussianNB() nb_model.fit(X_train, y_train) y_pred_class = nb_model.predict(X_test) cm = confusion_matrix(y_test, y_pred_class) print("Confusion Matrix:\n", cm) accuracy = accuracy_score(y_test, y_pred_class) print(f"Overall Accuracy on Test Data: {accuracy:.4f}") train_data = weekly[weekly['Year'] <= 2008] test_data = weekly[weekly['Year'] > 2008] all_predictors = ['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume'] def evaluate_model(model, X_train, X_test, y_train, y_test, name="Model"): model.fit(X_train, y_train) y_pred_class = model.predict(X_test) cm = confusion_matrix(y_test, y_pred_class) accuracy = accuracy_score(y_test, y_pred_class) print(f"\n{name}") print("Confusion Matrix:\n", cm) print(f"Overall Accuracy on Test Data: {accuracy:.4f}") return accuracy # Try different combinations of predictors def test_combinations(predictors): X_train = train_data[predictors] y_train = train_data['Direction'] X_test = test_data[predictors] y_test = test_data['Direction'] print(f"Testing with predictors: {predictors}") # Logistic Regression log_reg = LogisticRegression(max_iter=1000) log_reg_acc = evaluate_model(log_reg, X_train, X_test, y_train, y_test, name="Logistic Regression") # LDA lda = LinearDiscriminantAnalysis() lda_acc = evaluate_model(lda, X_train, X_test, y_train, y_test, name="LDA") # QDA qda = QuadraticDiscriminantAnalysis() qda_acc = evaluate_model(qda, X_train, X_test, y_train, y_test, name="QDA") for k in [1, 3, 5, 7, 9]: knn = KNeighborsClassifier(n_neighbors=k) knn_acc = evaluate_model(knn, X_train, X_test, y_train, y_test, name=f"KNN with K={k}") return {'predictors': predictors, 'log_reg_acc': log_reg_acc, 'lda_acc': lda_acc, 'qda_acc': qda_acc, 'knn_acc': knn_acc} def generate_combinations(): best_models = [] for r in range(1, len(all_predictors) + 1): for subset in itertools.combinations(all_predictors, r): result = test_combinations(list(subset)) best_models.append(result) return best_models best_results = generate_combinations() best_model = max(best_results, key=lambda x: max(x['log_reg_acc'], x['lda_acc'], x['qda_acc'], x['knn_acc'])) print("\nBest model based on accuracy:") print(best_model)