from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#pip install gensim

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
import nltk
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')
nltk.download('punkt')

# Load movie reviews from nltk
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents
np.random.shuffle(documents)


type(documents)
len(documents)
print(documents[0])
type(documents[0])
len(documents[0])

type(documents[0][0])
type(documents[0][1])

print(documents[0][0])
print(documents[0][1])

# Tokenization and preprocessing could also be more sophisticated
X, y = zip(*[(words, 1 if category == 'pos' else 0) for words, category in documents])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

type(X_train)
len(X_train)
print(X_train[0])
type(X_train[0])
len(X_train[0])

type(X_train[0][0])
type(X_train[0][1])

len(X_test)

# Train a Word2Vec model
model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)

print(model)

def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0) if doc else np.zeros(model.vector_size)

# Apply the function to each document
X_train_vec = np.array([document_vector(doc) for doc in X_train])
X_test_vec = np.array([document_vector(doc) for doc in X_test])


# Train a logistic regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_vec, y_train)

# Predict the labels for the test set
y_pred = classifier.predict(X_test_vec)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")