from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" #pip install gensim import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from gensim.models import Word2Vec import nltk from nltk.corpus import movie_reviews nltk.download('movie_reviews') nltk.download('punkt') # Load movie reviews from nltk documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] # Shuffle the documents np.random.shuffle(documents) type(documents) len(documents) print(documents[0]) type(documents[0]) len(documents[0]) type(documents[0][0]) type(documents[0][1]) print(documents[0][0]) print(documents[0][1]) # Tokenization and preprocessing could also be more sophisticated X, y = zip(*[(words, 1 if category == 'pos' else 0) for words, category in documents]) # Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) type(X_train) len(X_train) print(X_train[0]) type(X_train[0]) len(X_train[0]) type(X_train[0][0]) type(X_train[0][1]) len(X_test) # Train a Word2Vec model model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4) print(model) def document_vector(doc): # remove out-of-vocabulary words doc = [word for word in doc if word in model.wv.index_to_key] return np.mean(model.wv[doc], axis=0) if doc else np.zeros(model.vector_size) # Apply the function to each document X_train_vec = np.array([document_vector(doc) for doc in X_train]) X_test_vec = np.array([document_vector(doc) for doc in X_test]) # Train a logistic regression classifier classifier = LogisticRegression(max_iter=1000) classifier.fit(X_train_vec, y_train) # Predict the labels for the test set y_pred = classifier.predict(X_test_vec) # Calculate the accuracy of the predictions accuracy = accuracy_score(y_test, y_pred) print(f"Accuracy: {accuracy:.2f}")