from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
#pip install gensim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
import nltk
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')
nltk.download('punkt')
[nltk_data] Downloading package movie_reviews to /root/nltk_data... [nltk_data] Unzipping corpora/movie_reviews.zip. [nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip.
True
Load the movie reviews from NLTK, preprocess them, and prepare training and test datasets.
# Load movie reviews from nltk
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
# Shuffle the documents
np.random.shuffle(documents)
type(documents)
len(documents)
print(documents[0])
type(documents[0])
len(documents[0])
type(documents[0][0])
type(documents[0][1])
print(documents[0][0])
print(documents[0][1])
list
2000
(['contact', '(', 'pg', ')', 'there', "'", 's', 'a', 'moment', 'late', 'in', 'robert', 'zemeckis', "'", 's', 'contact', 'where', 'i', 'was', 'reminded', 'of', 'why', 'i', 'started', 'writing', 'movie', 'reviews', 'in', 'the', 'first', 'place', '.', 'we', 'see', 'a', 'scientist', ',', 'dressed', 'in', 'a', 'silvery', 'space', 'suit', ',', 'walking', 'tentatively', 'across', 'a', 'narrow', 'walkway', 'leading', 'inside', 'a', 'compact', ',', 'spherical', 'space', 'pod', ',', 'unaware', 'of', 'what', 'awaits', 'when', 'the', 'ball', 'literally', 'drops', '.', 'anticipation', ',', 'excitement', ',', 'anxiety', ',', 'fear', '--', 'the', 'audience', 'experiences', 'it', 'all', 'the', 'emotional', 'tension', 'right', 'with', 'the', 'character', ',', 'nervously', ',', 'breathlessly', 'eager', 'to', 'see', 'what', 'lies', 'ahead', '.', 'it', 'is', 'this', 'sense', 'of', 'discovery', ',', 'the', 'anticipation', 'of', 'which', 'and', 'its', 'accompanying', 'exhilaration', ',', 'that', 'makes', 'this', 'adaptation', 'of', 'the', 'carl', 'sagan', 'novel', 'such', 'magical', ',', 'captivating', 'entertainment', '.', 'jodie', 'foster', 'stars', 'as', 'dr', '.', 'ellie', 'arroway', ',', 'a', 'brilliant', 'astronomer', 'who', 'dedicates', 'her', 'entire', 'life', 'to', 'searching', 'outer', 'space', 'for', 'extraterrestrial', 'radio', 'signals', '.', 'and', 'i', 'mean', 'life', '--', 'after', 'losing', 'her', 'entire', 'family', 'when', 'she', 'was', 'young', ',', 'the', 'only', 'thing', 'occupying', 'ellie', "'", 's', 'world', 'is', 'this', 'quest', 'to', 'discover', 'life', 'beyond', 'this', 'earth', '.', 'after', 'dealing', 'with', 'much', 'skepticism', 'on', 'the', 'part', 'of', 'government', 'officials', 'and', 'wealthy', 'financiers', ',', 'ellie', 'receives', 'her', 'vindication', 'when', 'she', 'stumbles', 'upon', 'an', 'incoming', 'radio', 'transmission', 'from', 'the', 'distant', 'star', 'vega', ',', 'which', 'includes', 'instructions', 'on', 'building', 'an', 'interstellar', 'transport', '.', 'from', 'this', 'synopsis', ',', 'contact', 'does', 'not', 'sound', 'too', 'different', 'to', 'most', 'films', 'about', 'alien', 'contact', ',', 'but', 'there', 'is', 'a', 'whole', 'lot', 'more', 'to', 'this', 'intelligent', 'film', 'than', 'the', 'sci', '-', 'fi', 'hook', '.', 'the', 'alien', 'contact', 'angle', 'generates', 'a', 'great', 'amount', 'of', 'suspense', 'and', 'awe', ',', 'but', 'perhaps', 'more', 'than', 'anything', 'else', ',', 'contact', 'is', 'a', 'character', 'study', 'of', 'ellie', ',', 'whose', 'obsession', 'with', 'empirical', ',', 'scientific', 'evidence', 'has', 'erased', 'all', 'belief', 'in', 'a', 'higher', 'power', '.', 'the', 'irony', 'is', 'that', ',', 'while', 'admitting', 'to', 'having', 'no', 'religious', 'faith', ',', 'she', 'holds', 'onto', 'her', 'belief', 'in', 'extraterrestrial', 'life', 'with', 'such', 'passion', 'and', 'conviction', 'that', 'it', 'becomes', ',', 'in', 'a', 'sense', ',', 'a', 'religion', 'in', 'its', 'own', 'right', '.', 'it', 'would', 'be', 'easy', 'for', 'scripters', 'james', 'v', '.', 'hart', 'and', 'michael', 'goldenberg', ',', 'in', 'trying', 'to', 'paint', 'a', 'positive', 'image', 'of', 'the', 'heroine', ',', 'to', 'champion', 'her', 'scientific', 'beliefs', 'over', 'religious', 'ones', ',', 'but', 'they', 'wisely', 'eschew', 'easy', 'answers', ',', 'giving', 'equal', 'time', 'to', 'both', 'sides', ',', 'and', 'in', 'so', 'doing', 'depict', 'ellie', 'as', 'not', 'completely', 'sane', '.', 'in', 'the', 'end', ',', 'there', 'is', 'no', 'right', 'or', 'wrong', ',', 'nor', 'is', 'there', 'one', 'side', 'that', 'comes', 'off', 'more', 'positive', 'in', 'the', 'other', ',', 'even', 'slightly', 'so', '--', 'there', 'are', 'just', 'two', 'very', 'viable', 'points', 'of', 'view', ',', 'each', 'with', 'their', 'own', 'merits', ',', 'each', 'with', 'their', 'own', 'faults', '.', 'the', 'complex', 'role', 'of', 'ellie', 'is', 'an', 'actress', "'", 's', 'dream', ',', 'and', 'foster', ',', 'a', 'virtual', 'shoo', '-', 'in', 'for', 'yet', 'another', 'best', 'actress', 'oscar', 'nomination', 'next', 'year', ',', 'more', 'than', 'rises', 'to', 'the', 'challenge', '.', 'she', 'conveys', 'intelligence', ',', 'determination', ',', 'warmth', ',', 'and', ',', 'in', 'a', 'gutsy', 'move', ',', 'always', 'on', 'edge', '.', 'we', 'root', 'for', 'ellie', 'and', 'feel', 'for', 'her', ',', 'but', 'we', 'also', 'feel', 'at', 'times', 'that', 'she', 'goes', 'too', 'far', '.', 'contact', 'is', 'clearly', 'foster', "'", 's', 'vehicle', ',', 'but', 'others', 'are', 'given', 'their', 'chance', 'to', 'shine', 'in', 'smaller', 'roles', '.', 'matthew', 'mcconaughey', ',', 'who', 'receives', 'outrageously', 'high', 'billing', 'for', 'his', 'smallish', 'role', ',', 'holds', 'his', 'own', 'as', 'the', 'religious', 'counterpoint', 'to', 'ellie', ',', 'spiritual', 'scholar', 'and', 'government', 'adviser', 'palmer', 'joss', '(', 'however', ',', 'his', 'main', 'storyline', ',', 'the', 'tentative', 'palmer', '-', 'ellie', 'romance', ',', 'is', 'the', 'film', "'", 's', 'weakest', 'subplot', ')', '.', 'john', 'hurt', 'is', 'effectively', 'creepy', 'as', 's', '.', 'r', '.', 'hadden', ',', 'the', 'wealthy', 'eccentric', 'who', 'provides', 'ellie', 'with', 'her', 'research', 'funding', '.', 'angela', 'bassett', 'continues', 'to', 'impress', 'in', 'her', 'bit', 'role', 'as', 'white', 'house', 'aide', 'rachel', 'constantine', '.', 'most', 'memorable', 'of', 'all', ',', 'though', ',', 'are', 'tom', 'skerritt', 'and', 'james', 'woods', ',', 'who', 'play', 'rival', 'scientist', 'dr', '.', 'david', 'drumlin', 'and', 'national', 'security', 'adviser', 'michael', 'litz', ',', 'respectively', ';', 'both', ',', 'especially', 'skerritt', ',', 'embody', 'these', 'asshole', 'characters', 'that', 'the', 'audience', 'hissed', 'just', 'about', 'every', 'single', 'one', 'of', 'their', 'appearances', '.', 'zemeckis', 'comes', 'off', 'of', 'his', 'three', '-', 'year', 'break', 'in', 'top', 'shape', '.', 'always', 'known', 'as', 'a', 'director', 'of', 'effects', '-', 'laden', 'extravaganzas', ',', 'it', 'comes', 'as', 'no', 'surprise', 'that', 'contact', "'", 's', 'visual', 'effects', 'are', 'quite', 'stunning', '.', 'the', 'central', 'space', 'journey', 'is', 'more', 'than', 'a', 'little', 'reminiscent', 'of', 'the', 'close', 'of', '2001', ':', 'a', 'space', 'odyssey', ',', 'but', 'with', 'more', 'advanced', 'technology', 'at', 'his', 'disposal', ',', 'zemeckis', "'", 's', 'voyage', 'is', 'even', 'trippier', 'than', 'stanley', 'kubrick', "'", 's', 'yet', 'more', 'wondrously', 'pure', '.', 'and', 'zemeckis', 'doesn', "'", 't', 'resist', 'the', 'urge', 'to', 'use', 'the', 'always', '-', 'interesting', 'incorporate', '-', 'actors', '-', 'into', '-', 'existing', '-', 'film', '-', 'footage', 'effect', ',', 'which', 'is', 'every', 'bit', 'as', 'seamless', 'here', 'as', 'it', 'was', 'in', 'forrest', 'gump', '.', 'effects', ',', 'however', ',', 'are', 'confined', 'to', 'only', 'a', 'few', 'scenes', 'and', 'clearly', 'take', 'a', 'back', 'seat', 'to', 'the', 'drama', ',', 'emotion', ',', 'and', 'pure', 'wonder', ',', 'which', 'zemeckis', 'proved', 'to', 'be', 'quite', 'adept', 'at', 'in', 'gump', '.', 'it', 'says', 'a', 'lot', 'that', ',', 'in', 'a', 'summer', 'science', 'fiction', 'film', 'such', 'as', 'this', ',', 'it', "'", 's', 'not', 'so', 'much', 'the', 'effects', 'that', 'stay', 'with', 'you', 'as', 'it', 'is', 'the', 'drama', 'and', 'the', 'issues', 'that', 'are', 'raised', '.', 'the', 'thought', '-', 'provoking', ',', 'two', '-', 'hour', '-', 'plus', 'contact', 'is', 'a', 'much', '-', 'welcome', 'change', 'of', 'pace', 'from', 'summer', 'no', '-', 'brainers', ',', 'but', 'the', 'fact', 'that', 'it', 'is', 'a', 'smart', 'film', 'does', 'not', 'mean', 'that', 'it', 'also', 'isn', "'", 't', 'entertaining', '.', 'for', 'all', 'the', 'interesting', 'questions', 'it', 'asks', ',', 'the', 'film', 'is', 'still', 'what', 'it', "'", 's', 'being', 'sold', 'as', '--', '"', 'a', 'journey', 'to', 'the', 'heart', 'of', 'the', 'universe', '.', '"', 'and', 'what', 'a', 'fascinating', ',', 'unforgettable', 'journey', 'it', 'is', '.'], 'pos')
tuple
2
list
str
['contact', '(', 'pg', ')', 'there', "'", 's', 'a', 'moment', 'late', 'in', 'robert', 'zemeckis', "'", 's', 'contact', 'where', 'i', 'was', 'reminded', 'of', 'why', 'i', 'started', 'writing', 'movie', 'reviews', 'in', 'the', 'first', 'place', '.', 'we', 'see', 'a', 'scientist', ',', 'dressed', 'in', 'a', 'silvery', 'space', 'suit', ',', 'walking', 'tentatively', 'across', 'a', 'narrow', 'walkway', 'leading', 'inside', 'a', 'compact', ',', 'spherical', 'space', 'pod', ',', 'unaware', 'of', 'what', 'awaits', 'when', 'the', 'ball', 'literally', 'drops', '.', 'anticipation', ',', 'excitement', ',', 'anxiety', ',', 'fear', '--', 'the', 'audience', 'experiences', 'it', 'all', 'the', 'emotional', 'tension', 'right', 'with', 'the', 'character', ',', 'nervously', ',', 'breathlessly', 'eager', 'to', 'see', 'what', 'lies', 'ahead', '.', 'it', 'is', 'this', 'sense', 'of', 'discovery', ',', 'the', 'anticipation', 'of', 'which', 'and', 'its', 'accompanying', 'exhilaration', ',', 'that', 'makes', 'this', 'adaptation', 'of', 'the', 'carl', 'sagan', 'novel', 'such', 'magical', ',', 'captivating', 'entertainment', '.', 'jodie', 'foster', 'stars', 'as', 'dr', '.', 'ellie', 'arroway', ',', 'a', 'brilliant', 'astronomer', 'who', 'dedicates', 'her', 'entire', 'life', 'to', 'searching', 'outer', 'space', 'for', 'extraterrestrial', 'radio', 'signals', '.', 'and', 'i', 'mean', 'life', '--', 'after', 'losing', 'her', 'entire', 'family', 'when', 'she', 'was', 'young', ',', 'the', 'only', 'thing', 'occupying', 'ellie', "'", 's', 'world', 'is', 'this', 'quest', 'to', 'discover', 'life', 'beyond', 'this', 'earth', '.', 'after', 'dealing', 'with', 'much', 'skepticism', 'on', 'the', 'part', 'of', 'government', 'officials', 'and', 'wealthy', 'financiers', ',', 'ellie', 'receives', 'her', 'vindication', 'when', 'she', 'stumbles', 'upon', 'an', 'incoming', 'radio', 'transmission', 'from', 'the', 'distant', 'star', 'vega', ',', 'which', 'includes', 'instructions', 'on', 'building', 'an', 'interstellar', 'transport', '.', 'from', 'this', 'synopsis', ',', 'contact', 'does', 'not', 'sound', 'too', 'different', 'to', 'most', 'films', 'about', 'alien', 'contact', ',', 'but', 'there', 'is', 'a', 'whole', 'lot', 'more', 'to', 'this', 'intelligent', 'film', 'than', 'the', 'sci', '-', 'fi', 'hook', '.', 'the', 'alien', 'contact', 'angle', 'generates', 'a', 'great', 'amount', 'of', 'suspense', 'and', 'awe', ',', 'but', 'perhaps', 'more', 'than', 'anything', 'else', ',', 'contact', 'is', 'a', 'character', 'study', 'of', 'ellie', ',', 'whose', 'obsession', 'with', 'empirical', ',', 'scientific', 'evidence', 'has', 'erased', 'all', 'belief', 'in', 'a', 'higher', 'power', '.', 'the', 'irony', 'is', 'that', ',', 'while', 'admitting', 'to', 'having', 'no', 'religious', 'faith', ',', 'she', 'holds', 'onto', 'her', 'belief', 'in', 'extraterrestrial', 'life', 'with', 'such', 'passion', 'and', 'conviction', 'that', 'it', 'becomes', ',', 'in', 'a', 'sense', ',', 'a', 'religion', 'in', 'its', 'own', 'right', '.', 'it', 'would', 'be', 'easy', 'for', 'scripters', 'james', 'v', '.', 'hart', 'and', 'michael', 'goldenberg', ',', 'in', 'trying', 'to', 'paint', 'a', 'positive', 'image', 'of', 'the', 'heroine', ',', 'to', 'champion', 'her', 'scientific', 'beliefs', 'over', 'religious', 'ones', ',', 'but', 'they', 'wisely', 'eschew', 'easy', 'answers', ',', 'giving', 'equal', 'time', 'to', 'both', 'sides', ',', 'and', 'in', 'so', 'doing', 'depict', 'ellie', 'as', 'not', 'completely', 'sane', '.', 'in', 'the', 'end', ',', 'there', 'is', 'no', 'right', 'or', 'wrong', ',', 'nor', 'is', 'there', 'one', 'side', 'that', 'comes', 'off', 'more', 'positive', 'in', 'the', 'other', ',', 'even', 'slightly', 'so', '--', 'there', 'are', 'just', 'two', 'very', 'viable', 'points', 'of', 'view', ',', 'each', 'with', 'their', 'own', 'merits', ',', 'each', 'with', 'their', 'own', 'faults', '.', 'the', 'complex', 'role', 'of', 'ellie', 'is', 'an', 'actress', "'", 's', 'dream', ',', 'and', 'foster', ',', 'a', 'virtual', 'shoo', '-', 'in', 'for', 'yet', 'another', 'best', 'actress', 'oscar', 'nomination', 'next', 'year', ',', 'more', 'than', 'rises', 'to', 'the', 'challenge', '.', 'she', 'conveys', 'intelligence', ',', 'determination', ',', 'warmth', ',', 'and', ',', 'in', 'a', 'gutsy', 'move', ',', 'always', 'on', 'edge', '.', 'we', 'root', 'for', 'ellie', 'and', 'feel', 'for', 'her', ',', 'but', 'we', 'also', 'feel', 'at', 'times', 'that', 'she', 'goes', 'too', 'far', '.', 'contact', 'is', 'clearly', 'foster', "'", 's', 'vehicle', ',', 'but', 'others', 'are', 'given', 'their', 'chance', 'to', 'shine', 'in', 'smaller', 'roles', '.', 'matthew', 'mcconaughey', ',', 'who', 'receives', 'outrageously', 'high', 'billing', 'for', 'his', 'smallish', 'role', ',', 'holds', 'his', 'own', 'as', 'the', 'religious', 'counterpoint', 'to', 'ellie', ',', 'spiritual', 'scholar', 'and', 'government', 'adviser', 'palmer', 'joss', '(', 'however', ',', 'his', 'main', 'storyline', ',', 'the', 'tentative', 'palmer', '-', 'ellie', 'romance', ',', 'is', 'the', 'film', "'", 's', 'weakest', 'subplot', ')', '.', 'john', 'hurt', 'is', 'effectively', 'creepy', 'as', 's', '.', 'r', '.', 'hadden', ',', 'the', 'wealthy', 'eccentric', 'who', 'provides', 'ellie', 'with', 'her', 'research', 'funding', '.', 'angela', 'bassett', 'continues', 'to', 'impress', 'in', 'her', 'bit', 'role', 'as', 'white', 'house', 'aide', 'rachel', 'constantine', '.', 'most', 'memorable', 'of', 'all', ',', 'though', ',', 'are', 'tom', 'skerritt', 'and', 'james', 'woods', ',', 'who', 'play', 'rival', 'scientist', 'dr', '.', 'david', 'drumlin', 'and', 'national', 'security', 'adviser', 'michael', 'litz', ',', 'respectively', ';', 'both', ',', 'especially', 'skerritt', ',', 'embody', 'these', 'asshole', 'characters', 'that', 'the', 'audience', 'hissed', 'just', 'about', 'every', 'single', 'one', 'of', 'their', 'appearances', '.', 'zemeckis', 'comes', 'off', 'of', 'his', 'three', '-', 'year', 'break', 'in', 'top', 'shape', '.', 'always', 'known', 'as', 'a', 'director', 'of', 'effects', '-', 'laden', 'extravaganzas', ',', 'it', 'comes', 'as', 'no', 'surprise', 'that', 'contact', "'", 's', 'visual', 'effects', 'are', 'quite', 'stunning', '.', 'the', 'central', 'space', 'journey', 'is', 'more', 'than', 'a', 'little', 'reminiscent', 'of', 'the', 'close', 'of', '2001', ':', 'a', 'space', 'odyssey', ',', 'but', 'with', 'more', 'advanced', 'technology', 'at', 'his', 'disposal', ',', 'zemeckis', "'", 's', 'voyage', 'is', 'even', 'trippier', 'than', 'stanley', 'kubrick', "'", 's', 'yet', 'more', 'wondrously', 'pure', '.', 'and', 'zemeckis', 'doesn', "'", 't', 'resist', 'the', 'urge', 'to', 'use', 'the', 'always', '-', 'interesting', 'incorporate', '-', 'actors', '-', 'into', '-', 'existing', '-', 'film', '-', 'footage', 'effect', ',', 'which', 'is', 'every', 'bit', 'as', 'seamless', 'here', 'as', 'it', 'was', 'in', 'forrest', 'gump', '.', 'effects', ',', 'however', ',', 'are', 'confined', 'to', 'only', 'a', 'few', 'scenes', 'and', 'clearly', 'take', 'a', 'back', 'seat', 'to', 'the', 'drama', ',', 'emotion', ',', 'and', 'pure', 'wonder', ',', 'which', 'zemeckis', 'proved', 'to', 'be', 'quite', 'adept', 'at', 'in', 'gump', '.', 'it', 'says', 'a', 'lot', 'that', ',', 'in', 'a', 'summer', 'science', 'fiction', 'film', 'such', 'as', 'this', ',', 'it', "'", 's', 'not', 'so', 'much', 'the', 'effects', 'that', 'stay', 'with', 'you', 'as', 'it', 'is', 'the', 'drama', 'and', 'the', 'issues', 'that', 'are', 'raised', '.', 'the', 'thought', '-', 'provoking', ',', 'two', '-', 'hour', '-', 'plus', 'contact', 'is', 'a', 'much', '-', 'welcome', 'change', 'of', 'pace', 'from', 'summer', 'no', '-', 'brainers', ',', 'but', 'the', 'fact', 'that', 'it', 'is', 'a', 'smart', 'film', 'does', 'not', 'mean', 'that', 'it', 'also', 'isn', "'", 't', 'entertaining', '.', 'for', 'all', 'the', 'interesting', 'questions', 'it', 'asks', ',', 'the', 'film', 'is', 'still', 'what', 'it', "'", 's', 'being', 'sold', 'as', '--', '"', 'a', 'journey', 'to', 'the', 'heart', 'of', 'the', 'universe', '.', '"', 'and', 'what', 'a', 'fascinating', ',', 'unforgettable', 'journey', 'it', 'is', '.'] pos
# Tokenization and preprocessing could also be more sophisticated
X, y = zip(*[(words, 1 if category == 'pos' else 0) for words, category in documents])
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
type(X_train)
len(X_train)
print(X_train[0])
type(X_train[0])
len(X_train[0])
type(X_train[0][0])
type(X_train[0][1])
len(X_test)
list
1500
['susan', 'granger', "'", 's', 'review', 'of', '"', 'bread', 'and', 'tulips', '"', '(', 'first', 'look', 'pictures', ')', 'in', 'this', 'delightfully', 'frothy', 'italian', 'romantic', 'comedy', ',', 'after', 'accidentally', 'being', 'left', 'behind', 'by', 'a', 'tour', 'bus', 'while', 'on', 'a', 'family', 'vacation', 'with', 'her', 'cranky', 'husband', 'and', 'two', 'cynical', 'teenagers', ',', 'rosalba', '(', 'licia', 'maglietta', ')', ',', 'an', 'unhappy', 'housewife', 'from', 'pescara', ',', 'finds', 'herself', '-', 'and', 'love', '-', 'in', 'venice', '.', 'for', 'the', 'first', 'time', 'in', 'years', ',', 'rosalba', "'", 's', 'on', 'her', 'own', 'when', 'she', "'", 's', 'abandoned', 'at', 'a', 'highway', 'rest', 'area', '.', 'although', 'her', 'philandering', 'husband', '(', 'antonio', 'catania', ')', ',', 'a', 'plumbing', '-', 'supply', 'dealer', ',', 'orders', 'her', 'to', 'stay', 'there', 'until', 'she', "'", 's', 'picked', 'up', ',', 'she', 'impulsively', 'accepts', 'a', 'ride', 'to', 'venice', ',', 'a', 'bohemian', 'paradise', 'which', 'she', "'", 's', 'never', 'visited', '.', 'rosalba', 'finds', 'refuge', 'and', 'romance', 'with', 'fernando', '(', 'bruno', 'ganz', ')', ',', 'a', 'gruff', 'icelandic', 'waiter', 'who', 'offers', 'her', 'a', 'spare', 'room', 'in', 'his', 'modest', 'apartment', 'and', 'prepares', 'breakfast', 'for', 'her', 'each', 'morning', '.', 'to', 'support', 'herself', ',', 'she', 'gets', 'a', 'job', 'working', 'with', 'a', 'florist', '(', 'antonio', 'catania', ')', '.', 'film', '-', 'maker', 'silvio', 'soldini', 'gently', 'explores', 'the', 'blossoming', 'of', 'this', 'bored', ',', 'middle', '-', 'aged', ',', 'middle', '-', 'class', 'woman', 'with', 'warmth', 'and', 'affection', ',', 'savoring', 'special', 'moments', 'such', 'as', 'when', 'rosalba', 'starts', 'playing', 'the', 'accordion', 'again', 'and', 'abandons', 'her', 'maroon', 'stretch', 'pants', ',', 'silver', 'jacket', 'and', 'orange', 'sneakers', 'for', 'a', 'simple', ',', 'new', 'red', '-', 'and', '-', 'white', 'dress', 'with', 'platform', '-', 'soled', 'espadrilles', '.', 'the', 'superb', 'actors', 'slip', 'into', 'their', 'roles', 'seamlessly', ',', 'particularly', 'luminous', 'licia', 'maglietta', 'and', 'low', '-', 'key', 'bruno', 'ganz', ',', 'along', 'with', 'marina', 'massironi', 'as', 'her', 'nosy', 'massage', '-', 'therapist', 'neighbor', 'and', 'giuseppe', 'massironi', 'as', 'the', 'inept', 'plumber', '-', 'turned', '-', 'private', 'eye', 'who', "'", 's', 'sent', 'to', 'retrieve', 'her', 'on', 'orders', 'from', 'her', 'frantic', 'husband', '-', 'who', "'", 's', 'discovered', 'that', 'his', 'mistress', 'has', 'no', 'interest', 'in', 'doing', 'his', 'laundry', 'or', 'cleaning', 'the', 'house', '.', 'on', 'the', 'granger', 'movie', 'gauge', 'of', '1', 'to', '10', ',', '"', 'bread', 'and', 'tulips', '"', 'is', 'a', 'beguiling', ',', 'escapist', '8', '.', 'as', 'the', 'summer', 'ends', ',', 'it', "'", 's', 'a', 'magical', 'getaway', 'for', 'mature', 'audiences', '.', '.']
list
369
str
str
500
# Train a Word2Vec model
model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)
The aove code initializes and trains a Word2Vec model using the gensim
library in Python. Let's break down what each parameter means and how the model is trained:
Word2Vec
¶sentences (X_train
): This is the input data for the model. It should be a list of lists of tokens, i.e., each inner list is a sequence of words (tokens) from a single document. In the context of your code, X_train
contains the tokenized text data that will be used to train the Word2Vec model.
vector_size (100): This parameter sets the dimensionality of the word vectors. In this case, each word will be represented as a 100-dimensional vector. The size of the vector is a key hyperparameter and can affect both the quality of the representations and the performance of the model, with larger sizes generally capturing more information about the word but requiring more data to train effectively.
window (5): This parameter specifies the maximum distance between the current and predicted word within a sentence. In other words, for a given target word, the model will consider up to 5 words before and after it as context words. The window size affects how much contextual information the model considers and can influence how well the model learns relationships between words that occur in broader contexts.
min_count (1): This parameter determines the minimum frequency count of words. Here, it is set to 1, meaning that the model will consider all words that appear at least once in the corpus. Setting this threshold helps in reducing the size of the model by ignoring rare words, which often occur as noise or are too infrequent to provide meaningful information.
workers (4): This is the number of worker threads to use for training the model, which can speed up the training process on multi-core machines. Here, it is set to use 4 threads, enabling parallel processing to train the model faster.
Word2Vec
¶X_train
) to learn vector representations for each word in a way that similar words (in terms of context) have similar vectors. It does this either through the CBOW or Skip-Gram architecture, depending on how it's configured (default is CBOW if not specified).model
) will have learned word embeddings for all words in the corpus that meet the min_count
criterion. These embeddings can be accessed via model.wv
, and can be used to check word similarities, as input features for machine learning models, or for any task that might benefit from a numerical representation of text data.The trained Word2Vec model is particularly useful for NLP tasks where the semantic meaning of words and their relationships significantly influence the outcome, such as sentiment analysis, recommendation systems, and more.
print(model)
Word2Vec<vocab=35363, vector_size=100, alpha=0.025>
Convert each review into a vector by averaging the vectors of the words in the review.
def document_vector(doc):
# remove out-of-vocabulary words
doc = [word for word in doc if word in model.wv.index_to_key]
return np.mean(model.wv[doc], axis=0) if doc else np.zeros(model.vector_size)
# Apply the function to each document
X_train_vec = np.array([document_vector(doc) for doc in X_train])
X_test_vec = np.array([document_vector(doc) for doc in X_test])
This Python code defines a function document_vector(doc)
and then applies this function to each document in X_train
and X_test
datasets. Let's break down the code step by step:
Defining the document_vector
function:
doc
, which represents a document. In the context of natural language processing (NLP), a document could be a sentence, a paragraph, or any other unit of text.model
). This is done to ensure that only words known to the model are used to compute the document vector.model.wv[doc]
) for each word in the document and taking the mean along the rows (axis=0) to obtain a single vector representation for the entire document.model.vector_size
).Applying the function to each document:
document_vector
function, the code applies this function to each document in both the training (X_train
) and testing (X_test
) datasets.X_train_vec
to store the document vectors for the training set and another array X_test_vec
to store the document vectors for the testing set.X_train
and X_test
, the document_vector
function is called, and the resulting document vector is added to the respective array using list comprehension.np.array()
to facilitate further processing or analysis.In summary, this code snippet calculates the document vectors for each document in the training and testing datasets using a pre-trained Word2Vec model. The document vectors represent the semantic meaning of the documents in a continuous vector space, which can be useful for various NLP tasks such as document classification, clustering, or similarity analysis.
Train a logistic regression classifier using the document vectors.
# Train a logistic regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_vec, y_train)
# Predict the labels for the test set
y_pred = classifier.predict(X_test_vec)
# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
Accuracy: 0.69
In this example, we loaded a dataset of movie reviews, trained a Word2Vec model on the training part of the dataset, and used the averaged word vectors to create feature vectors for each review. A logistic regression model was then trained on these features to perform sentiment analysis. This example shows the power of Word2Vec in capturing semantic properties of words and using them in downstream tasks like sentiment analysis.
Keep in mind that the performance can be significantly improved with more sophisticated preprocessing, hyperparameter tuning, and possibly using more advanced models for both word embeddings and classification.