#1st STEP: TOKENIZE
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
review = "Went to order a Playstation 5 on the preorder date. Could not get one to safe my life... but there were plenty of PS5 DualSense Wireless controllers available. So that's something, right? Sony should be ashamed of themselves."
print(sent_tokenize(review))
['Went to order a Playstation 5 on the preorder date.', 'Could not get one to safe my life... but there were plenty of PS5 DualSense Wireless controllers available.', "So that's something, right?", 'Sony should be ashamed of themselves.']
print(word_tokenize(review))
['Went', 'to', 'order', 'a', 'Playstation', '5', 'on', 'the', 'preorder', 'date', '.', 'Could', 'not', 'get', 'one', 'to', 'safe', 'my', 'life', '...', 'but', 'there', 'were', 'plenty', 'of', 'PS5', 'DualSense', 'Wireless', 'controllers', 'available', '.', 'So', 'that', "'s", 'something', ',', 'right', '?', 'Sony', 'should', 'be', 'ashamed', 'of', 'themselves', '.']
#2nd STEP: DELETE STOPWORDS
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
sent_as_token = sent_tokenize(review)
words_as_token = word_tokenize(review)
filtered_sent = []
for word in words_as_token:
if word not in stopwords:
filtered_sent += [word]
print(words_as_token)
print(filtered_sent)
['Went', 'to', 'order', 'a', 'Playstation', '5', 'on', 'the', 'preorder', 'date', '.', 'Could', 'not', 'get', 'one', 'to', 'safe', 'my', 'life', '...', 'but', 'there', 'were', 'plenty', 'of', 'PS5', 'DualSense', 'Wireless', 'controllers', 'available', '.', 'So', 'that', "'s", 'something', ',', 'right', '?', 'Sony', 'should', 'be', 'ashamed', 'of', 'themselves', '.'] ['Went', 'order', 'Playstation', '5', 'preorder', 'date', '.', 'Could', 'get', 'one', 'safe', 'life', '...', 'plenty', 'PS5', 'DualSense', 'Wireless', 'controllers', 'available', '.', 'So', "'s", 'something', ',', 'right', '?', 'Sony', 'ashamed', '.']
#3rd STEP: STEMMING
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed_sent = []
for word in words_as_token:
stemmed_sent += [stemmer.stem(word)]
print(stemmed_sent)
['went', 'to', 'order', 'a', 'playstat', '5', 'on', 'the', 'preorder', 'date', '.', 'could', 'not', 'get', 'one', 'to', 'safe', 'my', 'life', '...', 'but', 'there', 'were', 'plenti', 'of', 'ps5', 'dualsens', 'wireless', 'control', 'avail', '.', 'So', 'that', "'s", 'someth', ',', 'right', '?', 'soni', 'should', 'be', 'asham', 'of', 'themselv', '.']
#4th STEP: SPEECH TAGGING
tagged_words = []
tagged_words = nltk.pos_tag(words_as_token) #you tag a LIST of tokenized words
print(tagged_words)
[('Went', 'NN'), ('to', 'TO'), ('order', 'NN'), ('a', 'DT'), ('Playstation', 'NN'), ('5', 'CD'), ('on', 'IN'), ('the', 'DT'), ('preorder', 'NN'), ('date', 'NN'), ('.', '.'), ('Could', 'MD'), ('not', 'RB'), ('get', 'VB'), ('one', 'CD'), ('to', 'TO'), ('safe', 'VB'), ('my', 'PRP$'), ('life', 'NN'), ('...', ':'), ('but', 'CC'), ('there', 'EX'), ('were', 'VBD'), ('plenty', 'NN'), ('of', 'IN'), ('PS5', 'NNP'), ('DualSense', 'NNP'), ('Wireless', 'NNP'), ('controllers', 'NNS'), ('available', 'JJ'), ('.', '.'), ('So', 'IN'), ('that', 'DT'), ("'s", 'VBZ'), ('something', 'NN'), (',', ','), ('right', 'RB'), ('?', '.'), ('Sony', 'NNP'), ('should', 'MD'), ('be', 'VB'), ('ashamed', 'VBN'), ('of', 'IN'), ('themselves', 'PRP'), ('.', '.')]
#5th STEP: CHUNKING
chunk_of_grammar = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
chunkParser = nltk.RegexpParser(chunk_of_grammar)
chunked = chunkParser.parse(tagged_words)
print(chunked)
for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
print(subtree)
chunked.draw()
(S Went/NN to/TO order/NN a/DT Playstation/NN 5/CD on/IN the/DT preorder/NN date/NN ./. Could/MD not/RB get/VB one/CD to/TO safe/VB my/PRP$ life/NN .../: but/CC there/EX were/VBD plenty/NN of/IN (Chunk PS5/NNP DualSense/NNP Wireless/NNP) controllers/NNS available/JJ ./. So/IN that/DT 's/VBZ something/NN ,/, right/RB ?/. (Chunk Sony/NNP) should/MD be/VB ashamed/VBN of/IN themselves/PRP ./.) (Chunk PS5/NNP DualSense/NNP Wireless/NNP) (Chunk Sony/NNP)
#Using random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
#Notes: Lets generetate a random classification problem
X, y = make_classification(n_samples=1000, n_features=4,
n_informative=2, n_redundant=0,
random_state=0, shuffle=False)
#print(X, y)
#Notes: Create the random forest. random_state controls bootstrapping
clf = RandomForestClassifier(max_depth=2, random_state=0)
#Notes: Build a forest of trees from the training set (X, y)
clf.fit(X, y)
#Notes: The predicted class of an input sample is a vote by the trees in the forest,
#weighted by their probability estimates. That is, the predicted class is the one with
#highest mean probability estimate across the trees.
print(clf.predict([[0, 0, 0, 0]]))
[1]