Notebook

In [17]:

#1st STEP: TOKENIZE
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

review = "Went to order a Playstation 5 on the preorder date. Could not get one to safe my life... but there were plenty of PS5 DualSense Wireless controllers available. So that's something, right? Sony should be ashamed of themselves."

print(sent_tokenize(review))

['Went to order a Playstation 5 on the preorder date.', 'Could not get one to safe my life... but there were plenty of PS5 DualSense Wireless controllers available.', "So that's something, right?", 'Sony should be ashamed of themselves.']

In [18]:

print(word_tokenize(review))

['Went', 'to', 'order', 'a', 'Playstation', '5', 'on', 'the', 'preorder', 'date', '.', 'Could', 'not', 'get', 'one', 'to', 'safe', 'my', 'life', '...', 'but', 'there', 'were', 'plenty', 'of', 'PS5', 'DualSense', 'Wireless', 'controllers', 'available', '.', 'So', 'that', "'s", 'something', ',', 'right', '?', 'Sony', 'should', 'be', 'ashamed', 'of', 'themselves', '.']

In [19]:

#2nd STEP: DELETE STOPWORDS

from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))

sent_as_token = sent_tokenize(review)
words_as_token = word_tokenize(review)

filtered_sent = []

for word in words_as_token:
    if word not in stopwords:
        filtered_sent += [word]
        
print(words_as_token)

print(filtered_sent)

['Went', 'to', 'order', 'a', 'Playstation', '5', 'on', 'the', 'preorder', 'date', '.', 'Could', 'not', 'get', 'one', 'to', 'safe', 'my', 'life', '...', 'but', 'there', 'were', 'plenty', 'of', 'PS5', 'DualSense', 'Wireless', 'controllers', 'available', '.', 'So', 'that', "'s", 'something', ',', 'right', '?', 'Sony', 'should', 'be', 'ashamed', 'of', 'themselves', '.']
['Went', 'order', 'Playstation', '5', 'preorder', 'date', '.', 'Could', 'get', 'one', 'safe', 'life', '...', 'plenty', 'PS5', 'DualSense', 'Wireless', 'controllers', 'available', '.', 'So', "'s", 'something', ',', 'right', '?', 'Sony', 'ashamed', '.']

In [20]:

#3rd STEP: STEMMING
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

stemmed_sent = []

for word in words_as_token:
    stemmed_sent += [stemmer.stem(word)]

print(stemmed_sent)

['went', 'to', 'order', 'a', 'playstat', '5', 'on', 'the', 'preorder', 'date', '.', 'could', 'not', 'get', 'one', 'to', 'safe', 'my', 'life', '...', 'but', 'there', 'were', 'plenti', 'of', 'ps5', 'dualsens', 'wireless', 'control', 'avail', '.', 'So', 'that', "'s", 'someth', ',', 'right', '?', 'soni', 'should', 'be', 'asham', 'of', 'themselv', '.']

In [22]:

#4th STEP: SPEECH TAGGING
tagged_words = []

tagged_words = nltk.pos_tag(words_as_token) #you tag a LIST of tokenized words
    
print(tagged_words)

[('Went', 'NN'), ('to', 'TO'), ('order', 'NN'), ('a', 'DT'), ('Playstation', 'NN'), ('5', 'CD'), ('on', 'IN'), ('the', 'DT'), ('preorder', 'NN'), ('date', 'NN'), ('.', '.'), ('Could', 'MD'), ('not', 'RB'), ('get', 'VB'), ('one', 'CD'), ('to', 'TO'), ('safe', 'VB'), ('my', 'PRP$'), ('life', 'NN'), ('...', ':'), ('but', 'CC'), ('there', 'EX'), ('were', 'VBD'), ('plenty', 'NN'), ('of', 'IN'), ('PS5', 'NNP'), ('DualSense', 'NNP'), ('Wireless', 'NNP'), ('controllers', 'NNS'), ('available', 'JJ'), ('.', '.'), ('So', 'IN'), ('that', 'DT'), ("'s", 'VBZ'), ('something', 'NN'), (',', ','), ('right', 'RB'), ('?', '.'), ('Sony', 'NNP'), ('should', 'MD'), ('be', 'VB'), ('ashamed', 'VBN'), ('of', 'IN'), ('themselves', 'PRP'), ('.', '.')]

In [30]:

#5th STEP: CHUNKING
chunk_of_grammar = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
chunkParser = nltk.RegexpParser(chunk_of_grammar)
chunked = chunkParser.parse(tagged_words)

print(chunked)
for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
    print(subtree)

chunked.draw()
            

(S
  Went/NN
  to/TO
  order/NN
  a/DT
  Playstation/NN
  5/CD
  on/IN
  the/DT
  preorder/NN
  date/NN
  ./.
  Could/MD
  not/RB
  get/VB
  one/CD
  to/TO
  safe/VB
  my/PRP$
  life/NN
  .../:
  but/CC
  there/EX
  were/VBD
  plenty/NN
  of/IN
  (Chunk PS5/NNP DualSense/NNP Wireless/NNP)
  controllers/NNS
  available/JJ
  ./.
  So/IN
  that/DT
  's/VBZ
  something/NN
  ,/,
  right/RB
  ?/.
  (Chunk Sony/NNP)
  should/MD
  be/VB
  ashamed/VBN
  of/IN
  themselves/PRP
  ./.)
(Chunk PS5/NNP DualSense/NNP Wireless/NNP)
(Chunk Sony/NNP)

In [8]:

#Using random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

#Notes: Lets generetate a random classification problem

X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)

#print(X, y)

#Notes: Create the random forest. random_state controls bootstrapping
clf = RandomForestClassifier(max_depth=2, random_state=0)

#Notes: Build a forest of trees from the training set (X, y)

clf.fit(X, y)

#Notes: The predicted class of an input sample is a vote by the trees in the forest, 
#weighted by their probability estimates. That is, the predicted class is the one with 
#highest mean probability estimate across the trees.

print(clf.predict([[0, 0, 0, 0]]))

[1]

In [ ]: