All the IPython Notebooks in Python Natural Language Processing lecture series by Dr. Milaan Parmar are available @ GitHub

08 Stemming¶

In [1]:

import spacy
nlp = spacy.load('en_core_web_sm')
doc2 = nlp(u"I'm always here to help you all! Email:milaanparmar9@gmail.com or visit more at https://github.com/milaan9!")
for t in doc2:
    print(t)

I
'm
always
here
to
help
you
all
!
Email:milaanparmar9@gmail.com
or
visit
more
at
https://github.com/milaan9
!

In [2]:

doc3 = nlp(u'A 5km NYC cab ride costs $10.30')
for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30

In [3]:

doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")
for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.

---- Porter Stemmer ------¶

In [4]:

# Import the toolkit and the full Porter Stemmer library
import nltk
from nltk.stem.porter import *

In [5]:

p_stemmer = PorterStemmer()
words = ['run','runner','running','ran','runs','easily','fairly']

In [6]:

for word in words:
    print(word+' --> '+p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli

In [7]:

#SnowballStemmer
from nltk.stem.snowball import SnowballStemmer
# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')

In [8]:

words = ['run','runner','running','ran','runs','easily','fairly']
# words = ['generous','generation','generously','generate']

In [9]:

for word in words:
    print(word+' --> '+s_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair

In [10]:

# ----Do Some more practice -----

In [11]:

words = ['consolingly']

In [12]:

print('Porter Stemmer:')
for word in words:
    print(word+' --> '+p_stemmer.stem(word))

Porter Stemmer:
consolingly --> consolingli

In [13]:

print('Porter2 Stemmer:')
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

Porter2 Stemmer:
consolingly --> consol

In [14]:

# Stemming has its drawbacks. If given the token saw, stemming might always return saw, whereas lemmatization would likely return either
# see or saw depending on whether the use of the token was as a verb or a noun. As an example, consider the following:

phrase = 'I am meeting him tomorrow at the meeting'
for word in phrase.split():
    print(word+' --> '+p_stemmer.stem(word))

I --> i
am --> am
meeting --> meet
him --> him
tomorrow --> tomorrow
at --> at
the --> the
meeting --> meet

In [15]:

# Perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')

In [16]:

var1 = nlp(u"John Adam is one the researcher who invent the direction of way towards success!")

for token in var1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

John 	 PROPN 	 11174346320140919546 	 John
Adam 	 PROPN 	 14264057329400597350 	 Adam
is 	 AUX 	 10382539506755952630 	 be
one 	 NUM 	 17454115351911680600 	 one
the 	 DET 	 7425985699627899538 	 the
researcher 	 NOUN 	 1317581537614213870 	 researcher
who 	 PRON 	 3876862883474502309 	 who
invent 	 VERB 	 5373681334090504585 	 invent
the 	 DET 	 7425985699627899538 	 the
direction 	 NOUN 	 895834437038626927 	 direction
of 	 ADP 	 886050111519832510 	 of
way 	 NOUN 	 6878210874361030284 	 way
towards 	 ADP 	 9315050841437086371 	 towards
success 	 NOUN 	 16089821935113899987 	 success
! 	 PUNCT 	 17494803046312582752 	 !

In [17]:

def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [18]:

var2 = nlp(u"John Adam is one the researcher who invent the direction of way towards success!")
show_lemmas(var2)

John         PROPN  11174346320140919546   John
Adam         PROPN  14264057329400597350   Adam
is           AUX    10382539506755952630   be
one          NUM    17454115351911680600   one
the          DET    7425985699627899538    the
researcher   NOUN   1317581537614213870    researcher
who          PRON   3876862883474502309    who
invent       VERB   5373681334090504585    invent
the          DET    7425985699627899538    the
direction    NOUN   895834437038626927     direction
of           ADP    886050111519832510     of
way          NOUN   6878210874361030284    way
towards      ADP    9315050841437086371    towards
success      NOUN   16089821935113899987   success
!            PUNCT  17494803046312582752   !

In [19]:

var3 = nlp(u"I am meeting him tomorrow at the meeting.")
show_lemmas(var3)

I            PRON   4690420944186131903    I
am           AUX    10382539506755952630   be
meeting      VERB   6880656908171229526    meet
him          PRON   1655312771067108281    he
tomorrow     NOUN   3573583789758258062    tomorrow
at           ADP    11667289587015813222   at
the          DET    7425985699627899538    the
meeting      NOUN   14798207169164081740   meeting
.            PUNCT  12646065887601541794   .

In [20]:

var4 = nlp(u"That's of the greate person in the world")
show_lemmas(var4)

That         PRON   4380130941430378203    that
's           AUX    10382539506755952630   be
of           ADP    886050111519832510     of
the          DET    7425985699627899538    the
greate       ADJ    4429768169814447593    greate
person       NOUN   14800503047316267216   person
in           ADP    3002984154512732771    in
the          DET    7425985699627899538    the
world        NOUN   1703489418272052182    world

In [ ]: