In [4]:
%run ../../common_functions/import_all.py

from scipy import optimize
from scipy.integrate import quad, odeint
from scipy.interpolate import interp1d
from scipy.signal import detrend
from scipy.spatial import distance
from matplotlib.legend_handler import HandlerLine2D

from nltk.book import *                                     # will print a list of books (texts) imported from here
from nltk.text import Text
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.tokenize import WordPunctTokenizer 
from nltk.corpus import brown, inaugural
from nltk import (word_tokenize, wordpunct_tokenize, sent_tokenize, pos_tag, bigrams, 
                  UnigramTagger, NgramTagger, PunktSentenceTokenizer, TreebankWordTokenizer)
from nltk.corpus import treebank, wordnet
from nltk.tag import pos_tag, UnigramTagger
from nltk.tag.sequential import NgramTagger
from nltk.corpus import brown
from nltk.stem import (PorterStemmer,
                       LancasterStemmer,
                       SnowballStemmer, 
                       WordNetLemmatizer)

from common_functions.nltk_helpers import measure_lexical_diversity, compute_perc_word_usage, plot_freqdist_freq

%matplotlib inline

from common_functions.setup_notebook import set_css_style, setup_matplotlib, config_ipython
config_ipython()
setup_matplotlib()
set_css_style()
Out[4]:

Playing around with NLTK

Some material has been taken/adapted from the NLTK book

  • Exploring NLTK books (Text instance)
  • Exploring NLTK corpora
  • Exploring NLTK Treebank
  • Exploring the WordNet corpus

For the linguistics concepts used here, refer to the specific notebook.

Books and corpora

In [5]:
## List of all the books and sents imported
texts()
sents()

# Choose the book to play with and some wordsText
book = text2
word = 'love'
word2 = 'him'
words = ['love', 'kiss', 'marriage', 'sense', 'children', 'house', 'hate']

# Print first 100 token in book (book is an instance of nltk.text.Text, which behaves like a list of tokens)
# Note that punctuation is included as tokens
print(book[0:100], type(book))
print(list(book)[0:100] == book[0:100])
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
sent1: Call me Ishmael .
sent2: The family of Dashwood had long been settled in Sussex .
sent3: In the beginning God created the heaven and the earth .
sent4: Fellow - Citizens of the Senate and of the House of Representatives :
sent5: I have a problem with people PMing me to lol JOIN
sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !
sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .
sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .
['[', 'Sense', 'and', 'Sensibility', 'by', 'Jane', 'Austen', '1811', ']', 'CHAPTER', '1', 'The', 'family', 'of', 'Dashwood', 'had', 'long', 'been', 'settled', 'in', 'Sussex', '.', 'Their', 'estate', 'was', 'large', ',', 'and', 'their', 'residence', 'was', 'at', 'Norland', 'Park', ',', 'in', 'the', 'centre', 'of', 'their', 'property', ',', 'where', ',', 'for', 'many', 'generations', ',', 'they', 'had', 'lived', 'in', 'so', 'respectable', 'a', 'manner', 'as', 'to', 'engage', 'the', 'general', 'good', 'opinion', 'of', 'their', 'surrounding', 'acquaintance', '.', 'The', 'late', 'owner', 'of', 'this', 'estate', 'was', 'a', 'single', 'man', ',', 'who', 'lived', 'to', 'a', 'very', 'advanced', 'age', ',', 'and', 'who', 'for', 'many', 'years', 'of', 'his', 'life', ',', 'had', 'a', 'constant', 'companion'] <class 'nltk.text.Text'>
True
In [6]:
## Counts and lexical diversity
print('Num of tokens', len(book))
print('Num of counts for given word', book.count(word))
print('Lexical diversity', measure_lexical_diversity(book))
print('Fraction of use of word in book', compute_perc_word_usage(word, book))
Num of tokens 141576
Num of counts for given word 77
Lexical diversity 0.04826383002768831
Fraction of use of word in book 0.0005438774933604566
In [7]:
## Concordance and context

# Choose a book and a word
book = text2
word = 'love'

# Concordance of chosen word in chosen book
print('Concordance: ')
book.concordance(word)

# Words appearing in same contexts as chosen word in chosen book
print('Words in similar context as chosen word:')
# given word w, this finds all contexts w_1 w w_2 and finds all words w' which appear in same context, 
#i.e., w_1 w' w-2
book.similar(word)     

# Choose two words and show the common contexts
print('Common contexts of two chosen words:')
book.common_contexts([word, word2])
Concordance: 
Displaying 25 of 77 matches:
priety of going , and her own tender love for all her three children determine
es ." " I believe you are right , my love ; it will be better that there shoul
 . It implies everything amiable . I love him already ." " I think you will li
sentiment of approbation inferior to love ." " You may esteem him ." " I have 
n what it was to separate esteem and love ." Mrs . Dashwood now took pains to 
oner did she perceive any symptom of love in his behaviour to Elinor , than sh
 how shall we do without her ?" " My love , it will be scarcely a separation .
ise . Edward is very amiable , and I love him tenderly . But yet -- he is not 
ll never see a man whom I can really love . I require so much ! He must have a
ry possible charm ." " Remember , my love , that you are not seventeen . It is
f I do not now . When you tell me to love him as a brother , I shall no more s
hat Colonel Brandon was very much in love with Marianne Dashwood . She rather 
e were ever animated enough to be in love , must have long outlived every sens
hirty - five anything near enough to love , to make him a desirable companion 
roach would have been spared ." " My love ," said her mother , " you must not 
pect that the misery of disappointed love had already been known to him . This
 most melancholy order of disastrous love . CHAPTER 12 As Elinor and Marianne 
hen she considered what Marianne ' s love for him was , a quarrel seemed almos
ctory way ;-- but you , Elinor , who love to doubt where you can -- it will no
 man whom we have all such reason to love , and no reason in the world to thin
ded as he must be of your sister ' s love , should leave her , and leave her p
cannot think that . He must and does love her I am sure ." " But with a strang
 I believe not ," cried Elinor . " I love Willoughby , sincerely love him ; an
or . " I love Willoughby , sincerely love him ; and suspicion of his integrity
deed a man could not very well be in love with either of her daughters , witho
Words in similar context as chosen word:
affection sister heart mother time see town life it dear elinor
marianne me word family her him do regard head
Common contexts of two chosen words:
to_you of_in to_and in_by to_but
In [8]:
## Collocations
print('Collocations:')
book.collocations()
Collocations:
------------------------------------------------------------
ValueError                 Traceback (most recent call last)
<ipython-input-8-45648ce89cf6> in <module>
      1 ## Collocations
      2 print('Collocations:')
----> 3 book.collocations()

~/Desktop/Mallzee/repos/plantation/venv/lib/python3.7/site-packages/nltk/text.py in collocations(self, num, window_size)
    442 
    443         collocation_strings = [
--> 444             w1 + " " + w2 for w1, w2 in self.collocation_list(num, window_size)
    445         ]
    446         print(tokenwrap(collocation_strings, separator="; "))

~/Desktop/Mallzee/repos/plantation/venv/lib/python3.7/site-packages/nltk/text.py in <listcomp>(.0)
    442 
    443         collocation_strings = [
--> 444             w1 + " " + w2 for w1, w2 in self.collocation_list(num, window_size)
    445         ]
    446         print(tokenwrap(collocation_strings, separator="; "))

ValueError: too many values to unpack (expected 2)
In [ ]:
# Dispersion plot of text given some words (how far from the start word appears in text)
plt.grid()
book.dispersion_plot(words)
In [ ]:
## FreqDist for token counts 

fdist = FreqDist(book)                                      # FreqDist needs a tokens list, gives dict {token: counts}

word = 'love'
print('Num tokens for word %s: %f' %(word, fdist[word]))
print('Num tokens: ', fdist.N())
print('Num unique tokens', fdist.B())
print('Token with the highest count is %s with count %d' %(fdist.max(), fdist[fdist.max()]))
print('Hapaxes are (10 of them)', fdist.hapaxes()[:10])

# Plot the 50 most frequent tokens and their token counts, normal and cumulative
fdist.plot(50, title='Book token counts')
fdist.plot(50, cumulative=True, title='Book token counts, cumulative')

# Same distrib, normal but with frequency instead of counts
plot_freqdist_freq(fdist, max_num=50, title='Book token frequencies')
In [10]:
## FreqDist for word lenghts

fdist_wl = FreqDist([len(word) for word in book])

# Plot and show as table
fdist_wl.plot()
fdist_wl.tabulate()
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x131e9c710>
    3     2     1     4     5     6     7     8     9    10    11    12    13    14    15    17    16 
28839 24826 23009 21352 11438  9507  8158  5676  3736  2596  1278   711   334    87    24     3     2 
In [11]:
# Conditional freq distrib on Brown corpus genres

# ConditionalFreqDist is a collection of freq dist, one per condition
# requires tuples (condition, event)

# print genres in corpus
print('All genres in Brown corpus: ', sorted(brown.categories()))

# choosing some of the categories (genres) and get the words in each
tuples = [(genre, word) for genre in ['romance', 'science_fiction'] for word in brown.words(categories=genre)]

# Building the cfdist
cfdist = ConditionalFreqDist(tuples)

# Each cfdist[condition] will be a FreqDist
type(cfdist['romance'])

# Tabulate selecting the conditions and the specific samples (no selection will give all)
cfdist.tabulate(conditions=['romance'], samples=['the', 'love', 'hate'])

# Plotting any of the dists on the condition
cfdist['romance'].plot(50, title='Counts tokens in genre romance')
cfdist['science_fiction'].plot(50, title='Counts tokens in genre science_fiction')
All genres in Brown corpus:  ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
Out[11]:
nltk.probability.FreqDist
         the love hate 
romance 2758   32    9 
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x130652f90>
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x130711fd0>

Treebank

  • Parsed sentences
In [12]:
# The Treebank corpus in NLTK contains 10% of the original Penn Treebank corpus

treebank.words()

treebank.parsed_sents()
Out[12]:
['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', ...]
Out[12]:
[Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])]), Tree('S', [Tree('NP-SBJ', [Tree('NNP', ['Mr.']), Tree('NNP', ['Vinken'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP-PRD', [Tree('NP', [Tree('NN', ['chairman'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('NP', [Tree('NNP', ['Elsevier']), Tree('NNP', ['N.V.'])]), Tree(',', [',']), Tree('NP', [Tree('DT', ['the']), Tree('NNP', ['Dutch']), Tree('VBG', ['publishing']), Tree('NN', ['group'])])])])])]), Tree('.', ['.'])]), ...]

WordNet

  • Hypernyms and Hyponyms
In [13]:
wn = wordnet

sss = wn.synsets('dog')

s1 = sss[0]
print(s1, s1.definition())

print(s1.hypernyms(), s1.hyponyms())
Synset('dog.n.01') a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds
[Synset('canine.n.02'), Synset('domestic_animal.n.01')] [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), Synset('dalmatian.n.02'), Synset('great_pyrenees.n.01'), Synset('griffon.n.02'), Synset('hunting_dog.n.01'), Synset('lapdog.n.01'), Synset('leonberg.n.01'), Synset('mexican_hairless.n.01'), Synset('newfoundland.n.01'), Synset('pooch.n.01'), Synset('poodle.n.01'), Synset('pug.n.01'), Synset('puppy.n.01'), Synset('spitz.n.01'), Synset('toy_dog.n.01'), Synset('working_dog.n.01')]

Text manipulation

  • Tokenizing
  • POS tagging
  • Stemming/lemmatizing
In [14]:
# tagged sentences from Brown corpus
brown_tagged_sents = brown.tagged_sents(categories='news')

# Separate tagged sents into train and test
train_sents = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.8)]
test_sents = brown_tagged_sents[int(len(brown_tagged_sents) * 0.8):]
In [15]:
# Tokenising
# NOTE: obvs the easiest sentence tokenization (naive) is splitting on period with split('.'). 
# this won't understand "Mr. Smith." though 
# similarly for tokenizing a sentence into tokens

text = """What will you do? I go to the cinema this weekend. That's a test. I can't do better!"""

# Standard methods are wrappers around the recommended tokenizers, so equivalent to 
# so equivalent to tokenizer = TreebankWordTokenizer(); tokenizer.tokenize(sentence)

# Tokenizing text into sentences

# sent_tokenize calls the PunktSentenceTokenizer (recommended)
print('* Docs of PunktSentenceTokenizer:')
print(PunktSentenceTokenizer.__doc__)
st = sent_tokenize(text)
print('* Text tokenized', st)

# To train tokenizer on a bespoke text:
# import nltk.tokenize.punkt
# tokenizer = PunktSentenceTokenizer()
# text = open("someplain.txt","r").read()
# tokenizer.train(text)

# Tokenizing a sentence into tokens

# word_tokenise calls the TreebankWordTokenizer (recommended)
print('* Docs of TreebankWordTokenizer:')
print(TreebankWordTokenizer.__doc__)
tokens = word_tokenize(st[2])
print('* Sentence tokenized', tokens)
# wordpunct_tokenise calls WordPunctTokenizer, it will separate all punctuation as tokens (uses a regexp)
print(WordPunctTokenizer.__doc__)
tokens_punct = wordpunct_tokenize(st[2])
print('* Sentence tokenized with a regexp tokenizer', tokens_punct)
* Docs of PunktSentenceTokenizer:

    A sentence tokenizer which uses an unsupervised algorithm to build
    a model for abbreviation words, collocations, and words that start
    sentences; and then uses that model to find sentence boundaries.
    This approach has been shown to work well for many European
    languages.
    
* Text tokenized ['What will you do?', 'I go to the cinema this weekend.', "That's a test.", "I can't do better!"]
* Docs of TreebankWordTokenizer:

    The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
    This is the method that is invoked by ``word_tokenize()``.  It assumes that the
    text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

    This tokenizer performs the following steps:

    - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
    - treat most punctuation characters as separate tokens
    - split off commas and single quotes, when followed by whitespace
    - separate periods that appear at the end of line

        >>> from nltk.tokenize import TreebankWordTokenizer
        >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
        >>> TreebankWordTokenizer().tokenize(s)
        ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
        >>> s = "They'll save and invest more."
        >>> TreebankWordTokenizer().tokenize(s)
        ['They', "'ll", 'save', 'and', 'invest', 'more', '.']
        >>> s = "hi, my name can't hello,"
        >>> TreebankWordTokenizer().tokenize(s)
        ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
    
* Sentence tokenized ['That', "'s", 'a', 'test', '.']

    Tokenize a text into a sequence of alphabetic and
    non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``.

        >>> from nltk.tokenize import WordPunctTokenizer
        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
        >>> WordPunctTokenizer().tokenize(s)
        ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
        '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
    
* Sentence tokenized with a regexp tokenizer ['That', "'", 's', 'a', 'test', '.']
In [16]:
# POS tagging

# pos_tag uses the PerceptronTagger
print('* Tagged tokens from above', pos_tag(tokens))

# Evaluate the performance of some taggers

# The UnigramTagger will assign tag to token as the most probable for that token given a training set
unigram_tagger = UnigramTagger(train_sents)
print('* Evaluation Unigram tagger:', unigram_tagger.evaluate(test_sents))

# Repeat with an NGramTagger (assign the most probable tag given word and N - 1 previous context words)
threegram_tagger = NgramTagger(3, train_sents)  # for n=2 there is already a BigramTagger
print('* Evaluation Ngram tagger with N=3:', threegram_tagger.evaluate(test_sents))    # slow due to sparsity: trained tagger hasn't seen many word-context combinations

# Combining taggers: start with the Ngram one, if it can't find a tag for token fallback to the unigram one
t0 = UnigramTagger(train_sents)
t1 = NgramTagger(3, train_sents, backoff=t0)
print('* Evaluation combined tagger:', t1.evaluate(test_sents))
* Tagged tokens from above [('That', 'DT'), ("'s", 'VBZ'), ('a', 'DT'), ('test', 'NN'), ('.', '.')]
* Evaluation Unigram tagger: 0.8026879907509996
* Evaluation Ngram tagger with N=3: 0.05867334650031312
* Evaluation combined tagger: 0.8053374440001927
In [17]:
# Stemming
# Stemming some words with Porter, Lancaster and Snowball stemmers

porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
snowball_stemmer = SnowballStemmer('english')

print('* Stemming with (in order) Porter, Lancaster, Snowball')
print('multiply: ', 
      porter_stemmer.stem('multiply'), 
      lancaster_stemmer.stem('multiply'), 
      snowball_stemmer.stem('multiply'))
print('mice: ', 
      porter_stemmer.stem('mice'), 
      lancaster_stemmer.stem('mice'), 
      snowball_stemmer.stem('mice'))
* Stemming with (in order) Porter, Lancaster, Snowball
multiply:  multipli multiply multipli
mice:  mice mic mice
In [18]:
# Lemmatizing with the WordNet lemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

print('mice: ', wordnet_lemmatizer.lemmatize('mice'))
mice:  mouse

Playing with frequency distributions

In [19]:
# Setting some sentences

sentences = ['I go to school', 'I will go to the cinema', 'I like strawberries', 'I read books']
In [20]:
# FreqDist on the word length on some chosen sentences and on the last letter of words

split_sentences = [sentence.split() for sentence in sentences]
all_words = []
for sent in split_sentences:
    for word in sent:
        all_words.append(word)
        
fdist = FreqDist([len(word) for word in all_words])
fdist.plot(title='Counts word lengths')

fdist = FreqDist([word[-1:] for word in all_words])
fdist.plot(title='Counts last letter')
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x13d2eb910>
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x13d2cd3d0>
In [21]:
# ConditionalFreqDist on the words per last letter of words

split_sentences = [sentence.split() for sentence in sentences]
all_words = []
for sent in split_sentences:
    for word in sent:
        all_words.append(word)
        
tuples = [(word[-1:], word) for word in all_words]
cfdist = ConditionalFreqDist(tuples)

# Can plot both at same time
cfdist.plot()

cfdist
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x131325050>
Out[21]:
<ConditionalFreqDist with 7 conditions>
In [22]:
data = [('breakfast', 'cereal'),
        ('breakfast', 'water'),
        ('evening', 'meat'), 
        ('evening', 'salad'), 
        ('evening', 'wine'),
        ('lunch', 'sandwich'),
        ('lunch', 'fruit'),
        ('lunch', 'water'),
        ('lunch', 'chocolate'),
        ('breakfast', 'milk')
       ]

# word counts per category
cfdist = ConditionalFreqDist(data)
cfdist.plot()
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x1332bbfd0>
In [23]:
# Conditional freq dist to see how words have been used in time, inaugural corpus

cfdist = ConditionalFreqDist(
           (target, fileid[:4])
           for fileid in inaugural.fileids()
           for w in inaugural.words(fileid)
           for target in ['america', 'citizen']
           if w.lower().startswith(target))
cfdist.plot()

# Conditional freq dist to see words which end with chosen letters in Brown corpus
cfdist = ConditionalFreqDist(
           (target, w)
           for w in brown.words()
           for target in ['zz']
           if w.lower().endswith(target))
cfdist.plot()
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x13d1353d0>
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x13a5e67d0>
In [ ]: