%load_ext autoreload
%autoreload 2
%matplotlib inline
import sys, os
_snlp_book_dir = ".."
sys.path.append(_snlp_book_dir)
from statnlpbook.lm import *
from statnlpbook.ohhla import *
# %cd ..
import sys
sys.path.append("..")
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
docs = load_all_songs("../data/ohhla/train/www.ohhla.com/anonymous/j_live/")
assert len(docs) == 50, "Your ohhla corpus is corrupted, please download it again!"
trainDocs, testDocs = docs[:len(docs)//2], docs[len(docs)//2:]
train = words(trainDocs)
test = words(testDocs)
Plot the perplexity for laplace smoothing on the given data as a function of alpha in the interval [0.001, 0.1] in steps by 0.001. Is it fair to assume that this is a convex function? Write a method that finds the optimal pseudo count alpha
number for laplace smoothing for the given data up to some predefined numerical precision epsilon
under the assumption that the perplexity is a convex function of alpha. How often did you have to call perplexity
to find the optimum?
Tips:
You don't need 1st or 2nd order derivatives in this case, only the gradient descent direction. Think about recursively slicing up the problem.oov_train = inject_OOVs(train)
oov_vocab = set(oov_train)
oov_test = replace_OOVs(oov_vocab, test)
bigram = NGramLM(oov_train,2)
interval = [x/1000.0 for x in range(1, 100, 1)]
perplexit_at_1 = perplexity(LaplaceLM(bigram, alpha=1.0), oov_test)
def plot_perplexities(interval):
"""Plots the perplexity of LaplaceLM for every alpha in interval."""
perplexities = [perplexity(LaplaceLM(bigram, alpha), oov_test) for alpha in interval] # todo
plt.plot(interval, perplexities)
def find_optimal(low, high, epsilon=1e-6):
"""Returns the optimal pseudo count alpha within the interval [low, high] and the perplexity."""
print(high, low)
if high - low < epsilon:
return high, perplexity(LaplaceLM(bigram, high), oov_test)
else:
mid = (high+low) / 2.0
left = perplexity(LaplaceLM(bigram, mid-epsilon), oov_test)
right = perplexity(LaplaceLM(bigram, mid+epsilon), oov_test)
if left < right:
return find_optimal(low, mid, epsilon)
else:
return find_optimal(mid, high, epsilon)
plot_perplexities(interval)
find_optimal(0.0, 1.0)
1.0 0.0 0.5 0.0 0.25 0.0 0.125 0.0 0.0625 0.0 0.03125 0.0 0.03125 0.015625 0.0234375 0.015625 0.0234375 0.01953125 0.0234375 0.021484375 0.0234375 0.0224609375 0.02294921875 0.0224609375 0.022705078125 0.0224609375 0.0225830078125 0.0224609375 0.02252197265625 0.0224609375 0.022491455078125 0.0224609375 0.022491455078125 0.0224761962890625 0.02248382568359375 0.0224761962890625 0.02248382568359375 0.022480010986328125 0.02248382568359375 0.022481918334960938 0.022482872009277344 0.022481918334960938
(0.022482872009277344, 65.3568849083981)
Implement a method that tests whether a language model provides a valid probability distribution.
def sanity_check(lm, *history):
"""Throws an AssertionError if lm does not define a valid probability distribution for all words
in the vocabulary."""
probability_mass = sum([lm.probability(word, *history) for word in lm.vocab])
assert abs(probability_mass - 1.0) < 1e-6, probability_mass
unigram = NGramLM(oov_train,1)
stupid = StupidBackoff(bigram, unigram, 0.1)
print(sum([stupid.probability(word, 'the') for word in stupid.vocab]))
assert OOV in stupid.vocab
sanity_check(stupid, 'the')
1.0647115579930901
--------------------------------------------------------------------------- AssertionError Traceback (most recent call last) <ipython-input-8-9bd0c3fbec6b> in <module>() 9 print(sum([stupid.probability(word, 'the') for word in stupid.vocab])) 10 assert OOV in stupid.vocab ---> 11 sanity_check(stupid, 'the') <ipython-input-8-9bd0c3fbec6b> in sanity_check(lm, *history) 3 in the vocabulary.""" 4 probability_mass = sum([lm.probability(word, *history) for word in lm.vocab]) ----> 5 assert abs(probability_mass - 1.0) < 1e-6, probability_mass 6 7 unigram = NGramLM(oov_train,1) AssertionError: 1.0647115579930901
Develop and implement a language model that subtracts a count $d\in[0,1]$ from each non-zero count in the training set. Let's first formalise this:
class SubtractCount(CountLM):
def __init__(self, base_lm, d):
super().__init__(base_lm.vocab, base_lm.order)
self.base_lm = base_lm
self.d = d
self._counts = base_lm._counts # not good style since it is a protected member
self.vocab = base_lm.vocab
def counts(self, word_and_history):
# todo: this can be chached and does not have to be called at every call of counts
history = word_and_history[1:]
num_non_zero_histories = len([x for x in self.vocab if self._counts[(x, ) + history] > 0])
num_zero_histories = len(self.vocab) - num_non_zero_histories
if num_zero_histories == 0:
return self._counts[word_and_history]
else:
if self._counts[word_and_history] > 0:
return self._counts[word_and_history] - self.d
else:
return self.d * num_non_zero_histories / num_zero_histories
def norm(self, history):
return self.base_lm.norm(history)
subtract_lm = SubtractCount(unigram, 0.1)
oov_prob = subtract_lm.probability(OOV, 'the')
rest_prob = sum([subtract_lm.probability(word, 'the') for word in subtract_lm.vocab])
print(oov_prob + rest_prob)
sanity_check(subtract_lm, 'the')
perplexity(subtract_lm, oov_test)
1.1742331911436041
91.4414922652717
Develop and implement a version of the stupid language model that provides probabilities summing up to 1.
class StupidBackoffNormalized(LanguageModel):
def __init__(self, main, backoff, alpha):
super().__init__(main.vocab, main.order)
self.main = main
self.backoff = backoff
self.alpha = alpha
def probability(self, word, *history):
main_counts = self.main.counts((word,)+tuple(history))
main_norm = self.main.norm(history)
backoff_order_diff = self.main.order - self.backoff.order
backoff_counts = self.backoff.counts((word,)+tuple(history[:-backoff_order_diff]))
backoff_norm = self.backoff.norm(history[:-backoff_order_diff])
counts = main_counts + self.alpha * backoff_counts
norm = main_norm + self.alpha * backoff_norm
return counts / norm
less_stupid = StupidBackoffNormalized(bigram, unigram, 0.1)
print(sum([less_stupid.probability(word, 'the') for word in less_stupid.vocab]))
sanity_check(less_stupid, 'the')
perplexity(less_stupid, oov_test)
1.000000000000002
60.032236179798886