#!/usr/bin/env python # coding: utf-8 # # Language Model Exercises # In these exercises you will extend and develop language models. We will use the code from the notes, but within a python package [`lm`](http://localhost:8888/edit/statnlpbook/lm.py). # ## Setup 1: Load Libraries # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import sys, os _snlp_book_dir = ".." sys.path.append(_snlp_book_dir) from statnlpbook.lm import * from statnlpbook.ohhla import * # %cd .. import sys sys.path.append("..") import matplotlib import matplotlib.pyplot as plt import pandas as pd matplotlib.rcParams['figure.figsize'] = (10.0, 6.0) # # $$ # \newcommand{\prob}{p} # \newcommand{\vocab}{V} # \newcommand{\params}{\boldsymbol{\theta}} # \newcommand{\param}{\theta} # \DeclareMathOperator{\perplexity}{PP} # \DeclareMathOperator{\argmax}{argmax} # \newcommand{\train}{\mathcal{D}} # \newcommand{\counts}[2]{\#_{#1}(#2) } # $$ # ## Setup 2: Load Data # In[2]: docs = load_all_songs("../data/ohhla/train/www.ohhla.com/anonymous/j_live/") assert len(docs) == 50, "Your ohhla corpus is corrupted, please download it again!" trainDocs, testDocs = docs[:len(docs)//2], docs[len(docs)//2:] train = words(trainDocs) test = words(testDocs) # ## Task 1: Optimal Pseudo Count # # Plot the perplexity for laplace smoothing on the given data as a function of alpha in the interval [0.001, 0.1] in steps by 0.001. Is it fair to assume that this is a convex function? Write a method that finds the optimal pseudo count `alpha` number for [laplace smoothing](https://github.com/uclmr/stat-nlp-book/blob/python/statnlpbook/lm.py#L180) for the given data up to some predefined numerical precision `epsilon` under the assumption that the perplexity is a convex function of alpha. How often did you have to call `perplexity` to find the optimum? # # Tips: # # You don't need 1st or 2nd order derivatives in this case, only the gradient descent direction. Think about recursively slicing up the problem. # # In[3]: oov_train = inject_OOVs(train) oov_vocab = set(oov_train) oov_test = replace_OOVs(oov_vocab, test) bigram = NGramLM(oov_train,2) interval = [x/1000.0 for x in range(1, 100, 1)] perplexity_at_1 = perplexity(LaplaceLM(bigram, alpha=1.0), oov_test) def plot_perplexities(interval): """Plots the perplexity of LaplaceLM for every alpha in interval.""" perplexities = [0.0 for alpha in interval] # todo plt.plot(interval, perplexities) def find_optimal(low, high, epsilon=1e-6): """Returns the optimal pseudo count alpha within the interval [low, high] and the perplexity.""" print(high, low) if high - low < epsilon: return 0.0 # todo else: return 0.0 # todo plot_perplexities(interval) find_optimal(0.0, 1.0) # ## Task 2: Sanity Check LM # Implement a method that tests whether a language model provides a valid probability distribution. # In[4]: def sanity_check(lm, *history): """Throws an AssertionError if lm does not define a valid probability distribution for all words in the vocabulary.""" probability_mass = 1.0 # todo assert abs(probability_mass - 1.0) < 1e-6, probability_mass unigram = NGramLM(oov_train,1) stupid = StupidBackoff(bigram, unigram, 0.1) print(sum([stupid.probability(word, 'the') for word in stupid.vocab])) sanity_check(stupid, 'the') # ## Task 3: Subtract Count LM # Develop and implement a language model that subtracts a count $d\in[0,1]$ from each non-zero count in the training set. Let's first formalize this: # # \begin{align} # \#_{w=0}(h_n) &= \sum_{w \in V} \mathbf{1}[\counts{\train}{h_n,w} = 0]\\ # \#_{w>0}(h_n) &= \sum_{w \in V} \mathbf{1}[\counts{\train}{h_n,w} > 0]\\ # \prob(w|h_n) &= # \begin{cases} # \frac{\counts{\train}{h_n,w} - d}{\counts{\train}{h_n}} & \mbox{if }\counts{\train}{h_n,w} > 0 \\\\ # \frac{???}{\counts{\train}{h_n}} & \mbox{otherwise} # \end{cases} # \end{align} # In[5]: class SubtractCount(CountLM): def __init__(self, base_lm, d): super().__init__(base_lm.vocab, base_lm.order) self.base_lm = base_lm self.d = d self._counts = base_lm._counts # not good style since it is a protected member self.vocab = base_lm.vocab def counts(self, word_and_history): if self._counts[word_and_history] > 0: return 0.0 # todo else: return 0.0 # todo def norm(self, history): return self.base_lm.norm(history) subtract_lm = SubtractCount(unigram, 0.1) oov_prob = subtract_lm.probability(OOV, 'the') rest_prob = sum([subtract_lm.probability(word, 'the') for word in subtract_lm.vocab]) print(oov_prob + rest_prob) sanity_check(subtract_lm, 'the') perplexity(subtract_lm, oov_test) # ## Task 4: Normalisation of Stupid LM # Develop and implement a version of the [stupid language model](https://github.com/uclmr/stat-nlp-book/blob/python/statnlpbook/lm.py#L205) that provides probabilities summing up to 1. # In[6]: class StupidBackoffNormalized(LanguageModel): def __init__(self, main, backoff, alpha): super().__init__(main.vocab, main.order) self.main = main self.backoff = backoff self.alpha = alpha def probability(self, word, *history): return 0.0 # todo less_stupid = StupidBackoffNormalized(bigram, unigram, 0.1) print(sum([less_stupid.probability(word, 'the') for word in less_stupid.vocab])) sanity_check(less_stupid, 'the') perplexity(less_stupid, oov_test)