We've checked out statistical approaches to language models in the last notebook. Now let's go find out what deep learning has to offer.
We're gonna use the same dataset as before, except this time we build a language model that's character-level, not word level.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
Working on character level means that we don't need to deal with large vocabulary or missing words. Heck, we can even keep uppercase words in text! The downside, however, is that all our sequences just got a lot longer.
However, we still need special tokens:
BOS, EOS = ' ', '\n'
data = pd.read_json("./arxivData.json")
lines = data.apply(lambda row: (row['title'] + ' ; ' + row['summary'])[:512], axis=1) \
.apply(lambda line: BOS + line.replace(EOS, ' ') + EOS) \
.tolist()
# if you missed the seminar, download data here - https://yadi.sk/d/_nGyU2IajjR9-w
Our next step is building char-level vocabulary. Put simply, you need to assemble a list of all unique tokens in the dataset.
# get all unique characters from lines (including capital letters and symbols)
tokens = <YOUR CODE>
tokens = sorted(tokens)
n_tokens = len(tokens)
print ('n_tokens = ',n_tokens)
assert 100 < n_tokens < 150
assert BOS in tokens, EOS in tokens
We can now assign each character with it's index in tokens list. This way we can encode a string into a TF-friendly integer vector.
# dictionary of character -> its identifier (index in tokens list)
token_to_id = <YOUR CODE>
assert len(tokens) == len(token_to_id), "dictionaries must have same size"
for i in range(n_tokens):
assert token_to_id[tokens[i]] == i, "token identifier must be it's position in tokens list"
print("Seems alright!")
Our final step is to assemble several strings in a integet matrix [batch_size, text_length]
.
The only problem is that each sequence has a different length. We can work around that by padding short sequences with extra EOS or cropping long sequences. Here's how it works:
def to_matrix(lines, max_len=None, pad=token_to_id[EOS], dtype='int32'):
"""Casts a list of lines into tf-digestable matrix"""
max_len = max_len or max(map(len, lines))
lines_ix = np.zeros([len(lines), max_len], dtype) + pad
for i in range(len(lines)):
line_ix = list(map(token_to_id.get, lines[i][:max_len]))
lines_ix[i, :len(line_ix)] = line_ix
return lines_ix
#Example: cast 4 random names to matrices, pad with zeros
dummy_lines = [
' abc\n',
' abacaba\n',
' abc1234567890\n',
]
print(to_matrix(dummy_lines))
Just like for N-gram LMs, we want to estimate probability of text as a joint probability of tokens (symbols this time).
$$P(X) = \prod_t P(x_t \mid x_0, \dots, x_{t-1}).$$Instead of counting all possible statistics, we want to train a neural network with parameters $\theta$ that estimates the conditional probabilities:
$$ P(x_t \mid x_0, \dots, x_{t-1}) \approx p(x_t \mid x_0, \dots, x_{t-1}, \theta) $$But before we optimize, we need to define our neural network. Let's start with a fixed-window (aka convolutional) architecture:
import tensorflow as tf
import keras, keras.layers as L
sess = tf.InteractiveSession()
class FixedWindowLanguageModel:
def __init__(self, n_tokens=n_tokens, emb_size=16, hid_size=64):
"""
A fixed window model that looks on at least 5 previous symbols.
Note: fixed window LM is effectively performing a convolution over a sequence of words.
This convolution only looks on current and previous words.
Such convolution can be represented as a sequence of 2 operations:
- pad input vectors by {strides * (filter_size - 1)} zero vectors on the "left", do not pad right
- perform regular convolution with {filter_size} and {strides}
You can stack several convolutions at once
"""
#YOUR CODE - create layers/variables and any metadata you want, e.g. self.emb = L.Embedding(...)
<...>
#END OF YOUR CODE
self.prefix_ix = tf.placeholder('int32', [None, None])
self.next_token_probs = tf.nn.softmax(self(self.prefix_ix)[:, -1])
def __call__(self, input_ix):
"""
compute language model logits given input tokens
:param input_ix: batch of sequences with token indices, tf tensor: int32[batch_size, sequence_length]
:returns: pre-softmax linear outputs of language model [batch_size, sequence_length, n_tokens]
these outputs will be used as logits to compute P(x_t | x_0, ..., x_{t - 1})
"""
# YOUR CODE - apply layers
return <...>
def get_possible_next_tokens(self, prefix=BOS, temperature=1.0, max_len=100, sess=sess):
""" :returns: probabilities of next token, dict {token : prob} for all tokens """
probs = sess.run(self.next_token_probs, {self.prefix_ix: to_matrix([prefix])})[0]
return dict(zip(tokens, probs))
window_lm = FixedWindowLanguageModel()
dummy_input_ix = tf.constant(to_matrix(dummy_lines))
dummy_lm_out = window_lm(dummy_input_ix)
# note: tensorflow and keras layers only create variables after they're first applied (called)
sess.run(tf.global_variables_initializer())
dummy_logits = sess.run(dummy_lm_out)
assert dummy_logits.shape == (len(dummy_lines), max(map(len, dummy_lines)), n_tokens), "please check output shape"
assert np.all(np.isfinite(dummy_logits)), "inf/nan encountered"
assert not np.allclose(dummy_logits.sum(-1), 1), "please predict linear outputs, don't use softmax (maybe you've just got unlucky)"
# test for lookahead
dummy_input_ix_2 = tf.constant(to_matrix([line[:3] + 'e' * (len(line) - 3) for line in dummy_lines]))
dummy_lm_out_2 = window_lm(dummy_input_ix_2)
dummy_logits_2 = sess.run(dummy_lm_out_2)
assert np.allclose(dummy_logits[:, :3] - dummy_logits_2[:, :3], 0), "your model's predictions depend on FUTURE tokens. " \
" Make sure you don't allow any layers to look ahead of current token." \
" You can also get this error if your model is not deterministic (e.g. dropout). Disable it for this test."
We can now tune our network's parameters to minimize categorical crossentropy over training dataset $D$:
$$ L = {\frac1{|D|}} \sum_{X \in D} \sum_{x_i \in X} - \log p(x_t \mid x_1, \dots, x_{t-1}, \theta) $$As usual with with neural nets, this optimization is performed via stochastic gradient descent with backprop. One can also note that minimizing crossentropy is equivalent to minimizing model perplexity, KL-divergence or maximizng log-likelihood.
def compute_lengths(input_ix, eos_ix=token_to_id[EOS]):
""" compute length of each line in input ix (incl. first EOS), int32 vector of shape [batch_size] """
count_eos = tf.cumsum(tf.to_int32(tf.equal(input_ix, eos_ix)), axis=1, exclusive=True)
lengths = tf.reduce_sum(tf.to_int32(tf.equal(count_eos, 0)), axis=1)
return lengths
print('matrix:\n', dummy_input_ix.eval())
print('lengths:', compute_lengths(dummy_input_ix).eval())
input_ix = tf.placeholder('int32', [None, None])
logits = window_lm(input_ix[:, :-1])
reference_answers = input_ix[:, 1:]
# Your task: implement loss function as per formula above
# your loss should only be computed on actual tokens, excluding padding
# predicting actual tokens and first EOS do count. Subsequent EOS-es don't
# you will likely need to use compute_lengths and/or tf.sequence_mask to get it right.
<YOUR CODE>
loss = <...>
# operation to update network weights
train_step = <...>
loss_1 = sess.run(loss, {input_ix: to_matrix(dummy_lines, max_len=50)})
loss_2 = sess.run(loss, {input_ix: to_matrix(dummy_lines, max_len=100)})
assert (np.ndim(loss_1) == 0) and (0 < loss_1 < 100), "loss must be a positive scalar"
assert np.allclose(loss_1, loss_2), 'do not include AFTER first EOS into loss. '\
'Hint: use tf.sequence_mask. Beware +/-1 errors. And be careful when averaging!'
Now let's train our model on minibatches of data
from sklearn.model_selection import train_test_split
train_lines, dev_lines = train_test_split(lines, test_size=0.25, random_state=42)
sess.run(tf.global_variables_initializer())
batch_size = 256
score_dev_every = 250
train_history, dev_history = [], []
def score_lines(dev_lines, batch_size):
""" computes average loss over the entire dataset """
dev_loss_num, dev_loss_len = 0., 0.
for i in range(0, len(dev_lines), batch_size):
batch_ix = to_matrix(dev_lines[i: i + batch_size])
dev_loss_num += sess.run(loss, {input_ix: batch_ix}) * len(batch_ix)
dev_loss_len += len(batch_ix)
return dev_loss_num / dev_loss_len
def generate(lm, prefix=BOS, temperature=1.0, max_len=100):
"""
Samples output sequence from probability distribution obtained by lm
:param temperature: samples proportionally to lm probabilities ^ temperature
if temperature == 0, always takes most likely token. Break ties arbitrarily.
"""
while True:
token_probs = lm.get_possible_next_tokens(prefix)
tokens, probs = zip(*token_probs.items())
if temperature == 0:
next_token = tokens[np.argmax(probs)]
else:
probs = np.array([p ** (1. / temperature) for p in probs])
probs /= sum(probs)
next_token = np.random.choice(tokens, p=probs)
prefix += next_token
if next_token == EOS or len(prefix) > max_len: break
return prefix
if len(dev_history) == 0:
dev_history.append((0, score_lines(dev_lines, batch_size)))
print("Before training:", generate(window_lm, 'Bridging'))
from IPython.display import clear_output
from random import sample
from tqdm import trange
for i in trange(len(train_history), 5000):
batch = to_matrix(sample(train_lines, batch_size))
loss_i, _ = sess.run([loss, train_step], {input_ix: batch})
train_history.append((i, loss_i))
if (i + 1) % 50 == 0:
clear_output(True)
plt.scatter(*zip(*train_history), alpha=0.1, label='train_loss')
if len(dev_history):
plt.plot(*zip(*dev_history), color='red', label='dev_loss')
plt.legend(); plt.grid(); plt.show()
print("Generated examples (tau=0.5):")
for j in range(3):
print(generate(window_lm, temperature=0.5))
if (i + 1) % score_dev_every == 0:
print("Scoring dev...")
dev_history.append((i, score_lines(dev_lines, batch_size)))
print('#%i Dev loss: %.3f' % dev_history[-1])
assert np.mean(train_history[:10], axis=0)[1] > np.mean(train_history[-10:], axis=0)[1], "The model didn't converge."
print("Final dev loss:", dev_history[-1][-1])
for i in range(10):
print(generate(window_lm, temperature=0.5))
Fixed-size architectures are reasonably good when capturing short-term dependencies, but their design prevents them from capturing any signal outside their window. We can mitigate this problem by using a recurrent neural network:
$$ h_0 = \vec 0 ; \quad h_{t+1} = RNN(x_t, h_t) $$$$ p(x_t \mid x_0, \dots, x_{t-1}, \theta) = dense_{softmax}(h_{t-1}) $$Such model processes one token at a time, left to right, and maintains a hidden state vector between them. Theoretically, it can learn arbitrarily long temporal dependencies given large enough hidden size.
class RNNLanguageModel:
def __init__(self, n_tokens=n_tokens, emb_size=16, hid_size=256):
"""
Build a recurrent language model.
You are free to choose anything you want, but the recommended architecture is
- token embeddings
- one or more LSTM/GRU layers with hid size
- linear layer to predict logits
"""
# YOUR CODE - create layers/variables/etc
<...>
#END OF YOUR CODE
self.prefix_ix = tf.placeholder('int32', [None, None])
self.next_token_probs = tf.nn.softmax(self(self.prefix_ix)[:, -1])
def __call__(self, input_ix):
"""
compute language model logits given input tokens
:param input_ix: batch of sequences with token indices, tf tensor: int32[batch_size, sequence_length]
:returns: pre-softmax linear outputs of language model [batch_size, sequence_length, n_tokens]
these outputs will be used as logits to compute P(x_t | x_0, ..., x_{t - 1})
"""
#YOUR CODE
return <...>
def get_possible_next_tokens(self, prefix=BOS, temperature=1.0, max_len=100, sess=sess):
""" :returns: probabilities of next token, dict {token : prob} for all tokens """
probs = sess.run(self.next_token_probs, {self.prefix_ix: to_matrix([prefix])})[0]
return dict(zip(tokens, probs))
rnn_lm = RNNLanguageModel()
dummy_input_ix = tf.constant(to_matrix(dummy_lines))
dummy_lm_out = rnn_lm(dummy_input_ix)
# note: tensorflow and keras layers only create variables after they're first applied (called)
sess.run(tf.global_variables_initializer())
dummy_logits = sess.run(dummy_lm_out)
assert dummy_logits.shape == (len(dummy_lines), max(map(len, dummy_lines)), n_tokens), "please check output shape"
assert np.all(np.isfinite(dummy_logits)), "inf/nan encountered"
assert not np.allclose(dummy_logits.sum(-1), 1), "please predict linear outputs, don't use softmax (maybe you've just got unlucky)"
# test for lookahead
dummy_input_ix_2 = tf.constant(to_matrix([line[:3] + 'e' * (len(line) - 3) for line in dummy_lines]))
dummy_lm_out_2 = rnn_lm(dummy_input_ix_2)
dummy_logits_2 = sess.run(dummy_lm_out_2)
assert np.allclose(dummy_logits[:, :3] - dummy_logits_2[:, :3], 0), "your model's predictions depend on FUTURE tokens. " \
" Make sure you don't allow any layers to look ahead of current token." \
" You can also get this error if your model is not deterministic (e.g. dropout). Disable it for this test."
Our RNN language model should optimize the same loss function as fixed-window model. But there's a catch. Since RNN recurrently multiplies gradients through many time-steps, gradient values may explode, breaking your model. The common solution to that problem is to clip gradients either individually or globally.
Your task here is to prepare tensorflow graph that would minimize the same loss function. If you encounter large loss fluctuations during training, please add gradient clipping using urls above.
Note: gradient clipping is not exclusive to RNNs. Convolutional networks with enough depth often suffer from the same issue.
input_ix = tf.placeholder('int32', [None, None])
logits = rnn_lm(input_ix[:, :-1])
reference_answers = input_ix[:, 1:]
# Copy the loss function and train step from the fixed-window model training
loss = <...>
# and the train step
train_step = <...>
loss_1 = sess.run(loss, {input_ix: to_matrix(dummy_lines, max_len=50)})
loss_2 = sess.run(loss, {input_ix: to_matrix(dummy_lines, max_len=100)})
assert (np.ndim(loss_1) == 0) and (0 < loss_1 < 100), "loss must be a positive scalar"
assert np.allclose(loss_1, loss_2), 'do not include AFTER first EOS into loss. Hint: use tf.sequence_mask. Be careful when averaging!'
sess.run(tf.global_variables_initializer())
batch_size = 128
score_dev_every = 250
train_history, dev_history = [], []
dev_history.append((0, score_lines(dev_lines, batch_size)))
for i in trange(len(train_history), 5000):
batch = to_matrix(sample(train_lines, batch_size))
loss_i, _ = sess.run([loss, train_step], {input_ix: batch})
train_history.append((i, loss_i))
if (i + 1) % 50 == 0:
clear_output(True)
plt.scatter(*zip(*train_history), alpha=0.1, label='train_loss')
if len(dev_history):
plt.plot(*zip(*dev_history), color='red', label='dev_loss')
plt.legend(); plt.grid(); plt.show()
print("Generated examples (tau=0.5):")
for j in range(3):
print(generate(rnn_lm, temperature=0.5))
if (i + 1) % score_dev_every == 0:
print("Scoring dev...")
dev_history.append((i, score_lines(dev_lines, batch_size)))
print('#%i Dev loss: %.3f' % dev_history[-1])
assert np.mean(train_history[:10], axis=0)[1] > np.mean(train_history[-10:], axis=0)[1], "The model didn't converge."
print("Final dev loss:", dev_history[-1][-1])
for i in range(10):
print(generate(rnn_lm, temperature=0.5))
So you've learned the building blocks of neural language models, you can now build the ultimate monster:
You are NOT required to submit this assignment. Please make sure you don't miss your deadline because of it :)