#!/usr/bin/env python
# coding: utf-8
#
Table of Contents
#
# In[1]:
# code for loading the format for the notebook
import os
# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir(os.path.join('..', '..', 'notebook_format'))
from formats import load_style
load_style(plot_style=False)
# In[2]:
os.chdir(path)
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('load_ext', 'watermark')
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('config', "InlineBackend.figure_format='retina'")
import os
import re
import time
import numpy as np
from operator import itemgetter
from typing import Dict, Tuple, List, Set
get_ipython().run_line_magic('watermark', "-a 'Ethen' -d -t -v -p numpy,sentencepiece")
# # Byte Pair Encoding (BPE) - Handling Rare Words with Subword Tokenization
# NLP techniques, be it word embeddings or tfidf often works with a fixed vocabulary size. Due to this, rare words in the corpus would all be considered out of vocabulary, and is often times replaced with a default unknown token, ``. Then when it comes to feature representation, these unknown tokens often times get some global default values. e.g. mean embedding values of all the word embeddings.
#
# Character level and n-gram representation aside, one seminal way of dealing with this rare word issue is **Byte Pair Encoding**. At a high level it works by encoding rare or unknown words as sequence of subword units. e.g. Imagine the model sees an out of vocabulary word `talking`. Instead of directly falling back to a default unknown token, ``, we may have noticed that we have words such as `talked, talks` appearing in the corpus. Hence, if these words got segmented as `talk ed`, `talk s`, etc. Now these words all have a `talk` in common and the model might be able to pick up more information compared with falling back to a unknown token.
#
# The gist is, by breaking down our corpus into subword units, any word that doesn't appear in our training corpus can be broken down into these subword units. How to break the corpus down into subword units becomes the next question that we'll tackle.
#
# **Byte Pair Encoding**, is a data compression algorithm that iteratively replaces the most frequent pair of bytes in a sequence with a single, unused byte. e.g.
#
# - aaabdaaabac. aa is the most frequent pair of bytes and we replace it with a unused byte Z.
# - ZabdZabac. ab is now the most frequent pair of bytes, we replace it with Y.
#
# To adapt this idea for word segmentation, instead of replacing frequent pair of bytes, we now merge subword pairs that frequently occur. To elaborate:
#
# - We initialize the vocabulary with the character vocabulary, and represent each word as a sequence of characters, plus a special end-of-word symbol '/w', which allows us to restore the original tokenization after translation. More on this below.
# - Next, we iteratively count all symbol pairs and merge each occurrence of the most frequent pair (a, b) into ab. Each merge operation produces a new symbol which represents a character n-gram.
# - We'll keep repeating the previous process until the target vocabulary size is reached.
#
# A special end of word symbol is important. As without it, say if there is a token `est`, this token could be in the word `est ern`, or the wold `small est`. The meaning between the two of them are quite different. With the end of word symbol, if there is a token `est/w`, the model immediately know that this is the token for the wold `small est` instead of `est ern`.
#
# An example use case is [neural machine translation (NMT)](https://arxiv.org/abs/1508.07909), where subword tokenization was shown to have superior performance compared to using larger vocabulary size or fall-back dictionary.
# ## Implementation From Scratch
# Start by initializing the vocabulary with character vocabulary plus a special end of word symbol.
# In[3]:
# assuming we've extracted from our raw text and this is the character
# vocabulary that we've ended up with, along with their frequency
vocab = {
'l o w ': 5,
'l o w e r ': 2,
'n e w e s t ': 6,
'w i d e s t ': 3,
'h a p p i e r ': 2
}
vocab
# We then count the frequency of each consecutive character pair.
# In[4]:
def get_pair_stats(vocab: Dict[str, int]) -> Dict[Tuple[str, str], int]:
"""Get counts of pairs of consecutive symbols."""
pairs = {}
for word, frequency in vocab.items():
symbols = word.split()
# count occurrences of pairs
for i in range(len(symbols) - 1):
pair = (symbols[i], symbols[i + 1])
current_frequency = pairs.get(pair, 0)
pairs[pair] = current_frequency + frequency
return pairs
# In[5]:
pair_stats = get_pair_stats(vocab)
pair_stats
# We find the most frequent, and merge them together into a new token whenever we encounter them in the vocabulary.
# In[6]:
def merge_vocab(best_pair: Tuple[str, str], vocab_in: Dict[str, int]) -> Dict[str, int]:
"""Step 3. Merge all occurrences of the most frequent pair"""
vocab_out = {}
# re.escape
# ensures the characters of our input pair will be handled as is and
# not get mistreated as special characters in the regular expression.
pattern = re.escape(' '.join(best_pair))
replacement = ''.join(best_pair)
for word_in in vocab_in:
# replace most frequent pair in all vocabulary
word_out = re.sub(pattern, replacement, word_in)
vocab_out[word_out] = vocab_in[word_in]
return vocab_out
# In this particular round, the e,s consecutive pair was identified as the most frequent pair, then in the new vocabulary, all the e,s was merged together into a single token.
# In[7]:
best_pair = max(pair_stats, key=pair_stats.get)
print(best_pair)
new_vocab = merge_vocab(best_pair, vocab)
new_vocab
# This was only 1 iteration of the merging process, we can iteratively perform this merging step until we reach the number of merges or the number of tokens we would like to have.
# In[8]:
vocab = {
'l o w ': 5,
'l o w e r ': 2,
'n e w e s t ': 6,
'w i d e s t ': 3,
'h a p p i e r ': 2
}
# we store the best pair during each iteration for encoding new vocabulary, more on this later
bpe_codes = {}
num_merges = 10 # hyperparameter
for i in range(num_merges):
print('\niteration', i)
pair_stats = get_pair_stats(vocab)
if not pair_stats:
break
best_pair = max(pair_stats, key=pair_stats.get)
bpe_codes[best_pair] = i
print('vocabulary: ', vocab)
print('best pair:', best_pair)
vocab = merge_vocab(best_pair, vocab)
print('\nfinal vocabulary: ', vocab)
print('\nbyte pair encoding: ', bpe_codes)
# ### Applying Encodings
# Now that we've see the process of "learning" the byte pair encodings. We now turn our attention to the process of "applying" them to new vocabulary.
#
# While possible:
#
# - Get all the bigram symbol for the word that we wish to encode.
# - Find the symbol pair in our byte pair codes that appeared first among the symbols that were merged.
# - Apply the merge on the word.
# In[9]:
# first convert an input word to the list of character format
original_word = 'lowest'
word = list(original_word)
word.append('')
word
# In[10]:
def get_pairs(word: List[str]) -> Set[Tuple[str, str]]:
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
# In[11]:
# gets the set of possible bigram symbol
pairs = get_pairs(word)
pairs
# In[12]:
# attempt to find it in the byte pair codes
bpe_codes_pairs = [(pair, bpe_codes[pair]) for pair in pairs if pair in bpe_codes]
pair_to_merge = min(bpe_codes_pairs, key=itemgetter(1))[0]
pair_to_merge
# In[13]:
def create_new_word(word: List[str], pair_to_merge: Tuple[str, str]) -> List[str]:
first, second = pair_to_merge
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except ValueError:
new_word.extend(word[i:])
break
if i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(first)
i += 1
return new_word
# In[14]:
# stitch together the new word
new_word = create_new_word(word, pair_to_merge)
new_word
# The previous couple of code chunks shows one iteration of applying the byte pair codes to encode a new word. The next one puts everything together into a single function.
# In[15]:
def encode(original_word: str, bpe_codes: Dict[Tuple[str, str], int]) -> List[str]:
if len(original_word) == 1:
return original_word
word = list(original_word)
word.append('')
while True:
pairs = get_pairs(word)
bpe_codes_pairs = [(pair, bpe_codes[pair]) for pair in pairs if pair in bpe_codes]
if not bpe_codes_pairs:
break
pair_to_merge = min(bpe_codes_pairs, key=itemgetter(1))[0]
word = create_new_word(word, pair_to_merge)
return word
# In[16]:
original_word = 'lowest'
encode(original_word, bpe_codes)
# In[17]:
bpe_codes
# Judging from the result, we can see that even though the word lowest did not appear in our "training" data, but because "low" and and ending "est" are both byte pair codes that were learned, the word got encoded into low est.
# ## Sentencepiece
# Upon seeing an educational implementation, we will give a more efficient implementation, `sentencepiece`, a swing. Consider reading the [overview section](https://github.com/google/sentencepiece#overview) of the software for a high-level introduction of the package.
# In[18]:
# download some sample text for demonstration
get_ipython().system('wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt')
# To train the subword unit, we call the `SentencePieceTrainer.train` method by passing in our parameters. I've specified some of the commonly used parameters in the example below. As for what are the available options, I find it helpful to just search in the source code of the [parameter parser](https://github.com/google/sentencepiece/blob/d4dd947fe71c4fa4ee24ad8297beee32887d8828/src/spec_parser.h).
# In[19]:
from sentencepiece import SentencePieceTrainer, SentencePieceProcessor
input_file = 'botchan.txt'
max_num_words = 10000
model_type = 'bpe'
model_prefix = 'sentencepiece'
pad_id = 0
unk_id = 1
bos_id = 2
eos_id = 3
sentencepiece_params = ' '.join([
'--input={}'.format(input_file),
'--model_type={}'.format(model_type),
'--model_prefix={}'.format(model_prefix),
'--vocab_size={}'.format(max_num_words),
'--pad_id={}'.format(pad_id),
'--unk_id={}'.format(unk_id),
'--bos_id={}'.format(bos_id),
'--eos_id={}'.format(eos_id)
])
print(sentencepiece_params)
SentencePieceTrainer.train(sentencepiece_params)
# Some comments regarding the parameters:
#
# - `input` expects a .txt file on disk. Hence, if we have a python variable, e.g. a list of sentences that we wish to learn the subword unit using sentencepiece, we would have to make an additional step to write it to a file on disk.
# - `model_type` can take in `bpe`, `unigram`, `char`, `word`, allowing us to experiment with different tokenization schemes. The [`unigram`](https://arxiv.org/abs/1804.10959) method was not introduced in this notebook.
# - `pad_id`, specifying these ids can be important with we're using sentencepiece in conjunction with other libraries, e.g. 1 library may have already by default preserved the token id 0 for padding characters, in that case, we can explicitly specifying that padding id to sentencepiece to be consistent.
#
# Upon training the model, we can load it, the model resides in `model_prefix.model`, where `model_prefix` is a parameter that we also get to specify.
# In[20]:
sp = SentencePieceProcessor()
sp.load("{}.model".format(model_prefix))
print('Found %s unique tokens.' % sp.get_piece_size())
# Showcasing some common operations.
# In[21]:
# encode: text => id
# given a new text, we can convert it to subword units
original = 'This is a test'
encoded_pieces = sp.encode_as_pieces(original)
print(encoded_pieces)
# or convert it to numeric id for downstream modeling
encoded_ids = sp.encode_as_ids(original)
print(encoded_ids)
# In[22]:
# decode: piece/id => text
# we can convert the subword units back to the original text
decoded_pieces = sp.decode_pieces(encoded_pieces)
print(decoded_pieces)
# we can convert the numeric id back to the original text
decoded_ids = sp.decode_ids(encoded_ids)
print(decoded_ids)
# In[23]:
# id <=> piece conversion
original = '▁This'
# finding the numeric id of the particular subword unit
piece_id = sp.piece_to_id('▁This')
print(piece_id)
# obtaining the subword unit of a particular numeric id
print(sp.id_to_piece(piece_id))
# This concludes our introduction to one of the subword tokenization methods, Byte Pair Encoding and one of the open-source packages that's available out there [sentencepiece](https://github.com/google/sentencepiece).
#
# Some other options out there at the time of writing this documentation includes, [YouTokenToMe](https://github.com/VKCOM/YouTokenToMe) and [fastBPE](https://github.com/glample/fastBPE). YouTokenToMe claims to be faster than both sentencepiece and fastBPE, and sentencepiece supports additional subword tokenization method.
#
# Subword tokenization is a commonly used technique in modern NLP pipeline, and it's definitely worth understanding and adding to our toolkit.
# # Reference
# - [Jupyter Notebook: Byte Pair Encoding](http://ufal.mff.cuni.cz/~helcl/courses/npfl116/ipython/byte_pair_encoding.html)
# - [Jupyter Notebook: Sentencepiece python module](https://nbviewer.jupyter.org/github/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb)
# - [StackOverflow: How is WordPiece tokenization helpful to effectively deal with rare words problem in NLP?](https://stackoverflow.com/questions/55382596/how-is-wordpiece-tokenization-helpful-to-effectively-deal-with-rare-words-proble)
# - [Blog: Byte Pair Encoding — The Dark Horse of Modern NLP](https://towardsdatascience.com/byte-pair-encoding-the-dark-horse-of-modern-nlp-eb36c7df4f10)
# - [Paper: R. Sennrich, B. Haddow, A. Birch - Neural Machine Translation of Rare Words with Subword Units (2015)](https://arxiv.org/abs/1508.07909)