This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.

In [ ]:

#@title
from IPython.display import HTML

HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/DJimQynXZsQ?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Out[ ]:

Install the Transformers and Datasets libraries to run this notebook.

In [ ]:

! pip install datasets transformers[sentencepiece]

In [ ]:

from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained(
  'huggingface-course/bert-base-uncased-tokenizer-without-normalizer'
)

In [ ]:

text = "here is a sentence adapted to our tokenizer"
print(tokenizer.tokenize(text))

In [ ]:

text = "এই বাক্যটি আমাদের টোকেনাইজারের উপযুক্ত নয়"
print(tokenizer.tokenize(text))

In [ ]:

text = "this tokenizer does not know àccënts and CAPITAL LETTERS"
print(tokenizer.tokenize(text))

In [ ]:

text = "the medical vocabulary is divided into many sub-token: paracetamol, phrayngitis"
print(tokenizer.tokenize(text))

In [ ]:

from datasets import load_dataset

raw_datasets = load_dataset("code_search_net", "python")

In [ ]:

def get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["whole_func_string"]

In [ ]:

from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")
training_corpus = get_training_corpus()
new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)
new_tokenizer.save_pretrained("code-search-net-tokenizer")

In [ ]:

example = """class LinearLayer():
    def __init__(self, input_size, output_size):
        self.weight = torch.randn(input_size, output_size)
        self.bias = torch.zeros(output_size)

    def __call__(self, x):
        return x @ self.weights + self.bias
    """

print(old_tokenizer.tokenize(example))
print(new_tokenizer.tokenize(example))