This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
#@title
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/DJimQynXZsQ?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
! pip install datasets transformers[sentencepiece]
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained(
'huggingface-course/bert-base-uncased-tokenizer-without-normalizer'
)
text = "here is a sentence adapted to our tokenizer"
print(tokenizer.tokenize(text))
text = "এই বাক্যটি আমাদের টোকেনাইজারের উপযুক্ত নয়"
print(tokenizer.tokenize(text))
text = "this tokenizer does not know àccënts and CAPITAL LETTERS"
print(tokenizer.tokenize(text))
text = "the medical vocabulary is divided into many sub-token: paracetamol, phrayngitis"
print(tokenizer.tokenize(text))
from datasets import load_dataset
raw_datasets = load_dataset("code_search_net", "python")
def get_training_corpus():
dataset = raw_datasets["train"]
for start_idx in range(0, len(dataset), 1000):
samples = dataset[start_idx : start_idx + 1000]
yield samples["whole_func_string"]
from transformers import AutoTokenizer
old_tokenizer = AutoTokenizer.from_pretrained("gpt2")
training_corpus = get_training_corpus()
new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)
new_tokenizer.save_pretrained("code-search-net-tokenizer")
example = """class LinearLayer():
def __init__(self, input_size, output_size):
self.weight = torch.randn(input_size, output_size)
self.bias = torch.zeros(output_size)
def __call__(self, x):
return x @ self.weights + self.bias
"""
print(old_tokenizer.tokenize(example))
print(new_tokenizer.tokenize(example))