This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.

In [ ]:

#@title
from IPython.display import HTML

HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/3umI3tm27Vw?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Out[ ]:

Install the Transformers and Datasets libraries to run this notebook.

In [ ]:

! pip install datasets transformers[sentencepiece]

In [ ]:

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
print(tokenizer("Let's talk about tokenizers superpowers.")["input_ids"])
print(tokenizer("Let's talk about tokenizers      superpowers.")["input_ids"])

In [ ]:

encoding = tokenizer("Let's talk about tokenizers superpowers.")
print(encoding.tokens())
print(encoding.word_ids())

In [ ]:

encoding = tokenizer(
    "Let's talk about tokenizers     superpowers.",
    return_offsets_mapping=True
)
print(encoding.tokens())
print(encoding["offset_mapping"])