This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.

In [ ]:

#@title
from IPython.display import HTML

HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/4IIC2jI9CaU?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Out[ ]:

Install the Transformers and Datasets libraries to run this notebook.

In [ ]:

! pip install datasets transformers[sentencepiece]

In [ ]:

from transformers import AutoTokenizer

text = "This is a text with àccënts and CAPITAL LETTERS"

tokenizer = AutoTokenizer.from_pretrained("albert-large-v2")
print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))

tokenizer = AutoTokenizer.from_pretrained("huggingface-course/albert-tokenizer-without-normalizer")
print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))

In [ ]:

text = "un père indigné"

tokenizer = AutoTokenizerFast.from_pretrained('distilbert-base-uncased')
print(tokenizer.backend_tokenizer.normalizer.normalize_str(text))