This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
#@title
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/4IIC2jI9CaU?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
! pip install datasets transformers[sentencepiece]
from transformers import AutoTokenizer
text = "This is a text with àccënts and CAPITAL LETTERS"
tokenizer = AutoTokenizer.from_pretrained("albert-large-v2")
print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/albert-tokenizer-without-normalizer")
print(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))
text = "un père indigné"
tokenizer = AutoTokenizerFast.from_pretrained('distilbert-base-uncased')
print(tokenizer.backend_tokenizer.normalizer.normalize_str(text))