This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
#@title
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/Yffk5aydLzg?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
! pip install datasets transformers[sentencepiece]
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer("Let's try to tokenize!")
print(inputs["input_ids"])
[101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 999, 102]
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize("Let's try to tokenize!")
print(tokens)
['let', "'", 's', 'try', 'to', 'token', '##ize', '!']
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("albert-base-v1")
tokens = tokenizer.tokenize("Let's try to tokenize!")
print(tokens)
['▁let', "'", 's', '▁try', '▁to', '▁to', 'ken', 'ize', '!']
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize("Let's try to tokenize!")
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)
[2292, 1005, 1055, 3046, 2000, 19204, 4697, 999]
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize("Let's try to tokenize!")
input_ids = tokenizer.convert_tokens_to_ids(tokens)
final_inputs = tokenizer.prepare_for_model(input_ids)
print(final_inputs["input_ids"])
[101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 999, 102]
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer("Let's try to tokenize!")
print(tokenizer.decode(inputs["input_ids"]))
[CLS] let's try to tokenize! [SEP]
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
inputs = tokenizer("Let's try to tokenize!")
print(tokenizer.decode(inputs["input_ids"]))
<s>Let's try to tokenize!</s>
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer("Let's try to tokenize!")
print(inputs)
{'input_ids': [101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}