This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
#@title
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/M6adb1j2jPI?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
! pip install datasets transformers[sentencepiece]
from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
"I've been waiting for a HuggingFace course my whole life.",
"I hate this.",
]
tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
print(ids[0])
print(ids[1])
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012] [1045, 5223, 2023, 1012]
import torch
ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
[1045, 5223, 2023, 1012]]
input_ids = torch.tensor(ids)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-2-b3483c81e2fd> in <module> 4 [1045, 5223, 2023, 1012]] 5 ----> 6 input_ids = torch.tensor(ids) ValueError: expected sequence of length 14 at dim 1 (got 4)
import torch
ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
[1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
input_ids = torch.tensor(ids)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token_id
0
from transformers import AutoModelForSequenceClassification
ids1 = torch.tensor(
[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]]
)
ids2 = torch.tensor([[1045, 5223, 2023, 1012]])
all_ids = torch.tensor(
[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
[1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
print(model(ids1).logits)
print(model(ids2).logits)
print(model(all_ids).logits)
tensor([[-2.7276, 2.8789]], grad_fn=<AddmmBackward>) tensor([[ 3.9497, -3.1357]], grad_fn=<AddmmBackward>) tensor([[-2.7276, 2.8789], [ 1.5444, -1.3998]], grad_fn=<AddmmBackward>)
all_ids = torch.tensor(
[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
[1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
)
attention_mask = torch.tensor(
[[ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
output1 = model(ids1)
output2 = model(ids2)
print(output1.logits)
print(output2.logits)
tensor([[-2.7276, 2.8789]], grad_fn=<AddmmBackward>) tensor([[ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)
output = model(all_ids, attention_mask=attention_mask)
print(output.logits)
tensor([[-2.7276, 2.8789], [ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)
from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
"I've been waiting for a HuggingFace course my whole life.",
"I hate this.",
]
print(tokenizer(sentences, padding=True))
{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}