#@title
from IPython.display import HTML
HTML('')
! pip install datasets transformers[sentencepiece]
from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
"I've been waiting for a HuggingFace course my whole life.",
"I hate this.",
]
tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
print(ids[0])
print(ids[1])
import torch
ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
[1045, 5223, 2023, 1012]]
input_ids = torch.tensor(ids)
import torch
ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
[1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
input_ids = torch.tensor(ids)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token_id
from transformers import AutoModelForSequenceClassification
ids1 = torch.tensor(
[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]]
)
ids2 = torch.tensor([[1045, 5223, 2023, 1012]])
all_ids = torch.tensor(
[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
[1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
print(model(ids1).logits)
print(model(ids2).logits)
print(model(all_ids).logits)
all_ids = torch.tensor(
[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
[1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
)
attention_mask = torch.tensor(
[[ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
output1 = model(ids1)
output2 = model(ids2)
print(output1.logits)
print(output2.logits)
output = model(all_ids, attention_mask=attention_mask)
print(output.logits)
from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
"I've been waiting for a HuggingFace course my whole life.",
"I hate this.",
]
print(tokenizer(sentences, padding=True))