#@title from IPython.display import HTML HTML('') ! pip install datasets transformers[sentencepiece] from transformers import AutoTokenizer checkpoint = "distilbert-base-uncased-finetuned-sst-2-english" tokenizer = AutoTokenizer.from_pretrained(checkpoint) sentences = [ "I've been waiting for a HuggingFace course my whole life.", "I hate this.", ] tokens = [tokenizer.tokenize(sentence) for sentence in sentences] ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens] print(ids[0]) print(ids[1]) import torch ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 1012]] input_ids = torch.tensor(ids) import torch ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] input_ids = torch.tensor(ids) from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(checkpoint) tokenizer.pad_token_id from transformers import AutoModelForSequenceClassification ids1 = torch.tensor( [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]] ) ids2 = torch.tensor([[1045, 5223, 2023, 1012]]) all_ids = torch.tensor( [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] ) model = AutoModelForSequenceClassification.from_pretrained(checkpoint) print(model(ids1).logits) print(model(ids2).logits) print(model(all_ids).logits) all_ids = torch.tensor( [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] ) attention_mask = torch.tensor( [[ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] ) model = AutoModelForSequenceClassification.from_pretrained(checkpoint) output1 = model(ids1) output2 = model(ids2) print(output1.logits) print(output2.logits) output = model(all_ids, attention_mask=attention_mask) print(output.logits) from transformers import AutoTokenizer checkpoint = "distilbert-base-uncased-finetuned-sst-2-english" tokenizer = AutoTokenizer.from_pretrained(checkpoint) sentences = [ "I've been waiting for a HuggingFace course my whole life.", "I hate this.", ] print(tokenizer(sentences, padding=True))