!pip install datasets evaluate transformers[sentencepiece] import torch from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification # Tương tự như ví dụ trước checkpoint = "bert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) model = AutoModelForSequenceClassification.from_pretrained(checkpoint) sequences = [ "I've been waiting for a HuggingFace course my whole life.", "This course is amazing!", ] batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt") # Đây là phần mới batch["labels"] = torch.tensor([1, 1]) optimizer = AdamW(model.parameters()) loss = model(**batch).loss loss.backward() optimizer.step() from datasets import load_dataset raw_datasets = load_dataset("glue", "mrpc") raw_datasets raw_train_dataset = raw_datasets["train"] raw_train_dataset[0] raw_train_dataset.features from transformers import AutoTokenizer checkpoint = "bert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"]) tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"]) inputs = tokenizer("This is the first sentence.", "This is the second one.") inputs tokenizer.convert_ids_to_tokens(inputs["input_ids"]) tokenized_dataset = tokenizer( raw_datasets["train"]["sentence1"], raw_datasets["train"]["sentence2"], padding=True, truncation=True, ) def tokenize_function(example): return tokenizer(example["sentence1"], example["sentence2"], truncation=True) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) tokenized_datasets from transformers import DataCollatorWithPadding data_collator = DataCollatorWithPadding(tokenizer=tokenizer) samples = tokenized_datasets["train"][:8] samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]} [len(x) for x in samples["input_ids"]] batch = data_collator(samples) {k: v.shape for k, v in batch.items()}