#@title from IPython.display import HTML HTML('') ! pip install datasets transformers[sentencepiece] from datasets import load_dataset from transformers import AutoTokenizer raw_datasets = load_dataset("glue", "mrpc") checkpoint = "bert-base-cased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_function(examples): return tokenizer( examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True, max_length=128 ) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"]) tokenized_datasets = tokenized_datasets.rename_column("label", "labels") tokenized_datasets = tokenized_datasets.with_format("torch") from torch.utils.data import DataLoader train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True) for step, batch in enumerate(train_dataloader): print(batch["input_ids"].shape) if step > 5: break from datasets import load_dataset from transformers import AutoTokenizer raw_datasets = load_dataset("glue", "mrpc") checkpoint = "bert-base-cased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_function(examples): return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"]) tokenized_datasets = tokenized_datasets.rename_column("label", "labels") tokenized_datasets = tokenized_datasets.with_format("torch") from torch.utils.data import DataLoader from transformers import DataCollatorWithPadding data_collator = DataCollatorWithPadding(tokenizer) train_dataloader = DataLoader( tokenized_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator ) for step, batch in enumerate(train_dataloader): print(batch["input_ids"].shape) if step > 5: break