#@title
from IPython.display import HTML
HTML('')
! pip install datasets transformers[sentencepiece]
from datasets import load_dataset
from transformers import AutoTokenizer
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(examples):
return tokenizer(
examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True, max_length=128
)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True)
for step, batch in enumerate(train_dataloader):
print(batch["input_ids"].shape)
if step > 5:
break
from datasets import load_dataset
from transformers import AutoTokenizer
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(examples):
return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(
tokenized_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator
)
for step, batch in enumerate(train_dataloader):
print(batch["input_ids"].shape)
if step > 5:
break