#@title
from IPython.display import HTML
HTML('')
! pip install datasets transformers[sentencepiece]
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets
raw_datasets["train"]
raw_datasets["train"][6]
raw_datasets["train"][:5]
raw_datasets["train"].features
from transformers import AutoTokenizer
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
return tokenizer(
example["sentence1"], example["sentence2"], padding="max_length", truncation=True, max_length=128
)
tokenized_datasets = raw_datasets.map(tokenize_function)
print(tokenized_datasets.column_names)
from transformers import AutoTokenizer
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(examples):
return tokenizer(
examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True, max_length=128
)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("tensorflow")
tokenized_datasets["train"]
small_train_dataset = tokenized_datasets["train"].select(range(100))