#@title from IPython.display import HTML HTML('') ! pip install datasets transformers[sentencepiece] from datasets import load_dataset raw_datasets = load_dataset("glue", "mrpc") raw_datasets raw_datasets["train"] raw_datasets["train"][6] raw_datasets["train"][:5] raw_datasets["train"].features from transformers import AutoTokenizer checkpoint = "bert-base-cased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_function(example): return tokenizer( example["sentence1"], example["sentence2"], padding="max_length", truncation=True, max_length=128 ) tokenized_datasets = raw_datasets.map(tokenize_function) print(tokenized_datasets.column_names) from transformers import AutoTokenizer checkpoint = "bert-base-cased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_function(examples): return tokenizer( examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True, max_length=128 ) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"]) tokenized_datasets = tokenized_datasets.rename_column("label", "labels") tokenized_datasets = tokenized_datasets.with_format("tensorflow") tokenized_datasets["train"] small_train_dataset = tokenized_datasets["train"].select(range(100))