#@title from IPython.display import HTML HTML('') ! pip install datasets transformers[sentencepiece] from datasets import load_dataset raw_datasets = load_dataset("glue", "mnli") raw_datasets from transformers import AutoTokenizer fast_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") def tokenize_with_fast(examples): return fast_tokenizer( examples["premise"], examples["hypothesis"], truncation=True ) slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False) def tokenize_with_slow(examples): return fast_tokenizer( examples["premise"], examples["hypothesis"], truncation=True ) %time tokenized_datasets = raw_datasets.map(tokenize_with_fast) %time tokenized_datasets = raw_datasets.map(tokenize_with_slow) %time tokenized_datasets = raw_datasets.map(tokenize_with_fast, batched=True) %time tokenized_datasets = raw_datasets.map(tokenize_with_slow, batched=True)