This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
#@title
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/g8quOxoqhHQ?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
! pip install datasets transformers[sentencepiece]
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mnli")
raw_datasets
from transformers import AutoTokenizer
fast_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_with_fast(examples):
return fast_tokenizer(
examples["premise"], examples["hypothesis"], truncation=True
)
slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
def tokenize_with_slow(examples):
return fast_tokenizer(
examples["premise"], examples["hypothesis"], truncation=True
)
%time tokenized_datasets = raw_datasets.map(tokenize_with_fast)
%time tokenized_datasets = raw_datasets.map(tokenize_with_slow)
%time tokenized_datasets = raw_datasets.map(tokenize_with_fast, batched=True)
%time tokenized_datasets = raw_datasets.map(tokenize_with_slow, batched=True)