This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
#@title
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/8PmhEIXhBvI?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
! pip install datasets transformers[sentencepiece]
from datasets import load_dataset
raw_datasets = load_dataset("wikitext", "wikitext-2-raw-v1")
raw_datasets["train"]
from datasets import load_dataset
from transformers import AutoTokenizer
raw_datasets = load_dataset("imdb")
raw_datasets = raw_datasets.remove_columns("label")
model_checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
context_length = 128
def tokenize_pad_and_truncate(texts):
return tokenizer(texts["text"], truncation=True, padding="max_length", max_length=context_length)
tokenized_datasets = raw_datasets.map(tokenize_pad_and_truncate, batched=True)
def tokenize_and_chunk(texts):
return tokenizer(
texts["text"], truncation=True, max_length=context_length,
return_overflowing_tokens=True
)
tokenized_datasets = raw_datasets.map(
tokenize_and_chunk, batched=True, remove_columns=["text"]
)
len(raw_datasets["train"]), len(tokenized_datasets["train"])
def tokenize_and_chunk(texts):
all_input_ids = []
for input_ids in tokenizer(texts["text"])["input_ids"]:
all_input_ids.extend(input_ids)
all_input_ids.append(tokenizer.eos_token_id)
chunks = []
for idx in range(0, len(all_input_ids), context_length):
chunks.append(all_input_ids[idx: idx + context_length])
return {"input_ids": chunks}
tokenized_datasets = raw_datasets.map(tokenize_and_chunk, batched=True, remove_columns=["text"])
len(raw_datasets["train"]), len(tokenized_datasets["train"])
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)