This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
#@title
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/qgaM0weJHpA?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
! pip install datasets transformers[sentencepiece]
from datasets import load_dataset
raw_datasets = load_dataset("squad")
raw_datasets = raw_datasets.remove_columns(["id", "title"])
def prepare_data(example):
answer = example["answers"]["text"][0]
example["answer_start"] = example["answers"]["answer_start"][0]
example["answer_end"] = example["answer_start"] + len(answer)
return example
raw_datasets = raw_datasets.map(prepare_data, remove_columns=["answers"])
raw_datasets["train"]
print(f"Context: {raw_datasets['train'][0]['context']")
print(f"Question: {raw_datasets['train'][0]['question']")
start = raw_datasets["train"][0]["answer_start"]
end = raw_datasets["train"][0]["answer_end"]
print(f"\nAnswer: {raw_datasets['train'][0]['context'][start:end]}")
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
example = raw_datasets["train"][0]
inputs = tokenizer(
example["question"],
example["context"],
truncation="only_second",
padding="max_length",
max_length=384,
stride=128,
return_overflowing_tokens=True,
return_offsets_mapping=True
)
def find_labels(offsets, answer_start, answer_end, sequence_ids):
idx = 0
while sequence_ids[idx] != 1:
idx += 1
context_start = idx
while sequence_ids[idx] == 1:
idx += 1
context_end = idx - 1
# If the answer is not fully in the context, return (0, 0)
if offsets[context_start][0] > answer_end or offsets[context_end][1] < answer_start:
return (0, 0)
else:
idx = context_start
while idx <= context_end and offsets[idx][0] <= answer_start:
idx += 1
start_position = idx - 1
idx = context_end
while idx >= context_start and offsets[idx][1] >= answer_end:
idx -= 1
end_position = idx + 1
return start_position, end_position
start, end = find_labels(
inputs["offset_mapping"][0],
example["answer_start"],
example["answer_end"],
inputs.sequence_ids(0)
)
tokenizer.decode(inputs["input_ids"][0][start: end+1])
def preprocess_training_examples(examples):
questions = [q.strip() for q in examples["question"]]
inputs = tokenizer(
examples["question"],
examples["context"],
truncation="only_second",
padding="max_length",
max_length=384,
stride=128,
return_overflowing_tokens=True,
return_offsets_mapping=True,
)
offset_mapping = inputs.pop("offset_mapping")
sample_map = inputs.pop("overflow_to_sample_mapping")
inputs["start_positions"] = []
inputs["end_positions"] = []
for i, offset in enumerate(offset_mapping):
sample_idx = sample_map[i]
start, end = find_labels(
offset, examples["answer_start"][sample_idx], examples["answer_end"][sample_idx], inputs.sequence_ids(i)
)
inputs["start_positions"].append(start)
inputs["end_positions"].append(end)
return inputs
tokenized_datasets = raw_datasets.map(
preprocess_training_examples,
batched=True,
remove_columns=raw_datasets["train"].column_names,
)