Notebook

This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.

In [ ]:

#@title
from IPython.display import HTML

HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/qgaM0weJHpA?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Out[ ]:

Install the Transformers and Datasets libraries to run this notebook.

In [ ]:

! pip install datasets transformers[sentencepiece]

In [ ]:

from datasets import load_dataset

raw_datasets = load_dataset("squad")
raw_datasets = raw_datasets.remove_columns(["id", "title"])

def prepare_data(example):
    answer = example["answers"]["text"][0]
    example["answer_start"] = example["answers"]["answer_start"][0]
    example["answer_end"] = example["answer_start"] + len(answer)
    return example

raw_datasets = raw_datasets.map(prepare_data, remove_columns=["answers"])
raw_datasets["train"]

In [ ]:

print(f"Context: {raw_datasets['train'][0]['context']")
print(f"Question: {raw_datasets['train'][0]['question']")
start = raw_datasets["train"][0]["answer_start"]
end = raw_datasets["train"][0]["answer_end"]
print(f"\nAnswer: {raw_datasets['train'][0]['context'][start:end]}")

In [ ]:

from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

example = raw_datasets["train"][0]
inputs = tokenizer(
    example["question"],
    example["context"],
    truncation="only_second",
    padding="max_length",
    max_length=384,
    stride=128,
    return_overflowing_tokens=True,
    return_offsets_mapping=True
)

In [ ]:

def find_labels(offsets, answer_start, answer_end, sequence_ids):
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    # If the answer is not fully in the context, return (0, 0)
    if offsets[context_start][0] > answer_end or offsets[context_end][1] < answer_start:
        return (0, 0)
    else:
        idx = context_start
        while idx <= context_end and offsets[idx][0] <= answer_start:
            idx += 1
        start_position = idx - 1

        idx = context_end
        while idx >= context_start and offsets[idx][1] >= answer_end:
            idx -= 1
        end_position = idx + 1
    
        return start_position, end_position

In [ ]:

start, end = find_labels(
    inputs["offset_mapping"][0],
    example["answer_start"],
    example["answer_end"],
    inputs.sequence_ids(0)
)
tokenizer.decode(inputs["input_ids"][0][start: end+1])

In [ ]:

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        padding="max_length",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
    )
    
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    inputs["start_positions"] = []
    inputs["end_positions"] = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        start, end = find_labels(
            offset, examples["answer_start"][sample_idx], examples["answer_end"][sample_idx], inputs.sequence_ids(i)
        )
        
        inputs["start_positions"].append(start)
        inputs["end_positions"].append(end)

    return inputs

In [ ]:

tokenized_datasets = raw_datasets.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [ ]: