This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
#@title
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/VN67ZpN33Ss?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
! pip install datasets transformers[sentencepiece]
def find_labels(offsets, answer_start, answer_end, sequence_ids):
idx = 0
while sequence_ids[idx] != 1:
idx += 1
context_start = idx
while sequence_ids[idx] == 1:
idx += 1
context_end = idx - 1
if offsets[context_start][0] > answer_end or offsets[context_end][1] < answer_start:
return (0, 0)
else:
idx = context_start
while idx <= context_end and offsets[idx][0] <= answer_start:
idx += 1
start_position = idx - 1
idx = context_end
while idx >= context_start and offsets[idx][1] >= answer_end:
idx -= 1
end_position = idx + 1
return start_position, end_position
def preprocess_validation_examples(examples):
questions = [q.strip() for q in examples["question"]]
inputs = tokenizer(
examples["question"],
examples["context"],
truncation="only_second",
padding="max_length",
max_length=384,
stride=128,
return_overflowing_tokens=True,
return_offsets_mapping=True,
)
offset_mapping = inputs["offset_mapping"]
sample_map = inputs.pop("overflow_to_sample_mapping")
inputs["start_positions"] = []
inputs["end_positions"] = []
inputs["example_id"] = []
for i, offset in enumerate(offset_mapping):
sample_idx = sample_map[i]
inputs["example_id"].append(examples["id"][sample_idx])
sequence_ids = inputs.sequence_ids(i)
offset_mapping[i] = [(o if s == 1 else None) for o, s in zip(offset, sequence_ids)]
start, end = find_labels(
offset, examples["answer_start"][sample_idx], examples["answer_end"][sample_idx], sequence_ids
)
inputs["start_positions"].append(start)
inputs["end_positions"].append(end)
return inputs
from datasets import load_dataset
from transformers import AutoTokenizer
model_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
raw_datasets = load_dataset("squad")
raw_datasets = raw_datasets.remove_columns(["title"])
def prepare_data(example):
answer = example["answers"]["text"][0]
example["answer_start"] = example["answers"]["answer_start"][0]
example["answer_end"] = example["answer_start"] + len(answer)
return example
validation_set = raw_datasets["validation"].map(prepare_data, remove_columns=["answers"])
validation_features = validation_set.map(
preprocess_validation_examples,
batched=True,
remove_columns=validation_set.column_names,
)
len(validation_set), len(validation_features)
from transformers import TFAutoModelForQuestionAnswering
tf_validation_set = validation_features.to_tf_dataset(
columns=["input_ids", "attention_mask"],
batch_size=16,
shuffle=False,
)
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
predictions = model.predict(tf_validation_set)
start_logits = predictions.start_logits
end_logits = predictions.end_logits
import collections
example_to_feature = collections.defaultdict(list)
for idx, feature in enumerate(validation_features):
example_id = feature["example_id"]
example_to_feature[example_id].append(idx)
score[start_pos, end_pos] = start_probabilities[start_pos] * end_probabilities[end_pos]
logit_score[start_pos, end_pos] = start_logits[start_pos] + end_logits[end_pos]
import numpy as np
start_logit = start_logits[0]
end_logit = end_logits[0]
offsets = validation_features[0]["offset_mapping"]
context = validation_set[0]["context"]
start_indexes = np.argsort(start_logit)[-1 : -21 : -1].tolist()
end_indexes = np.argsort(end_logit)[-1 : -21 : -1].tolist()
answers = []
for start_index in start_indexes:
for end_index in end_indexes:
# Predicting (0, 0) means no answer.
if start_index == 0 and end_index == 0:
answers.append({"text": "", "logit_score": start_logit[start_index] + end_logit[end_index]})
# Skip answers that are not fully in the context.
elif offsets[start_index] is None or offsets[end_index] is None:
continue
# Skip answers with a length that is either < 0 or > max_answer_length.
elif end_index < start_index or end_index - start_index + 1 > 30:
continue
else:
answers.append({
"text": context[offsets[start_index][0]: offsets[end_index][1]],
"logit_score": start_logit[start_index] + end_logit[end_index],
})
predicted_answer = max(answers, key = lambda x: x["logit_score"])
print(f"Predicted answer: {predicted_answer}")
answer_start = validation_set[0]["answer_start"]
answer_end = validation_set[0]["answer_end"]
right_answer = context[answer_start: answer_end]
print(f"Theorerical answer: {right_answer}")
predicted_answers = {}
for example in tqdm(validation_set):
example_id = example["id"]
context = example["context"]
answers = []
for feature_index in example_to_feature[example_id]:
start_logit = start_logits[feature_index]
end_logit = end_logits[feature_index]
offsets = validation_features[feature_index]["offset_mapping"]
start_indexes = np.argsort(start_logit)[-1 : -11 : -1].tolist()
end_indexes = np.argsort(end_logit)[-1 : -11 : -1].tolist()
for start_index in start_indexes:
for end_index in end_indexes:
# Predicting (0, 0) means no answer.
if start_index == 0 and end_index == 0:
answers.append({"text": "", "logit_score": start_logit[start_index] + end_logit[end_index]})
# Skip answers that are not fully in the context.
elif offsets[start_index] is None or offsets[end_index] is None:
continue
# Skip answers with a length that is either < 0 or > max_answer_length.
elif end_index < start_index or end_index - start_index + 1 > 30:
continue
else:
answers.append({
"text": context[offsets[start_index][0]: offsets[end_index][1]],
"logit_score": start_logit[start_index] + end_logit[end_index],
})
best_answer = max(answers, key= lambda x: x["logit_score"])
predicted_answers[example_id] = best_answer["text"]