This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
#@title
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/iY2AZYdZAr0?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
! pip install datasets transformers[sentencepiece]
from datasets import load_dataset
raw_datasets = load_dataset("conll2003")
raw_datasets = raw_datasets.remove_columns(["chunk_tags", "id", "pos_tags"])
raw_datasets = raw_datasets.rename_column("ner_tags", "labels")
raw_datasets = raw_datasets.rename_column("tokens", "words")
raw_datasets["train"]
print(raw_datasets["train"][0]["words"])
print(raw_datasets["train"][0]["labels"])
label_names = raw_datasets["train"].features["labels"].feature.names
label_names
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
inputs = tokenizer(raw_datasets["train"][0]["words"], is_split_into_words=True)
inputs.tokens()
def shift_label(label):
# If the label is B-XXX we change it to I-XXX
if label % 2 == 1:
label += 1
return label
def align_labels_with_tokens(labels, word_ids):
new_labels = []
current_word = None
for word_id in word_ids:
if word_id is None:
new_labels.append(-100)
elif word_id != current_word:
# Start of a new word!
current_word = word_id
new_labels.append(labels[word_id])
else:
new_labels.append(shift_label(labels[word_id]))
return new_labels
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)
new_labels = []
for i, labels in enumerate(examples["labels"]):
word_ids = tokenized_inputs.word_ids(i)
new_labels.append(align_labels_with_tokens(labels, word_ids))
tokenized_inputs["labels"] = new_labels
return tokenized_inputs
tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)