This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.

In [ ]:

#@title
from IPython.display import HTML

HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/iY2AZYdZAr0?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Out[ ]:

Install the Transformers and Datasets libraries to run this notebook.

In [ ]:

! pip install datasets transformers[sentencepiece]

In [ ]:

from datasets import load_dataset

raw_datasets = load_dataset("conll2003")
raw_datasets = raw_datasets.remove_columns(["chunk_tags", "id", "pos_tags"])
raw_datasets = raw_datasets.rename_column("ner_tags", "labels")
raw_datasets = raw_datasets.rename_column("tokens", "words")
raw_datasets["train"]

In [ ]:

print(raw_datasets["train"][0]["words"])
print(raw_datasets["train"][0]["labels"])

In [ ]:

label_names = raw_datasets["train"].features["labels"].feature.names
label_names

In [ ]:

from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

inputs = tokenizer(raw_datasets["train"][0]["words"], is_split_into_words=True)
inputs.tokens()

In [ ]:

def shift_label(label):
    # If the label is B-XXX we change it to I-XXX
    if label % 2 == 1:
        label += 1
    return label

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        elif word_id != current_word:
            # Start of a new word!
            current_word = word_id
            new_labels.append(labels[word_id])
        else:
            new_labels.append(shift_label(labels[word_id]))

    return new_labels

In [ ]:

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)
    new_labels = []
    for i, labels in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)

In [ ]:

from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [ ]: