This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.

In [ ]:

#@title
from IPython.display import HTML

HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/0E7ltQB7fM8?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Out[ ]:

Install the Transformers and Datasets libraries to run this notebook.

In [ ]:

! pip install datasets transformers[sentencepiece]

In [ ]:

from transformers import pipeline

token_classifier = pipeline("token-classification")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

In [ ]:

token_classifier = pipeline("token-classification", aggregation_strategy="simple")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

In [ ]:

from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = ""
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
inputs = tokenizer(example, return_tensors="pt")
outputs = model(**inputs)

print(inputs["input_ids"].shape)
print(outputs.logits.shape)

In [ ]:

import torch

probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
predictions = probabilities.argmax(dim=-1)[0].tolist()
print(predictions)

In [ ]:

model.config.id2label

In [ ]:

results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "O":
        start, end = offsets[idx]
        results.append(
            {"entity": label, "score": probabilities[idx][pred],
             "word": tokens[idx], "start": start, "end": end}
        )

print(results)

In [ ]:

import numpy as np

label_map = model.config.id2label
results = []
idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label = label_map[pred]
    if label != "O":
        # Remove the B- or I-
        label = label[2:]
        start, _ = offsets[idx]

        # Grab all the tokens labeled with I-label
        all_scores = []
        while idx < len(predictions) and label_map[predictions[idx]] == f"I-{label}":
            all_scores.append(probabilities[idx][pred])
            _, end = offsets[idx]
            idx += 1

        # The score is the mean of all the scores of the token in that grouped entity.
        score = np.mean(all_scores).item()
        word = example[start:end]
        results.append(
            {"entity_group": label, "score": score,
             "word": word, "start": start, "end": end}
        )
    idx += 1

In [ ]: