!pip install datasets evaluate transformers[sentencepiece] from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") example = "My name is Sylvain and I work at Hugging Face in Brooklyn." encoding = tokenizer(example) print(type(encoding)) tokenizer.is_fast encoding.is_fast encoding.tokens() encoding.word_ids() start, end = encoding.word_to_chars(3) example[start:end] from transformers import pipeline token_classifier = pipeline("token-classification") token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.") from transformers import pipeline token_classifier = pipeline("token-classification", aggregation_strategy="simple") token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.") from transformers import AutoTokenizer, TFAutoModelForTokenClassification model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) model = TFAutoModelForTokenClassification.from_pretrained(model_checkpoint) example = "My name is Sylvain and I work at Hugging Face in Brooklyn." inputs = tokenizer(example, return_tensors="tf") outputs = model(**inputs) print(inputs["input_ids"].shape) print(outputs.logits.shape) import tensorflow as tf probabilities = tf.math.softmax(outputs.logits, axis=-1)[0] probabilities = probabilities.numpy().tolist() predictions = tf.math.argmax(outputs.logits, axis=-1)[0] predictions = predictions.numpy().tolist() print(predictions) model.config.id2label results = [] tokens = inputs.tokens() for idx, pred in enumerate(predictions): label = model.config.id2label[pred] if label != "O": results.append( {"entity": label, "score": probabilities[idx][pred], "word": tokens[idx]} ) print(results) inputs_with_offsets = tokenizer(example, return_offsets_mapping=True) inputs_with_offsets["offset_mapping"] example[12:14] results = [] inputs_with_offsets = tokenizer(example, return_offsets_mapping=True) tokens = inputs_with_offsets.tokens() offsets = inputs_with_offsets["offset_mapping"] for idx, pred in enumerate(predictions): label = model.config.id2label[pred] if label != "O": start, end = offsets[idx] results.append( { "entity": label, "score": probabilities[idx][pred], "word": tokens[idx], "start": start, "end": end, } ) print(results) example[33:45] import numpy as np results = [] inputs_with_offsets = tokenizer(example, return_offsets_mapping=True) tokens = inputs_with_offsets.tokens() offsets = inputs_with_offsets["offset_mapping"] idx = 0 while idx < len(predictions): pred = predictions[idx] label = model.config.id2label[pred] if label != "O": # Remove the B- or I- label = label[2:] start, _ = offsets[idx] # Grab all the tokens labeled with I-label all_scores = [] while ( idx < len(predictions) and model.config.id2label[predictions[idx]] == f"I-{label}" ): all_scores.append(probabilities[idx][pred]) _, end = offsets[idx] idx += 1 # The score is the mean of all the scores of the tokens in that grouped entity score = np.mean(all_scores).item() word = example[start:end] results.append( { "entity_group": label, "score": score, "word": word, "start": start, "end": end, } ) idx += 1 print(results)