Install the Transformers, Datasets, and Evaluate libraries to run this notebook.
!pip install datasets evaluate transformers[sentencepiece]
!apt install git-lfs
You will need to setup git, adapt your email and name in the following cell.
!git config --global user.email "you@example.com"
!git config --global user.name "Your Name"
You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.
from huggingface_hub import notebook_login
notebook_login()
from datasets import load_dataset
raw_datasets = load_dataset("conll2003")
raw_datasets
DatasetDict({ train: Dataset({ features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'], num_rows: 14041 }) validation: Dataset({ features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'], num_rows: 3250 }) test: Dataset({ features: ['chunk_tags', 'id', 'ner_tags', 'pos_tags', 'tokens'], num_rows: 3453 }) })
raw_datasets["train"][0]["tokens"]
['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
raw_datasets["train"][0]["ner_tags"]
[3, 0, 7, 0, 0, 0, 7, 0, 0]
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature
Sequence(feature=ClassLabel(num_classes=9, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], names_file=None, id=None), length=-1, id=None)
label_names = ner_feature.feature.names
label_names
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
full_label = label_names[label]
max_length = max(len(word), len(full_label))
line1 += word + " " * (max_length - len(word) + 1)
line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)
'EU rejects German call to boycott British lamb .' 'B-ORG O B-MISC O O O B-MISC O O'
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.is_fast
True
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()
['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']
inputs.word_ids()
[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]
def align_labels_with_tokens(labels, word_ids):
new_labels = []
current_word = None
for word_id in word_ids:
if word_id != current_word:
# Bắt đầu một từ mới!
current_word = word_id
label = -100 if word_id is None else labels[word_id]
new_labels.append(label)
elif word_id is None:
# Token đặc biệt
new_labels.append(-100)
else:
# Từ giống với token trước đó
label = labels[word_id]
# Nếu nhãn là B-XXX, ta đổi sang I-XXX
if label % 2 == 1:
label += 1
new_labels.append(label)
return new_labels
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))
[3, 0, 7, 0, 0, 0, 7, 0, 0] [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(
examples["tokens"], truncation=True, is_split_into_words=True
)
all_labels = examples["ner_tags"]
new_labels = []
for i, labels in enumerate(all_labels):
word_ids = tokenized_inputs.word_ids(i)
new_labels.append(align_labels_with_tokens(labels, word_ids))
tokenized_inputs["labels"] = new_labels
return tokenized_inputs
tokenized_datasets = raw_datasets.map(
tokenize_and_align_labels,
batched=True,
remove_columns=raw_datasets["train"].column_names,
)
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(
tokenizer=tokenizer, return_tensors="tf"
)
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]
tensor([[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100], [-100, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])
for i in range(2):
print(tokenized_datasets["train"][i]["labels"])
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100] [-100, 1, 2, -100]
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
collate_fn=data_collator,
shuffle=True,
batch_size=16,
)
tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
collate_fn=data_collator,
shuffle=False,
batch_size=16,
)
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
from transformers import TFAutoModelForTokenClassification
model = TFAutoModelForTokenClassification.from_pretrained(
model_checkpoint,
id2label=id2label,
label2id=label2id,
)
model.config.num_labels
9
from huggingface_hub import notebook_login
notebook_login()
from transformers import create_optimizer
import tensorflow as tf
# Huấn luyện trong mixed-precision float16
# Bình luận dòng này nếu bạn đang sử dụng GPU nên không được hưởng lợi từ điều này
tf.keras.mixed_precision.set_global_policy("mixed_float16")
# Số bước huấn luyện là số lượng mẫu trong tập dữ liệu, chia cho kích thước lô sau đó nhân
# với số epoch. Lưu ý rằng tf_train_dataset ở đây là lô tf.data.Dataset,
# không phải Hugging Face Dataset gốc, nên len() của nó vốn là num_samples // batch_size.
num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs
optimizer, schedule = create_optimizer(
init_lr=2e-5,
num_warmup_steps=0,
num_train_steps=num_train_steps,
weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)
from transformers.keras_callbacks import PushToHubCallback
callback = PushToHubCallback(output_dir="bert-finetuned-ner", tokenizer=tokenizer)
model.fit(
tf_train_dataset,
validation_data=tf_eval_dataset,
callbacks=[callback],
epochs=num_epochs,
)
!pip install seqeval
import evaluate
metric = evaluate.load("seqeval")
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])
{'MISC': {'precision': 1.0, 'recall': 0.5, 'f1': 0.67, 'number': 2}, 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1}, 'overall_precision': 1.0, 'overall_recall': 0.67, 'overall_f1': 0.8, 'overall_accuracy': 0.89}
import numpy as np
all_predictions = []
all_labels = []
for batch in tf_eval_dataset:
logits = model.predict(batch)["logits"]
labels = batch["labels"]
predictions = np.argmax(logits, axis=-1)
for prediction, label in zip(predictions, labels):
for predicted_idx, label_idx in zip(prediction, label):
if label_idx == -100:
continue
all_predictions.append(label_names[predicted_idx])
all_labels.append(label_names[label_idx])
metric.compute(predictions=[all_predictions], references=[all_labels])
{'LOC': {'precision': 0.91, 'recall': 0.92, 'f1': 0.91, 'number': 1668}, 'MISC': {'precision': 0.70, 'recall': 0.79, 'f1': 0.74, 'number': 702}, 'ORG': {'precision': 0.85, 'recall': 0.90, 'f1': 0.88, 'number': 1661}, 'PER': {'precision': 0.95, 'recall': 0.95, 'f1': 0.95, 'number': 1617}, 'overall_precision': 0.87, 'overall_recall': 0.91, 'overall_f1': 0.89, 'overall_accuracy': 0.97}
from transformers import pipeline
# Thay thế nó với checkpoint của ta
model_checkpoint = "huggingface-course/bert-finetuned-ner"
token_classifier = pipeline(
"token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")
[{'entity_group': 'PER', 'score': 0.9988506, 'word': 'Sylvain', 'start': 11, 'end': 18}, {'entity_group': 'ORG', 'score': 0.9647625, 'word': 'Hugging Face', 'start': 33, 'end': 45}, {'entity_group': 'LOC', 'score': 0.9986118, 'word': 'Brooklyn', 'start': 49, 'end': 57}]