This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
#@title
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/Zh0FfmVrKX0?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
! pip install datasets transformers[sentencepiece]
from datasets import load_dataset, load_metric
raw_datasets = load_dataset("glue", "cola")
raw_datasets
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
return tokenizer(examples["sentence"], truncation=True)
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
import numpy as np
from datasets import load_metric
metric = load_metric("glue", "cola")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=-1)
return metric.compute(predictions=predictions, references=labels)
from transformers import TrainingArguments
args = TrainingArguments(
"bert-fine-tuned-cola",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
push_to_hub=True,
)
from transformers import Trainer
trainer = Trainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
compute_metrics=compute_metrics,
tokenizer=tokenizer,
)
trainer.train()
trainer.push_to_hub("End of training")
repo_name = "bert-fine-tuned-cola"
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)
label_names = raw_datasets["train"].features["label"].names
label_names
model.config.id2label = {str(i): lbl for i, lbl in enumerate(label_names)}
model.config.label2id = {lbl: str(i) for i, lbl in enumerate(label_names)}
repo_name = "bert-fine-tuned-cola"
model.config.push_to_hub(repo_name)