This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.
#@title
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/L-WSwUWde1U?rel=0&controls=0&showinfo=0" frameborder="0" allowfullscreen></iframe>')
Install the Transformers and Datasets libraries to run this notebook.
! pip install datasets transformers[sentencepiece]
from datasets import load_dataset, load_metric
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
)
raw_datasets = load_dataset("glue", "mnli")
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
args = TrainingArguments(
"distilbert-finetuned-mnli",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
)
metric = load_metric("glue", "mnli")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(
predictions=predictions, references=labels
)
trainer = Trainer(
model,
args,
train_dataset=raw_datasets["train"],
eval_dataset=raw_datasets["validation_matched"],
compute_metrics=compute_metrics,
)
trainer.train()
trainer.train_dataset[0]
from datasets import load_dataset, load_metric
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
)
raw_datasets = load_dataset("glue", "mnli")
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
args = TrainingArguments(
"distilbert-finetuned-mnli",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
)
metric = load_metric("glue", "mnli")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(
predictions=predictions, references=labels
)
trainer = Trainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation_matched"],
compute_metrics=compute_metrics,
)
trainer.train()
for batch in trainer.get_train_dataloader():
break
from datasets import load_dataset, load_metric
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
)
raw_datasets = load_dataset("glue", "mnli")
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
args = TrainingArguments(
"distilbert-finetuned-mnli",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
)
metric = load_metric("glue", "mnli")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(
predictions=predictions, references=labels
)
trainer = Trainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation_matched"],
compute_metrics=compute_metrics,
tokenizer=tokenizer,
)
trainer.train()
for batch in trainer.get_train_dataloader():
break
outputs = trainer.model.cpu()(**batch)
model.num_labels
from datasets import load_dataset, load_metric
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
)
raw_datasets = load_dataset("glue", "mnli")
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint, num_labels=3
)
args = TrainingArguments(
"distilbert-finetuned-mnli",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
)
metric = load_metric("glue", "mnli")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(
predictions=predictions, references=labels
)
trainer = Trainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation_matched"],
compute_metrics=compute_metrics,
tokenizer=tokenizer,
)
trainer.train()
for batch in trainer.get_train_dataloader():
break
outputs = trainer.model.cpu()(**batch)
loss = outputs.loss
loss.backward()
trainer.create_optimizer()
trainer.optimizer.step()