Ce chapitre portant sur le débogage, la langue nous importe peu ici. Nous nous intéressons surtout à la logique du code pour comprendre d'où provient l'erreur.
Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce notebook.
!pip install datasets transformers[sentencepiece]
from datasets import load_dataset, load_metric
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
)
raw_datasets = load_dataset("glue", "mnli")
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
args = TrainingArguments(
f"distilbert-finetuned-mnli",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
)
metric = load_metric("glue", "mnli")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
return metric.compute(predictions=predictions, references=labels)
trainer = Trainer(
model,
args,
train_dataset=raw_datasets["train"],
eval_dataset=raw_datasets["validation_matched"],
compute_metrics=compute_metrics,
)
trainer.train()
trainer.train_dataset[0]
from datasets import load_dataset, load_metric
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
)
raw_datasets = load_dataset("glue", "mnli")
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
args = TrainingArguments(
f"distilbert-finetuned-mnli",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
)
metric = load_metric("glue", "mnli")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
return metric.compute(predictions=predictions, references=labels)
trainer = Trainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation_matched"],
compute_metrics=compute_metrics,
)
trainer.train()
tokenizer.decode(trainer.train_dataset[0]["input_ids"])
trainer.train_dataset[0].keys()
type(trainer.model)
trainer.train_dataset[0]["attention_mask"]
len(trainer.train_dataset[0]["attention_mask"]) == len(
trainer.train_dataset[0]["input_ids"]
)
trainer.train_dataset[0]["label"]
trainer.train_dataset.features["label"].names
for batch in trainer.get_train_dataloader():
break
data_collator = trainer.get_train_dataloader().collate_fn
data_collator
from datasets import load_dataset, load_metric
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
TrainingArguments,
Trainer,
)
raw_datasets = load_dataset("glue", "mnli")
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
args = TrainingArguments(
f"distilbert-finetuned-mnli",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
)
metric = load_metric("glue", "mnli")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
return metric.compute(predictions=predictions, references=labels)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation_matched"],
compute_metrics=compute_metrics,
data_collator=data_collator,
tokenizer=tokenizer,
)
trainer.train()
data_collator = trainer.get_train_dataloader().collate_fn
batch = data_collator([trainer.train_dataset[i] for i in range(4)])
data_collator = trainer.get_train_dataloader().collate_fn
actual_train_set = trainer._remove_unused_columns(trainer.train_dataset)
batch = data_collator([actual_train_set[i] for i in range(4)])
for batch in trainer.get_train_dataloader():
break
outputs = trainer.model.cpu()(**batch)
trainer.model.config.num_labels
from datasets import load_dataset, load_metric
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
TrainingArguments,
Trainer,
)
raw_datasets = load_dataset("glue", "mnli")
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)
args = TrainingArguments(
f"distilbert-finetuned-mnli",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
)
metric = load_metric("glue", "mnli")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
return metric.compute(predictions=predictions, references=labels)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation_matched"],
compute_metrics=compute_metrics,
data_collator=data_collator,
tokenizer=tokenizer,
)
for batch in trainer.get_train_dataloader():
break
outputs = trainer.model.cpu()(**batch)
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: v.to(device) for k, v in batch.items()}
outputs = trainer.model.to(device)(**batch)
loss = outputs.loss
loss.backward()
trainer.create_optimizer()
trainer.optimizer.step()
# This will take a long time and error out, so you shouldn't run this cell
trainer.train()
trainer.evaluate()
for batch in trainer.get_eval_dataloader():
break
batch = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
outputs = trainer.model(**batch)
predictions = outputs.logits.cpu().numpy()
labels = batch["labels"].cpu().numpy()
compute_metrics((predictions, labels))
predictions.shape, labels.shape
import numpy as np
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)
compute_metrics((predictions, labels))
import numpy as np
from datasets import load_dataset, load_metric
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
TrainingArguments,
Trainer,
)
raw_datasets = load_dataset("glue", "mnli")
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)
args = TrainingArguments(
f"distilbert-finetuned-mnli",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
)
metric = load_metric("glue", "mnli")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation_matched"],
compute_metrics=compute_metrics,
data_collator=data_collator,
tokenizer=tokenizer,
)
trainer.train()
for batch in trainer.get_train_dataloader():
break
batch = {k: v.to(device) for k, v in batch.items()}
trainer.create_optimizer()
for _ in range(20):
outputs = trainer.model(**batch)
loss = outputs.loss
loss.backward()
trainer.optimizer.step()
trainer.optimizer.zero_grad()
with torch.no_grad():
outputs = trainer.model(**batch)
preds = outputs.logits
labels = batch["labels"]
compute_metrics((preds.cpu().numpy(), labels.cpu().numpy()))