Installez les bibliothèques 🤗 Datasets et 🤗 Transformers pour exécuter ce notebook.
!pip install datasets transformers[sentencepiece]
!pip install accelerate
# Pour exécuter l'entraînement sur TPU, vous devez décommenter la ligne suivante :
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs
Vous aurez besoin de configurer git, adaptez votre email et votre nom dans la cellule suivante.
!git config --global user.email "you@example.com"
!git config --global user.name "Your Name"
Vous devrez également être connecté au Hub d'Hugging Face. Exécutez ce qui suit et entrez vos informations d'identification.
from huggingface_hub import notebook_login
notebook_login()
from datasets import load_dataset
french_dataset = load_dataset("amazon_reviews_multi", "fr")
english_dataset = load_dataset("amazon_reviews_multi", "en")
french_dataset
def show_samples(dataset, num_samples=3, seed=42):
sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
for example in sample:
print(f"\n'>> Title: {example['review_title']}'")
print(f"'>> Review: {example['review_body']}'")
show_samples(french_dataset)
french_dataset.set_format("pandas")
french_df = french_dataset["train"][:]
# Afficher les comptes des 20 premiers produits
french_df["product_category"].value_counts()[:20]
def filter_books(example):
return (
example["product_category"] == "book"
or example["product_category"] == "digital_ebook_purchase"
)
french_dataset.reset_format()
french_books = french_dataset.filter(filter_books)
english_books = english_dataset.filter(filter_books)
show_samples(french_dataset)
from datasets import concatenate_datasets, DatasetDict
books_dataset = DatasetDict()
for split in english_books.keys():
books_dataset[split] = concatenate_datasets(
[english_books[split], french_books[split]]
)
books_dataset[split] = books_dataset[split].shuffle(seed=42)
# Quelques exemples
show_samples(books_dataset)
books_dataset = books_dataset.filter(lambda x: len(x["review_title"].split()) > 2)
from transformers import AutoTokenizer
model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
inputs = tokenizer("J'ai adoré lire les Hunger Games !")
inputs
tokenizer.convert_ids_to_tokens(inputs.input_ids)
max_input_length = 512
max_target_length = 30
def preprocess_function(examples):
model_inputs = tokenizer(
examples["review_body"], max_length=max_input_length, truncation=True
)
# Configurer le tokenizer pour les cibles
with tokenizer.as_target_tokenizer():
labels = tokenizer(
examples["review_title"], max_length=max_target_length, truncation=True
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_datasets = books_dataset.map(preprocess_function, batched=True)
generated_summary = "J'ai absolument adoré lire les Hunger Games"
reference_summary = "J'ai adoré lire les Hunger Games"
!pip install rouge_score
from datasets import load_metric
rouge_score = load_metric("rouge")
scores = rouge_score.compute(
predictions=[generated_summary], references=[reference_summary]
)
scores
scores["rouge1"].mid
!pip install nltk
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize
def three_sentence_summary(text):
return "\n".join(sent_tokenize(text)[:3])
print(three_sentence_summary(books_dataset["train"][1]["review_body"]))
def evaluate_baseline(dataset, metric):
summaries = [three_sentence_summary(text) for text in dataset["review_body"]]
return metric.compute(predictions=summaries, references=dataset["review_title"])
import pandas as pd
score = evaluate_baseline(books_dataset["validation"], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(score[rn].mid.fmeasure * 100, 2)) for rn in rouge_names)
rouge_dict
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
from transformers import Seq2SeqTrainingArguments
batch_size = 8
num_train_epochs = 8
# Montre la perte d'entraînement à chaque époque
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
output_dir=f"{model_name}-finetuned-amazon-en-fr",
evaluation_strategy="epoch",
learning_rate=5.6e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
save_total_limit=3,
num_train_epochs=num_train_epochs,
predict_with_generate=True,
logging_steps=logging_steps,
push_to_hub=True,
)
import numpy as np
def compute_metrics(eval_pred):
predictions, labels = eval_pred
# Décoder les résumés générés en texte
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
# Remplacer -100 dans les étiquettes car nous ne pouvons pas les décoder
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
# Décoder les résumés de référence en texte
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# ROUGE attend une nouvelle ligne après chaque phrase
decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
# Calculer les scores ROUGE
result = rouge_score.compute(
predictions=decoded_preds, references=decoded_labels, use_stemmer=True
)
# Extraire les scores médians
result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
return {k: round(v, 4) for k, v in result.items()}
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
tokenized_datasets = tokenized_datasets.remove_columns(
books_dataset["train"].column_names
)
features = [tokenized_datasets["train"][i] for i in range(2)]
data_collator(features)
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.train()
trainer.evaluate()
tokenized_datasets.set_format("torch")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
from torch.utils.data import DataLoader
batch_size = 8
train_dataloader = DataLoader(
tokenized_datasets["train"],
shuffle=True,
collate_fn=data_collator,
batch_size=batch_size,
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], collate_fn=data_collator, batch_size=batch_size
)
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5)
from accelerate import Accelerator
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
from transformers import get_scheduler
num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps,
)
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [label.strip() for label in labels]
# ROUGE attend une nouvelle ligne après chaque phrase
preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
return preds, labels
from huggingface_hub import get_full_repo_name
model_name = "test-bert-finetuned-squad-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name
from huggingface_hub import Repository
output_dir = "results-mt5-finetuned-squad-accelerate"
repo = Repository(output_dir, clone_from=repo_name)
from tqdm.auto import tqdm
import torch
import numpy as np
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
# Entraînement
model.train()
for step, batch in enumerate(train_dataloader):
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
# Evaluation
model.eval()
for step, batch in enumerate(eval_dataloader):
with torch.no_grad():
generated_tokens = accelerator.unwrap_model(model).generate(
batch["input_ids"],
attention_mask=batch["attention_mask"],
)
generated_tokens = accelerator.pad_across_processes(
generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
)
labels = batch["labels"]
# Si nous n'avons pas rempli la longueur maximale, nous devons également remplir les étiquettes
labels = accelerator.pad_across_processes(
batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
)
generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
labels = accelerator.gather(labels).cpu().numpy()
# Remplacer -100 dans les étiquettes car nous ne pouvons pas les décoder
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
if isinstance(generated_tokens, tuple):
generated_tokens = generated_tokens[0]
decoded_preds = tokenizer.batch_decode(
generated_tokens, skip_special_tokens=True
)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_preds, decoded_labels = postprocess_text(
decoded_preds, decoded_labels
)
rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)
# Calculer les métriques
result = rouge_score.compute()
# Extraire les scores médians de ROUGE
result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
result = {k: round(v, 4) for k, v in result.items()}
print(f"Epoch {epoch}:", result)
# Sauvegarder et télécharger
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(output_dir)
repo.push_to_hub(
commit_message=f"Training in progress epoch {epoch}", blocking=False
)
from transformers import pipeline
hub_model_id = "huggingface-course/mt5-small-finetuned-amazon-en-fr"
summarizer = pipeline("summarization", model=hub_model_id)
def print_summary(idx):
review = books_dataset["test"][idx]["review_body"]
title = books_dataset["test"][idx]["review_title"]
summary = summarizer(books_dataset["test"][idx]["review_body"])[0]["summary_text"]
print(f"'>>> Review: {review}'")
print(f"\n'>>> Title: {title}'")
print(f"\n'>>> Summary: {summary}'")
print_summary(100)
print_summary(0)