#!/usr/bin/env python # coding: utf-8 # # Finetuner un modèle avec l'API Trainer # Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce notebook. # In[ ]: get_ipython().system('pip install datasets transformers[sentencepiece]') # In[ ]: from datasets import load_dataset from transformers import AutoTokenizer, DataCollatorWithPadding raw_datasets = load_dataset("paws-x", "fr") checkpoint = "camembert-base" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_function(example): return tokenizer(example["sentence1"], example["sentence2"], truncation=True) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # In[ ]: from transformers import TrainingArguments training_args = TrainingArguments("test-trainer") # In[ ]: from transformers import AutoModelForSequenceClassification model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) # In[ ]: from transformers import Trainer trainer = Trainer( model, training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], data_collator=data_collator, tokenizer=tokenizer, ) # In[ ]: trainer.train() # Attention, une epoch prend 12h ! # In[ ]: predictions = trainer.predict(tokenized_datasets["validation"]) print(predictions.predictions.shape, predictions.label_ids.shape) # In[ ]: import numpy as np preds = np.argmax(predictions.predictions, axis=-1) # In[ ]: from datasets import load_metric metric = load_metric("glue", "mrpc") metric.compute(predictions=preds, references=predictions.label_ids) # In[ ]: def compute_metrics(eval_preds): metric = load_metric("glue", "mrpc") logits, labels = eval_preds predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels) # In[ ]: training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch") model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) trainer = Trainer( model, training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics, )