!pip install datasets evaluate transformers[sentencepiece] !pip install accelerate # To run the training on TPU, you will need to uncomment the following line: # !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl !apt install git-lfs !git config --global user.email "you@example.com" !git config --global user.name "Your Name" from huggingface_hub import notebook_login notebook_login() from datasets import load_dataset spanish_dataset = load_dataset("amazon_reviews_multi", "es") english_dataset = load_dataset("amazon_reviews_multi", "en") english_dataset def show_samples(dataset, num_samples=3, seed=42): sample = dataset["train"].shuffle(seed=seed).select(range(num_samples)) for example in sample: print(f"\n'>> Title: {example['review_title']}'") print(f"'>> Review: {example['review_body']}'") show_samples(english_dataset) english_dataset.set_format("pandas") english_df = english_dataset["train"][:] # Show counts for top 20 products english_df["product_category"].value_counts()[:20] def filter_books(example): return ( example["product_category"] == "book" or example["product_category"] == "digital_ebook_purchase" ) english_dataset.reset_format() spanish_books = spanish_dataset.filter(filter_books) english_books = english_dataset.filter(filter_books) show_samples(english_books) from datasets import concatenate_datasets, DatasetDict books_dataset = DatasetDict() for split in english_books.keys(): books_dataset[split] = concatenate_datasets( [english_books[split], spanish_books[split]] ) books_dataset[split] = books_dataset[split].shuffle(seed=42) # Peek at a few examples show_samples(books_dataset) books_dataset = books_dataset.filter(lambda x: len(x["review_title"].split()) > 2) from transformers import AutoTokenizer model_checkpoint = "google/mt5-small" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) inputs = tokenizer("I loved reading the Hunger Games!") inputs tokenizer.convert_ids_to_tokens(inputs.input_ids) max_input_length = 512 max_target_length = 30 def preprocess_function(examples): model_inputs = tokenizer( examples["review_body"], max_length=max_input_length, truncation=True, ) labels = tokenizer( examples["review_title"], max_length=max_target_length, truncation=True ) model_inputs["labels"] = labels["input_ids"] return model_inputs tokenized_datasets = books_dataset.map(preprocess_function, batched=True) generated_summary = "I absolutely loved reading the Hunger Games" reference_summary = "I loved reading the Hunger Games" !pip install rouge_score import evaluate rouge_score = evaluate.load("rouge") scores = rouge_score.compute( predictions=[generated_summary], references=[reference_summary] ) scores scores["rouge1"].mid !pip install nltk import nltk nltk.download("punkt") from nltk.tokenize import sent_tokenize def three_sentence_summary(text): return "\n".join(sent_tokenize(text)[:3]) print(three_sentence_summary(books_dataset["train"][1]["review_body"])) def evaluate_baseline(dataset, metric): summaries = [three_sentence_summary(text) for text in dataset["review_body"]] return metric.compute(predictions=summaries, references=dataset["review_title"]) import pandas as pd score = evaluate_baseline(books_dataset["validation"], rouge_score) rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"] rouge_dict = dict((rn, round(score[rn].mid.fmeasure * 100, 2)) for rn in rouge_names) rouge_dict from transformers import AutoModelForSeq2SeqLM model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) from huggingface_hub import notebook_login notebook_login() from transformers import Seq2SeqTrainingArguments batch_size = 8 num_train_epochs = 8 # Show the training loss with every epoch logging_steps = len(tokenized_datasets["train"]) // batch_size model_name = model_checkpoint.split("/")[-1] args = Seq2SeqTrainingArguments( output_dir=f"{model_name}-finetuned-amazon-en-es", evaluation_strategy="epoch", learning_rate=5.6e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, weight_decay=0.01, save_total_limit=3, num_train_epochs=num_train_epochs, predict_with_generate=True, logging_steps=logging_steps, push_to_hub=True, ) import numpy as np def compute_metrics(eval_pred): predictions, labels = eval_pred # Decode generated summaries into text decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) # Replace -100 in the labels as we can't decode them labels = np.where(labels != -100, labels, tokenizer.pad_token_id) # Decode reference summaries into text decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # ROUGE expects a newline after each sentence decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds] decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels] # Compute ROUGE scores result = rouge_score.compute( predictions=decoded_preds, references=decoded_labels, use_stemmer=True ) # Extract the median scores result = {key: value.mid.fmeasure * 100 for key, value in result.items()} return {k: round(v, 4) for k, v in result.items()} from transformers import DataCollatorForSeq2Seq data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) tokenized_datasets = tokenized_datasets.remove_columns( books_dataset["train"].column_names ) features = [tokenized_datasets["train"][i] for i in range(2)] data_collator(features) from transformers import Seq2SeqTrainer trainer = Seq2SeqTrainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics, ) trainer.train() trainer.evaluate() tokenized_datasets.set_format("torch") model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) from torch.utils.data import DataLoader batch_size = 8 train_dataloader = DataLoader( tokenized_datasets["train"], shuffle=True, collate_fn=data_collator, batch_size=batch_size, ) eval_dataloader = DataLoader( tokenized_datasets["validation"], collate_fn=data_collator, batch_size=batch_size ) from torch.optim import AdamW optimizer = AdamW(model.parameters(), lr=2e-5) from accelerate import Accelerator accelerator = Accelerator() model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader ) from transformers import get_scheduler num_train_epochs = 10 num_update_steps_per_epoch = len(train_dataloader) num_training_steps = num_train_epochs * num_update_steps_per_epoch lr_scheduler = get_scheduler( "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps, ) def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # ROUGE expects a newline after each sentence preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] return preds, labels from huggingface_hub import get_full_repo_name model_name = "test-bert-finetuned-squad-accelerate" repo_name = get_full_repo_name(model_name) repo_name from huggingface_hub import Repository output_dir = "results-mt5-finetuned-squad-accelerate" repo = Repository(output_dir, clone_from=repo_name) from tqdm.auto import tqdm import torch import numpy as np progress_bar = tqdm(range(num_training_steps)) for epoch in range(num_train_epochs): # Training model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) # Evaluation model.eval() for step, batch in enumerate(eval_dataloader): with torch.no_grad(): generated_tokens = accelerator.unwrap_model(model).generate( batch["input_ids"], attention_mask=batch["attention_mask"], ) generated_tokens = accelerator.pad_across_processes( generated_tokens, dim=1, pad_index=tokenizer.pad_token_id ) labels = batch["labels"] # If we did not pad to max length, we need to pad the labels too labels = accelerator.pad_across_processes( batch["labels"], dim=1, pad_index=tokenizer.pad_token_id ) generated_tokens = accelerator.gather(generated_tokens).cpu().numpy() labels = accelerator.gather(labels).cpu().numpy() # Replace -100 in the labels as we can't decode them labels = np.where(labels != -100, labels, tokenizer.pad_token_id) if isinstance(generated_tokens, tuple): generated_tokens = generated_tokens[0] decoded_preds = tokenizer.batch_decode( generated_tokens, skip_special_tokens=True ) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels ) rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels) # Compute metrics result = rouge_score.compute() # Extract the median ROUGE scores result = {key: value.mid.fmeasure * 100 for key, value in result.items()} result = {k: round(v, 4) for k, v in result.items()} print(f"Epoch {epoch}:", result) # Save and upload accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False ) from transformers import pipeline hub_model_id = "huggingface-course/mt5-small-finetuned-amazon-en-es" summarizer = pipeline("summarization", model=hub_model_id) def print_summary(idx): review = books_dataset["test"][idx]["review_body"] title = books_dataset["test"][idx]["review_title"] summary = summarizer(books_dataset["test"][idx]["review_body"])[0]["summary_text"] print(f"'>>> Review: {review}'") print(f"\n'>>> Title: {title}'") print(f"\n'>>> Summary: {summary}'") print_summary(100) print_summary(0)