from IPython.display import display, HTML
display(HTML('<style>.container { width:100% !important; }</style>'))
!nvidia-smi
Sun Feb 25 13:20:57 2024 +---------------------------------------------------------------------------------------+ | NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.2 | |-----------------------------------------+----------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+======================+======================| | 0 NVIDIA GeForce RTX 4090 On | 00000000:01:00.0 Off | Off | | 0% 36C P8 31W / 450W | 8684MiB / 24564MiB | 0% Default | | | | N/A | +-----------------------------------------+----------------------+----------------------+ +---------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=======================================================================================| +---------------------------------------------------------------------------------------+
!pip -q uninstall transformers -y
!pip -q install transformers[sentencepiece]
!pip -q install transformers
!pip -q install accelerate -U
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
!pip -q install pandas matplotlib torch nltk tqdm transformers datasets transformers[sentencepiece]
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
# Importing
import pandas as pd
import matplotlib.pyplot as plt
import torch
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
nltk.download('punkt')
# model download
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset, load_from_disk, load_metric
# finetuning
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
import warnings
warnings.filterwarnings('ignore')
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Package punkt is already up-to-date! 2024-02-25 13:21:18.076170: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered 2024-02-25 13:21:18.076240: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered 2024-02-25 13:21:18.077366: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered 2024-02-25 13:21:18.084780: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. 2024-02-25 13:21:19.164485: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
device='gpu' if torch.cuda.is_available() else 'cpu'
print(device)
gpu
model='google/pegasus-cnn_dailymail'
tokenizer=AutoTokenizer.from_pretrained(model)
model_pegasus=AutoModelForSeq2SeqLM.from_pretrained(model).to('cuda')
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataset=load_dataset('samsum')
dataset
DatasetDict({ train: Dataset({ features: ['id', 'dialogue', 'summary'], num_rows: 14732 }) test: Dataset({ features: ['id', 'dialogue', 'summary'], num_rows: 819 }) validation: Dataset({ features: ['id', 'dialogue', 'summary'], num_rows: 818 }) })
# train dataset
train = dataset['train']
# dialogues in train dataset
dialogue = train['dialogue']
# summary in train dataset
summary = train['summary']
print('Fetch first dialogue and summary')
print(dialogue[0], summary[0])
Fetch first dialogue and summary Amanda: I baked cookies. Do you want some? Jerry: Sure! Amanda: I'll bring you tomorrow :-) Amanda baked cookies and will bring Jerry some tomorrow.
def convert_examples_to_features(data_in_batch):
# Tokenize the dialogues in the batch
input_encoding = tokenizer(data_in_batch['dialogue'],
max_length=1024,
truncation=True)
# Tokenize the summaries in the batch
target_encoding = tokenizer(data_in_batch['summary'],
max_length=128,
truncation=True)
# Return a dictionary containing input and target tokenized sequences
return {
'input_ids': input_encoding['input_ids'], # Input token IDs for dialogues
'attention_mask': input_encoding['attention_mask'], # Attention mask for dialogue inputs
'labels': target_encoding['input_ids'] # Target token IDs for summaries (used as labels)
}
dataset_en = dataset.map(convert_examples_to_features, batched = True)
# Display the first example from the training dataset
display(
dataset_en['train'], # Display the training dataset
dataset_en['train']['input_ids'][0], # Display the input token IDs of the first example
dataset_en['train']['attention_mask'][0], # Display the attention mask of the first example
dataset_en['train']['labels'][0] # Display the target token IDs (labels) of the first example
)
Dataset({ features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'], num_rows: 14732 })
[12195, 151, 125, 7091, 3659, 107, 842, 119, 245, 181, 152, 10508, 151, 7435, 147, 12195, 151, 125, 131, 267, 650, 119, 3469, 29344, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[12195, 7091, 3659, 111, 138, 650, 10508, 181, 3469, 107, 1]
# reference: https://huggingface.co/docs/transformers/v4.38.1/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
training_args=TrainingArguments(
output_dir='pegasus-samsum',
num_train_epochs=1,
warmup_steps=500,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
weight_decay=0.01,
logging_steps=10,
evaluation_strategy='steps',
eval_steps=500,
save_steps=1e6,
gradient_accumulation_steps=16
)
trainer=Trainer(model=model_pegasus,
args=training_args,
tokenizer=tokenizer,
data_collator=seq2seq_data_collator,
train_dataset=dataset_en['train'],
eval_dataset=dataset_en['validation'])
trainer.train()
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Step | Training Loss | Validation Loss |
---|---|---|
500 | 1.665300 | 1.482481 |
TrainOutput(global_step=920, training_loss=1.8287915426751842, metrics={'train_runtime': 1404.8123, 'train_samples_per_second': 10.487, 'train_steps_per_second': 0.655, 'total_flos': 5528248038285312.0, 'train_loss': 1.8287915426751842, 'epoch': 1.0})
# Save the Pegasus model
model_pegasus.save_pretrained('pegasus-samsum-model')
# Save the tokenizer used with the Pegasus model
tokenizer.save_pretrained('samsum-tokenizer')
Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41. Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}
('samsum-tokenizer/tokenizer_config.json', 'samsum-tokenizer/special_tokens_map.json', 'samsum-tokenizer/spiece.model', 'samsum-tokenizer/added_tokens.json', 'samsum-tokenizer/tokenizer.json')
# Load tokenizer and model from the pretrained directories
tokenizer = AutoTokenizer.from_pretrained('samsum-tokenizer')
model = AutoModelForSeq2SeqLM.from_pretrained('pegasus-samsum-model')
# Create a pipeline object for summarization using the loaded model and tokenizer
pipeline_obj = pipeline('summarization', model=model, tokenizer=tokenizer)
# Define a sample text for summarization
sample_text_for_text = dataset['train']['dialogue'][2]
actual_summary = dataset['train']['summary'][2]
print(f'Sample text: {sample_text_for_text}')
Sample text: Tim: Hi, what's up? Kim: Bad mood tbh, I was going to do lots of stuff but ended up procrastinating Tim: What did you plan on doing? Kim: Oh you know, uni stuff and unfucking my room Kim: Maybe tomorrow I'll move my ass and do everything Kim: We were going to defrost a fridge so instead of shopping I'll eat some defrosted veggies Tim: For doing stuff I recommend Pomodoro technique where u use breaks for doing chores Tim: It really helps Kim: thanks, maybe I'll do that Tim: I also like using post-its in kaban style
print(f'Actual summary: {actual_summary}')
Actual summary: Kim may try the pomodoro technique recommended by Tim to get more stuff done.
# Generate a summary using the pipeline object
gen_kwargs = {'length_penalty': 0.8, 'num_beams':8, 'max_length': 128}
display(pipeline_obj(sample_text_for_text, **gen_kwargs))
prediction = pipeline_obj(sample_text_for_text, **gen_kwargs)[0]['summary_text']
[{'summary_text': "Kim was going to do lots of stuff but ended up procrastinating. She'll move her ass tomorrow and do everything. Tim recommends Pomodoro technique for doing chores."}]
# Evaluation
# evalation of model
# Evaluation
def generate_batch_sized_chunks(list_of_elements, batch_size):
'''split the dataset into smaller batches that we can process simultaneously
Yield successive batch-sized chunks from list_of_elements.'''
for i in range(0, len(list_of_elements), batch_size):
yield list_of_elements[i : i + batch_size]
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
batch_size=16, device=device,
column_text='article',
column_summary='highlights'):
# Divide input text data and target summaries into batches
article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))
# Iterate over batches and track progress using tqdm
for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):
# Tokenize the input text data for model input
inputs = tokenizer(article_batch, max_length=1024, truncation=True,
padding='max_length', return_tensors='pt')
# Generate summaries using the model
summaries = model.generate(input_ids=inputs['input_ids'].to('cuda'),
attention_mask=inputs['attention_mask'].to('cuda'),
length_penalty=0.8, num_beams=8, max_length=128)
''' Parameter for length penalty ensures that the model
does not generate sequences that are too long. '''
# Decode the generated summaries for evaluation
decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
clean_up_tokenization_spaces=True)
for s in summaries]
decoded_summaries = [d.replace('', ' ') for d in decoded_summaries]
# Add the generated summaries and references to the metric for evaluation
metric.add_batch(predictions=decoded_summaries, references=target_batch)
# Finally compute and return the ROUGE scores.
score = metric.compute()
return score
rouge_names=['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
rouge_metric = load_metric('rouge')
score = calculate_metric_on_test_ds(dataset['test'][0:10],
rouge_metric,
trainer.model,
tokenizer,
batch_size = 2,
column_text = 'dialogue',
column_summary = 'summary'
)
rouge_dict = dict( (rn, score[rn].mid.fmeasure) for rn in rouge_names )
pd.DataFrame(rouge_dict, index = [f'pegasus'] )
100%|██████████| 5/5 [00:03<00:00, 1.30it/s]
rouge1 | rouge2 | rougeL | rougeLsum | |
---|---|---|---|---|
pegasus | 0.0247 | 0.0 | 0.024573 | 0.024587 |