Notebook

In [1]:

from IPython.display import display, HTML
display(HTML('<style>.container { width:100% !important; }</style>'))

In [2]:

!nvidia-smi

Sun Feb 25 13:20:57 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
|  0%   36C    P8              31W / 450W |   8684MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
+---------------------------------------------------------------------------------------+

In [3]:

!pip -q uninstall transformers -y
!pip -q install transformers[sentencepiece]
!pip -q install transformers
!pip -q install accelerate -U

WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv

In [4]:

!pip -q install pandas matplotlib torch nltk tqdm transformers datasets transformers[sentencepiece]

WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv

In [5]:

# Importing
import pandas as pd
import matplotlib.pyplot as plt
import torch
import nltk

from nltk.tokenize import sent_tokenize
from tqdm import tqdm
nltk.download('punkt')

# model download
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset, load_from_disk, load_metric

# finetuning
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2024-02-25 13:21:18.076170: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-25 13:21:18.076240: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-25 13:21:18.077366: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-25 13:21:18.084780: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-25 13:21:19.164485: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT

In [6]:

device='gpu' if torch.cuda.is_available() else 'cpu'
print(device)

gpu

In [7]:

model='google/pegasus-cnn_dailymail'
tokenizer=AutoTokenizer.from_pretrained(model)
model_pegasus=AutoModelForSeq2SeqLM.from_pretrained(model).to('cuda')

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

In [8]:

dataset=load_dataset('samsum')
dataset

Out[8]:

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [9]:

# train dataset
train = dataset['train']

# dialogues in train dataset
dialogue = train['dialogue']

# summary in train dataset
summary = train['summary']

print('Fetch first dialogue and summary')
print(dialogue[0], summary[0])

Fetch first dialogue and summary
Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-) Amanda baked cookies and will bring Jerry some tomorrow.

In [10]:

def convert_examples_to_features(data_in_batch):
    # Tokenize the dialogues in the batch
    input_encoding = tokenizer(data_in_batch['dialogue'],
                               max_length=1024,
                               truncation=True)

    # Tokenize the summaries in the batch
    target_encoding = tokenizer(data_in_batch['summary'],
                                max_length=128,
                                truncation=True)

    # Return a dictionary containing input and target tokenized sequences
    return {
        'input_ids': input_encoding['input_ids'],  # Input token IDs for dialogues
        'attention_mask': input_encoding['attention_mask'],  # Attention mask for dialogue inputs
        'labels': target_encoding['input_ids']  # Target token IDs for summaries (used as labels)
    }

In [11]:

dataset_en = dataset.map(convert_examples_to_features, batched = True)

In [12]:

# Display the first example from the training dataset
display(
    dataset_en['train'],  # Display the training dataset
    dataset_en['train']['input_ids'][0],  # Display the input token IDs of the first example
    dataset_en['train']['attention_mask'][0],  # Display the attention mask of the first example
    dataset_en['train']['labels'][0]  # Display the target token IDs (labels) of the first example
)

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14732
})

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

[12195, 7091, 3659, 111, 138, 650, 10508, 181, 3469, 107, 1]

In [13]:

# reference: https://huggingface.co/docs/transformers/v4.38.1/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args=TrainingArguments(
    output_dir='pegasus-samsum',
    num_train_epochs=1,
    warmup_steps=500,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1e6,
    gradient_accumulation_steps=16
)
trainer=Trainer(model=model_pegasus,
                args=training_args,
                tokenizer=tokenizer,
                data_collator=seq2seq_data_collator,
                train_dataset=dataset_en['train'],
                eval_dataset=dataset_en['validation'])

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.

[920/920 23:22, Epoch 0/1]

Step	Training Loss	Validation Loss
500	1.665300	1.482481

Out[13]:

TrainOutput(global_step=920, training_loss=1.8287915426751842, metrics={'train_runtime': 1404.8123, 'train_samples_per_second': 10.487, 'train_steps_per_second': 0.655, 'total_flos': 5528248038285312.0, 'train_loss': 1.8287915426751842, 'epoch': 1.0})

In [14]:

# Save the Pegasus model
model_pegasus.save_pretrained('pegasus-samsum-model')

# Save the tokenizer used with the Pegasus model
tokenizer.save_pretrained('samsum-tokenizer')

Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.
Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}

Out[14]:

('samsum-tokenizer/tokenizer_config.json',
 'samsum-tokenizer/special_tokens_map.json',
 'samsum-tokenizer/spiece.model',
 'samsum-tokenizer/added_tokens.json',
 'samsum-tokenizer/tokenizer.json')

In [15]:

# Load tokenizer and model from the pretrained directories
tokenizer = AutoTokenizer.from_pretrained('samsum-tokenizer')
model = AutoModelForSeq2SeqLM.from_pretrained('pegasus-samsum-model')

# Create a pipeline object for summarization using the loaded model and tokenizer
pipeline_obj = pipeline('summarization', model=model, tokenizer=tokenizer)

# Define a sample text for summarization
sample_text_for_text = dataset['train']['dialogue'][2]
actual_summary = dataset['train']['summary'][2]
print(f'Sample text: {sample_text_for_text}')

Sample text: Tim: Hi, what's up?
Kim: Bad mood tbh, I was going to do lots of stuff but ended up procrastinating
Tim: What did you plan on doing?
Kim: Oh you know, uni stuff and unfucking my room
Kim: Maybe tomorrow I'll move my ass and do everything
Kim: We were going to defrost a fridge so instead of shopping I'll eat some defrosted veggies
Tim: For doing stuff I recommend Pomodoro technique where u use breaks for doing chores
Tim: It really helps
Kim: thanks, maybe I'll do that
Tim: I also like using post-its in kaban style

In [20]:

print(f'Actual summary: {actual_summary}')

Actual summary: Kim may try the pomodoro technique recommended by Tim to get more stuff done.

In [16]:

# Generate a summary using the pipeline object
gen_kwargs = {'length_penalty': 0.8, 'num_beams':8, 'max_length': 128}
display(pipeline_obj(sample_text_for_text, **gen_kwargs))
prediction = pipeline_obj(sample_text_for_text, **gen_kwargs)[0]['summary_text']

[{'summary_text': "Kim was going to do lots of stuff but ended up procrastinating. She'll move her ass tomorrow and do everything. Tim recommends Pomodoro technique for doing chores."}]

In [21]:

# Evaluation
# evalation of model
# Evaluation

def generate_batch_sized_chunks(list_of_elements, batch_size):
    '''split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements.'''
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]



def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text='article',
                               column_summary='highlights'):
    # Divide input text data and target summaries into batches
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    # Iterate over batches and track progress using tqdm
    for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):

        # Tokenize the input text data for model input
        inputs = tokenizer(article_batch, max_length=1024, truncation=True,
                           padding='max_length', return_tensors='pt')

        # Generate summaries using the model
        summaries = model.generate(input_ids=inputs['input_ids'].to('cuda'),
                                   attention_mask=inputs['attention_mask'].to('cuda'),
                                   length_penalty=0.8, num_beams=8, max_length=128)

        ''' Parameter for length penalty ensures that the model
        does not generate sequences that are too long. '''

        # Decode the generated summaries for evaluation
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                              clean_up_tokenization_spaces=True)
                            for s in summaries]

        decoded_summaries = [d.replace('', ' ') for d in decoded_summaries]

        # Add the generated summaries and references to the metric for evaluation
        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [22]:

rouge_names=['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
rouge_metric = load_metric('rouge')

In [23]:

score = calculate_metric_on_test_ds(dataset['test'][0:10],
                                    rouge_metric,
                                    trainer.model,
                                    tokenizer,
                                    batch_size = 2,
                                    column_text = 'dialogue',
                                    column_summary = 'summary'
                                    )

rouge_dict = dict( (rn, score[rn].mid.fmeasure) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

100%|██████████| 5/5 [00:03<00:00,  1.30it/s]

Out[23]:

	rouge1	rouge2	rougeL	rougeLsum
pegasus	0.0247	0.0	0.024573	0.024587