Notebook

Install Dependencies¶

In [ ]:

!pip install datasets peft -qq
!pip install accelerate -qq
!pip install bitsandbytes -qq

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 542.0/542.0 kB 11.7 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 251.6/251.6 kB 33.5 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 17.5 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.1/194.1 kB 23.6 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 19.2 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 302.6/302.6 kB 35.3 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.3/21.3 MB 75.0 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.8/119.8 MB 13.2 MB/s eta 0:00:00

Import Packages¶

In [ ]:

from google.colab import userdata
from huggingface_hub import login
from datasets import load_dataset
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    Trainer,
    BitsAndBytesConfig)

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

Load Model and Tokenizer¶

In [ ]:

# Define the model ID for the pre-trained model
model_id = "google/gemma-2b"

# Configure BitsAndBytes for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the model with the specified quantization configuration and device mapping
base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

# Check if the tokenizer has a pad token defined
if tokenizer.pad_token is None:
  # If not, assign the end of sequence (eos) token as the pad token
  tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [ ]:

# Print base model
print(base_model)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaRMSNorm()
  )
  (lm_head): Linear(in_features=2048, out_features=256000, bias=False)
)

In [ ]:

# Print model configuration

base_model.config

Out[ ]:

GemmaConfig {
  "_name_or_path": "google/gemma-2b",
  "architectures": [
    "GemmaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "eos_token_id": 1,
  "head_dim": 256,
  "hidden_act": "gelu",
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 16384,
  "max_position_embeddings": 8192,
  "model_type": "gemma",
  "num_attention_heads": 8,
  "num_hidden_layers": 18,
  "num_key_value_heads": 1,
  "pad_token_id": 0,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"
  },
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.0",
  "use_cache": true,
  "vocab_size": 256000
}

In [ ]:

# Print Trainable Parameters

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    return f"Trainable model parameters: {trainable_params}\nAll model parameters: {all_param}\nPercentage of trainable model parameters: {100 * trainable_params / all_param:.2f}%"

print(print_trainable_parameters(base_model))

Trainable model parameters: 524363776
All model parameters: 1515268096
Percentage of trainable model parameters: 34.61%

Test Model before Fine Tuning¶

In [ ]:

# Test model

text = "Quote: If you want to know what a man's like"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = base_model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: If you want to know what a man's like, take a look at how he treats his inferiors, for that's what he'll

Load Dataset¶

In [ ]:

# Load dataset

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

Downloading readme:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2508 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

Configure LoRA¶

In [ ]:

# Configure Low-Rank Adaptation (LoRA) settings

lora_config = LoraConfig(
    r=8,                           # Rank of the low-rank decomposition
    lora_alpha=32,                 # Scaling factor for the LoRA updates
    lora_dropout=0.05,             # Dropout rate to apply to the LoRA updates
    bias="none",                   # Specify how to handle biases in LoRA
    target_modules=[               # List of module names to apply LoRA to
        'q_proj',                  # Query projection in attention mechanism
        'k_proj',                  # Key projection in attention mechanism
        'v_proj',                  # Value projection in attention mechanism
        'o_proj',                  # Output projection in attention mechanism
        'gate_proj',               # Gate projection
        'up_proj',                 # Up projection in feed-forward network
        'down_proj',               # Down projection in feed-forward network
        'lm_head',                 # Language model head
    ],
    task_type="CAUSAL_LM",         # Task type: Causal Language Modeling
)

In [ ]:

# LoRA trainable version of model

model = prepare_model_for_kbit_training(base_model)
model = get_peft_model(base_model, lora_config)

In [ ]:

# Trainable parameter count

print(print_trainable_parameters(model))

Trainable model parameters: 11870208
All model parameters: 1527138304
Percentage of trainable model parameters: 0.78%

In [ ]:

# Print Lora version of base model

print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GemmaForCausalLM(
      (model): GemmaModel(
        (embed_tokens): Embedding(256000, 2048, padding_idx=0)
        (layers): ModuleList(
          (0-17): 18 x GemmaDecoderLayer(
            (self_attn): GemmaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=256, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=256, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (v_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=256, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=256, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (rotary_emb): GemmaRotaryEmbedding()
            )
            (mlp): GemmaMLP(
              (gate_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=16384, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=16384, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (up_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=16384, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=16384, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (down_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=16384, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=16384, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (act_fn): PytorchGELUTanh()
            )
            (input_layernorm): GemmaRMSNorm()
            (post_attention_layernorm): GemmaRMSNorm()
          )
        )
        (norm): GemmaRMSNorm()
      )
      (lm_head): lora.Linear(
        (base_layer): Linear(in_features=2048, out_features=256000, bias=False)
        (lora_dropout): ModuleDict(
          (default): Dropout(p=0.05, inplace=False)
        )
        (lora_A): ModuleDict(
          (default): Linear(in_features=2048, out_features=8, bias=False)
        )
        (lora_B): ModuleDict(
          (default): Linear(in_features=8, out_features=256000, bias=False)
        )
        (lora_embedding_A): ParameterDict()
        (lora_embedding_B): ParameterDict()
      )
    )
  )
)

Train Model¶

In [ ]:

# Define the training arguments

training_args = TrainingArguments(
    output_dir="gemma-ft",          # Output directory for the fine-tuned model
    learning_rate=2e-4,             # Learning rate for the optimizer
    logging_steps=1,                # Log training progress every 1 step
    max_steps=2,                    # Maximum number of training steps (set to 2 for testing)
    optim="paged_adamw_8bit",       # Optimizer to use (8-bit AdamW optimizer)
    gradient_checkpointing=True,    # Enable gradient checkpointing to save memory
    save_steps=1,                   # Save a checkpoint every 1 step
    per_device_train_batch_size=1,  # Batch size for training (set to 1 for testing)
)

# Create the Trainer object
trainer = Trainer(
    model=model,                    # The pre-trained model to fine-tune
    train_dataset=data["train"],    # The training dataset
    args=training_args,             # The training arguments
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)  # Data collator for language modeling (mlm=False means no masked language modeling)
)

max_steps is given, it will override any value given in num_train_epochs

In [ ]:

# Silence the warnings
model.config.use_cache = False

# Train model
trainer.train()

/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  warnings.warn(

[2/2 00:16, Epoch 0/1]

Step	Training Loss
1	1.537000
2	1.443500

/usr/local/lib/python3.10/dist-packages/peft/utils/save_and_load.py:180: UserWarning: Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.
  warnings.warn("Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.")
/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/peft/utils/save_and_load.py:180: UserWarning: Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.
  warnings.warn("Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.")

Out[ ]:

TrainOutput(global_step=2, training_loss=1.4902946949005127, metrics={'train_runtime': 17.0333, 'train_samples_per_second': 0.117, 'train_steps_per_second': 0.117, 'total_flos': 550276276224.0, 'train_loss': 1.4902946949005127, 'epoch': 0.0007974481658692185})

Save Model and Push to Hub¶

In [ ]:

# # Renable warnings for inference
model.config.use_cache = True

# Save trained model
trainer.model.save_pretrained("gemma-ft", save_embedding_layers=True)

In [ ]:

write_key = userdata.get('HF_TOKEN')
login(write_key)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful

In [ ]:

# from huggingface_hub import notebook_login
# notebook_login()

In [ ]:

# Push LoRA Adapters

trainer.push_to_hub("bmartinc80/gemma-ft")

/usr/local/lib/python3.10/dist-packages/peft/utils/save_and_load.py:180: UserWarning: Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.
  warnings.warn("Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.")

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/2.14G [00:00<?, ?B/s]

events.out.tfevents.1716401851.218cecfccc92.2053.0:   0%|          | 0.00/6.08k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Out[ ]:

CommitInfo(commit_url='https://huggingface.co/bmartinc80/gemma-ft/commit/5f3255dd574f9f38f8e26e1a03be7e60899bf391', commit_message='bmartinc80/gemma-ft', commit_description='', oid='5f3255dd574f9f38f8e26e1a03be7e60899bf391', pr_url=None, pr_revision=None, pr_num=None)

Test Model after Fine Tuning¶

In [ ]:

# Merge Adapters with model

merged_model = model.merge_and_unload()

/usr/local/lib/python3.10/dist-packages/peft/tuners/lora/bnb.py:325: UserWarning: Merge lora module to 4-bit linear may get different generations due to rounding errors.
  warnings.warn(

In [ ]:

text = "Quote: If you want to know what a man's like"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = merged_model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:91: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
  warnings.warn(

Quote: If you want to know what a man's like, take a look at how he treats his inferiors, for that is what he will treat his