!pip install datasets peft -qq
!pip install accelerate -qq
!pip install bitsandbytes -qq
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 542.0/542.0 kB 11.7 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 251.6/251.6 kB 33.5 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 17.5 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.1/194.1 kB 23.6 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 19.2 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 302.6/302.6 kB 35.3 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.3/21.3 MB 75.0 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.8/119.8 MB 13.2 MB/s eta 0:00:00
from google.colab import userdata
from huggingface_hub import login
from datasets import load_dataset
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
DataCollatorForLanguageModeling,
Trainer,
BitsAndBytesConfig)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
# Define the model ID for the pre-trained model
model_id = "google/gemma-2b"
# Configure BitsAndBytes for 4-bit quantization
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16)
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Load the model with the specified quantization configuration and device mapping
base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
# Check if the tokenizer has a pad token defined
if tokenizer.pad_token is None:
# If not, assign the end of sequence (eos) token as the pad token
tokenizer.pad_token = tokenizer.eos_token
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
# Print base model
print(base_model)
GemmaForCausalLM( (model): GemmaModel( (embed_tokens): Embedding(256000, 2048, padding_idx=0) (layers): ModuleList( (0-17): 18 x GemmaDecoderLayer( (self_attn): GemmaSdpaAttention( (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False) (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False) (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False) (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False) (rotary_emb): GemmaRotaryEmbedding() ) (mlp): GemmaMLP( (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False) (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False) (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False) (act_fn): PytorchGELUTanh() ) (input_layernorm): GemmaRMSNorm() (post_attention_layernorm): GemmaRMSNorm() ) ) (norm): GemmaRMSNorm() ) (lm_head): Linear(in_features=2048, out_features=256000, bias=False) )
# Print model configuration
base_model.config
GemmaConfig { "_name_or_path": "google/gemma-2b", "architectures": [ "GemmaForCausalLM" ], "attention_bias": false, "attention_dropout": 0.0, "bos_token_id": 2, "eos_token_id": 1, "head_dim": 256, "hidden_act": "gelu", "hidden_activation": "gelu_pytorch_tanh", "hidden_size": 2048, "initializer_range": 0.02, "intermediate_size": 16384, "max_position_embeddings": 8192, "model_type": "gemma", "num_attention_heads": 8, "num_hidden_layers": 18, "num_key_value_heads": 1, "pad_token_id": 0, "quantization_config": { "_load_in_4bit": true, "_load_in_8bit": false, "bnb_4bit_compute_dtype": "bfloat16", "bnb_4bit_quant_storage": "uint8", "bnb_4bit_quant_type": "nf4", "bnb_4bit_use_double_quant": false, "llm_int8_enable_fp32_cpu_offload": false, "llm_int8_has_fp16_weight": false, "llm_int8_skip_modules": null, "llm_int8_threshold": 6.0, "load_in_4bit": true, "load_in_8bit": false, "quant_method": "bitsandbytes" }, "rms_norm_eps": 1e-06, "rope_scaling": null, "rope_theta": 10000.0, "torch_dtype": "bfloat16", "transformers_version": "4.41.0", "use_cache": true, "vocab_size": 256000 }
# Print Trainable Parameters
def print_trainable_parameters(model):
"""
Prints the number of trainable parameters in the model.
"""
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
return f"Trainable model parameters: {trainable_params}\nAll model parameters: {all_param}\nPercentage of trainable model parameters: {100 * trainable_params / all_param:.2f}%"
print(print_trainable_parameters(base_model))
Trainable model parameters: 524363776 All model parameters: 1515268096 Percentage of trainable model parameters: 34.61%
# Test model
text = "Quote: If you want to know what a man's like"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = base_model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
Quote: If you want to know what a man's like, take a look at how he treats his inferiors, for that's what he'll
# Load dataset
data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
Downloading readme: 0%| | 0.00/5.55k [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/647k [00:00<?, ?B/s]
Generating train split: 0%| | 0/2508 [00:00<?, ? examples/s]
Map: 0%| | 0/2508 [00:00<?, ? examples/s]
# Configure Low-Rank Adaptation (LoRA) settings
lora_config = LoraConfig(
r=8, # Rank of the low-rank decomposition
lora_alpha=32, # Scaling factor for the LoRA updates
lora_dropout=0.05, # Dropout rate to apply to the LoRA updates
bias="none", # Specify how to handle biases in LoRA
target_modules=[ # List of module names to apply LoRA to
'q_proj', # Query projection in attention mechanism
'k_proj', # Key projection in attention mechanism
'v_proj', # Value projection in attention mechanism
'o_proj', # Output projection in attention mechanism
'gate_proj', # Gate projection
'up_proj', # Up projection in feed-forward network
'down_proj', # Down projection in feed-forward network
'lm_head', # Language model head
],
task_type="CAUSAL_LM", # Task type: Causal Language Modeling
)
# LoRA trainable version of model
model = prepare_model_for_kbit_training(base_model)
model = get_peft_model(base_model, lora_config)
# Trainable parameter count
print(print_trainable_parameters(model))
Trainable model parameters: 11870208 All model parameters: 1527138304 Percentage of trainable model parameters: 0.78%
# Print Lora version of base model
print(model)
PeftModelForCausalLM( (base_model): LoraModel( (model): GemmaForCausalLM( (model): GemmaModel( (embed_tokens): Embedding(256000, 2048, padding_idx=0) (layers): ModuleList( (0-17): 18 x GemmaDecoderLayer( (self_attn): GemmaSdpaAttention( (q_proj): lora.Linear4bit( (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=2048, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=2048, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() ) (k_proj): lora.Linear4bit( (base_layer): Linear4bit(in_features=2048, out_features=256, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=2048, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=256, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() ) (v_proj): lora.Linear4bit( (base_layer): Linear4bit(in_features=2048, out_features=256, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=2048, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=256, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() ) (o_proj): lora.Linear4bit( (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=2048, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=2048, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() ) (rotary_emb): GemmaRotaryEmbedding() ) (mlp): GemmaMLP( (gate_proj): lora.Linear4bit( (base_layer): Linear4bit(in_features=2048, out_features=16384, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=2048, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=16384, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() ) (up_proj): lora.Linear4bit( (base_layer): Linear4bit(in_features=2048, out_features=16384, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=2048, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=16384, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() ) (down_proj): lora.Linear4bit( (base_layer): Linear4bit(in_features=16384, out_features=2048, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=16384, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=2048, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() ) (act_fn): PytorchGELUTanh() ) (input_layernorm): GemmaRMSNorm() (post_attention_layernorm): GemmaRMSNorm() ) ) (norm): GemmaRMSNorm() ) (lm_head): lora.Linear( (base_layer): Linear(in_features=2048, out_features=256000, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=2048, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=256000, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() ) ) ) )
# Define the training arguments
training_args = TrainingArguments(
output_dir="gemma-ft", # Output directory for the fine-tuned model
learning_rate=2e-4, # Learning rate for the optimizer
logging_steps=1, # Log training progress every 1 step
max_steps=2, # Maximum number of training steps (set to 2 for testing)
optim="paged_adamw_8bit", # Optimizer to use (8-bit AdamW optimizer)
gradient_checkpointing=True, # Enable gradient checkpointing to save memory
save_steps=1, # Save a checkpoint every 1 step
per_device_train_batch_size=1, # Batch size for training (set to 1 for testing)
)
# Create the Trainer object
trainer = Trainer(
model=model, # The pre-trained model to fine-tune
train_dataset=data["train"], # The training dataset
args=training_args, # The training arguments
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False) # Data collator for language modeling (mlm=False means no masked language modeling)
)
max_steps is given, it will override any value given in num_train_epochs
# Silence the warnings
model.config.use_cache = False
# Train model
trainer.train()
/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants. warnings.warn(
Step | Training Loss |
---|---|
1 | 1.537000 |
2 | 1.443500 |
/usr/local/lib/python3.10/dist-packages/peft/utils/save_and_load.py:180: UserWarning: Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`. warnings.warn("Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.") /usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants. warnings.warn( /usr/local/lib/python3.10/dist-packages/peft/utils/save_and_load.py:180: UserWarning: Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`. warnings.warn("Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.")
TrainOutput(global_step=2, training_loss=1.4902946949005127, metrics={'train_runtime': 17.0333, 'train_samples_per_second': 0.117, 'train_steps_per_second': 0.117, 'total_flos': 550276276224.0, 'train_loss': 1.4902946949005127, 'epoch': 0.0007974481658692185})
# # Renable warnings for inference
model.config.use_cache = True
# Save trained model
trainer.model.save_pretrained("gemma-ft", save_embedding_layers=True)
write_key = userdata.get('HF_TOKEN')
login(write_key)
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well. Token is valid (permission: write). Your token has been saved to /root/.cache/huggingface/token Login successful
# from huggingface_hub import notebook_login
# notebook_login()
# Push LoRA Adapters
trainer.push_to_hub("bmartinc80/gemma-ft")
/usr/local/lib/python3.10/dist-packages/peft/utils/save_and_load.py:180: UserWarning: Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`. warnings.warn("Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.")
Upload 3 LFS files: 0%| | 0/3 [00:00<?, ?it/s]
adapter_model.safetensors: 0%| | 0.00/2.14G [00:00<?, ?B/s]
events.out.tfevents.1716401851.218cecfccc92.2053.0: 0%| | 0.00/6.08k [00:00<?, ?B/s]
training_args.bin: 0%| | 0.00/5.11k [00:00<?, ?B/s]
CommitInfo(commit_url='https://huggingface.co/bmartinc80/gemma-ft/commit/5f3255dd574f9f38f8e26e1a03be7e60899bf391', commit_message='bmartinc80/gemma-ft', commit_description='', oid='5f3255dd574f9f38f8e26e1a03be7e60899bf391', pr_url=None, pr_revision=None, pr_num=None)
# Merge Adapters with model
merged_model = model.merge_and_unload()
/usr/local/lib/python3.10/dist-packages/peft/tuners/lora/bnb.py:325: UserWarning: Merge lora module to 4-bit linear may get different generations due to rounding errors. warnings.warn(
text = "Quote: If you want to know what a man's like"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = merged_model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. /usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:91: UserWarning: None of the inputs have requires_grad=True. Gradients will be None warnings.warn(
Quote: If you want to know what a man's like, take a look at how he treats his inferiors, for that is what he will treat his