# Transformers installation ! pip install transformers datasets # To install from source instead of the last release, comment the command above and uncomment the following one. # ! pip install git+https://github.com/huggingface/transformers.git import numpy as np from datasets import Dataset seq_len, dataset_size = 512, 512 dummy_data = { "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)), "labels": np.random.randint(0, 1, (dataset_size)), } ds = Dataset.from_dict(dummy_data) ds.set_format("pt") from pynvml import * def print_gpu_utilization(): nvmlInit() handle = nvmlDeviceGetHandleByIndex(0) info = nvmlDeviceGetMemoryInfo(handle) print(f"GPU memory occupied: {info.used//1024**2} MB.") def print_summary(result): print(f"Time: {result.metrics['train_runtime']:.2f}") print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}") print_gpu_utilization() print_gpu_utilization() import torch torch.ones((1, 1)).to("cuda") print_gpu_utilization() from transformers import AutoModelForSequenceClassification model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased").to("cuda") print_gpu_utilization() default_args = { "output_dir": "tmp", "evaluation_strategy": "steps", "num_train_epochs": 1, "log_level": "error", "report_to": "none", } from transformers import TrainingArguments, Trainer, logging logging.set_verbosity_error() training_args = TrainingArguments(per_device_train_batch_size=4, **default_args) trainer = Trainer(model=model, args=training_args, train_dataset=ds) result = trainer.train() print_summary(result) training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args) trainer = Trainer(model=model, args=training_args, train_dataset=ds) result = trainer.train() print_summary(result) training_args = TrainingArguments( per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args ) trainer = Trainer(model=model, args=training_args, train_dataset=ds) result = trainer.train() print_summary(result) training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args) trainer = Trainer(model=model, args=training_args, train_dataset=ds) result = trainer.train() print_summary(result) training_args = TrainingArguments( per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, fp16=True, **default_args, ) trainer = Trainer(model=model, args=training_args, train_dataset=ds) result = trainer.train() print_summary(result) training_args = TrainingArguments(per_device_train_batch_size=4, optim="adafactor", **default_args) trainer = Trainer(model=model, args=training_args, train_dataset=ds) result = trainer.train() print_summary(result) training_args = TrainingArguments( per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, fp16=True, optim="adafactor", **default_args, ) trainer = Trainer(model=model, args=training_args, train_dataset=ds) result = trainer.train() print_summary(result) import bitsandbytes as bnb from torch import nn from transformers.trainer_pt_utils import get_parameter_names training_args = TrainingArguments(per_device_train_batch_size=4, **default_args) decay_parameters = get_parameter_names(model, [nn.LayerNorm]) decay_parameters = [name for name in decay_parameters if "bias" not in name] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if n in decay_parameters], "weight_decay": training_args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if n not in decay_parameters], "weight_decay": 0.0, }, ] optimizer_kwargs = { "betas": (training_args.adam_beta1, training_args.adam_beta2), "eps": training_args.adam_epsilon, } optimizer_kwargs["lr"] = training_args.learning_rate adam_bnb_optim = bnb.optim.Adam8bit( optimizer_grouped_parameters, betas=(training_args.adam_beta1, training_args.adam_beta2), eps=training_args.adam_epsilon, lr=training_args.learning_rate, ) trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None)) result = trainer.train() print_summary(result) training_args = TrainingArguments( per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, fp16=True, **default_args, ) trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None)) result = trainer.train() print_summary(result) training_args = TrainingArguments( per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, fp16=True, **default_args, ) from accelerate import Accelerator from torch.utils.data.dataloader import DataLoader dataloader = DataLoader(ds, batch_size=training_args.per_device_train_batch_size) if training_args.gradient_checkpointing: model.gradient_checkpointing_enable() accelerator = Accelerator(fp16=training_args.fp16) model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader) model.train() for step, batch in enumerate(dataloader, start=1): loss = model(**batch).loss loss = loss / training_args.gradient_accumulation_steps accelerator.backward(loss) if step % training_args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() print_gpu_utilization() model.gradient_checkpointing_enable()