#!/usr/bin/env python # coding: utf-8 # # Launching Multi-Node Training from a Jupyter Environment # > Using the `notebook_launcher` to use Accelerate from inside a Jupyter Notebook # ## General Overview # # This notebook covers how to run the `cv_example.py` script as a Jupyter Notebook and train it on a distributed system. It will also cover the few specific requirements needed for ensuring your environment is configured properly, your data has been prepared properly, and finally how to launch training. # ## Configuring the Environment # # Before any training can be performed, an accelerate config file must exist in the system. Usually this can be done by running the following in a terminal: # # ```bash # accelerate config # ``` # # However, if general defaults are fine and you are *not* running on a TPU, accelerate has a utility to quickly write your GPU configuration into a config file via `write_basic_config`. # # The following cell will restart Jupyter after writing the configuration, as CUDA code was called to perform this. CUDA can't be initialized more than once (once for the single-GPU's notebooks use by default, and then what would be again when `notebook_launcher` is called). It's fine to debug in the notebook and have calls to CUDA, but remember that in order to finally train a full cleanup and restart will need to be performed, such as what is shown below: # In[1]: #import os #from accelerate.utils import write_basic_config #write_basic_config() # Write a config file #os._exit(00) # Restart the notebook # ## Preparing the Dataset and Model # # Next you should prepare your dataset. As mentioned at earlier, great care should be taken when preparing the `DataLoaders` and model to make sure that **nothing** is put on *any* GPU. # # If you do, it is recommended to put that specific code into a function and call that from within the notebook launcher interface, which will be shown later. # # Make sure the dataset is downloaded based on the directions [here](https://github.com/huggingface/accelerate/tree/main/examples#simple-vision-example) # In[2]: import os, re, torch, PIL import numpy as np from torch.optim.lr_scheduler import OneCycleLR from torch.utils.data import DataLoader, Dataset from torchvision.transforms import Compose, RandomResizedCrop, Resize, ToTensor from accelerate import Accelerator from accelerate.utils import set_seed from timm import create_model # First we'll create a function to extract the class name based on a file: # In[3]: import os data_dir = "../../images" fnames = os.listdir(data_dir) fname = fnames[0] print(fname) # In the case here, the label is `beagle`: # In[4]: import re def extract_label(fname): stem = fname.split(os.path.sep)[-1] return re.search(r"^(.*)_\d+\.jpg$", stem).groups()[0] # In[5]: extract_label(fname) # Next we'll create a `Dataset` class: # In[6]: class PetsDataset(Dataset): def __init__(self, file_names, image_transform=None, label_to_id=None): self.file_names = file_names self.image_transform = image_transform self.label_to_id = label_to_id def __len__(self): return len(self.file_names) def __getitem__(self, idx): fname = self.file_names[idx] raw_image = PIL.Image.open(fname) image = raw_image.convert("RGB") if self.image_transform is not None: image = self.image_transform(image) label = extract_label(fname) if self.label_to_id is not None: label = self.label_to_id[label] return {"image": image, "label": label} # And build our dataset # In[7]: # Grab all the image filenames fnames = [ os.path.join("../../images", fname) for fname in fnames if fname.endswith(".jpg") ] # Build the labels all_labels = [ extract_label(fname) for fname in fnames ] id_to_label = list(set(all_labels)) id_to_label.sort() label_to_id = {lbl: i for i, lbl in enumerate(id_to_label)} # > Note: This will be stored inside of a function as we'll be setting our seed during training. # In[8]: def get_dataloaders(batch_size:int=64): "Builds a set of dataloaders with a batch_size" random_perm = np.random.permutation(len(fnames)) cut = int(0.8 * len(fnames)) train_split = random_perm[:cut] eval_split = random_perm[:cut] # For training we use a simple RandomResizedCrop train_tfm = Compose([ RandomResizedCrop((224, 224), scale=(0.5, 1.0)), ToTensor() ]) train_dataset = PetsDataset( [fnames[i] for i in train_split], image_transform=train_tfm, label_to_id=label_to_id ) # For evaluation we use a deterministic Resize eval_tfm = Compose([ Resize((224, 224)), ToTensor() ]) eval_dataset = PetsDataset( [fnames[i] for i in eval_split], image_transform=eval_tfm, label_to_id=label_to_id ) # Instantiate dataloaders train_dataloader = DataLoader( train_dataset, shuffle=True, batch_size=batch_size, num_workers=4 ) eval_dataloader = DataLoader( eval_dataset, shuffle=False, batch_size=batch_size*2, num_workers=4 ) return train_dataloader, eval_dataloader # ## Writing the Training Function # # Now we can build our training loop. `notebook_launcher` works by passing in a function to call that will be ran across the distributed system. # # Here is a basic training loop for our animal classification problem: # In[9]: from torch.optim.lr_scheduler import CosineAnnealingLR # In[10]: def training_loop(mixed_precision="fp16", seed:int=42, batch_size:int=64): set_seed(seed) # Initialize accelerator accelerator = Accelerator(mixed_precision=mixed_precision) # Build dataloaders train_dataloader, eval_dataloader = get_dataloaders(batch_size) # instantiate the model (we build the model here so that the seed also controls new weight initaliziations) model = create_model("resnet50d", pretrained=True, num_classes=len(label_to_id)) # Freeze the base model for param in model.parameters(): param.requires_grad=False for param in model.get_classifier().parameters(): param.requires_grad=True # We normalize the batches of images to be a bit faster mean = torch.tensor(model.default_cfg["mean"])[None, :, None, None] std = torch.tensor(model.default_cfg["std"])[None, :, None, None] # To make this constant available on the active device, we set it to the accelerator device mean = mean.to(accelerator.device) std = std.to(accelerator.device) # Intantiate the optimizer optimizer = torch.optim.Adam(params=model.parameters(), lr = 3e-2/25) # Instantiate the learning rate scheduler lr_scheduler = OneCycleLR( optimizer=optimizer, max_lr=3e-2, epochs=5, steps_per_epoch=len(train_dataloader) ) # Prepare everything # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the # prepare method. model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader, lr_scheduler ) # Now we train the model for epoch in range(5): model.train() for step, batch in enumerate(train_dataloader): # We could avoid this line since we set the accelerator with `device_placement=True`. batch = {k: v.to(accelerator.device) for k, v in batch.items()} inputs = (batch["image"] - mean) / std outputs = model(inputs) loss = torch.nn.functional.cross_entropy(outputs, batch["label"]) accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() model.eval() accurate = 0 num_elems = 0 for _, batch in enumerate(eval_dataloader): # We could avoid this line since we set the accelerator with `device_placement=True`. batch = {k: v.to(accelerator.device) for k, v in batch.items()} inputs = (batch["image"] - mean) / std with torch.no_grad(): outputs = model(inputs) predictions = outputs.argmax(dim=-1) accurate_preds = accelerator.gather(predictions) == accelerator.gather(batch["label"]) num_elems += accurate_preds.shape[0] accurate += accurate_preds.long().sum() eval_metric = accurate.item() / num_elems # Use accelerator.print to print only on the main process. accelerator.print(f"epoch {epoch}: {100 * eval_metric:.2f}") # All that's left is to use the `notebook_launcher`. # # We pass in the function, the arguments (as a tuple), and the number of processes to train on. (See the [documentation](https://huggingface.co/docs/accelerate/launcher) for more information) # In[11]: from accelerate import notebook_launcher # In[13]: args = ("fp16", 42, 64) notebook_launcher(training_loop, args, num_processes=2) # And that's it! # ## Conclusion # # This notebook showed how to perform distributed training from inside of a Jupyter Notebook. Some key notes to remember: # # - Make sure to save any code that use CUDA (or CUDA imports) for the function passed to `notebook_launcher` # - Set the `num_processes` to be the number of devices used for training (such as number of GPUs, CPUs, TPUs, etc)