#!/usr/bin/env python # coding: utf-8 # Deep Learning Models -- A collection of various deep learning architectures, models, and tips for TensorFlow and PyTorch in Jupyter Notebooks. # - Author: Sebastian Raschka # - GitHub Repository: https://github.com/rasbt/deeplearning-models # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -v -p torch") # # Increase the Batch Size (AlexNet CIFAR-10 Classifier) # This is a notebook experimenting with increasing the batch size during training, which is inspired by the paper # # - Smith, S. L., Kindermans, P. J., Ying, C., & Le, Q. V. (2017). Don't decay the learning rate, increase the batch size. arXiv preprint arXiv:1711.00489. # # To summarize the main points of the paper: # # - Stochastic gradient descent adds noise to the optimization problem; during the early training epochs, this noise helps with exploring the loss landscape, and in general, it helps with escaping sharp minima which are known to be bad for generalization. # # - However, during the course of the training process, one wants to decay the learning rate gradually (like simulated annealing) for fine-tuning, i.e., to help with convergence # # - Due to the relationship between learning rate, batch size, and momentum, one can also just increase the batch size instead of decreasing the learning rate to reduce the noise. This way, more training examples can be used in each update and fewer steps (parameter updates) overall may be required to converge. # # The relationship between learning rate and batch size is as follows: # # # $$g=\epsilon\left(\frac{N}{B}-1\right),$$ # # where $\epsilon$ is the learning rate, $B$ is the batch size, and $N$ is the number of training examples # # Or, with added momentum term, this becomes: # # $$\begin{aligned} g &=\frac{\epsilon}{1-m}\left(\frac{N}{B}-1\right) \\ & \approx \frac{\epsilon N}{B(1-m)} \end{aligned}.$$ # ### Network Architecture # In this notebook, the CIFAR-10 dataset is used for training a classic AlexNet network [1] for classification: # # - [1] Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton. "[Imagenet classification with deep convolutional neural networks.](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf)" In Advances in Neural Information Processing Systems, pp. 1097-1105. 2012. # # ## Imports # In[2]: import os import time import numpy as np import pandas as pd import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.data import DataLoader from torch.utils.data.dataset import Subset from torchvision import datasets from torchvision import transforms import matplotlib.pyplot as plt from PIL import Image if torch.cuda.is_available(): torch.backends.cudnn.deterministic = True # In[3]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # ## Model Settings # In[4]: ########################## ### SETTINGS ########################## # Hyperparameters RANDOM_SEED = 1 LEARNING_RATE = 0.0001 BATCH_SIZE = 256 NUM_EPOCHS = 40 # Architecture NUM_CLASSES = 10 # Other DEVICE = "cuda:0" # ## Dataset # In[5]: train_indices = torch.arange(0, 48000) valid_indices = torch.arange(48000, 50000) train_transform = transforms.Compose([transforms.Resize((70, 70)), transforms.RandomCrop((64, 64)), transforms.ToTensor()]) test_transform = transforms.Compose([transforms.Resize((70, 70)), transforms.CenterCrop((64, 64)), transforms.ToTensor()]) train_and_valid = datasets.CIFAR10(root='data', train=True, transform=train_transform, download=True) train_dataset = Subset(train_and_valid, train_indices) valid_dataset = Subset(train_and_valid, valid_indices) test_dataset = datasets.CIFAR10(root='data', train=False, transform=test_transform, download=False) train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, num_workers=4, shuffle=True) valid_loader = DataLoader(dataset=valid_dataset, batch_size=BATCH_SIZE, num_workers=4, shuffle=False) test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, num_workers=4, shuffle=False) # In[6]: # Checking the dataset print('Training Set:\n') for images, labels in train_loader: print('Image batch dimensions:', images.size()) print('Image label dimensions:', labels.size()) break # Checking the dataset print('\nValidation Set:') for images, labels in valid_loader: print('Image batch dimensions:', images.size()) print('Image label dimensions:', labels.size()) break # Checking the dataset print('\nTesting Set:') for images, labels in train_loader: print('Image batch dimensions:', images.size()) print('Image label dimensions:', labels.size()) break # ## Model # In[7]: ########################## ### MODEL ########################## class AlexNet(nn.Module): def __init__(self, num_classes): super(AlexNet, self).__init__() self.features = nn.Sequential( nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2), nn.Conv2d(64, 192, kernel_size=5, padding=2), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2), nn.Conv2d(192, 384, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2), ) self.avgpool = nn.AdaptiveAvgPool2d((6, 6)) self.classifier = nn.Sequential( nn.Dropout(0.5), nn.Linear(256 * 6 * 6, 4096), nn.ReLU(inplace=True), nn.Dropout(0.5), nn.Linear(4096, 4096), nn.ReLU(inplace=True), nn.Linear(4096, num_classes) ) def forward(self, x): x = self.features(x) x = self.avgpool(x) x = x.view(x.size(0), 256 * 6 * 6) logits = self.classifier(x) probas = F.softmax(logits, dim=1) return logits, probas # In[8]: def compute_acc(model, data_loader, device): correct_pred, num_examples = 0, 0 model.eval() for i, (features, targets) in enumerate(data_loader): features = features.to(device) targets = targets.to(device) logits, probas = model(features) _, predicted_labels = torch.max(probas, 1) num_examples += targets.size(0) assert predicted_labels.size() == targets.size() correct_pred += (predicted_labels == targets).sum() return correct_pred.float()/num_examples * 100 # # Training 1: Constant Batch Size # In[9]: torch.manual_seed(RANDOM_SEED) model = AlexNet(NUM_CLASSES) model.to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) # In[10]: start_time = time.time() cost_list = [] train_acc_list, valid_acc_list = [], [] for epoch in range(NUM_EPOCHS): model.train() for batch_idx, (features, targets) in enumerate(train_loader): features = features.to(DEVICE) targets = targets.to(DEVICE) ### FORWARD AND BACK PROP logits, probas = model(features) cost = F.cross_entropy(logits, targets) optimizer.zero_grad() cost.backward() ### UPDATE MODEL PARAMETERS optimizer.step() ################################################# ### CODE ONLY FOR LOGGING BEYOND THIS POINT ################################################ cost_list.append(cost.item()) if not batch_idx % 150: print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | ' f'Batch {batch_idx:03d}/{len(train_loader):03d} |' f' Cost: {cost:.4f}') model.eval() with torch.set_grad_enabled(False): # save memory during inference train_acc = compute_acc(model, train_loader, device=DEVICE) valid_acc = compute_acc(model, valid_loader, device=DEVICE) print(f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d}\n' f'Train ACC: {train_acc:.2f} | Validation ACC: {valid_acc:.2f}') train_acc_list.append(train_acc) valid_acc_list.append(valid_acc) elapsed = (time.time() - start_time)/60 print(f'Time elapsed: {elapsed:.2f} min') elapsed = (time.time() - start_time)/60 print(f'Total Training Time: {elapsed:.2f} min') # ## Evaluation # In[11]: plt.plot(cost_list, label='Minibatch cost') plt.plot(np.convolve(cost_list, np.ones(200,)/200, mode='valid'), label='Running average') plt.ylabel('Cross Entropy') plt.xlabel('Iteration') plt.legend() plt.show() # In[12]: plt.plot(np.arange(1, NUM_EPOCHS+1), train_acc_list, label='Training') plt.plot(np.arange(1, NUM_EPOCHS+1), valid_acc_list, label='Validation') plt.xlabel('Epoch') plt.ylabel('Accuracy') plt.legend() plt.show() # In[13]: with torch.set_grad_enabled(False): test_acc = compute_acc(model=model, data_loader=test_loader, device=DEVICE) valid_acc = compute_acc(model=model, data_loader=valid_loader, device=DEVICE) print(f'Validation ACC: {valid_acc:.2f}%') print(f'Test ACC: {test_acc:.2f}%') # # Training 2: Increasing Batch Size # In[14]: torch.manual_seed(RANDOM_SEED) model = AlexNet(NUM_CLASSES) model.to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) # In[15]: batch_sizes = np.arange(256, 5121, 512) batch_size_index = 0 # In[16]: start_time = time.time() cost_list = [] train_acc_list, valid_acc_list = [], [] for epoch in range(NUM_EPOCHS): ### INCREASE BATCH SIZE if epoch > (NUM_EPOCHS//2) and not epoch % (NUM_EPOCHS//len(batch_sizes)): train_loader = DataLoader(dataset=train_dataset, batch_size=int(batch_sizes[batch_size_index]), num_workers=4, shuffle=True) batch_size_index += 1 model.train() for batch_idx, (features, targets) in enumerate(train_loader): features = features.to(DEVICE) targets = targets.to(DEVICE) ### FORWARD AND BACK PROP logits, probas = model(features) cost = F.cross_entropy(logits, targets) optimizer.zero_grad() cost.backward() ### UPDATE MODEL PARAMETERS optimizer.step() ################################################# ### CODE ONLY FOR LOGGING BEYOND THIS POINT ################################################ cost_list.append(cost.item()) if not batch_idx % 150: print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | ' f'Batch {batch_idx:03d}/{len(train_loader):03d} |' f' Cost: {cost:.4f} | Batchsize: {batch_sizes[batch_size_index]}') model.eval() with torch.set_grad_enabled(False): # save memory during inference train_acc = compute_acc(model, train_loader, device=DEVICE) valid_acc = compute_acc(model, valid_loader, device=DEVICE) print(f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d}\n' f'Train ACC: {train_acc:.2f} | Validation ACC: {valid_acc:.2f}') train_acc_list.append(train_acc) valid_acc_list.append(valid_acc) elapsed = (time.time() - start_time)/60 print(f'Time elapsed: {elapsed:.2f} min') elapsed = (time.time() - start_time)/60 print(f'Total Training Time: {elapsed:.2f} min') # In[17]: plt.plot(cost_list, label='Minibatch cost') plt.plot(np.convolve(cost_list, np.ones(200,)/200, mode='valid'), label='Running average') plt.ylabel('Cross Entropy') plt.xlabel('Iteration') plt.legend() plt.show() # In[18]: plt.plot(np.arange(1, NUM_EPOCHS+1), train_acc_list, label='Training') plt.plot(np.arange(1, NUM_EPOCHS+1), valid_acc_list, label='Validation') plt.xlabel('Epoch') plt.ylabel('Accuracy') plt.legend() plt.show() # In[19]: with torch.set_grad_enabled(False): test_acc = compute_acc(model=model, data_loader=test_loader, device=DEVICE) valid_acc = compute_acc(model=model, data_loader=valid_loader, device=DEVICE) print(f'Validation ACC: {valid_acc:.2f}%') print(f'Test ACC: {test_acc:.2f}%')