#!/usr/bin/env python
# coding: utf-8

# Deep Learning Models -- A collection of various deep learning architectures, models, and tips for TensorFlow and PyTorch in Jupyter Notebooks.
# - Author: Sebastian Raschka
# - GitHub Repository: https://github.com/rasbt/deeplearning-models

# In[1]:


get_ipython().run_line_magic('load_ext', 'watermark')
get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -v -p torch")


# # Model Zoo -- Distribute a Model Across Multiple GPUs with Pipeline Parallelism (VGG-16 Example)

# This notebook demos pipeline parallelism added to PyTorch 1.8 using VGG-16 as an example. For more details, see https://pytorch.org/docs/1.8.0/pipeline.html?highlight=pipeline#.

# # 1) Setup

# In[2]:


import sys
import torch

sys.path.insert(0, "..") # to include ../helper_evaluate.py etc.

from helper_utils import set_all_seeds, set_deterministic
from helper_evaluate import compute_accuracy
from helper_data import get_dataloaders_cifar10
from helper_train import train_classifier_simple_v1


# In[3]:


##########################
### SETTINGS
##########################

# Data settings
num_classes = 10

# Hyperparameters
random_seed = 1
learning_rate = 0.0001
batch_size = 128
num_epochs = 50


# In[4]:


set_all_seeds(random_seed)
#set_deterministic()


# In[5]:


##########################
### Dataset
##########################

train_loader, valid_loader, test_loader = get_dataloaders_cifar10(
    batch_size, 
    num_workers=2, 
    validation_fraction=0.1)


# # 2) Regular (1-GPU) Training

# This section implements the VGG-16 network in the conventional manne as a reference. The next section replicates this using pipeline parallelism.

# In[6]:


##########################
### Model
##########################


class VGG16(torch.nn.Module):

    def __init__(self, num_classes):
        super().__init__()
        
        # calculate same padding:
        # (w - k + 2*p)/s + 1 = o
        # => p = (s(o-1) - w + k)/2
        
        self.block_1 = torch.nn.Sequential(
                torch.nn.Conv2d(in_channels=3,
                          out_channels=64,
                          kernel_size=(3, 3),
                          stride=(1, 1),
                          # (1(32-1)- 32 + 3)/2 = 1
                          padding=1), 
                torch.nn.ReLU(),
                torch.nn.Conv2d(in_channels=64,
                          out_channels=64,
                          kernel_size=(3, 3),
                          stride=(1, 1),
                          padding=1),
                torch.nn.ReLU(),
                torch.nn.MaxPool2d(kernel_size=(2, 2),
                             stride=(2, 2))
        )
        
        self.block_2 = torch.nn.Sequential(
                torch.nn.Conv2d(in_channels=64,
                          out_channels=128,
                          kernel_size=(3, 3),
                          stride=(1, 1),
                          padding=1),
                torch.nn.ReLU(),
                torch.nn.Conv2d(in_channels=128,
                          out_channels=128,
                          kernel_size=(3, 3),
                          stride=(1, 1),
                          padding=1),
                torch.nn.ReLU(),
                torch.nn.MaxPool2d(kernel_size=(2, 2),
                             stride=(2, 2))
        )
        
        self.block_3 = torch.nn.Sequential(        
                torch.nn.Conv2d(in_channels=128,
                          out_channels=256,
                          kernel_size=(3, 3),
                          stride=(1, 1),
                          padding=1),
                torch.nn.ReLU(),
                torch.nn.Conv2d(in_channels=256,
                          out_channels=256,
                          kernel_size=(3, 3),
                          stride=(1, 1),
                          padding=1),
                torch.nn.ReLU(),        
                torch.nn.Conv2d(in_channels=256,
                          out_channels=256,
                          kernel_size=(3, 3),
                          stride=(1, 1),
                          padding=1),
                torch.nn.ReLU(),
                torch.nn.MaxPool2d(kernel_size=(2, 2),
                             stride=(2, 2))
        )
        
          
        self.block_4 = torch.nn.Sequential(   
                torch.nn.Conv2d(in_channels=256,
                          out_channels=512,
                          kernel_size=(3, 3),
                          stride=(1, 1),
                          padding=1),
                torch.nn.ReLU(),        
                torch.nn.Conv2d(in_channels=512,
                          out_channels=512,
                          kernel_size=(3, 3),
                          stride=(1, 1),
                          padding=1),
                torch.nn.ReLU(),        
                torch.nn.Conv2d(in_channels=512,
                          out_channels=512,
                          kernel_size=(3, 3),
                          stride=(1, 1),
                          padding=1),
                torch.nn.ReLU(),            
                torch.nn.MaxPool2d(kernel_size=(2, 2),
                             stride=(2, 2))
        )
        
        self.block_5 = torch.nn.Sequential(
                torch.nn.Conv2d(in_channels=512,
                          out_channels=512,
                          kernel_size=(3, 3),
                          stride=(1, 1),
                          padding=1),
                torch.nn.ReLU(),            
                torch.nn.Conv2d(in_channels=512,
                          out_channels=512,
                          kernel_size=(3, 3),
                          stride=(1, 1),
                          padding=1),
                torch.nn.ReLU(),            
                torch.nn.Conv2d(in_channels=512,
                          out_channels=512,
                          kernel_size=(3, 3),
                          stride=(1, 1),
                          padding=1),
                torch.nn.ReLU(),    
                torch.nn.MaxPool2d(kernel_size=(2, 2),
                             stride=(2, 2))             
        )
            
        self.classifier = torch.nn.Sequential(
            torch.nn.Flatten(),
            torch.nn.Linear(512, 4096),
            torch.nn.ReLU(True),
            #torch.nn.Dropout(p=0.5),
            torch.nn.Linear(4096, 4096),
            torch.nn.ReLU(True),
            #torch.nn.Dropout(p=0.5),
            torch.nn.Linear(4096, num_classes),
        )
        
        
    def forward(self, x):
        x = self.block_1(x)
        x = self.block_2(x)
        x = self.block_3(x)
        x = self.block_4(x)
        x = self.block_5(x)
        x = self.classifier(x) # logits

        return x


model = VGG16(num_classes=num_classes)

device = torch.device('cuda:0')
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


# In[7]:


_ = train_classifier_simple_v1(num_epochs=num_epochs, model=model, 
                               optimizer=optimizer, device=device, 
                               train_loader=train_loader, valid_loader=valid_loader, 
                               logging_interval=200)


# ## 3) VGG16 with Pipeline Parallelism

# Below we first define the blocks we are going to wrap into the model:

# In[8]:


block_1 = torch.nn.Sequential(
        torch.nn.Conv2d(in_channels=3,
                  out_channels=64,
                  kernel_size=(3, 3),
                  stride=(1, 1),
                  # (1(32-1)- 32 + 3)/2 = 1
                  padding=1), 
        torch.nn.ReLU(),
        torch.nn.Conv2d(in_channels=64,
                  out_channels=64,
                  kernel_size=(3, 3),
                  stride=(1, 1),
                  padding=1),
        torch.nn.ReLU(),
        torch.nn.MaxPool2d(kernel_size=(2, 2),
                     stride=(2, 2))
)

block_2 = torch.nn.Sequential(
        torch.nn.Conv2d(in_channels=64,
                  out_channels=128,
                  kernel_size=(3, 3),
                  stride=(1, 1),
                  padding=1),
        torch.nn.ReLU(),
        torch.nn.Conv2d(in_channels=128,
                  out_channels=128,
                  kernel_size=(3, 3),
                  stride=(1, 1),
                  padding=1),
        torch.nn.ReLU(),
        torch.nn.MaxPool2d(kernel_size=(2, 2),
                     stride=(2, 2))
)
        
block_3 = torch.nn.Sequential(        
        torch.nn.Conv2d(in_channels=128,
                  out_channels=256,
                  kernel_size=(3, 3),
                  stride=(1, 1),
                  padding=1),
        torch.nn.ReLU(),
        torch.nn.Conv2d(in_channels=256,
                  out_channels=256,
                  kernel_size=(3, 3),
                  stride=(1, 1),
                  padding=1),
        torch.nn.ReLU(),        
        torch.nn.Conv2d(in_channels=256,
                  out_channels=256,
                  kernel_size=(3, 3),
                  stride=(1, 1),
                  padding=1),
        torch.nn.ReLU(),
        torch.nn.MaxPool2d(kernel_size=(2, 2),
                     stride=(2, 2))
)
        
          
block_4 = torch.nn.Sequential(   
        torch.nn.Conv2d(in_channels=256,
                  out_channels=512,
                  kernel_size=(3, 3),
                  stride=(1, 1),
                  padding=1),
        torch.nn.ReLU(),        
        torch.nn.Conv2d(in_channels=512,
                  out_channels=512,
                  kernel_size=(3, 3),
                  stride=(1, 1),
                  padding=1),
        torch.nn.ReLU(),        
        torch.nn.Conv2d(in_channels=512,
                  out_channels=512,
                  kernel_size=(3, 3),
                  stride=(1, 1),
                  padding=1),
        torch.nn.ReLU(),            
        torch.nn.MaxPool2d(kernel_size=(2, 2),
                     stride=(2, 2))
)
        
block_5 = torch.nn.Sequential(
        torch.nn.Conv2d(in_channels=512,
                  out_channels=512,
                  kernel_size=(3, 3),
                  stride=(1, 1),
                  padding=1),
        torch.nn.ReLU(),            
        torch.nn.Conv2d(in_channels=512,
                  out_channels=512,
                  kernel_size=(3, 3),
                  stride=(1, 1),
                  padding=1),
        torch.nn.ReLU(),            
        torch.nn.Conv2d(in_channels=512,
                  out_channels=512,
                  kernel_size=(3, 3),
                  stride=(1, 1),
                  padding=1),
        torch.nn.ReLU(),    
        torch.nn.MaxPool2d(kernel_size=(2, 2),
                     stride=(2, 2))             
)
            
classifier = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(512, 4096),
    torch.nn.ReLU(True),
    #torch.nn.Dropout(p=0.5),
    torch.nn.Linear(4096, 4096),
    torch.nn.ReLU(True),
    #torch.nn.Dropout(p=0.5),
    torch.nn.Linear(4096, num_classes),
)
        

# Before setting up the environment for the distributed run, we check if the distributed setting is supported on our machine. The following should return `True`:

# In[9]:


torch.distributed.is_available()


# Next, we set the following environment variables for your machine:

# For `MASTER_ADDR` just use the IP address of your machine. E.g., 123.45.67.89

# In[ ]:


get_ipython().run_line_magic('env', 'MASTER_ADDR=xxx.xx.xx.xx')


# Choose a free port:

# In[11]:


get_ipython().run_line_magic('env', 'MASTER_PORT=8891')


# Set up the RPC if it is not already running (more details at https://pytorch.org/docs/stable/rpc.html):

# In[12]:


try:
    torch.distributed.rpc.init_rpc(name='node1', rank=0, world_size=1)
except RuntimeError as e:
    if str(e) == 'Address already in use':
        pass
    else:
        raise RuntimeError(e)


# This is the main part for running the model on multiple GPUs.
# 
# 1. We wrap the individual blocks into a `Sequential` model
# 2. The chunks refer to the `microbatches`, for more details, see https://pytorch.org/docs/1.8.0/pipeline.html?highlight=pipeline#

# In[13]:


from torch.distributed.pipeline.sync import Pipe


block1 = block_1.cuda(0)
block2 = block_2.cuda(0)
block3 = block_3.cuda(2)
block4 = block_4.cuda(2)
block4 = block_5.cuda(3)
block4 = classifier.cuda(0)

model_parallel = torch.nn.Sequential(
    block_1, block_2, block_3, block_4, block_5, classifier)
model_parallel = Pipe(model_parallel, chunks=8)
optimizer = torch.optim.Adam(model_parallel.parameters(), lr=learning_rate)


# In[ ]:


_ = train_classifier_simple_v1(num_epochs=num_epochs, model=model_parallel, 
                               optimizer=optimizer, device=torch.device('cuda:0'), 
                               train_loader=train_loader, valid_loader=valid_loader, 
                               logging_interval=200)


# As expected, the Training is slower as before. But this is expected because the main selling point of pipeline parallelism is to utilize more GPUs due to memory contraints not to speed up training.