#!/usr/bin/env python
# coding: utf-8

# ### Data Augmentation in PyTorch

# We will use the Caltech-256 image dataset in this tutorial. The dataset contains a total of 30607 images ranging over 256 categories. If you want to download the dataset, then you can do it [here](http://www.vision.caltech.edu/Image_Datasets/Caltech256/).

# When carrying out deep learning tasks involving images, you can use a host of image augmentation techniques.
# 
# But in this article, we will focus on those which we will implement through programming. Let’s take a look at some of those:
# 
# * `Resize`: resizing of images. This helps in particular when you have very high dimensional images and want to resize them to lower the resolutions. This can make deep learning neural network training much faster.
# * `Cropping`: we can do cropping of an image. In particular, programmatically, we do Center Cropping and Random Cropping of an image.
# * `Flipping`: flipping an image, either vertically or horizontally can change its orientation.
# * `Rotating`: we can also rotate an image by certain degrees.

# In[1]:


# imports
import torch
import torchvision.transforms as transforms
import glob
import matplotlib.pyplot as plt
import numpy as np
import torchvision
import time

from torch.utils.data import DataLoader, Dataset
from PIL import Image


# All the images are saved as per the category they belong to where each category is a directory. We can use glob module to get all the image names and store those as a list.

# In[2]:


image_list = glob.glob('256_ObjectCategories/*/*.jpg')
print(len(image_list))


# PyTorch transforms module will help define all the image augmentation and transforms that we need to apply to the images.

# In[3]:


# define pytorch transforms
transform = transforms.Compose([
     transforms.ToPILImage(),
     transforms.Resize((300, 300)),
     transforms.CenterCrop((100, 100)),
     transforms.RandomCrop((80, 80)),
     transforms.RandomHorizontalFlip(p=0.5),
     transforms.RandomRotation(degrees=(-90, 90)),
     transforms.RandomVerticalFlip(p=0.5),
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
     ])


# In[4]:


# PyTorch image augmentation module
class PyTorchImageDataset(Dataset):
    def __init__(self, image_list, transforms=None):
        self.image_list = image_list
        self.transforms = transforms
         
    def __len__(self):
        return (len(self.image_list))
    
    def __getitem__(self, i):
        image = plt.imread(self.image_list[i])
        image = Image.fromarray(image).convert('RGB')        
        image = np.asarray(image).astype(np.uint8)
        if self.transforms is not None:
            image = self.transforms(image)
            
        return torch.tensor(image, dtype=torch.float)


# Here, we will write our custom class. And then, we will prepare the dataset and data loader that will use the PyTorch transforms and image augmentations.

# In[5]:


pytorch_dataset = PyTorchImageDataset(image_list=image_list, transforms=transform)
pytorch_dataloader = DataLoader(dataset=pytorch_dataset, batch_size=16, shuffle=True)


# In[6]:


def show_img(img):
    plt.figure(figsize=(18,15))
    # unnormalize
    img = img / 2 + 0.5  
    npimg = img.numpy()
    npimg = np.clip(npimg, 0., 1.)
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


# We can visualize a single batch of the image now. Plotting the images will give us an idea of how the transforms are being applied.

# In[9]:


data = iter(pytorch_dataloader)
images = data.next()

# show images
show_img(torchvision.utils.make_grid(images))


# We can clearly see that the image augmentations have been applied. All the images have been randomly cropped, resized, and rotated as well.

# In[8]:


start = time.time()
for i, data in enumerate(pytorch_dataloader):
    images = data
end = time.time()
time_spent = (end-start)/60
print(f"{time_spent:.3} minutes")


# In simple terms, on top of the total per epoch training time, it will also take an extra 2.33 minutes if you carry out all the above image augmentations. It is a lot of time if you are thinking of training a model for more than a few hundred epochs.

# ### Timm Lib
# 
# `timm` is a deep-learning library created by Ross Wightman and is a collection of SOTA computer vision models, layers, utilities, optimizers, schedulers, data-loaders, augmentations and also training/validating scripts with ability to reproduce ImageNet training results. [Link](https://fastai.github.io/timmdocs/)

# `timm` supports a wide variety of augmentations and one such augmentation is Mixup. CutMix followed Mixup and most deep learning practitioners use either Mixup or CutMix in their training pipelines to improve performance.

# In[ ]:


pip install timm


# In[1]:


import torch
from timm.data.mixup import Mixup
from timm.data.dataset import ImageDataset
from timm.data.loader import create_loader


# In[53]:


def get_dataset_and_loader(mixup_args):
    mixup_fn = Mixup(**mixup_args)
    dataset = ImageDataset('imagenette2-320')
    loader = create_loader(dataset, 
                           input_size=(3,224,224), 
                           batch_size=4, 
                           is_training=True, 
                           use_prefetcher=False)
    return mixup_fn, dataset, loader


# Visualize a few images with Mixup

# In[54]:


import torchvision
import numpy as np
from matplotlib import pyplot as plt


# In[55]:


def imshow(inp, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1)
    plt.imshow(inp)
    if title is not None:
        plt.title(title)
    plt.pause(0.001)  # pause a bit so that plots are updated


# In[56]:


mixup_args = {
    'mixup_alpha': 1.,
    'cutmix_alpha': 0.,
    'cutmix_minmax': None,
    'prob': 1.0,
    'switch_prob': 0.,
    'mode': 'batch',
    'label_smoothing': 0,
    'num_classes': 1000}


# In[59]:


mixup_fn, dataset, loader = get_dataset_and_loader(mixup_args)
inputs, classes = next(iter(loader))
out = torchvision.utils.make_grid(inputs)
imshow(out, title=[x.item() for x in classes])


# In[60]:


inputs, classes = mixup_fn(inputs, classes)
out = torchvision.utils.make_grid(inputs)
imshow(out, title=[x.item() for x in classes.argmax(1)])


# Visualize a few images with Cutmix

# In[37]:


mixup_args = {
    'mixup_alpha': 0.,
    'cutmix_alpha': 1.0,
    'cutmix_minmax': None,
    'prob': 1.0,
    'switch_prob': 0.,
    'mode': 'batch',
    'label_smoothing': 0,
    'num_classes': 1000}


# In[38]:


mixup_fn, dataset, loader = get_dataset_and_loader(mixup_args)
inputs, classes = next(iter(loader))
out = torchvision.utils.make_grid(inputs)
imshow(out, title=[x.item() for x in classes])


# In[39]:


inputs, classes = mixup_fn(inputs, classes)
out = torchvision.utils.make_grid(inputs)
imshow(out, title=[x.item() for x in classes.argmax(1)])


# Until now, all operations were applied batch-wise. That is Mixup was done for all elements in a batch. But, by passing argument mode = 'elem' to the Mixup function, we can change it to be elementwise.
# In this case, Cutmix or Mixup is applied to each item inside the batch based on the mixup_args.
# 
# 

# In[40]:


mixup_args = {
    'mixup_alpha': 0.3,
    'cutmix_alpha': 0.3,
    'cutmix_minmax': None,
    'prob': 1.0,
    'switch_prob': 0.5,
    'mode': 'elem',
    'label_smoothing': 0,
    'num_classes': 1000}


# In[43]:


mixup_fn, dataset, loader = get_dataset_and_loader(mixup_args)
inputs, classes = next(iter(loader))
out = torchvision.utils.make_grid(inputs)
imshow(out, title=[x.item() for x in classes])


# In[44]:


inputs, classes = mixup_fn(inputs, classes)
out = torchvision.utils.make_grid(inputs)
imshow(out, title=[x.item() for x in classes.argmax(1)])


# ## PyTorch Lightning Tutorial
# 

# PyTorch is extremely easy to use to build complex AI models. But once the research gets complicated and things like multi-GPU training, 16-bit precision and TPU training get mixed in, users are likely to introduce bugs. 
# PyTorch Lightning solves exactly this problem. Lightning structures your PyTorch code so it can abstract the details of training. This makes AI research scalable and fast to iterate on.

# ### MNIST example of the PyTorch:

# In[1]:


import torch 
from torch import nn
from torch import optim
from torchvision import datasets, transforms
from torch.utils.data import random_split, DataLoader


# In[2]:


# Define my model
class ResNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(28 * 28, 64)
        self.l2 = nn.Linear(64, 64)
        self.l3 = nn.Linear(64, 10)
        self.do = nn.Dropout(0.1)
        
    def forward(self, x):
        h1 = nn.functional.relu(self.l1(x))
        h2 = nn.functional.relu(self.l2(h1))
        do = self.do(h2 + h1)
        logits = self.l3(do)
        return logits
model = ResNet()    


# That's it! This model defines the computational graph to take as input an MNIST image and convert it to a probability distribution over 10 classes for digits 0-9.

# In[3]:


# Define my optimiser
optimiser = optim.SGD(model.parameters(), lr=1e-2)


# In[4]:


# Define my loss
loss = nn.CrossEntropyLoss()


# In[5]:


# Train, Val split
train_data = datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor())
train, val = random_split(train_data, [55000, 5000])
train_loader = DataLoader(train, batch_size=32)
val_loader = DataLoader(val, batch_size=32)


# In[6]:


# My training and validation loops
nb_epochs = 5
for epoch in range(nb_epochs):
    losses = list()
    accuracies = list()
    model.train()    # because I use Dropout
    for batch in train_loader:
        x, y = batch
        
        # x: b x 1 x 28 x 28
        b = x.size(0)
        x = x.view(b, -1)
        
        # 1 forward
        l = model(x)
        
        # 2 compute the objective function
        J = loss(l, y)
        
        # 3 cleaning the gradients
        model.zero_grad()
        # optimiser.zero_grad()
        
        # 4 accumulate the partial derivatives of the gradient
        J.backward()
        
        # 5 step in the opposite direction of the gradient
        optimiser.step()
        
        losses.append(J.item())
        accuracies.append(y.eq(l.detach().argmax(dim=1)).float().mean())
        
    print(f'Epoch {epoch + 1}', end=', ')
    print(f'training loss: {torch.tensor(losses).mean():.2f}', end=', ')
    print(f'training accuracy: {torch.tensor(accuracies).mean():.2f}')
    
    #print(f'Epoch {epoch + 1}, train loss: {torch.tensor(losses).mean():.2f}')
    
    losses = list()
    accuracies = list()
    model.eval()
    for batch in val_loader:
        x, y = batch
        
        # x: b x 1 x 28 x 28
        b = x.size(0)
        x = x.view(b, -1)
        
        # 1 forward
        with torch.no_grad():
            l = model(x)
        
        # 2 compute the objective function
        J = loss(l, y)    
        losses.append(J.item())
        accuracies.append(y.eq(l.detach().argmax(dim=1)).float().mean())
    
    print(f'Epoch {epoch + 1}', end=', ')
    print(f'validation loss: {torch.tensor(losses).mean():.2f}', end=', ')
    print(f'validation accuracy: {torch.tensor(accuracies).mean():.2f}')
    #print(f'Epoch {epoch + 1}, validation loss: {torch.tensor(losses).mean():.2f}')


# ### PyTorch lightning Version

# #### PyTorch Lightning 
# 
# 1. model/network architecture
# 2. optimizer
# 3. data
# 4. training loop "the magic"
# 5. validation loop "the validation magic"

# In[20]:


import pytorch_lightning as pl
from torchmetrics.functional import accuracy

class ResNet(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(28 * 28, 64)
        self.l2 = nn.Linear(64, 64)
        self.l3 = nn.Linear(64, 10)
        self.do = nn.Dropout(0.1)
        
        self.loss = nn.CrossEntropyLoss()
        
    def forward(self, x):
        h1 = nn.functional.relu(self.l1(x))
        h2 = nn.functional.relu(self.l2(h1))
        do = self.do(h2 + h1)
        logits = self.l3(do)
        return logits
    
    def configure_optimizers(self):
        optimiser = optim.Adam(self.parameters(), lr=1e-2)
        return optimiser
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        
        # x: b x 1 x 28 x 28
        b = x.size(0)
        x = x.view(b, -1)
        
        # 1 forward
        logits = model(x)
        
        # 2 compute the objective function
        J = self.loss(logits, y)
        
        #acc = accuracy(logits, y)
        #pbar = {'train_acc': acc}
        #return {'loss': J, 'progress_bar': pbar}
        return J
        
    def val_step(self, batch,batch_idx):
        pass
        
    def train_dataloader(self):
        train_data = datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor())
        #train, val = random_split(train_data, [55000, 5000])
        train_loader = DataLoader(train_data, batch_size=32)
        #val_loader = DataLoader(val, batch_size=32)
        return train_loader
        
    def val_dataloader(self):
        pass
    
model = ResNet()


# #### Model
# This means you can use a LightningModule exactly as you would a PyTorch module.
# 
# #### Data
# In short, data preparation has 3 steps:
# 1. Image transforms (these are highly subjective).
# 2. Generate training, validation and test dataset splits.
# 3. Wrap each dataset split in a DataLoader
# 
# #### Optimizer
# we choose how we're going to do the optimization. We'll use Adam instead of SGD because it is a good default in most DL research. In PyTorch, the optimizer is given the weights to optimizer when we init the optimizer.
# 
# #### Training loop
# Notice a few things about this structure:
# 1. It is highly organized.
# 2. It is the SAME PyTorch code, except it's been organized.
# 3. The inner loop of the PyTorch training code became the ```training_step```. But we didn't have to do any of the gradient stuff because Lightning will do it automatically!

# To train the Lightning MNIST we simply use the Lightning trainer to automatically do all the other stuff we don't need from the PyTorch code - which we have to duplicate in EVERY SINGLE project we start which makes it boilerplate code.

# In[24]:


trainer = pl.Trainer(progress_bar_refresh_rate=20, max_epochs=5, gpus=1,  fast_dev_run=1)
trainer.fit(model)


# Save/load model

# In[22]:


ls lightning_logs/


# ## PyTorch and PyTorch Lightning differences
# 
# Let's call out a few differences:
# 1. Without Lightning, the PyTorch code is allowed to be in arbitrary parts of the file. With Lightning, this is structured. This is why Lightning is more of a template for PyTorch than a framework!
# 2. In Lightning we didn't have to write any training loops. Instead, we wrote what happens inside the loop
# 
# ```python
# for epoch:
#   for batch in data:
#     # training_step
#     loss.backward()
#     optimizer.step()
# ```
# 3. We saw the weights summarized
# 4. We got a free progress bar
# 5. The validation and training loop were automated
# 6. We saved weights automatically
# 7. We got free tensorboard logging

# ## Additional PyTorch Lightning Features
# 

# Although those benefits are super nice, the real benefits don't show up until your project gets more complicated.
# 
# 
# ### Single GPU training
# For instance, now assume you want to run this same code on a GPU. To do that in the PyTorch example you'd have to make these changes.
# 
# ```python
# model.gpu(0)
# 
# for batch in data:
#   batch = batch.gpu(0)
# ```
# 
# But in Lightning you don't have to change your code! Simply add a trainer flag
# 
# 

# In[ ]:


Trainer(gpus=1)


# ### TPU training
# How about running on a TPU?
# To enable the PyTorch example, this would be very complicated and outside the scope of this tutorial... 
# 
# You would also have to install a library called xla (for PyTorch and Lightning).
# 
# But after installing, in Lightning you just set this flag:
# 
# 

# In[ ]:


Trainer(num_tpu_cores=8)


# ### Multiple GPUs
# What about multiple GPUs? In The PyTorch example, you would have to use the dataparallel or distributed parallel classes which are also outside the scope of this tutorial.
# 
# In Lightning you would do:
# 

# In[ ]:


Trainer(gpus=8, distributed_backend='ddp')


# ### 16 bit precision
# What about 16-bit precision? In the PyTorch example that would again get very complicated! That's also outside of the scope of this tutorial.
# 
# You would also need to install a library called apex by NVIDIA (applies to both PyTorch and Lightning)
# 
# But in lightning you can just set the flag:
# 

# In[ ]:


Trainer(precision=16)


# 
# ### Logging
# Remember that you got tensorboard for free? In Lightning we support more advanced visualization platforms which again are trivial to change
# 

# In[ ]:


# replace tensorboard
Trainer(logger=CometML(...))


# ### Finding bottlenecks
# What if your code is slow and you want to figure out where the issues are? You can try the [pytorch profiler](https://pytorch.org/docs/stable/bottleneck.html) which can get complicated to interpret and run.
# 
# But in lightning you can get basic profiling by doing the following
# 

# In[ ]:


Trainer(profile=True)


# ### 40+ other features
# The point is that with just by simply organizing your PyTorch code into a LightningModule you can get access to features like the ones above and 40+ other features such as:
# 
# - Gradient clipping
# - Automatic truncated back propagation through time
# - HPC cluster auto-resubmission
# - Early stopping
# - ...
# - Built-in debugging tests
# - Multi-node training (yup!)
# 

# ## Reusability and Readability
# But the main underlying benefit behind all of this is that with Lightnign you've succesfully abstracted out all the code that is hardward specific or related to engineering. This makes your code highly readable and easier to reproduce because it isn't litered with engineering code.

# ## Rigurous testing
# Another benefit of lightning is that it is rigurously tested daily. This means that if Lightning handles 90% of your code, it means that part is very well tested to ensure it is correct. It limits the footprint of bugs to the 10% you have to write.
# 
#