#!/usr/bin/env python
# coding: utf-8

# # Convolutional Neural Nets
# 
# This type of neural nets are predominantly (and heavily) used in image processing. This lesson is on the CIFAR-10 dataset.
# 
# ## Useful terms:
# 
# 1. Conv2D
# 2. MaxPool2D
# 3. BatchNormalization
# 
# ## Further Readings:
# https://ujjwalkarn.me/2016/08/11/intuitive-explanation-convnets/

# In[1]:


get_ipython().system('pip install tqdm')


# In[1]:


import numpy as np
import matplotlib.pyplot as plt

from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm

import tarfile
import pickle

from keras.models import Sequential
from keras.layers import Dense, Activation, Conv2D, MaxPool2D, Flatten, BatchNormalization, Dropout

get_ipython().run_line_magic('matplotlib', 'inline')


# In[3]:


cifar10_dataset_folder_path = 'cifar-10-batches-py'

class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

if not isfile('cifar-10-python.tar.gz'):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc='CIFAR-10 Dataset') as pbar:
        urlretrieve(
            'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz',
            'cifar-10-python.tar.gz',
            pbar.hook)

if not isdir(cifar10_dataset_folder_path):
    with tarfile.open('cifar-10-python.tar.gz') as tar:
        tar.extractall()
        tar.close()
        
label_dict = dict(zip(range(10), 
                      ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']))  
width = height = 32
channels = 3
train_examples = 50000

# Get the test set
with open(cifar10_dataset_folder_path + '/test_batch', mode='rb') as file:
    batch = pickle.load(file, encoding='latin1')

    test_x = batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)/255
    test_y = batch['labels']


# In[4]:


def get_batch(batch_size):
    n_batches = 5
    while(1):
        for batch_id in range(1, n_batches + 1):
            with open(cifar10_dataset_folder_path + '/data_batch_' + str(batch_id), mode='rb') as file:
                batch = pickle.load(file, encoding='latin1')

                features = batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)
                labels = batch['labels']
                for start in range(0, len(features), batch_size):
                    end = min(start + batch_size, len(features))
                    yield features[start:end]/255, np.array(labels[start:end])


# In[5]:


x, y = next(get_batch(5))
for im, label in zip(x,y):
    plt.imshow(im)
    plt.title(label_dict[label])
    plt.show()


# ## Basic logistic multiclass classification:

# In[6]:


batch_size = 1000
gen = get_batch(batch_size)
x_train, y_train = next(gen)


# In[7]:


from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(x_train.reshape(batch_size,-1), y_train)


# In[8]:


y_pred = logistic.predict(test_x.reshape(len(test_x), -1))
y_pred[:10]


# Predicting the probabilities for the first 3 images:

# In[9]:


logistic.predict_proba(test_x[:3].reshape(3,-1))


# Accuracy of the predictions:

# In[10]:


np.count_nonzero(y_pred == test_y)/len(test_y)


# The number of parameters for a fully connected network:

# In[11]:


32*32*3*10


# ## Keras Multilayered Perceptron (Neural Net)

# In[12]:


def get_batch(batch_size):
    n_batches = 5
    while(1):
        for batch_id in range(1, n_batches + 1):
            with open(cifar10_dataset_folder_path + '/data_batch_' + str(batch_id), mode='rb') as file:
                batch = pickle.load(file, encoding='latin1')

                features = batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)
                labels = batch['labels']
                for start in range(0, len(features), batch_size):
                    end = min(start + batch_size, len(features))
                    x = features[start:end]/255
                    y = labels[start:end]
                    yield x.reshape(len(x),-1), np.array(y)


# It is important to note that when we do classification problems we use the **Categorical Crossentropy Loss**. When its only two classes we can use Logistic Loss (Binary Crossentropy Loss). Finally for regression problems we use **Mean Squared Error**.
# 
# The Cross Entropy loss is defined as:
# $$\mathcal{L} = -\frac{1}{N}\sum_i \mathcal{I}(y_i=1)\log(p_{i1})+\mathcal{I}(y_i=2)\log(1-p_{i2})+\cdots++\mathcal{I}(y_i=K)\log(1-p_{iK})$$
# where $N$ is the number of training instances, $K$ is the number of classes and $p_{ik}$ is the probability that instance $i$ belongs to $k$.
# 
# Softmax takes a $D$ dimensional vector and squeezes them through a function such that we have $D$ outputs whos values are positive and sums to one.
# $$
# \text{softmax}(\mathbf{y})_d = \frac{\exp(-y_d)}{\exp(-y_1)+...+\exp(-y_D)}
# $$

# ### 1 Hidden Layer

# In[2]:


get_ipython().run_line_magic('pinfo', 'Dense')


# In[13]:


model = Sequential()
# TODO: Do a 'Normal' 1 Hidden layer NN (Refresher https://keras.io/#getting-started-30-seconds-to-keras)
# Note that the number of inputs is width*height*channels
# The last layer is a softmax layer (it outputs the probability of the ten classes)
# The loss function is 'sparse_categorical_crossentropy' and use either 'adagrad' or 'adadelta' as the optimizer
model.summary()


# In[14]:


batch_size = 256
model.fit_generator(get_batch(batch_size=batch_size), train_examples//batch_size, epochs=1)


# In[15]:


y_pred = model.predict_classes(test_x.reshape(len(test_x),-1))
np.count_nonzero(y_pred == test_y)/len(test_y)


# ## Convolution Neural Networks (CNN)

# ** Points to note **
# 1. One CNN, connected to **one** node above is simply a Dense layer with most weights set to zero.
# 2. The same CNN, connected to multiple nodes is weight tying/ sharing.
# 
# Consider the following convolution mask:
# <img src='https://ujwlkarn.files.wordpress.com/2016/07/screen-shot-2016-07-24-at-11-25-24-pm.png?w=74&h=64'>
# <img src='https://ujwlkarn.files.wordpress.com/2016/07/convolution_schematic.gif?w=536&h=392'>
# 
# ![](cnn.png)

# In[16]:


def get_batch(batch_size):
    n_batches = 5
    while(1):
        for batch_id in range(1, n_batches + 1):
            with open(cifar10_dataset_folder_path + '/data_batch_' + str(batch_id), mode='rb') as file:
                batch = pickle.load(file, encoding='latin1')

                features = batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)
                labels = batch['labels']
                for start in range(0, len(features), batch_size):
                    end = min(start + batch_size, len(features))
                    x = features[start:end]/255
                    y = labels[start:end]
                    yield x, np.array(y)


# ### Using the max pooling layer:

# In[7]:


get_ipython().run_line_magic('pinfo', 'Conv2D')


# In[ ]:


get_ipython().run_line_magic('pinfo', 'MaxPool2D')


# In[14]:


model = Sequential()
# TODO: Get 3 layers of Conv2D followed by MaxPool2D
# The first layer requires input_shape = (width,height,channels)
# Set activation='relu' in all layers and padding='same', Maxpool does not have an activation
# All you need to specify is the kernel_size and filters parameters
# As a thumb rule the number of filters double. eg. choose 4, 8, 16 for the 3 layers
model.add(Flatten())
model.add(Dense(10, activation='softmax'))
# Note: You so not apply dropout on final layer.
model.compile(optimizer='adadelta', loss='sparse_categorical_crossentropy')


# In[15]:


model.summary()


# In[ ]:


32*32*3


# In[5]:


3*3*3*8+8


# In[7]:


3*3*8*16+16


# In[16]:


batch_size = 256
model.fit_generator(get_batch(batch_size=batch_size), train_examples//batch_size, epochs=5)


# In[17]:


y_pred = model.predict_classes(test_x)
np.count_nonzero(y_pred == test_y)/len(test_y)


# In[18]:


plt.figure(figsize=(12,12))
idx = np.random.choice(len(test_x),5,replace=False)


p = model.predict(test_x[idx])
for i in range(len(idx)):
    plt.subplot(5,2,2*i+1)
    plt.imshow(test_x[idx[i]])
    plt.title(label_dict[test_y[idx[i]]])
#     plt.show()
    pred_label = np.argsort(-p[i])[:3]
    pred_prob = [p[i][l] for l in pred_label]
    pred_label = [label_dict[l] for l in pred_label]
    
    plt.subplot(5,2,2*i+2)
    plt.bar(range(3),pred_prob)
    plt.xticks(range(3), pred_label)
#     plt.show()

plt.show()


# ## Batch Normalization
# 
# Batch Normalization makes the output of multiplying by weights 0 mean and variance of one **before** passing through an activation layer. This makes sure that the gradients are neither large or too small. Making the learning process faster. 
# 
# https://www.quora.com/Why-does-batch-normalization-help

# In[3]:


get_ipython().run_line_magic('pinfo', 'BatchNormalization')


# In[36]:


model = Sequential()
model.add(Conv2D(8, kernel_size=(3,3), padding='same', input_shape = (width,height,channels)))
model.add(Activation('relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Conv2D(16, kernel_size=(3,3), padding='same'))
# TODO: add a BatchNormalization() layer
model.add(Activation('relu'))
# TODO: add a MaxPool2D layer
# TODO: Add another set of Conv2D followed by BatchNormalization, followed by relu activation, followed by maxpool (4 lines of code)
# TODO: flatten the layer
# TODO: Add the last softmax layer
model.compile(optimizer='adadelta', loss='sparse_categorical_crossentropy', metrics = ['accuracy'])


# In[37]:


model.summary()


# In[32]:


batch_size = 256
model.fit_generator(get_batch(batch_size=batch_size), train_examples//batch_size, epochs=5)


# In[33]:


y_pred = model.predict_classes(test_x)
np.count_nonzero(y_pred == test_y)/len(test_y)


# In[34]:


plt.figure(figsize=(12,12))
idx = np.random.choice(len(test_x),5,replace=False)


p = model.predict(test_x[idx])
for i in range(len(idx)):
    plt.subplot(5,2,2*i+1)
    plt.imshow(test_x[idx[i]])
    plt.title(label_dict[test_y[idx[i]]])
#     plt.show()
    pred_label = np.argsort(-p[i])[:3]
    pred_prob = [p[i][l] for l in pred_label]
    pred_label = [label_dict[l] for l in pred_label]
    
    plt.subplot(5,2,2*i+2)
    plt.bar(range(3),pred_prob)
    plt.xticks(range(3), pred_label)
#     plt.show()

plt.show()


# In[ ]: