#!/usr/bin/env python
# coding: utf-8

# # Convolution Nets for MNIST

# Deep Learning models can take quite a bit of time to run, particularly if GPU isn't used. 
# 
# In the interest of time, you could sample a subset of observations (e.g. $1000$) that are a particular number of your choice (e.g. $6$) and $1000$ observations that aren't that particular number (i.e. $\neq 6$). 
# 
# We will build a model using that and see how it performs on the test dataset

# In[ ]:


#Import the required libraries
import numpy as np
np.random.seed(1338)

from keras.datasets import mnist


# In[ ]:


from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten


# In[ ]:


from keras.layers.convolutional import Conv2D
from keras.layers.pooling import MaxPooling2D


# In[ ]:


from keras.utils import np_utils
from keras.optimizers import SGD


# ## Loading Data

# In[ ]:


#Load the training and testing data
(X_train, y_train), (X_test, y_test) = mnist.load_data()


# In[ ]:


X_test_orig = X_test


# ## Data Preparation

# In[ ]:


from keras import backend as K


# #### Very Important: 
# When dealing with images & convolutions, it is paramount to handle `image_data_format` properly

# In[ ]:


img_rows, img_cols = 28, 28

if K.image_data_format() == 'channels_first':
    shape_ord = (1, img_rows, img_cols)
else:  # channel_last
    shape_ord = (img_rows, img_cols, 1)


# #### Preprocess and Normalise Data

# In[ ]:


X_train = X_train.reshape((X_train.shape[0],) + shape_ord)
X_test = X_test.reshape((X_test.shape[0],) + shape_ord)

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

X_train /= 255
X_test /= 255


# In[ ]:


np.random.seed(1338)  # for reproducibilty!!

# Test data
X_test = X_test.copy()
Y = y_test.copy()

# Converting the output to binary classification(Six=1,Not Six=0)
Y_test = Y == 6
Y_test = Y_test.astype(int)

# Selecting the 5918 examples where the output is 6
X_six = X_train[y_train == 6].copy()
Y_six = y_train[y_train == 6].copy()

# Selecting the examples where the output is not 6
X_not_six = X_train[y_train != 6].copy()
Y_not_six = y_train[y_train != 6].copy()

# Selecting 6000 random examples from the data that 
# only contains the data where the output is not 6
random_rows = np.random.randint(0,X_six.shape[0],6000)
X_not_six = X_not_six[random_rows]
Y_not_six = Y_not_six[random_rows]


# In[ ]:


# Appending the data with output as 6 and data with output as <> 6
X_train = np.append(X_six,X_not_six)

# Reshaping the appended data to appropraite form
X_train = X_train.reshape((X_six.shape[0] + X_not_six.shape[0],) + shape_ord)

# Appending the labels and converting the labels to 
# binary classification(Six=1,Not Six=0)
Y_labels = np.append(Y_six,Y_not_six)
Y_train = Y_labels == 6 
Y_train = Y_train.astype(int)


# In[ ]:


print(X_train.shape, Y_labels.shape, X_test.shape, Y_test.shape)


# In[ ]:


# Converting the classes to its binary categorical form
nb_classes = 2
Y_train = np_utils.to_categorical(Y_train, nb_classes)
Y_test = np_utils.to_categorical(Y_test, nb_classes)


# # A simple CNN

# In[ ]:


# -- Initializing the values for the convolution neural network

nb_epoch = 2  # kept very low! Please increase if you have GPU

batch_size = 64
# number of convolutional filters to use
nb_filters = 32
# size of pooling area for max pooling
nb_pool = 2
# convolution kernel size
nb_conv = 3

sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)


# #### Step 1: Model Definition

# In[ ]:


model = Sequential()

model.add(Conv2D(nb_filters, (nb_conv, nb_conv), padding='valid', 
                 input_shape=shape_ord))  # note: the very first layer **must** always specify the input_shape
model.add(Activation('relu'))

model.add(Flatten())
model.add(Dense(nb_classes))
model.add(Activation('softmax'))


# #### Step 2: Compile

# In[ ]:


model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])


# #### Step 3: Fit

# In[ ]:


hist = model.fit(X_train, Y_train, batch_size=batch_size, 
                 epochs=nb_epoch, verbose=1, 
                 validation_data=(X_test, Y_test))


# In[ ]:


import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

plt.figure()
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.legend(['Training', 'Validation'])

plt.figure()
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.plot(hist.history['acc'])
plt.plot(hist.history['val_acc'])
plt.legend(['Training', 'Validation'], loc='lower right')


# ### Step 4: Evaluate

# In[ ]:


print('Available Metrics in Model: {}'.format(model.metrics_names))


# In[ ]:


# Evaluating the model on the test data    
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)


# ### Let's plot our model Predictions!

# In[ ]:


import matplotlib.pyplot as plt

get_ipython().run_line_magic('matplotlib', 'inline')


# In[ ]:


slice = 15
predicted = model.predict(X_test[:slice]).argmax(-1)

plt.figure(figsize=(16,8))
for i in range(slice):
    plt.subplot(1, slice, i+1)
    plt.imshow(X_test_orig[i], interpolation='nearest')
    plt.text(0, 0, predicted[i], color='black', 
             bbox=dict(facecolor='white', alpha=1))
    plt.axis('off')


# # Adding more Dense Layers

# In[ ]:


model = Sequential()
model.add(Conv2D(nb_filters, (nb_conv, nb_conv),
                 padding='valid', input_shape=shape_ord))
model.add(Activation('relu'))

model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))

model.add(Dense(nb_classes))
model.add(Activation('softmax'))


# In[ ]:


model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

model.fit(X_train, Y_train, batch_size=batch_size, 
          epochs=nb_epoch,verbose=1,
          validation_data=(X_test, Y_test))


# In[ ]:


#Evaluating the model on the test data    
score, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score)
print('Test accuracy:', accuracy)


# # Adding Dropout

# In[ ]:


model = Sequential()

model.add(Conv2D(nb_filters, (nb_conv, nb_conv),
                        padding='valid',
                        input_shape=shape_ord))
model.add(Activation('relu'))

model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))


# In[ ]:


model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

model.fit(X_train, Y_train, batch_size=batch_size, 
          epochs=nb_epoch,verbose=1,
          validation_data=(X_test, Y_test))


# In[ ]:


#Evaluating the model on the test data    
score, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score)
print('Test accuracy:', accuracy)


# # Adding more Convolution Layers

# In[ ]:


model = Sequential()
model.add(Conv2D(nb_filters, (nb_conv, nb_conv),
                 padding='valid', input_shape=shape_ord))
model.add(Activation('relu'))
model.add(Convolution2D(nb_filters, (nb_conv, nb_conv)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool)))
model.add(Dropout(0.25))
    
model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))


# In[ ]:


model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

model.fit(X_train, Y_train, batch_size=batch_size, 
          epochs=nb_epoch,verbose=1,
          validation_data=(X_test, Y_test))


# In[ ]:


#Evaluating the model on the test data    
score, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score)
print('Test accuracy:', accuracy)


# # Exercise
# 
# The above code has been written as a function. 
# 
# Change some of the **hyperparameters** and see what happens. 

# In[ ]:


# Function for constructing the convolution neural network
# Feel free to add parameters, if you want

def build_model():
    """"""
    model = Sequential()
    model.add(Conv2D(nb_filters, (nb_conv, nb_conv), 
                     padding='valid',
                     input_shape=shape_ord))
    model.add(Activation('relu'))
    model.add(Conv2D(nb_filters, (nb_conv, nb_conv)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool)))
    model.add(Dropout(0.25))
    
    model.add(Flatten())
    model.add(Dense(128))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    
    model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

    model.fit(X_train, Y_train, batch_size=batch_size, 
              epochs=nb_epoch,verbose=1,
              validation_data=(X_test, Y_test))
          

    #Evaluating the model on the test data    
    score, accuracy = model.evaluate(X_test, Y_test, verbose=0)
    print('Test score:', score)
    print('Test accuracy:', accuracy)


# In[ ]:


#Timing how long it takes to build the model and test it.
get_ipython().run_line_magic('timeit', '-n1 -r1 build_model()')


# # Batch Normalisation

# Normalize the activations of the previous layer at each batch, i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1.

# ## How to BatchNorm in Keras

# ```python
# from keras.layers.normalization import BatchNormalization
# 
# BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, 
#                    beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros',
#                    moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None,
#                    beta_constraint=None, gamma_constraint=None)
# ```
# 
# #### Arguments
# 
# <ul>
# <li><strong>axis</strong>: Integer, the axis that should be normalized
#     (typically the features axis).
#     For instance, after a <code>Conv2D</code> layer with
#     <code>data_format="channels_first"</code>,
#     set <code>axis=1</code> in <code>BatchNormalization</code>.</li>
# <li><strong>momentum</strong>: Momentum for the moving average.</li>
# <li><strong>epsilon</strong>: Small float added to variance to avoid dividing by zero.</li>
# <li><strong>center</strong>: If True, add offset of <code>beta</code> to normalized tensor.
#     If False, <code>beta</code> is ignored.</li>
# <li><strong>scale</strong>: If True, multiply by <code>gamma</code>.
#     If False, <code>gamma</code> is not used.
#     When the next layer is linear (also e.g. <code>nn.relu</code>),
#     this can be disabled since the scaling
#     will be done by the next layer.</li>
# <li><strong>beta_initializer</strong>: Initializer for the beta weight.</li>
# <li><strong>gamma_initializer</strong>: Initializer for the gamma weight.</li>
# <li><strong>moving_mean_initializer</strong>: Initializer for the moving mean.</li>
# <li><strong>moving_variance_initializer</strong>: Initializer for the moving variance.</li>
# <li><strong>beta_regularizer</strong>: Optional regularizer for the beta weight.</li>
# <li><strong>gamma_regularizer</strong>: Optional regularizer for the gamma weight.</li>
# <li><strong>beta_constraint</strong>: Optional constraint for the beta weight.</li>
# <li><strong>gamma_constraint</strong>: Optional constraint for the gamma weight.</li>
# </ul>

# ### Excercise

# In[ ]:


# Try to add a new BatchNormalization layer to the Model 
# (after the Dropout layer) - before or after the ReLU Activation