#!/usr/bin/env python
# coding: utf-8

# # Part 3 - Evaluating Models
# 
# In this notebook we will cover the following topics:
# 
# * Evaluating model performance
# * Controlling overfitting with dropout
# * Increasing model complexity

# ## Evaluating Model Performance
# 
# One of the most important parts of the data science workflow is evaluating the performance of a trained model and deciding:
# 
# 1. Is it good enough?  (If so, stop!)
# 2. If not, how should it be changed?
# 
# Let's load up Keras and train an overly simple model on the CIFAR10 data.

# In[1]:


import numpy as np
np.warnings.filterwarnings('ignore')  # Hide np.floating warning

import keras

from keras.datasets import cifar10
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D

# Prevent TensorFlow from grabbing all the GPU memory
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

import holoviews as hv
hv.extension('bokeh')


# ### Load the Data
# 
# Same data preparation as before.
# 
# (*Pro tip*: If this wasn't a tutorial, we'd move these repetitive code to a Python module and import it in the notebook to ensure we do it consistently in every experiment.)

# In[2]:


from keras.datasets import cifar10
import keras.utils

(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# Save an unmodified copy of y_test for later, flattened to one column
y_test_true = y_test[:,0].copy()

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

num_classes = 10
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

# The data only has numeric categories so we also have the string labels below 
cifar10_labels = np.array(['airplane', 'automobile', 'bird', 'cat', 'deer', 
                           'dog', 'frog', 'horse', 'ship', 'truck'])


# ### Create a Basic Model
# 
# This model resembles the one from the previous notebook, but we've removed one of the convolutional groups

# In[3]:


model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=x_train.shape[1:]))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])


# In[4]:


history = model.fit(x_train, y_train,
          batch_size=128,
          epochs=8,
          verbose=1,
          validation_data=(x_test, y_test))


# In[5]:


train_acc = hv.Curve((history.epoch, history.history['acc']), 'epoch', 'accuracy', label='training')
val_acc = hv.Curve((history.epoch, history.history['val_acc']), 'epoch', 'accuracy', label='validation')

layout = (train_acc * val_acc).redim(accuracy=dict(range=(0.4, 1.1)))

layout.opts(
    hv.opts.Curve(width=400, height=300, line_width=3),
    hv.opts.Overlay(legend_position='top_left')
)


# This model shows a huge discrepancy in accuracy between the training and validation data, a sign of overfitting.  After the epoch 2, additional training is not helping.  The model is essentially memorizing the training data and not generalizing at all.

# ### Plotting the Confusion Matrix
# 
# When dealing with models that predict categories, it is helpful to look at the confusion matrix as well.  This will show which categories are being predicted poorly, and what kind of mispredictions are happening.
# 
# As the confusion matrix is a standard tool in all of machine learning, the `sklearn` package includes a function that computes it from an array of true category IDs and an array of predicted category IDs:

# In[6]:


from sklearn.metrics import confusion_matrix

y_pred = model.predict_classes(x_test)
confuse = confusion_matrix(y_test_true, y_pred)


# In[7]:


# Holoviews hack to tilt labels by 45 degrees
from math import pi
def angle_label(plot, element):
    plot.state.xaxis.major_label_orientation = pi / 4


# In[8]:


layout = hv.HeatMap((cifar10_labels, cifar10_labels, confuse)).redim.label(x='true', y='predict')

layout.opts(
    hv.opts.HeatMap(width=500, height=400, tools=['hover'], finalize_hooks=[angle_label]),
)


# From this we can see that dogs, deer, cats, and birds are particularly problematic classes, with the confusion between cats and dogs being especially high.  Note that because the test data are already balanced to have equal examples from each class, we do not need to do any special normalization of the above.

# ### Controlling Overfit with Dropout
# 
# Overfitting is more or less inevitable if we train long enough.  The goal is to control it with tools like regularization or dropout.  [Dropout](https://keras.io/layers/core/#dropout) is a surprisingly effective technique where layer inputs are passed to the output, with a random subset of outputs forced to zero during training.  The subset of zeroed outputs changes after every batch.  When the model is used for prediction after training, the dropout layers have no effect.
# 
# For more details about dropout, see this [paper](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf).

# In[9]:


model2 = Sequential()
model2.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=x_train.shape[1:]))
model2.add(Conv2D(64, (3, 3), activation='relu'))
model2.add(MaxPooling2D(pool_size=(2, 2)))

model2.add(Dropout(0.25))

model2.add(Flatten())
model2.add(Dense(128, activation='relu'))

model2.add(Dropout(0.5))

model2.add(Dense(num_classes, activation='softmax'))

model2.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])


# In[10]:


history2 = model2.fit(x_train, y_train,
          batch_size=128,
          epochs=11,
          verbose=1,
          validation_data=(x_test, y_test))


# In[11]:


train_acc = hv.Curve((history.epoch, history.history['acc']), 'epoch', 'accuracy', label='training without dropout')
val_acc = hv.Curve((history.epoch, history.history['val_acc']), 'epoch', 'accuracy', label='validation without dropout')
train_acc2 = hv.Curve((history2.epoch, history2.history['acc']), 'epoch', 'accuracy', label='training with dropout')
val_acc2 = hv.Curve((history2.epoch, history2.history['val_acc']), 'epoch', 'accuracy', label='validation with dropout')

layout = (train_acc * val_acc * train_acc2 * val_acc2).redim(accuracy=dict(range=(0.4, 1.1)))

layout.opts(
    hv.opts.Curve(width=600, height=450, line_width=3),
    hv.opts.Overlay(legend_position='top_left')
)


# Here we can see some common features of a model with dropout:
# * Training is slower (the noise introduced by dropout interferes with gradient descent)
# * The onset of overfitting is delayed (from epoch 2 to epoch 7)
# * We achieve a slightly higher accuracy on the validation data with additional training.
# 
# Unfortunately, the amount of improvement in this case is still not enough to increase accuracy by more than a few percent.  It looks like we need a more complex model.

# ### A More Complex Model
# 
# To increase the sophistication of this model, we're going to employ a few strategies:
# 
# * Add back the second round of convolutions (more like VGG16)
# * Increase the size of the first dense layer
# 
# Unfortunately, this is the hardest thing to figure out in practice.  Sometimes we need more layers, sometimes we need bigger layers, and sometimes we need a different model entirely.  Looking at what others have done is your best guide here until you get some intuition.

# In[12]:


model3 = Sequential()
model3.add(Conv2D(32, kernel_size=(3, 3), padding='same',
                 activation='relu',
                 input_shape=x_train.shape[1:]))
model3.add(Conv2D(32, (3, 3), activation='relu'))
model3.add(MaxPooling2D(pool_size=(2, 2)))
model3.add(Dropout(0.25))

# Second layer of convolutions
model3.add(Conv2D(64, kernel_size=(3, 3), padding='same',
                 activation='relu'))
model3.add(Conv2D(64, (3, 3), activation='relu'))
model3.add(MaxPooling2D(pool_size=(2, 2)))
model3.add(Dropout(0.25))

model3.add(Flatten())
model3.add(Dense(512, activation='relu'))
model3.add(Dropout(0.5))
model3.add(Dense(num_classes, activation='softmax'))

model3.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

history3 = model3.fit(x_train, y_train,
          batch_size=128,
          epochs=15,
          verbose=1,
          validation_data=(x_test, y_test))


# In[13]:


train_acc = hv.Curve((history2.epoch, history2.history['val_acc']), 'epoch', 'accuracy', label='validation (simple model)')
train_acc2 = hv.Curve((history3.epoch, history3.history['acc']), 'epoch', 'accuracy', label='training (complex model)')
val_acc = hv.Curve((history3.epoch, history3.history['val_acc']), 'epoch', 'accuracy', label='validation (complex model)')

layout = (train_acc * val_acc * train_acc2).redim(accuracy=dict(range=(0.4, 1.1)))

layout.opts(
    hv.opts.Curve(width=600, height=500, line_width=3),
    hv.opts.Overlay(legend_position='top_left')
)


# ## Experiments to Try
# 
# * We changed two things to make the more complex model: extra convolutional layers and making the dense layer bigger.  Was it necessary to do both?
# * Add a callback to the first fit to end training early once the validation data accuracy stops improving.
# * What does the confusion matrix look like for the more complex model?  Do we still have problems with cats and dogs?
# 
# If you screw everything up, you can use File / Revert to Checkpoint to go back to the first version of the notebook and restart the Jupyter kernel with Kernel / Restart.

# In[14]:


model3.summary()


# In[ ]: