#!/usr/bin/env python # coding: utf-8 # # Part 3 - Evaluating Models # # In this notebook we will cover the following topics: # # * Evaluating model performance # * Controlling overfitting with dropout # * Increasing model complexity # ## Evaluating Model Performance # # One of the most important parts of the data science workflow is evaluating the performance of a trained model and deciding: # # 1. Is it good enough? (If so, stop!) # 2. If not, how should it be changed? # # Let's load up Keras and train an overly simple model on the CIFAR10 data. # In[1]: import numpy as np np.warnings.filterwarnings('ignore') # Hide np.floating warning import keras from keras.datasets import cifar10 from keras.models import Sequential from keras.layers import Dense, Dropout, Flatten from keras.layers import Conv2D, MaxPooling2D # Prevent TensorFlow from grabbing all the GPU memory import tensorflow as tf config = tf.ConfigProto() config.gpu_options.allow_growth=True sess = tf.Session(config=config) import holoviews as hv hv.extension('bokeh') # ### Load the Data # # Same data preparation as before. # # (*Pro tip*: If this wasn't a tutorial, we'd move these repetitive code to a Python module and import it in the notebook to ensure we do it consistently in every experiment.) # In[2]: from keras.datasets import cifar10 import keras.utils (x_train, y_train), (x_test, y_test) = cifar10.load_data() # Save an unmodified copy of y_test for later, flattened to one column y_test_true = y_test[:,0].copy() x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 num_classes = 10 y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) # The data only has numeric categories so we also have the string labels below cifar10_labels = np.array(['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']) # ### Create a Basic Model # # This model resembles the one from the previous notebook, but we've removed one of the convolutional groups # In[3]: model = Sequential() model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=x_train.shape[1:])) model.add(Conv2D(64, (3, 3), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(128, activation='relu')) model.add(Dense(num_classes, activation='softmax')) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy']) # In[4]: history = model.fit(x_train, y_train, batch_size=128, epochs=8, verbose=1, validation_data=(x_test, y_test)) # In[5]: train_acc = hv.Curve((history.epoch, history.history['acc']), 'epoch', 'accuracy', label='training') val_acc = hv.Curve((history.epoch, history.history['val_acc']), 'epoch', 'accuracy', label='validation') layout = (train_acc * val_acc).redim(accuracy=dict(range=(0.4, 1.1))) layout.opts( hv.opts.Curve(width=400, height=300, line_width=3), hv.opts.Overlay(legend_position='top_left') ) # This model shows a huge discrepancy in accuracy between the training and validation data, a sign of overfitting. After the epoch 2, additional training is not helping. The model is essentially memorizing the training data and not generalizing at all. # ### Plotting the Confusion Matrix # # When dealing with models that predict categories, it is helpful to look at the confusion matrix as well. This will show which categories are being predicted poorly, and what kind of mispredictions are happening. # # As the confusion matrix is a standard tool in all of machine learning, the `sklearn` package includes a function that computes it from an array of true category IDs and an array of predicted category IDs: # In[6]: from sklearn.metrics import confusion_matrix y_pred = model.predict_classes(x_test) confuse = confusion_matrix(y_test_true, y_pred) # In[7]: # Holoviews hack to tilt labels by 45 degrees from math import pi def angle_label(plot, element): plot.state.xaxis.major_label_orientation = pi / 4 # In[8]: layout = hv.HeatMap((cifar10_labels, cifar10_labels, confuse)).redim.label(x='true', y='predict') layout.opts( hv.opts.HeatMap(width=500, height=400, tools=['hover'], finalize_hooks=[angle_label]), ) # From this we can see that dogs, deer, cats, and birds are particularly problematic classes, with the confusion between cats and dogs being especially high. Note that because the test data are already balanced to have equal examples from each class, we do not need to do any special normalization of the above. # ### Controlling Overfit with Dropout # # Overfitting is more or less inevitable if we train long enough. The goal is to control it with tools like regularization or dropout. [Dropout](https://keras.io/layers/core/#dropout) is a surprisingly effective technique where layer inputs are passed to the output, with a random subset of outputs forced to zero during training. The subset of zeroed outputs changes after every batch. When the model is used for prediction after training, the dropout layers have no effect. # # For more details about dropout, see this [paper](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf). # In[9]: model2 = Sequential() model2.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=x_train.shape[1:])) model2.add(Conv2D(64, (3, 3), activation='relu')) model2.add(MaxPooling2D(pool_size=(2, 2))) model2.add(Dropout(0.25)) model2.add(Flatten()) model2.add(Dense(128, activation='relu')) model2.add(Dropout(0.5)) model2.add(Dense(num_classes, activation='softmax')) model2.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy']) # In[10]: history2 = model2.fit(x_train, y_train, batch_size=128, epochs=11, verbose=1, validation_data=(x_test, y_test)) # In[11]: train_acc = hv.Curve((history.epoch, history.history['acc']), 'epoch', 'accuracy', label='training without dropout') val_acc = hv.Curve((history.epoch, history.history['val_acc']), 'epoch', 'accuracy', label='validation without dropout') train_acc2 = hv.Curve((history2.epoch, history2.history['acc']), 'epoch', 'accuracy', label='training with dropout') val_acc2 = hv.Curve((history2.epoch, history2.history['val_acc']), 'epoch', 'accuracy', label='validation with dropout') layout = (train_acc * val_acc * train_acc2 * val_acc2).redim(accuracy=dict(range=(0.4, 1.1))) layout.opts( hv.opts.Curve(width=600, height=450, line_width=3), hv.opts.Overlay(legend_position='top_left') ) # Here we can see some common features of a model with dropout: # * Training is slower (the noise introduced by dropout interferes with gradient descent) # * The onset of overfitting is delayed (from epoch 2 to epoch 7) # * We achieve a slightly higher accuracy on the validation data with additional training. # # Unfortunately, the amount of improvement in this case is still not enough to increase accuracy by more than a few percent. It looks like we need a more complex model. # ### A More Complex Model # # To increase the sophistication of this model, we're going to employ a few strategies: # # * Add back the second round of convolutions (more like VGG16) # * Increase the size of the first dense layer # # Unfortunately, this is the hardest thing to figure out in practice. Sometimes we need more layers, sometimes we need bigger layers, and sometimes we need a different model entirely. Looking at what others have done is your best guide here until you get some intuition. # In[12]: model3 = Sequential() model3.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu', input_shape=x_train.shape[1:])) model3.add(Conv2D(32, (3, 3), activation='relu')) model3.add(MaxPooling2D(pool_size=(2, 2))) model3.add(Dropout(0.25)) # Second layer of convolutions model3.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu')) model3.add(Conv2D(64, (3, 3), activation='relu')) model3.add(MaxPooling2D(pool_size=(2, 2))) model3.add(Dropout(0.25)) model3.add(Flatten()) model3.add(Dense(512, activation='relu')) model3.add(Dropout(0.5)) model3.add(Dense(num_classes, activation='softmax')) model3.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy']) history3 = model3.fit(x_train, y_train, batch_size=128, epochs=15, verbose=1, validation_data=(x_test, y_test)) # In[13]: train_acc = hv.Curve((history2.epoch, history2.history['val_acc']), 'epoch', 'accuracy', label='validation (simple model)') train_acc2 = hv.Curve((history3.epoch, history3.history['acc']), 'epoch', 'accuracy', label='training (complex model)') val_acc = hv.Curve((history3.epoch, history3.history['val_acc']), 'epoch', 'accuracy', label='validation (complex model)') layout = (train_acc * val_acc * train_acc2).redim(accuracy=dict(range=(0.4, 1.1))) layout.opts( hv.opts.Curve(width=600, height=500, line_width=3), hv.opts.Overlay(legend_position='top_left') ) # ## Experiments to Try # # * We changed two things to make the more complex model: extra convolutional layers and making the dense layer bigger. Was it necessary to do both? # * Add a callback to the first fit to end training early once the validation data accuracy stops improving. # * What does the confusion matrix look like for the more complex model? Do we still have problems with cats and dogs? # # If you screw everything up, you can use File / Revert to Checkpoint to go back to the first version of the notebook and restart the Jupyter kernel with Kernel / Restart. # In[14]: model3.summary() # In[ ]: