#!/usr/bin/env python # coding: utf-8 # # Advanced MLP # - Advanced techniques for training neural networks # - Weight Initialization # - Nonlinearity (Activation function) # - Optimizers # - Batch Normalization # - Dropout (Regularization) # - Model Ensemble # In[ ]: import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from tensorflow.keras.datasets import mnist from tensorflow.keras.models import Sequential from tensorflow.keras.utils import to_categorical # ## Load Dataset # - MNIST dataset # - source: http://yann.lecun.com/exdb/mnist/ # In[ ]: (X_train, y_train), (X_test, y_test) = mnist.load_data() # In[ ]: plt.imshow(X_train[0]) # show first number in the dataset plt.show() print('Label: ', y_train[0]) # In[ ]: plt.imshow(X_test[0]) # show first number in the dataset plt.show() print('Label: ', y_test[0]) # In[ ]: # reshaping X data: (n, 28, 28) => (n, 784) X_train = X_train.reshape((X_train.shape[0], -1)) X_test = X_test.reshape((X_test.shape[0], -1)) # In[ ]: # converting y data into categorical (one-hot encoding) y_train = to_categorical(y_train) y_test = to_categorical(y_test) # In[ ]: print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) # ## Basic MLP model # - Naive MLP model without any alterations # In[ ]: from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Activation, Dense from tensorflow.keras import optimizers # In[ ]: model = Sequential() # In[ ]: model.add(Dense(50, input_shape = (784, ))) model.add(Activation('sigmoid')) model.add(Dense(50)) model.add(Activation('sigmoid')) model.add(Dense(50)) model.add(Activation('sigmoid')) model.add(Dense(50)) model.add(Activation('sigmoid')) model.add(Dense(10)) model.add(Activation('softmax')) # In[ ]: sgd = optimizers.SGD(lr = 0.001) model.compile(optimizer = sgd, loss = 'categorical_crossentropy', metrics = ['accuracy']) # In[ ]: history = model.fit(X_train, y_train, batch_size = 256, validation_split = 0.3, epochs = 100, verbose = 1) # In[ ]: plt.plot(history.history['accuracy']) plt.plot(history.history['val_accuracy']) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.legend(['train acc', 'valid acc', 'train loss', 'valid loss'], loc = 'upper left') plt.show() # Training and validation accuracy seems to improve after around 60 epochs # In[ ]: results = model.evaluate(X_test, y_test) # In[ ]: print('Test accuracy: ', results[1]) # ## 1. Weight Initialization # - Changing weight initialization scheme can sometimes improve training of the model by preventing vanishing gradient problem up to some degree # - He normal or Xavier normal initialization schemes are SOTA at the moment # - Doc: https://keras.io/initializers/ # In[ ]: # from now on, create a function to generate (return) models def mlp_model(): model = Sequential() model.add(Dense(50, input_shape = (784, ), kernel_initializer='he_normal')) # use he_normal initializer model.add(Activation('sigmoid')) model.add(Dense(50, kernel_initializer='he_normal')) # use he_normal initializer model.add(Activation('sigmoid')) model.add(Dense(50, kernel_initializer='he_normal')) # use he_normal initializer model.add(Activation('sigmoid')) model.add(Dense(50, kernel_initializer='he_normal')) # use he_normal initializer model.add(Activation('sigmoid')) model.add(Dense(10, kernel_initializer='he_normal')) # use he_normal initializer model.add(Activation('softmax')) sgd = optimizers.SGD(lr = 0.001) model.compile(optimizer = sgd, loss = 'categorical_crossentropy', metrics = ['accuracy']) return model # In[ ]: model = mlp_model() history = model.fit(X_train, y_train, validation_split = 0.3, epochs = 100, verbose = 1) # In[ ]: plt.plot(history.history['accuracy']) plt.plot(history.history['val_accuracy']) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.legend(['train acc', 'valid acc', 'train loss', 'valid loss'], loc = 'upper left') plt.show() # Training and validation accuracy seems to improve after around 60 epochs # In[ ]: results = model.evaluate(X_test, y_test) # In[ ]: print('Test accuracy: ', results[1]) # ## 2. Nonlinearity (Activation function) # - Sigmoid functions suffer from gradient vanishing problem, making training slower # - There are many choices apart from sigmoid and tanh; try many of them! # - **'relu'** (rectified linear unit) is one of the most popular ones # - **'selu'** (scaled exponential linear unit) is one of the most recent ones # - Doc: https://keras.io/activations/ # #
**Sigmoid Activation Function**
# #
**Relu Activation Function**
# In[ ]: def mlp_model(): model = Sequential() model.add(Dense(50, input_shape = (784, ))) model.add(Activation('relu')) # use relu model.add(Dense(50)) model.add(Activation('relu')) # use relu model.add(Dense(50)) model.add(Activation('relu')) # use relu model.add(Dense(50)) model.add(Activation('relu')) # use relu model.add(Dense(10)) model.add(Activation('softmax')) sgd = optimizers.SGD(lr = 0.001) model.compile(optimizer = sgd, loss = 'categorical_crossentropy', metrics = ['accuracy']) return model # In[ ]: model = mlp_model() history = model.fit(X_train, y_train, validation_split = 0.3, epochs = 100, verbose = 1) # In[ ]: plt.plot(history.history['accuracy']) plt.plot(history.history['val_accuracy']) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.legend(['train acc', 'valid acc', 'train loss', 'valid loss'], loc = 'upper left') plt.show() # Training and validation accuracy improve instantaneously, but reach a plateau after around 30 epochs # In[ ]: results = model.evaluate(X_test, y_test) # In[ ]: print('Test accuracy: ', results[1]) # ## 3. Optimizers # - Many variants of SGD are proposed and employed nowadays # - One of the most popular ones are Adam (Adaptive Moment Estimation) # - Doc: https://keras.io/optimizers/ # #
**Relative convergence speed of different optimizers**

# In[ ]: def mlp_model(): model = Sequential() model.add(Dense(50, input_shape = (784, ))) model.add(Activation('sigmoid')) model.add(Dense(50)) model.add(Activation('sigmoid')) model.add(Dense(50)) model.add(Activation('sigmoid')) model.add(Dense(50)) model.add(Activation('sigmoid')) model.add(Dense(10)) model.add(Activation('softmax')) adam = optimizers.Adam(lr = 0.001) # use Adam optimizer model.compile(optimizer = adam, loss = 'categorical_crossentropy', metrics = ['accuracy']) return model # In[ ]: model = mlp_model() history = model.fit(X_train, y_train, validation_split = 0.3, epochs = 100, verbose = 0) # In[ ]: plt.plot(history.history['accuracy']) plt.plot(history.history['val_accuracy']) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.legend(['train acc', 'valid acc', 'train loss', 'valid loss'], loc = 'upper left') plt.show() # Training and validation accuracy improve instantaneously, but reach plateau after around 50 epochs # In[ ]: results = model.evaluate(X_test, y_test) # In[ ]: print('Test accuracy: ', results[1]) # ## 4. Batch Normalization # - Batch Normalization, one of the methods to prevent the "internal covariance shift" problem, has proven to be highly effective # - Normalize each mini-batch before nonlinearity # - Doc: https://keras.io/optimizers/ # # #
Batch normalization layer is usually inserted after dense/convolution and before nonlinearity # In[ ]: from keras.layers import BatchNormalization # In[ ]: def mlp_model(): model = Sequential() model.add(Dense(50, input_shape = (784, ))) model.add(BatchNormalization()) # Add Batchnorm layer before Activation model.add(Activation('sigmoid')) model.add(Dense(50)) model.add(BatchNormalization()) # Add Batchnorm layer before Activation model.add(Activation('sigmoid')) model.add(Dense(50)) model.add(BatchNormalization()) # Add Batchnorm layer before Activation model.add(Activation('sigmoid')) model.add(Dense(50)) model.add(BatchNormalization()) # Add Batchnorm layer before Activation model.add(Activation('sigmoid')) model.add(Dense(10)) model.add(Activation('softmax')) sgd = optimizers.SGD(lr = 0.001) model.compile(optimizer = sgd, loss = 'categorical_crossentropy', metrics = ['accuracy']) return model # In[ ]: model = mlp_model() history = model.fit(X_train, y_train, validation_split = 0.3, epochs = 100, verbose = 0) # In[ ]: plt.plot(history.history['accuracy']) plt.plot(history.history['val_accuracy']) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.legend(['train acc', 'valid acc', 'train loss', 'valid loss'], loc = 'upper left') plt.show() # Training and validation accuracy improve consistently, but reach plateau after around 60 epochs # In[ ]: results = model.evaluate(X_test, y_test) # In[ ]: print('Test accuracy: ', results[1]) # ## 5. Dropout (Regularization) # - Dropout is one of powerful ways to prevent overfitting # - The idea is simple. It is disconnecting some (randomly selected) neurons in each layer # - The probability of each neuron to be disconnected, namely 'Dropout rate', has to be designated # - Doc: https://keras.io/layers/core/#dropout # # In[ ]: from keras.layers import Dropout # In[ ]: def mlp_model(): model = Sequential() model.add(Dense(50, input_shape = (784, ))) model.add(Activation('sigmoid')) model.add(Dropout(0.2)) # Dropout layer after Activation model.add(Dense(50)) model.add(Activation('sigmoid')) model.add(Dropout(0.2)) # Dropout layer after Activation model.add(Dense(50)) model.add(Activation('sigmoid')) model.add(Dropout(0.2)) # Dropout layer after Activation model.add(Dense(50)) model.add(Activation('sigmoid')) model.add(Dropout(0.2)) # Dropout layer after Activation model.add(Dense(10)) model.add(Activation('softmax')) sgd = optimizers.SGD(lr = 0.001) model.compile(optimizer = sgd, loss = 'categorical_crossentropy', metrics = ['accuracy']) return model # In[ ]: model = mlp_model() history = model.fit(X_train, y_train, validation_split = 0.3, epochs = 100, verbose = 0) # In[ ]: plt.plot(history.history['accuracy']) plt.plot(history.history['val_accuracy']) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.legend(['train acc', 'valid acc', 'train loss', 'valid loss'], loc = 'upper left') plt.show() # Validation results does not improve since it did not show signs of overfitting, yet. #
Hence, the key takeaway message is that apply dropout when you see a signal of overfitting. # In[ ]: results = model.evaluate(X_test, y_test) # In[ ]: print('Test accuracy: ', results[1]) # ## 6. Model Ensemble # - Model ensemble is a reliable and promising way to boost performance of the model # - Usually create 8 to 10 independent networks and merge their results # - Here, we resort to scikit-learn API, **VotingClassifier** # - Doc: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html # # In[ ]: import numpy as np from tensorflow.keras.wrappers.scikit_learn import KerasClassifier from sklearn.ensemble import VotingClassifier from sklearn.metrics import accuracy_score # In[ ]: y_train = np.argmax(y_train, axis = 1) y_test = np.argmax(y_test, axis = 1) # In[ ]: def mlp_model(): model = Sequential() model.add(Dense(50, input_shape = (784, ))) model.add(Activation('sigmoid')) model.add(Dense(50)) model.add(Activation('sigmoid')) model.add(Dense(50)) model.add(Activation('sigmoid')) model.add(Dense(50)) model.add(Activation('sigmoid')) model.add(Dense(10)) model.add(Activation('softmax')) sgd = optimizers.SGD(lr = 0.001) model.compile(optimizer = sgd, loss = 'categorical_crossentropy', metrics = ['accuracy']) return model # In[ ]: model1 = KerasClassifier(build_fn = mlp_model, epochs = 100, verbose = 0) model2 = KerasClassifier(build_fn = mlp_model, epochs = 100, verbose = 0) model3 = KerasClassifier(build_fn = mlp_model, epochs = 100, verbose = 0) model1._estimator_type = "classifier" model2._estimator_type = "classifier" model3._estimator_type = "classifier" # In[ ]: ensemble_clf = VotingClassifier(estimators = [('model1', model1), ('model2', model2), ('model3', model3)] , voting = 'soft') # In[ ]: ensemble_clf.fit(X_train, y_train) # In[ ]: y_pred = ensemble_clf.predict(X_test) # In[ ]: print('Test accuracy:', accuracy_score(y_pred, y_test)) # ## Summary # # Below table is a summary of evaluation results so far. It turns out that all methods improve the test performance over the MNIST dataset. Why don't we try them out altogether? # # |Model | Baseline | Weight initialization | Activation function | Optimizer | Batchnormalization | Regularization | Ensemble | # |----------------|-------------|------------|-------------|-------------|------------|-----------|------------| # |Test Accuracy | 0.1134 | 0.8625 | 0.9488 | 0.9465 | 0.9480 | 0.4226 | 0.9002 | # #
# # In[ ]: