Chapter 11 – Training Deep Neural Networks
This notebook contains all the sample code and solutions to the exercises in chapter 11.
This project requires Python 3.7 or above:
import sys
assert sys.version_info >= (3, 7)
And TensorFlow ≥ 2.8:
from packaging import version
import tensorflow as tf
assert version.parse(tf.__version__) >= version.parse("2.8.0")
As we did in previous chapters, let's define the default font sizes to make the figures prettier:
import matplotlib.pyplot as plt
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
And let's create the images/deep
folder (if it doesn't already exist), and define the save_fig()
function which is used through this notebook to save the figures in high-res for the book:
from pathlib import Path
IMAGES_PATH = Path() / "images" / "deep"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
if tight_layout:
plt.tight_layout()
plt.savefig(path, format=fig_extension, dpi=resolution)
# extra code – this cell generates and saves Figure 11–1
import numpy as np
def sigmoid(z):
return 1 / (1 + np.exp(-z))
z = np.linspace(-5, 5, 200)
plt.plot([-5, 5], [0, 0], 'k-')
plt.plot([-5, 5], [1, 1], 'k--')
plt.plot([0, 0], [-0.2, 1.2], 'k-')
plt.plot([-5, 5], [-3/4, 7/4], 'g--')
plt.plot(z, sigmoid(z), "b-", linewidth=2,
label=r"$\sigma(z) = \dfrac{1}{1+e^{-z}}$")
props = dict(facecolor='black', shrink=0.1)
plt.annotate('Saturating', xytext=(3.5, 0.7), xy=(5, 1), arrowprops=props,
fontsize=14, ha="center")
plt.annotate('Saturating', xytext=(-3.5, 0.3), xy=(-5, 0), arrowprops=props,
fontsize=14, ha="center")
plt.annotate('Linear', xytext=(2, 0.2), xy=(0, 0.5), arrowprops=props,
fontsize=14, ha="center")
plt.grid(True)
plt.axis([-5, 5, -0.2, 1.2])
plt.xlabel("$z$")
plt.legend(loc="upper left", fontsize=16)
save_fig("sigmoid_saturation_plot")
plt.show()
dense = tf.keras.layers.Dense(50, activation="relu",
kernel_initializer="he_normal")
he_avg_init = tf.keras.initializers.VarianceScaling(scale=2., mode="fan_avg",
distribution="uniform")
dense = tf.keras.layers.Dense(50, activation="sigmoid",
kernel_initializer=he_avg_init)
# extra code – this cell generates and saves Figure 11–2
def leaky_relu(z, alpha):
return np.maximum(alpha * z, z)
z = np.linspace(-5, 5, 200)
plt.plot(z, leaky_relu(z, 0.1), "b-", linewidth=2, label=r"$LeakyReLU(z) = max(\alpha z, z)$")
plt.plot([-5, 5], [0, 0], 'k-')
plt.plot([0, 0], [-1, 3.7], 'k-')
plt.grid(True)
props = dict(facecolor='black', shrink=0.1)
plt.annotate('Leak', xytext=(-3.5, 0.5), xy=(-5, -0.3), arrowprops=props,
fontsize=14, ha="center")
plt.xlabel("$z$")
plt.axis([-5, 5, -1, 3.7])
plt.gca().set_aspect("equal")
plt.legend()
save_fig("leaky_relu_plot")
plt.show()
leaky_relu = tf.keras.layers.LeakyReLU(alpha=0.2) # defaults to alpha=0.3
dense = tf.keras.layers.Dense(50, activation=leaky_relu,
kernel_initializer="he_normal")
model = tf.keras.models.Sequential([
# [...] # more layers
tf.keras.layers.Dense(50, kernel_initializer="he_normal"), # no activation
tf.keras.layers.LeakyReLU(alpha=0.2), # activation as a separate layer
# [...] # more layers
])
2021-12-16 11:22:41.636848: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Implementing ELU in TensorFlow is trivial, just specify the activation function when building each layer, and use He initialization:
dense = tf.keras.layers.Dense(50, activation="elu",
kernel_initializer="he_normal")
By default, the SELU hyperparameters (scale
and alpha
) are tuned in such a way that the mean output of each neuron remains close to 0, and the standard deviation remains close to 1 (assuming the inputs are standardized with mean 0 and standard deviation 1 too, and other constraints are respected, as explained in the book). Using this activation function, even a 1,000 layer deep neural network preserves roughly mean 0 and standard deviation 1 across all layers, avoiding the exploding/vanishing gradients problem:
# extra code – this cell generates and saves Figure 11–3
from scipy.special import erfc
# alpha and scale to self normalize with mean 0 and standard deviation 1
# (see equation 14 in the paper):
alpha_0_1 = -np.sqrt(2 / np.pi) / (erfc(1 / np.sqrt(2)) * np.exp(1 / 2) - 1)
scale_0_1 = (
(1 - erfc(1 / np.sqrt(2)) * np.sqrt(np.e))
* np.sqrt(2 * np.pi)
* (
2 * erfc(np.sqrt(2)) * np.e ** 2
+ np.pi * erfc(1 / np.sqrt(2)) ** 2 * np.e
- 2 * (2 + np.pi) * erfc(1 / np.sqrt(2)) * np.sqrt(np.e)
+ np.pi
+ 2
) ** (-1 / 2)
)
def elu(z, alpha=1):
return np.where(z < 0, alpha * (np.exp(z) - 1), z)
def selu(z, scale=scale_0_1, alpha=alpha_0_1):
return scale * elu(z, alpha)
z = np.linspace(-5, 5, 200)
plt.plot(z, elu(z), "b-", linewidth=2, label=r"ELU$_\alpha(z) = \alpha (e^z - 1)$ if $z < 0$, else $z$")
plt.plot(z, selu(z), "r--", linewidth=2, label=r"SELU$(z) = 1.05 \, $ELU$_{1.67}(z)$")
plt.plot([-5, 5], [0, 0], 'k-')
plt.plot([-5, 5], [-1, -1], 'k:', linewidth=2)
plt.plot([-5, 5], [-1.758, -1.758], 'k:', linewidth=2)
plt.plot([0, 0], [-2.2, 3.2], 'k-')
plt.grid(True)
plt.axis([-5, 5, -2.2, 3.2])
plt.xlabel("$z$")
plt.gca().set_aspect("equal")
plt.legend()
save_fig("elu_selu_plot")
plt.show()
Using SELU is straightforward:
dense = tf.keras.layers.Dense(50, activation="selu",
kernel_initializer="lecun_normal")
Extra material – an example of a self-regularized network using SELU
Let's create a neural net for Fashion MNIST with 100 hidden layers, using the SELU activation function:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=[28, 28]))
for layer in range(100):
model.add(tf.keras.layers.Dense(100, activation="selu",
kernel_initializer="lecun_normal"))
model.add(tf.keras.layers.Dense(10, activation="softmax"))
model.compile(loss="sparse_categorical_crossentropy",
optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
metrics=["accuracy"])
Now let's train it. Do not forget to scale the inputs to mean 0 and standard deviation 1:
fashion_mnist = tf.keras.datasets.fashion_mnist.load_data()
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist
X_train, y_train = X_train_full[:-5000], y_train_full[:-5000]
X_valid, y_valid = X_train_full[-5000:], y_train_full[-5000:]
X_train, X_valid, X_test = X_train / 255, X_valid / 255, X_test / 255
class_names = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
"Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]
pixel_means = X_train.mean(axis=0, keepdims=True)
pixel_stds = X_train.std(axis=0, keepdims=True)
X_train_scaled = (X_train - pixel_means) / pixel_stds
X_valid_scaled = (X_valid - pixel_means) / pixel_stds
X_test_scaled = (X_test - pixel_means) / pixel_stds
history = model.fit(X_train_scaled, y_train, epochs=5,
validation_data=(X_valid_scaled, y_valid))
2021-12-16 11:22:44.499697: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
Epoch 1/5 1719/1719 [==============================] - 13s 7ms/step - loss: 1.3735 - accuracy: 0.4548 - val_loss: 0.9599 - val_accuracy: 0.6444 Epoch 2/5 1719/1719 [==============================] - 12s 7ms/step - loss: 0.7783 - accuracy: 0.7073 - val_loss: 0.6529 - val_accuracy: 0.7664 Epoch 3/5 1719/1719 [==============================] - 12s 7ms/step - loss: 0.6462 - accuracy: 0.7611 - val_loss: 0.6048 - val_accuracy: 0.7748 Epoch 4/5 1719/1719 [==============================] - 11s 6ms/step - loss: 0.5821 - accuracy: 0.7863 - val_loss: 0.5737 - val_accuracy: 0.7944 Epoch 5/5 1719/1719 [==============================] - 12s 7ms/step - loss: 0.5401 - accuracy: 0.8041 - val_loss: 0.5333 - val_accuracy: 0.8046
The network managed to learn, despite how deep it is. Now look at what happens if we try to use the ReLU activation function instead:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=[28, 28]))
for layer in range(100):
model.add(tf.keras.layers.Dense(100, activation="relu",
kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(10, activation="softmax"))
model.compile(loss="sparse_categorical_crossentropy",
optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
metrics=["accuracy"])
history = model.fit(X_train_scaled, y_train, epochs=5,
validation_data=(X_valid_scaled, y_valid))
Epoch 1/5 1719/1719 [==============================] - 12s 6ms/step - loss: 1.6932 - accuracy: 0.3071 - val_loss: 1.2058 - val_accuracy: 0.5106 Epoch 2/5 1719/1719 [==============================] - 11s 6ms/step - loss: 1.1132 - accuracy: 0.5297 - val_loss: 0.9682 - val_accuracy: 0.5718 Epoch 3/5 1719/1719 [==============================] - 10s 6ms/step - loss: 0.9480 - accuracy: 0.6117 - val_loss: 1.0552 - val_accuracy: 0.5102 Epoch 4/5 1719/1719 [==============================] - 10s 6ms/step - loss: 0.9763 - accuracy: 0.6003 - val_loss: 0.7764 - val_accuracy: 0.7070 Epoch 5/5 1719/1719 [==============================] - 11s 6ms/step - loss: 0.7892 - accuracy: 0.6875 - val_loss: 0.7485 - val_accuracy: 0.7054
Not great at all, we suffered from the vanishing/exploding gradients problem.
# extra code – this cell generates and saves Figure 11–4
def swish(z, beta=1):
return z * sigmoid(beta * z)
def approx_gelu(z):
return swish(z, beta=1.702)
def softplus(z):
return np.log(1 + np.exp(z))
def mish(z):
return z * np.tanh(softplus(z))
z = np.linspace(-4, 2, 200)
beta = 0.6
plt.plot(z, approx_gelu(z), "b-", linewidth=2,
label=r"GELU$(z) = z\,\Phi(z)$")
plt.plot(z, swish(z), "r--", linewidth=2,
label=r"Swish$(z) = z\,\sigma(z)$")
plt.plot(z, swish(z, beta), "r:", linewidth=2,
label=fr"Swish$_{{\beta={beta}}}(z)=z\,\sigma({beta}\,z)$")
plt.plot(z, mish(z), "g:", linewidth=3,
label=fr"Mish$(z) = z\,\tanh($softplus$(z))$")
plt.plot([-4, 2], [0, 0], 'k-')
plt.plot([0, 0], [-2.2, 3.2], 'k-')
plt.grid(True)
plt.axis([-4, 2, -1, 2])
plt.gca().set_aspect("equal")
plt.xlabel("$z$")
plt.legend(loc="upper left")
save_fig("gelu_swish_mish_plot")
plt.show()
# extra code - clear the name counters and set the random seed
tf.keras.backend.clear_session()
tf.random.set_seed(42)
model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=[28, 28]),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dense(300, activation="relu",
kernel_initializer="he_normal"),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dense(100, activation="relu",
kernel_initializer="he_normal"),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dense(10, activation="softmax")
])
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= flatten (Flatten) (None, 784) 0 _________________________________________________________________ batch_normalization (BatchNo (None, 784) 3136 _________________________________________________________________ dense (Dense) (None, 300) 235500 _________________________________________________________________ batch_normalization_1 (Batch (None, 300) 1200 _________________________________________________________________ dense_1 (Dense) (None, 100) 30100 _________________________________________________________________ batch_normalization_2 (Batch (None, 100) 400 _________________________________________________________________ dense_2 (Dense) (None, 10) 1010 ================================================================= Total params: 271,346 Trainable params: 268,978 Non-trainable params: 2,368 _________________________________________________________________
[(var.name, var.trainable) for var in model.layers[1].variables]
[('batch_normalization/gamma:0', True), ('batch_normalization/beta:0', True), ('batch_normalization/moving_mean:0', False), ('batch_normalization/moving_variance:0', False)]
# extra code – just show that the model works! 😊
model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd",
metrics="accuracy")
model.fit(X_train, y_train, epochs=2, validation_data=(X_valid, y_valid))
Epoch 1/2 1719/1719 [==============================] - 3s 2ms/step - loss: 0.5559 - accuracy: 0.8094 - val_loss: 0.4016 - val_accuracy: 0.8558 Epoch 2/2 1719/1719 [==============================] - 3s 1ms/step - loss: 0.4083 - accuracy: 0.8561 - val_loss: 0.3676 - val_accuracy: 0.8650
<keras.callbacks.History at 0x7fa5d11505b0>
Sometimes applying BN before the activation function works better (there's a debate on this topic). Moreover, the layer before a BatchNormalization
layer does not need to have bias terms, since the BatchNormalization
layer some as well, it would be a waste of parameters, so you can set use_bias=False
when creating those layers:
# extra code - clear the name counters and set the random seed
tf.keras.backend.clear_session()
tf.random.set_seed(42)
model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=[28, 28]),
tf.keras.layers.Dense(300, kernel_initializer="he_normal", use_bias=False),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation("relu"),
tf.keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation("relu"),
tf.keras.layers.Dense(10, activation="softmax")
])
# extra code – just show that the model works! 😊
model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd",
metrics="accuracy")
model.fit(X_train, y_train, epochs=2, validation_data=(X_valid, y_valid))
Epoch 1/2 1719/1719 [==============================] - 3s 1ms/step - loss: 0.6063 - accuracy: 0.7993 - val_loss: 0.4296 - val_accuracy: 0.8418 Epoch 2/2 1719/1719 [==============================] - 2s 1ms/step - loss: 0.4275 - accuracy: 0.8500 - val_loss: 0.3752 - val_accuracy: 0.8646
<keras.callbacks.History at 0x7fa5fdd309d0>
All tf.keras.optimizers
accept clipnorm
or clipvalue
arguments:
optimizer = tf.keras.optimizers.SGD(clipvalue=1.0)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer)
optimizer = tf.keras.optimizers.SGD(clipnorm=1.0)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer)
Let's split the fashion MNIST training set in two:
X_train_A
: all images of all items except for T-shirts/tops and pullovers (classes 0 and 2).X_train_B
: a much smaller training set of just the first 200 images of T-shirts/tops and pullovers.The validation set and the test set are also split this way, but without restricting the number of images.
We will train a model on set A (classification task with 8 classes), and try to reuse it to tackle set B (binary classification). We hope to transfer a little bit of knowledge from task A to task B, since classes in set A (trousers, dresses, coats, sandals, shirts, sneakers, bags, and ankle boots) are somewhat similar to classes in set B (T-shirts/tops and pullovers). However, since we are using Dense
layers, only patterns that occur at the same location can be reused (in contrast, convolutional layers will transfer much better, since learned patterns can be detected anywhere on the image, as we will see in the chapter 14).
# extra code – split Fashion MNIST into tasks A and B, then train and save
# model A to "my_model_A".
pos_class_id = class_names.index("Pullover")
neg_class_id = class_names.index("T-shirt/top")
def split_dataset(X, y):
y_for_B = (y == pos_class_id) | (y == neg_class_id)
y_A = y[~y_for_B]
y_B = (y[y_for_B] == pos_class_id).astype(np.float32)
old_class_ids = list(set(range(10)) - set([neg_class_id, pos_class_id]))
for old_class_id, new_class_id in zip(old_class_ids, range(8)):
y_A[y_A == old_class_id] = new_class_id # reorder class ids for A
return ((X[~y_for_B], y_A), (X[y_for_B], y_B))
(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(X_train, y_train)
(X_valid_A, y_valid_A), (X_valid_B, y_valid_B) = split_dataset(X_valid, y_valid)
(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(X_test, y_test)
X_train_B = X_train_B[:200]
y_train_B = y_train_B[:200]
tf.random.set_seed(42)
model_A = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=[28, 28]),
tf.keras.layers.Dense(100, activation="relu",
kernel_initializer="he_normal"),
tf.keras.layers.Dense(100, activation="relu",
kernel_initializer="he_normal"),
tf.keras.layers.Dense(100, activation="relu",
kernel_initializer="he_normal"),
tf.keras.layers.Dense(8, activation="softmax")
])
model_A.compile(loss="sparse_categorical_crossentropy",
optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
metrics=["accuracy"])
history = model_A.fit(X_train_A, y_train_A, epochs=20,
validation_data=(X_valid_A, y_valid_A))
model_A.save("my_model_A")
Epoch 1/20 1376/1376 [==============================] - 1s 908us/step - loss: 1.1385 - accuracy: 0.6260 - val_loss: 0.7101 - val_accuracy: 0.7603 Epoch 2/20 1376/1376 [==============================] - 1s 869us/step - loss: 0.6221 - accuracy: 0.7911 - val_loss: 0.5293 - val_accuracy: 0.8315 Epoch 3/20 1376/1376 [==============================] - 1s 852us/step - loss: 0.5016 - accuracy: 0.8394 - val_loss: 0.4515 - val_accuracy: 0.8581 Epoch 4/20 1376/1376 [==============================] - 1s 852us/step - loss: 0.4381 - accuracy: 0.8583 - val_loss: 0.4055 - val_accuracy: 0.8669 Epoch 5/20 1376/1376 [==============================] - 1s 844us/step - loss: 0.3979 - accuracy: 0.8692 - val_loss: 0.3748 - val_accuracy: 0.8706 Epoch 6/20 1376/1376 [==============================] - 1s 882us/step - loss: 0.3693 - accuracy: 0.8782 - val_loss: 0.3538 - val_accuracy: 0.8787 Epoch 7/20 1376/1376 [==============================] - 1s 863us/step - loss: 0.3487 - accuracy: 0.8825 - val_loss: 0.3376 - val_accuracy: 0.8834 Epoch 8/20 1376/1376 [==============================] - 2s 1ms/step - loss: 0.3324 - accuracy: 0.8879 - val_loss: 0.3315 - val_accuracy: 0.8847 Epoch 9/20 1376/1376 [==============================] - 1s 1ms/step - loss: 0.3198 - accuracy: 0.8920 - val_loss: 0.3174 - val_accuracy: 0.8879 Epoch 10/20 1376/1376 [==============================] - 2s 1ms/step - loss: 0.3088 - accuracy: 0.8947 - val_loss: 0.3118 - val_accuracy: 0.8904 Epoch 11/20 1376/1376 [==============================] - 1s 1ms/step - loss: 0.2994 - accuracy: 0.8979 - val_loss: 0.3039 - val_accuracy: 0.8925 Epoch 12/20 1376/1376 [==============================] - 1s 837us/step - loss: 0.2918 - accuracy: 0.8999 - val_loss: 0.2998 - val_accuracy: 0.8952 Epoch 13/20 1376/1376 [==============================] - 1s 840us/step - loss: 0.2852 - accuracy: 0.9016 - val_loss: 0.2932 - val_accuracy: 0.8980 Epoch 14/20 1376/1376 [==============================] - 1s 799us/step - loss: 0.2788 - accuracy: 0.9034 - val_loss: 0.2865 - val_accuracy: 0.8990 Epoch 15/20 1376/1376 [==============================] - 1s 922us/step - loss: 0.2736 - accuracy: 0.9052 - val_loss: 0.2824 - val_accuracy: 0.9015 Epoch 16/20 1376/1376 [==============================] - 1s 835us/step - loss: 0.2686 - accuracy: 0.9068 - val_loss: 0.2796 - val_accuracy: 0.9015 Epoch 17/20 1376/1376 [==============================] - 1s 863us/step - loss: 0.2641 - accuracy: 0.9085 - val_loss: 0.2748 - val_accuracy: 0.9015 Epoch 18/20 1376/1376 [==============================] - 1s 913us/step - loss: 0.2596 - accuracy: 0.9101 - val_loss: 0.2729 - val_accuracy: 0.9037 Epoch 19/20 1376/1376 [==============================] - 1s 909us/step - loss: 0.2558 - accuracy: 0.9119 - val_loss: 0.2715 - val_accuracy: 0.9040 Epoch 20/20 1376/1376 [==============================] - 1s 859us/step - loss: 0.2520 - accuracy: 0.9125 - val_loss: 0.2728 - val_accuracy: 0.9027
2021-12-15 16:22:23.274500: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
INFO:tensorflow:Assets written to: my_model_A/assets
# extra code – train and evaluate model B, without reusing model A
tf.random.set_seed(42)
model_B = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=[28, 28]),
tf.keras.layers.Dense(100, activation="relu",
kernel_initializer="he_normal"),
tf.keras.layers.Dense(100, activation="relu",
kernel_initializer="he_normal"),
tf.keras.layers.Dense(100, activation="relu",
kernel_initializer="he_normal"),
tf.keras.layers.Dense(1, activation="sigmoid")
])
model_B.compile(loss="binary_crossentropy",
optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
metrics=["accuracy"])
history = model_B.fit(X_train_B, y_train_B, epochs=20,
validation_data=(X_valid_B, y_valid_B))
model_B.evaluate(X_test_B, y_test_B)
Epoch 1/20 7/7 [==============================] - 0s 20ms/step - loss: 0.7167 - accuracy: 0.5450 - val_loss: 0.7052 - val_accuracy: 0.5272 Epoch 2/20 7/7 [==============================] - 0s 7ms/step - loss: 0.6805 - accuracy: 0.5800 - val_loss: 0.6758 - val_accuracy: 0.6004 Epoch 3/20 7/7 [==============================] - 0s 7ms/step - loss: 0.6532 - accuracy: 0.6650 - val_loss: 0.6530 - val_accuracy: 0.6746 Epoch 4/20 7/7 [==============================] - 0s 6ms/step - loss: 0.6289 - accuracy: 0.7150 - val_loss: 0.6317 - val_accuracy: 0.7517 Epoch 5/20 7/7 [==============================] - 0s 7ms/step - loss: 0.6079 - accuracy: 0.7800 - val_loss: 0.6105 - val_accuracy: 0.8091 Epoch 6/20 7/7 [==============================] - 0s 7ms/step - loss: 0.5866 - accuracy: 0.8400 - val_loss: 0.5913 - val_accuracy: 0.8447 Epoch 7/20 7/7 [==============================] - 0s 6ms/step - loss: 0.5670 - accuracy: 0.8850 - val_loss: 0.5728 - val_accuracy: 0.8833 Epoch 8/20 7/7 [==============================] - 0s 7ms/step - loss: 0.5499 - accuracy: 0.8900 - val_loss: 0.5571 - val_accuracy: 0.8971 Epoch 9/20 7/7 [==============================] - 0s 7ms/step - loss: 0.5331 - accuracy: 0.9150 - val_loss: 0.5427 - val_accuracy: 0.9050 Epoch 10/20 7/7 [==============================] - 0s 7ms/step - loss: 0.5180 - accuracy: 0.9250 - val_loss: 0.5290 - val_accuracy: 0.9080 Epoch 11/20 7/7 [==============================] - 0s 6ms/step - loss: 0.5038 - accuracy: 0.9350 - val_loss: 0.5160 - val_accuracy: 0.9189 Epoch 12/20 7/7 [==============================] - 0s 6ms/step - loss: 0.4903 - accuracy: 0.9350 - val_loss: 0.5032 - val_accuracy: 0.9228 Epoch 13/20 7/7 [==============================] - 0s 7ms/step - loss: 0.4770 - accuracy: 0.9400 - val_loss: 0.4925 - val_accuracy: 0.9228 Epoch 14/20 7/7 [==============================] - 0s 6ms/step - loss: 0.4656 - accuracy: 0.9450 - val_loss: 0.4817 - val_accuracy: 0.9258 Epoch 15/20 7/7 [==============================] - 0s 6ms/step - loss: 0.4546 - accuracy: 0.9550 - val_loss: 0.4708 - val_accuracy: 0.9298 Epoch 16/20 7/7 [==============================] - 0s 6ms/step - loss: 0.4435 - accuracy: 0.9550 - val_loss: 0.4608 - val_accuracy: 0.9318 Epoch 17/20 7/7 [==============================] - 0s 6ms/step - loss: 0.4330 - accuracy: 0.9600 - val_loss: 0.4510 - val_accuracy: 0.9337 Epoch 18/20 7/7 [==============================] - 0s 6ms/step - loss: 0.4226 - accuracy: 0.9600 - val_loss: 0.4406 - val_accuracy: 0.9367 Epoch 19/20 7/7 [==============================] - 0s 6ms/step - loss: 0.4119 - accuracy: 0.9600 - val_loss: 0.4311 - val_accuracy: 0.9377 Epoch 20/20 7/7 [==============================] - 0s 7ms/step - loss: 0.4025 - accuracy: 0.9600 - val_loss: 0.4225 - val_accuracy: 0.9367 63/63 [==============================] - 0s 728us/step - loss: 0.4317 - accuracy: 0.9185
[0.43168652057647705, 0.9185000061988831]
Model B reaches 91.85% accuracy on the test set. Now let's try reusing the pretrained model A.
model_A = tf.keras.models.load_model("my_model_A")
model_B_on_A = tf.keras.Sequential(model_A.layers[:-1])
model_B_on_A.add(tf.keras.layers.Dense(1, activation="sigmoid"))
Note that model_B_on_A
and model_A
actually share layers now, so when we train one, it will update both models. If we want to avoid that, we need to build model_B_on_A
on top of a clone of model_A
:
tf.random.set_seed(42) # extra code – ensure reproducibility
model_A_clone = tf.keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())
# extra code – creating model_B_on_A just like in the previous cell
model_B_on_A = tf.keras.Sequential(model_A_clone.layers[:-1])
model_B_on_A.add(tf.keras.layers.Dense(1, activation="sigmoid"))
for layer in model_B_on_A.layers[:-1]:
layer.trainable = False
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer,
metrics=["accuracy"])
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=4,
validation_data=(X_valid_B, y_valid_B))
for layer in model_B_on_A.layers[:-1]:
layer.trainable = True
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer,
metrics=["accuracy"])
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=16,
validation_data=(X_valid_B, y_valid_B))
Epoch 1/4 7/7 [==============================] - 0s 23ms/step - loss: 1.7893 - accuracy: 0.5550 - val_loss: 1.3324 - val_accuracy: 0.5084 Epoch 2/4 7/7 [==============================] - 0s 7ms/step - loss: 1.1235 - accuracy: 0.5350 - val_loss: 0.9199 - val_accuracy: 0.4807 Epoch 3/4 7/7 [==============================] - 0s 7ms/step - loss: 0.8836 - accuracy: 0.5000 - val_loss: 0.8266 - val_accuracy: 0.4837 Epoch 4/4 7/7 [==============================] - 0s 7ms/step - loss: 0.8202 - accuracy: 0.5250 - val_loss: 0.7795 - val_accuracy: 0.4985 Epoch 1/16 7/7 [==============================] - 0s 21ms/step - loss: 0.7348 - accuracy: 0.6050 - val_loss: 0.6372 - val_accuracy: 0.6914 Epoch 2/16 7/7 [==============================] - 0s 7ms/step - loss: 0.6055 - accuracy: 0.7600 - val_loss: 0.5283 - val_accuracy: 0.8229 Epoch 3/16 7/7 [==============================] - 0s 7ms/step - loss: 0.4992 - accuracy: 0.8400 - val_loss: 0.4742 - val_accuracy: 0.8180 Epoch 4/16 7/7 [==============================] - 0s 6ms/step - loss: 0.4297 - accuracy: 0.8700 - val_loss: 0.4212 - val_accuracy: 0.8773 Epoch 5/16 7/7 [==============================] - 0s 7ms/step - loss: 0.3825 - accuracy: 0.9050 - val_loss: 0.3797 - val_accuracy: 0.9031 Epoch 6/16 7/7 [==============================] - 0s 6ms/step - loss: 0.3438 - accuracy: 0.9250 - val_loss: 0.3534 - val_accuracy: 0.9149 Epoch 7/16 7/7 [==============================] - 0s 7ms/step - loss: 0.3148 - accuracy: 0.9500 - val_loss: 0.3384 - val_accuracy: 0.9001 Epoch 8/16 7/7 [==============================] - 0s 7ms/step - loss: 0.3012 - accuracy: 0.9450 - val_loss: 0.3179 - val_accuracy: 0.9209 Epoch 9/16 7/7 [==============================] - 0s 6ms/step - loss: 0.2767 - accuracy: 0.9650 - val_loss: 0.3043 - val_accuracy: 0.9298 Epoch 10/16 7/7 [==============================] - 0s 6ms/step - loss: 0.2623 - accuracy: 0.9550 - val_loss: 0.2929 - val_accuracy: 0.9308 Epoch 11/16 7/7 [==============================] - 0s 6ms/step - loss: 0.2512 - accuracy: 0.9600 - val_loss: 0.2830 - val_accuracy: 0.9327 Epoch 12/16 7/7 [==============================] - 0s 6ms/step - loss: 0.2397 - accuracy: 0.9600 - val_loss: 0.2744 - val_accuracy: 0.9318 Epoch 13/16 7/7 [==============================] - 0s 6ms/step - loss: 0.2295 - accuracy: 0.9600 - val_loss: 0.2675 - val_accuracy: 0.9327 Epoch 14/16 7/7 [==============================] - 0s 6ms/step - loss: 0.2225 - accuracy: 0.9600 - val_loss: 0.2598 - val_accuracy: 0.9347 Epoch 15/16 7/7 [==============================] - 0s 6ms/step - loss: 0.2147 - accuracy: 0.9600 - val_loss: 0.2542 - val_accuracy: 0.9357 Epoch 16/16 7/7 [==============================] - 0s 7ms/step - loss: 0.2077 - accuracy: 0.9600 - val_loss: 0.2492 - val_accuracy: 0.9377
So, what's the final verdict?
model_B_on_A.evaluate(X_test_B, y_test_B)
63/63 [==============================] - 0s 667us/step - loss: 0.2546 - accuracy: 0.9385
[0.2546142041683197, 0.9384999871253967]
Great! We got a bit of transfer: the model's accuracy went up 2 percentage points, from 91.85% to 93.85%. This means the error rate dropped by almost 25%:
1 - (100 - 93.85) / (100 - 91.85)
0.24539877300613477
# extra code – a little function to test an optimizer on Fashion MNIST
def build_model(seed=42):
tf.random.set_seed(seed)
return tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=[28, 28]),
tf.keras.layers.Dense(100, activation="relu",
kernel_initializer="he_normal"),
tf.keras.layers.Dense(100, activation="relu",
kernel_initializer="he_normal"),
tf.keras.layers.Dense(100, activation="relu",
kernel_initializer="he_normal"),
tf.keras.layers.Dense(10, activation="softmax")
])
def build_and_train_model(optimizer):
model = build_model()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
metrics=["accuracy"])
return model.fit(X_train, y_train, epochs=10,
validation_data=(X_valid, y_valid))
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9)
history_sgd = build_and_train_model(optimizer) # extra code
Epoch 1/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.6877 - accuracy: 0.7677 - val_loss: 0.4960 - val_accuracy: 0.8172 Epoch 2/10 1719/1719 [==============================] - 2s 948us/step - loss: 0.4619 - accuracy: 0.8378 - val_loss: 0.4421 - val_accuracy: 0.8404 Epoch 3/10 1719/1719 [==============================] - 1s 868us/step - loss: 0.4179 - accuracy: 0.8525 - val_loss: 0.4188 - val_accuracy: 0.8538 Epoch 4/10 1719/1719 [==============================] - 1s 866us/step - loss: 0.3902 - accuracy: 0.8621 - val_loss: 0.3814 - val_accuracy: 0.8604 Epoch 5/10 1719/1719 [==============================] - 1s 869us/step - loss: 0.3686 - accuracy: 0.8691 - val_loss: 0.3665 - val_accuracy: 0.8656 Epoch 6/10 1719/1719 [==============================] - 2s 925us/step - loss: 0.3553 - accuracy: 0.8732 - val_loss: 0.3643 - val_accuracy: 0.8720 Epoch 7/10 1719/1719 [==============================] - 2s 908us/step - loss: 0.3385 - accuracy: 0.8778 - val_loss: 0.3611 - val_accuracy: 0.8684 Epoch 8/10 1719/1719 [==============================] - 2s 926us/step - loss: 0.3297 - accuracy: 0.8796 - val_loss: 0.3490 - val_accuracy: 0.8726 Epoch 9/10 1719/1719 [==============================] - 2s 893us/step - loss: 0.3200 - accuracy: 0.8850 - val_loss: 0.3625 - val_accuracy: 0.8666 Epoch 10/10 1719/1719 [==============================] - 2s 886us/step - loss: 0.3097 - accuracy: 0.8881 - val_loss: 0.3656 - val_accuracy: 0.8672
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9)
history_momentum = build_and_train_model(optimizer) # extra code
Epoch 1/10 1719/1719 [==============================] - 2s 941us/step - loss: 0.6877 - accuracy: 0.7677 - val_loss: 0.4960 - val_accuracy: 0.8172 Epoch 2/10 1719/1719 [==============================] - 2s 878us/step - loss: 0.4619 - accuracy: 0.8378 - val_loss: 0.4421 - val_accuracy: 0.8404 Epoch 3/10 1719/1719 [==============================] - 2s 898us/step - loss: 0.4179 - accuracy: 0.8525 - val_loss: 0.4188 - val_accuracy: 0.8538 Epoch 4/10 1719/1719 [==============================] - 2s 934us/step - loss: 0.3902 - accuracy: 0.8621 - val_loss: 0.3814 - val_accuracy: 0.8604 Epoch 5/10 1719/1719 [==============================] - 2s 910us/step - loss: 0.3686 - accuracy: 0.8691 - val_loss: 0.3665 - val_accuracy: 0.8656 Epoch 6/10 1719/1719 [==============================] - 2s 913us/step - loss: 0.3553 - accuracy: 0.8732 - val_loss: 0.3643 - val_accuracy: 0.8720 Epoch 7/10 1719/1719 [==============================] - 2s 893us/step - loss: 0.3385 - accuracy: 0.8778 - val_loss: 0.3611 - val_accuracy: 0.8684 Epoch 8/10 1719/1719 [==============================] - 2s 968us/step - loss: 0.3297 - accuracy: 0.8796 - val_loss: 0.3490 - val_accuracy: 0.8726 Epoch 9/10 1719/1719 [==============================] - 2s 913us/step - loss: 0.3200 - accuracy: 0.8850 - val_loss: 0.3625 - val_accuracy: 0.8666 Epoch 10/10 1719/1719 [==============================] - 1s 858us/step - loss: 0.3097 - accuracy: 0.8881 - val_loss: 0.3656 - val_accuracy: 0.8672
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9,
nesterov=True)
history_nesterov = build_and_train_model(optimizer) # extra code
Epoch 1/10 1719/1719 [==============================] - 2s 907us/step - loss: 0.6777 - accuracy: 0.7711 - val_loss: 0.4796 - val_accuracy: 0.8260 Epoch 2/10 1719/1719 [==============================] - 2s 898us/step - loss: 0.4570 - accuracy: 0.8398 - val_loss: 0.4358 - val_accuracy: 0.8396 Epoch 3/10 1719/1719 [==============================] - 1s 872us/step - loss: 0.4140 - accuracy: 0.8537 - val_loss: 0.4013 - val_accuracy: 0.8566 Epoch 4/10 1719/1719 [==============================] - 2s 902us/step - loss: 0.3882 - accuracy: 0.8629 - val_loss: 0.3802 - val_accuracy: 0.8616 Epoch 5/10 1719/1719 [==============================] - 2s 913us/step - loss: 0.3666 - accuracy: 0.8703 - val_loss: 0.3689 - val_accuracy: 0.8638 Epoch 6/10 1719/1719 [==============================] - 2s 882us/step - loss: 0.3531 - accuracy: 0.8732 - val_loss: 0.3681 - val_accuracy: 0.8688 Epoch 7/10 1719/1719 [==============================] - 2s 958us/step - loss: 0.3375 - accuracy: 0.8784 - val_loss: 0.3658 - val_accuracy: 0.8670 Epoch 8/10 1719/1719 [==============================] - 2s 895us/step - loss: 0.3278 - accuracy: 0.8815 - val_loss: 0.3598 - val_accuracy: 0.8682 Epoch 9/10 1719/1719 [==============================] - 2s 878us/step - loss: 0.3183 - accuracy: 0.8855 - val_loss: 0.3472 - val_accuracy: 0.8720 Epoch 10/10 1719/1719 [==============================] - 2s 921us/step - loss: 0.3081 - accuracy: 0.8891 - val_loss: 0.3624 - val_accuracy: 0.8708
optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.001)
history_adagrad = build_and_train_model(optimizer) # extra code
Epoch 1/10 1719/1719 [==============================] - 2s 1ms/step - loss: 1.0003 - accuracy: 0.6822 - val_loss: 0.6876 - val_accuracy: 0.7744 Epoch 2/10 1719/1719 [==============================] - 2s 912us/step - loss: 0.6389 - accuracy: 0.7904 - val_loss: 0.5837 - val_accuracy: 0.8048 Epoch 3/10 1719/1719 [==============================] - 2s 930us/step - loss: 0.5682 - accuracy: 0.8105 - val_loss: 0.5379 - val_accuracy: 0.8154 Epoch 4/10 1719/1719 [==============================] - 2s 878us/step - loss: 0.5316 - accuracy: 0.8215 - val_loss: 0.5135 - val_accuracy: 0.8244 Epoch 5/10 1719/1719 [==============================] - 1s 855us/step - loss: 0.5076 - accuracy: 0.8295 - val_loss: 0.4937 - val_accuracy: 0.8288 Epoch 6/10 1719/1719 [==============================] - 1s 868us/step - loss: 0.4905 - accuracy: 0.8338 - val_loss: 0.4821 - val_accuracy: 0.8312 Epoch 7/10 1719/1719 [==============================] - 2s 940us/step - loss: 0.4776 - accuracy: 0.8371 - val_loss: 0.4705 - val_accuracy: 0.8348 Epoch 8/10 1719/1719 [==============================] - 2s 966us/step - loss: 0.4674 - accuracy: 0.8409 - val_loss: 0.4611 - val_accuracy: 0.8362 Epoch 9/10 1719/1719 [==============================] - 2s 892us/step - loss: 0.4587 - accuracy: 0.8435 - val_loss: 0.4548 - val_accuracy: 0.8406 Epoch 10/10 1719/1719 [==============================] - 2s 873us/step - loss: 0.4511 - accuracy: 0.8458 - val_loss: 0.4469 - val_accuracy: 0.8424
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
history_rmsprop = build_and_train_model(optimizer) # extra code
Epoch 1/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.5138 - accuracy: 0.8135 - val_loss: 0.4413 - val_accuracy: 0.8338 Epoch 2/10 1719/1719 [==============================] - 2s 942us/step - loss: 0.3932 - accuracy: 0.8590 - val_loss: 0.4518 - val_accuracy: 0.8370 Epoch 3/10 1719/1719 [==============================] - 2s 948us/step - loss: 0.3711 - accuracy: 0.8692 - val_loss: 0.3914 - val_accuracy: 0.8686 Epoch 4/10 1719/1719 [==============================] - 2s 949us/step - loss: 0.3643 - accuracy: 0.8735 - val_loss: 0.4176 - val_accuracy: 0.8644 Epoch 5/10 1719/1719 [==============================] - 2s 970us/step - loss: 0.3578 - accuracy: 0.8769 - val_loss: 0.3874 - val_accuracy: 0.8696 Epoch 6/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3561 - accuracy: 0.8775 - val_loss: 0.4650 - val_accuracy: 0.8590 Epoch 7/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3528 - accuracy: 0.8783 - val_loss: 0.4122 - val_accuracy: 0.8774 Epoch 8/10 1719/1719 [==============================] - 2s 989us/step - loss: 0.3491 - accuracy: 0.8811 - val_loss: 0.5151 - val_accuracy: 0.8586 Epoch 9/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3479 - accuracy: 0.8829 - val_loss: 0.4457 - val_accuracy: 0.8856 Epoch 10/10 1719/1719 [==============================] - 2s 1000us/step - loss: 0.3437 - accuracy: 0.8830 - val_loss: 0.4781 - val_accuracy: 0.8636
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9,
beta_2=0.999)
history_adam = build_and_train_model(optimizer) # extra code
Epoch 1/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.4949 - accuracy: 0.8220 - val_loss: 0.4110 - val_accuracy: 0.8428 Epoch 2/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3727 - accuracy: 0.8637 - val_loss: 0.4153 - val_accuracy: 0.8370 Epoch 3/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3372 - accuracy: 0.8756 - val_loss: 0.3600 - val_accuracy: 0.8708 Epoch 4/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3126 - accuracy: 0.8833 - val_loss: 0.3498 - val_accuracy: 0.8760 Epoch 5/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2965 - accuracy: 0.8901 - val_loss: 0.3264 - val_accuracy: 0.8794 Epoch 6/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2821 - accuracy: 0.8947 - val_loss: 0.3295 - val_accuracy: 0.8782 Epoch 7/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2672 - accuracy: 0.8993 - val_loss: 0.3473 - val_accuracy: 0.8790 Epoch 8/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2587 - accuracy: 0.9020 - val_loss: 0.3230 - val_accuracy: 0.8818 Epoch 9/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2500 - accuracy: 0.9057 - val_loss: 0.3676 - val_accuracy: 0.8744 Epoch 10/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2428 - accuracy: 0.9073 - val_loss: 0.3879 - val_accuracy: 0.8696
Adamax Optimization
optimizer = tf.keras.optimizers.Adamax(learning_rate=0.001, beta_1=0.9,
beta_2=0.999)
history_adamax = build_and_train_model(optimizer) # extra code
Epoch 1/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.5327 - accuracy: 0.8151 - val_loss: 0.4402 - val_accuracy: 0.8340 Epoch 2/10 1719/1719 [==============================] - 2s 935us/step - loss: 0.3950 - accuracy: 0.8591 - val_loss: 0.3907 - val_accuracy: 0.8512 Epoch 3/10 1719/1719 [==============================] - 2s 933us/step - loss: 0.3563 - accuracy: 0.8715 - val_loss: 0.3730 - val_accuracy: 0.8676 Epoch 4/10 1719/1719 [==============================] - 2s 942us/step - loss: 0.3335 - accuracy: 0.8797 - val_loss: 0.3453 - val_accuracy: 0.8738 Epoch 5/10 1719/1719 [==============================] - 2s 993us/step - loss: 0.3129 - accuracy: 0.8853 - val_loss: 0.3270 - val_accuracy: 0.8792 Epoch 6/10 1719/1719 [==============================] - 2s 926us/step - loss: 0.2986 - accuracy: 0.8913 - val_loss: 0.3396 - val_accuracy: 0.8772 Epoch 7/10 1719/1719 [==============================] - 2s 939us/step - loss: 0.2854 - accuracy: 0.8949 - val_loss: 0.3390 - val_accuracy: 0.8770 Epoch 8/10 1719/1719 [==============================] - 2s 949us/step - loss: 0.2757 - accuracy: 0.8984 - val_loss: 0.3147 - val_accuracy: 0.8854 Epoch 9/10 1719/1719 [==============================] - 2s 952us/step - loss: 0.2662 - accuracy: 0.9020 - val_loss: 0.3341 - val_accuracy: 0.8760 Epoch 10/10 1719/1719 [==============================] - 2s 957us/step - loss: 0.2542 - accuracy: 0.9063 - val_loss: 0.3282 - val_accuracy: 0.8780
Nadam Optimization
optimizer = tf.keras.optimizers.Nadam(learning_rate=0.001, beta_1=0.9,
beta_2=0.999)
history_nadam = build_and_train_model(optimizer) # extra code
Epoch 1/10 1719/1719 [==============================] - 3s 1ms/step - loss: 0.4826 - accuracy: 0.8284 - val_loss: 0.4092 - val_accuracy: 0.8456 Epoch 2/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3610 - accuracy: 0.8667 - val_loss: 0.3893 - val_accuracy: 0.8592 Epoch 3/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3270 - accuracy: 0.8784 - val_loss: 0.3653 - val_accuracy: 0.8712 Epoch 4/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3049 - accuracy: 0.8874 - val_loss: 0.3444 - val_accuracy: 0.8726 Epoch 5/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2897 - accuracy: 0.8905 - val_loss: 0.3174 - val_accuracy: 0.8810 Epoch 6/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2753 - accuracy: 0.8981 - val_loss: 0.3389 - val_accuracy: 0.8830 Epoch 7/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2652 - accuracy: 0.9000 - val_loss: 0.3725 - val_accuracy: 0.8734 Epoch 8/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2563 - accuracy: 0.9034 - val_loss: 0.3229 - val_accuracy: 0.8828 Epoch 9/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2463 - accuracy: 0.9079 - val_loss: 0.3353 - val_accuracy: 0.8818 Epoch 10/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2402 - accuracy: 0.9091 - val_loss: 0.3813 - val_accuracy: 0.8740
AdamW Optimization
Note: Since TF 1.12, AdamW
is no longer experimental. It is available at tf.keras.optimizers.AdamW
instead of tf.keras.optimizers.experimental.AdamW
.
optimizer = tf.keras.optimizers.AdamW(weight_decay=1e-5, learning_rate=0.001,
beta_1=0.9, beta_2=0.999)
history_adamw = build_and_train_model(optimizer) # extra code
Epoch 1/10 1719/1719 [==============================] - 3s 1ms/step - loss: 0.4945 - accuracy: 0.8220 - val_loss: 0.4203 - val_accuracy: 0.8424 Epoch 2/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3735 - accuracy: 0.8629 - val_loss: 0.4014 - val_accuracy: 0.8474 Epoch 3/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3391 - accuracy: 0.8753 - val_loss: 0.3347 - val_accuracy: 0.8760 Epoch 4/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3155 - accuracy: 0.8827 - val_loss: 0.3441 - val_accuracy: 0.8720 Epoch 5/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2989 - accuracy: 0.8892 - val_loss: 0.3218 - val_accuracy: 0.8786 Epoch 6/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2862 - accuracy: 0.8931 - val_loss: 0.3423 - val_accuracy: 0.8814 Epoch 7/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2738 - accuracy: 0.8970 - val_loss: 0.3593 - val_accuracy: 0.8764 Epoch 8/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2648 - accuracy: 0.8993 - val_loss: 0.3263 - val_accuracy: 0.8856 Epoch 9/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2583 - accuracy: 0.9035 - val_loss: 0.3642 - val_accuracy: 0.8680 Epoch 10/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2483 - accuracy: 0.9054 - val_loss: 0.3696 - val_accuracy: 0.8702
# extra code – visualize the learning curves of all the optimizers
for loss in ("loss", "val_loss"):
plt.figure(figsize=(12, 8))
opt_names = "SGD Momentum Nesterov AdaGrad RMSProp Adam Adamax Nadam AdamW"
for history, opt_name in zip((history_sgd, history_momentum, history_nesterov,
history_adagrad, history_rmsprop, history_adam,
history_adamax, history_nadam, history_adamw),
opt_names.split()):
plt.plot(history.history[loss], label=f"{opt_name}", linewidth=3)
plt.grid()
plt.xlabel("Epochs")
plt.ylabel({"loss": "Training loss", "val_loss": "Validation loss"}[loss])
plt.legend(loc="upper left")
plt.axis([0, 9, 0.1, 0.7])
plt.show()
learning_rate = initial_learning_rate / (1 + step / decay_steps)**power
Keras uses power = 1
.
Note: The decay
argument in optimizers is deprecated. The old optimizers which implement the decay
argument are still available in tf.keras.optimizers.legacy
, but you should use the schedulers in tf.keras.optimizers.schedules
instead.
# DEPRECATED:
optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=0.01, decay=1e-4)
# RECOMMENDED:
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
initial_learning_rate=0.01,
decay_steps=10_000,
decay_rate=1.0,
staircase=False
)
optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schedule)
The InverseTimeDecay
scheduler uses learning_rate = initial_learning_rate / (1 + decay_rate * step / decay_steps)
. If you set staircase=True
, then it replaces step / decay_step
with floor(step / decay_step)
.
history_power_scheduling = build_and_train_model(optimizer) # extra code
Epoch 1/10 1719/1719 [==============================] - 3s 2ms/step - loss: 0.7004 - accuracy: 0.7588 - val_loss: 0.4991 - val_accuracy: 0.8206 Epoch 2/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.4781 - accuracy: 0.8316 - val_loss: 0.4477 - val_accuracy: 0.8372 Epoch 3/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.4293 - accuracy: 0.8487 - val_loss: 0.4177 - val_accuracy: 0.8498 Epoch 4/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.4053 - accuracy: 0.8563 - val_loss: 0.3987 - val_accuracy: 0.8602 Epoch 5/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3864 - accuracy: 0.8633 - val_loss: 0.3859 - val_accuracy: 0.8612 Epoch 6/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3720 - accuracy: 0.8675 - val_loss: 0.3942 - val_accuracy: 0.8584 Epoch 7/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3616 - accuracy: 0.8709 - val_loss: 0.3706 - val_accuracy: 0.8670 Epoch 8/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3529 - accuracy: 0.8741 - val_loss: 0.3758 - val_accuracy: 0.8638 Epoch 9/10 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3452 - accuracy: 0.8765 - val_loss: 0.3587 - val_accuracy: 0.8680 Epoch 10/10 1719/1719 [==============================] - 3s 1ms/step - loss: 0.3379 - accuracy: 0.8793 - val_loss: 0.3569 - val_accuracy: 0.8714
# extra code – this cell plots power scheduling with staircase=True or False
initial_learning_rate = 0.01
decay_rate = 1.0
decay_steps = 10_000
steps = np.arange(100_000)
lrs = initial_learning_rate / (1 + decay_rate * steps / decay_steps)
lrs2 = initial_learning_rate / (1 + decay_rate * np.floor(steps / decay_steps))
plt.plot(steps, lrs, "-", label="staircase=False")
plt.plot(steps, lrs2, "-", label="staircase=True")
plt.axis([0, steps.max(), 0, 0.0105])
plt.xlabel("Step")
plt.ylabel("Learning Rate")
plt.title("Power Scheduling", fontsize=14)
plt.legend()
plt.grid(True)
plt.show()
learning_rate = initial_learning_rate * decay_rate ** (step / decay_steps)
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=0.01,
decay_steps=20_000,
decay_rate=0.1,
staircase=False
)
optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schedule)
history_exponential_scheduling = build_and_train_model(optimizer) # extra code
Epoch 1/10 1719/1719 [==============================] - 3s 2ms/step - loss: 0.6916 - accuracy: 0.7632 - val_loss: 0.5030 - val_accuracy: 0.8254 Epoch 2/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.4832 - accuracy: 0.8311 - val_loss: 0.4601 - val_accuracy: 0.8358 Epoch 3/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.4372 - accuracy: 0.8449 - val_loss: 0.4256 - val_accuracy: 0.8524 Epoch 4/10 1719/1719 [==============================] - 3s 1ms/step - loss: 0.4131 - accuracy: 0.8546 - val_loss: 0.4037 - val_accuracy: 0.8568 Epoch 5/10 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3952 - accuracy: 0.8596 - val_loss: 0.3950 - val_accuracy: 0.8598 Epoch 6/10 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3825 - accuracy: 0.8640 - val_loss: 0.4010 - val_accuracy: 0.8584 Epoch 7/10 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3739 - accuracy: 0.8667 - val_loss: 0.3851 - val_accuracy: 0.8650 Epoch 8/10 1719/1719 [==============================] - 3s 1ms/step - loss: 0.3664 - accuracy: 0.8696 - val_loss: 0.3811 - val_accuracy: 0.8616 Epoch 9/10 1719/1719 [==============================] - 3s 1ms/step - loss: 0.3606 - accuracy: 0.8720 - val_loss: 0.3749 - val_accuracy: 0.8662 Epoch 10/10 1719/1719 [==============================] - 3s 1ms/step - loss: 0.3555 - accuracy: 0.8743 - val_loss: 0.3706 - val_accuracy: 0.8662
# extra code – this cell plots exponential scheduling
initial_learning_rate = 0.01
decay_rate = 0.1
decay_steps = 20_000
steps = np.arange(100_000)
lrs = initial_learning_rate * decay_rate ** (steps / decay_steps)
lrs2 = initial_learning_rate * decay_rate ** np.floor(steps / decay_steps)
plt.plot(steps, lrs, "-", label="staircase=False")
plt.plot(steps, lrs2, "-", label="staircase=True")
plt.axis([0, steps.max(), 0, 0.0105])
plt.xlabel("Step")
plt.ylabel("Learning Rate")
plt.title("Exponential Scheduling", fontsize=14)
plt.legend()
plt.grid(True)
plt.show()
Keras also provides a LearningRateScheduler
callback class that lets you define your own scheduling function. Let's see how you could use it to implement exponential decay. Note that in this case the learning rate only changes at each epoch, not at each step:
def exponential_decay_fn(epoch):
return 0.01 * 0.1 ** (epoch / 20)
def exponential_decay(lr0, s):
def exponential_decay_fn(epoch):
return lr0 * 0.1 ** (epoch / s)
return exponential_decay_fn
exponential_decay_fn = exponential_decay(lr0=0.01, s=20)
# extra code – build and compile a model for Fashion MNIST
tf.random.set_seed(42)
model = build_model()
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
metrics=["accuracy"])
n_epochs = 20
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn)
history = model.fit(X_train, y_train, epochs=n_epochs,
validation_data=(X_valid, y_valid),
callbacks=[lr_scheduler])
Epoch 1/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.6905 - accuracy: 0.7643 - val_loss: 0.4814 - val_accuracy: 0.8330 - lr: 0.0100 Epoch 2/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.4672 - accuracy: 0.8357 - val_loss: 0.4488 - val_accuracy: 0.8374 - lr: 0.0089 Epoch 3/25 1719/1719 [==============================] - 3s 1ms/step - loss: 0.4212 - accuracy: 0.8503 - val_loss: 0.4118 - val_accuracy: 0.8532 - lr: 0.0079 Epoch 4/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3975 - accuracy: 0.8593 - val_loss: 0.3884 - val_accuracy: 0.8636 - lr: 0.0071 Epoch 5/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3781 - accuracy: 0.8657 - val_loss: 0.3772 - val_accuracy: 0.8642 - lr: 0.0063 Epoch 6/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3634 - accuracy: 0.8710 - val_loss: 0.3779 - val_accuracy: 0.8662 - lr: 0.0056 Epoch 7/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3530 - accuracy: 0.8744 - val_loss: 0.3674 - val_accuracy: 0.8652 - lr: 0.0050 Epoch 8/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3437 - accuracy: 0.8771 - val_loss: 0.3616 - val_accuracy: 0.8686 - lr: 0.0045 Epoch 9/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3359 - accuracy: 0.8801 - val_loss: 0.3509 - val_accuracy: 0.8728 - lr: 0.0040 Epoch 10/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3290 - accuracy: 0.8826 - val_loss: 0.3504 - val_accuracy: 0.8720 - lr: 0.0035 Epoch 11/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3236 - accuracy: 0.8844 - val_loss: 0.3458 - val_accuracy: 0.8736 - lr: 0.0032 Epoch 12/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3186 - accuracy: 0.8869 - val_loss: 0.3459 - val_accuracy: 0.8752 - lr: 0.0028 Epoch 13/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3147 - accuracy: 0.8878 - val_loss: 0.3359 - val_accuracy: 0.8770 - lr: 0.0025 Epoch 14/25 1719/1719 [==============================] - 3s 1ms/step - loss: 0.3109 - accuracy: 0.8890 - val_loss: 0.3404 - val_accuracy: 0.8762 - lr: 0.0022 Epoch 15/25 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3076 - accuracy: 0.8902 - val_loss: 0.3398 - val_accuracy: 0.8790 - lr: 0.0020 Epoch 16/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3043 - accuracy: 0.8915 - val_loss: 0.3331 - val_accuracy: 0.8784 - lr: 0.0018 Epoch 17/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3020 - accuracy: 0.8924 - val_loss: 0.3363 - val_accuracy: 0.8774 - lr: 0.0016 Epoch 18/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2998 - accuracy: 0.8927 - val_loss: 0.3356 - val_accuracy: 0.8778 - lr: 0.0014 Epoch 19/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2979 - accuracy: 0.8935 - val_loss: 0.3309 - val_accuracy: 0.8796 - lr: 0.0013 Epoch 20/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2961 - accuracy: 0.8940 - val_loss: 0.3308 - val_accuracy: 0.8782 - lr: 0.0011 Epoch 21/25 1719/1719 [==============================] - 2s 1ms/step - loss: 0.2944 - accuracy: 0.8951 - val_loss: 0.3286 - val_accuracy: 0.8802 - lr: 0.0010 Epoch 22/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2930 - accuracy: 0.8953 - val_loss: 0.3313 - val_accuracy: 0.8804 - lr: 8.9125e-04 Epoch 23/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2916 - accuracy: 0.8957 - val_loss: 0.3285 - val_accuracy: 0.8796 - lr: 7.9433e-04 Epoch 24/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2904 - accuracy: 0.8961 - val_loss: 0.3313 - val_accuracy: 0.8786 - lr: 7.0795e-04 Epoch 25/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2896 - accuracy: 0.8962 - val_loss: 0.3296 - val_accuracy: 0.8812 - lr: 6.3096e-04
Alternatively, the schedule function can take the current learning rate as a second argument:
def exponential_decay_fn(epoch, lr):
return lr * 0.1 ** (1 / 20)
Extra material: if you want to use a custom scheduling function that updates the learning rate at each iteration rather than at each epoch, you can write your own callback class like this:
K = tf.keras.backend
class ExponentialDecay(tf.keras.callbacks.Callback):
def __init__(self, n_steps=40_000):
super().__init__()
self.n_steps = n_steps
def on_batch_begin(self, batch, logs=None):
# Note: the `batch` argument is reset at each epoch
lr = K.get_value(self.model.optimizer.learning_rate)
new_learning_rate = lr * 0.1 ** (1 / self.n_steps)
K.set_value(self.model.optimizer.learning_rate, new_learning_rate)
def on_epoch_end(self, epoch, logs=None):
logs = logs or {}
logs['lr'] = K.get_value(self.model.optimizer.learning_rate)
lr0 = 0.01
model = build_model()
optimizer = tf.keras.optimizers.SGD(learning_rate=lr0)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
metrics=["accuracy"])
import math
batch_size = 32
n_steps = n_epochs * math.ceil(len(X_train) / batch_size)
exp_decay = ExponentialDecay(n_steps)
history = model.fit(X_train, y_train, epochs=n_epochs,
validation_data=(X_valid, y_valid),
callbacks=[exp_decay])
Epoch 1/25 1719/1719 [==============================] - 4s 2ms/step - loss: 0.6947 - accuracy: 0.7635 - val_loss: 0.5014 - val_accuracy: 0.8224 - lr: 0.0091 Epoch 2/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.4718 - accuracy: 0.8349 - val_loss: 0.4530 - val_accuracy: 0.8382 - lr: 0.0083 Epoch 3/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.4255 - accuracy: 0.8500 - val_loss: 0.4216 - val_accuracy: 0.8526 - lr: 0.0076 Epoch 4/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.4025 - accuracy: 0.8587 - val_loss: 0.3954 - val_accuracy: 0.8618 - lr: 0.0069 Epoch 5/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3840 - accuracy: 0.8643 - val_loss: 0.3847 - val_accuracy: 0.8612 - lr: 0.0063 Epoch 6/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3696 - accuracy: 0.8689 - val_loss: 0.3908 - val_accuracy: 0.8558 - lr: 0.0058 Epoch 7/25 1719/1719 [==============================] - 4s 2ms/step - loss: 0.3590 - accuracy: 0.8722 - val_loss: 0.3744 - val_accuracy: 0.8670 - lr: 0.0052 Epoch 8/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3498 - accuracy: 0.8749 - val_loss: 0.3754 - val_accuracy: 0.8640 - lr: 0.0048 Epoch 9/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3415 - accuracy: 0.8783 - val_loss: 0.3592 - val_accuracy: 0.8700 - lr: 0.0044 Epoch 10/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3340 - accuracy: 0.8803 - val_loss: 0.3575 - val_accuracy: 0.8724 - lr: 0.0040 Epoch 11/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3281 - accuracy: 0.8833 - val_loss: 0.3573 - val_accuracy: 0.8718 - lr: 0.0036 Epoch 12/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3228 - accuracy: 0.8847 - val_loss: 0.3579 - val_accuracy: 0.8688 - lr: 0.0033 Epoch 13/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3182 - accuracy: 0.8865 - val_loss: 0.3421 - val_accuracy: 0.8756 - lr: 0.0030 Epoch 14/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3138 - accuracy: 0.8882 - val_loss: 0.3468 - val_accuracy: 0.8766 - lr: 0.0028 Epoch 15/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3101 - accuracy: 0.8889 - val_loss: 0.3471 - val_accuracy: 0.8766 - lr: 0.0025 Epoch 16/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3064 - accuracy: 0.8898 - val_loss: 0.3386 - val_accuracy: 0.8752 - lr: 0.0023 Epoch 17/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3035 - accuracy: 0.8903 - val_loss: 0.3417 - val_accuracy: 0.8758 - lr: 0.0021 Epoch 18/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3005 - accuracy: 0.8919 - val_loss: 0.3398 - val_accuracy: 0.8768 - lr: 0.0019 Epoch 19/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2983 - accuracy: 0.8929 - val_loss: 0.3357 - val_accuracy: 0.8766 - lr: 0.0017 Epoch 20/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2959 - accuracy: 0.8939 - val_loss: 0.3370 - val_accuracy: 0.8752 - lr: 0.0016 Epoch 21/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2940 - accuracy: 0.8938 - val_loss: 0.3346 - val_accuracy: 0.8782 - lr: 0.0014 Epoch 22/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2917 - accuracy: 0.8949 - val_loss: 0.3361 - val_accuracy: 0.8766 - lr: 0.0013 Epoch 23/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2902 - accuracy: 0.8955 - val_loss: 0.3349 - val_accuracy: 0.8796 - lr: 0.0012 Epoch 24/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2884 - accuracy: 0.8959 - val_loss: 0.3364 - val_accuracy: 0.8796 - lr: 0.0011 Epoch 25/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2871 - accuracy: 0.8969 - val_loss: 0.3352 - val_accuracy: 0.8802 - lr: 1.0000e-03
lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
boundaries=[50_000, 80_000],
values=[0.01, 0.005, 0.001]
)
optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schedule)
history_piecewise_scheduling = build_and_train_model(optimizer) # extra code
Epoch 1/10 1719/1719 [==============================] - 4s 2ms/step - loss: 0.6942 - accuracy: 0.7617 - val_loss: 0.4892 - val_accuracy: 0.8318 Epoch 2/10 1719/1719 [==============================] - 3s 2ms/step - loss: 0.4751 - accuracy: 0.8340 - val_loss: 0.4603 - val_accuracy: 0.8346 Epoch 3/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.4280 - accuracy: 0.8500 - val_loss: 0.4245 - val_accuracy: 0.8542 Epoch 4/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.4035 - accuracy: 0.8581 - val_loss: 0.3867 - val_accuracy: 0.8626 Epoch 5/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3828 - accuracy: 0.8650 - val_loss: 0.3827 - val_accuracy: 0.8634 Epoch 6/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3665 - accuracy: 0.8700 - val_loss: 0.3880 - val_accuracy: 0.8608 Epoch 7/10 1719/1719 [==============================] - 3s 1ms/step - loss: 0.3539 - accuracy: 0.8730 - val_loss: 0.3669 - val_accuracy: 0.8688 Epoch 8/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3423 - accuracy: 0.8773 - val_loss: 0.3583 - val_accuracy: 0.8708 Epoch 9/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3322 - accuracy: 0.8807 - val_loss: 0.3447 - val_accuracy: 0.8758 Epoch 10/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3218 - accuracy: 0.8832 - val_loss: 0.3488 - val_accuracy: 0.8716
# extra code – this cell plots piecewise constant scheduling
boundaries = [50_000, 80_000]
values = [0.01, 0.005, 0.001]
steps = np.arange(100_000)
lrs = np.full(len(steps), values[0])
for boundary, value in zip(boundaries, values[1:]):
lrs[boundary:] = value
plt.plot(steps, lrs, "-")
plt.axis([0, steps.max(), 0, 0.0105])
plt.xlabel("Step")
plt.ylabel("Learning Rate")
plt.title("Piecewise Constant Scheduling", fontsize=14)
plt.grid(True)
plt.show()
Just like we did with exponential scheduling, we could also implement piecewise constant scheduling manually:
def piecewise_constant_fn(epoch):
if epoch < 5:
return 0.01
elif epoch < 15:
return 0.005
else:
return 0.001
# extra code – this cell demonstrates a more general way to define
# piecewise constant scheduling.
def piecewise_constant(boundaries, values):
boundaries = np.array([0] + boundaries)
values = np.array(values)
def piecewise_constant_fn(epoch):
return values[(boundaries > epoch).argmax() - 1]
return piecewise_constant_fn
piecewise_constant_fn = piecewise_constant([5, 15], [0.01, 0.005, 0.001])
# extra code – use a tf.keras.callbacks.LearningRateScheduler like earlier
n_epochs = 25
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(piecewise_constant_fn)
model = build_model()
optimizer = tf.keras.optimizers.Nadam(learning_rate=lr0)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=n_epochs,
validation_data=(X_valid, y_valid),
callbacks=[lr_scheduler])
Epoch 1/25 1719/1719 [==============================] - 5s 2ms/step - loss: 0.5433 - accuracy: 0.8087 - val_loss: 0.4586 - val_accuracy: 0.8288 - lr: 0.0100 Epoch 2/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.4487 - accuracy: 0.8439 - val_loss: 0.4608 - val_accuracy: 0.8350 - lr: 0.0100 Epoch 3/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.4263 - accuracy: 0.8502 - val_loss: 0.4234 - val_accuracy: 0.8568 - lr: 0.0100 Epoch 4/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.4241 - accuracy: 0.8537 - val_loss: 0.4359 - val_accuracy: 0.8490 - lr: 0.0100 Epoch 5/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.4080 - accuracy: 0.8584 - val_loss: 0.4165 - val_accuracy: 0.8560 - lr: 0.0100 Epoch 6/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3544 - accuracy: 0.8738 - val_loss: 0.3830 - val_accuracy: 0.8662 - lr: 0.0050 Epoch 7/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3464 - accuracy: 0.8761 - val_loss: 0.4026 - val_accuracy: 0.8652 - lr: 0.0050 Epoch 8/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3426 - accuracy: 0.8772 - val_loss: 0.4212 - val_accuracy: 0.8544 - lr: 0.0050 Epoch 9/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3417 - accuracy: 0.8793 - val_loss: 0.4116 - val_accuracy: 0.8612 - lr: 0.0050 Epoch 10/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3339 - accuracy: 0.8804 - val_loss: 0.4090 - val_accuracy: 0.8618 - lr: 0.0050 Epoch 11/25 1719/1719 [==============================] - 4s 2ms/step - loss: 0.3309 - accuracy: 0.8819 - val_loss: 0.4033 - val_accuracy: 0.8746 - lr: 0.0050 Epoch 12/25 1719/1719 [==============================] - 5s 3ms/step - loss: 0.3270 - accuracy: 0.8826 - val_loss: 0.4518 - val_accuracy: 0.8630 - lr: 0.0050 Epoch 13/25 1719/1719 [==============================] - 4s 2ms/step - loss: 0.3270 - accuracy: 0.8837 - val_loss: 0.3714 - val_accuracy: 0.8674 - lr: 0.0050 Epoch 14/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3247 - accuracy: 0.8844 - val_loss: 0.4026 - val_accuracy: 0.8652 - lr: 0.0050 Epoch 15/25 1719/1719 [==============================] - 4s 2ms/step - loss: 0.3204 - accuracy: 0.8852 - val_loss: 0.3993 - val_accuracy: 0.8724 - lr: 0.0050 Epoch 16/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2859 - accuracy: 0.8963 - val_loss: 0.3930 - val_accuracy: 0.8736 - lr: 0.0010 Epoch 17/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2781 - accuracy: 0.8978 - val_loss: 0.4021 - val_accuracy: 0.8714 - lr: 0.0010 Epoch 18/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2743 - accuracy: 0.8984 - val_loss: 0.3955 - val_accuracy: 0.8754 - lr: 0.0010 Epoch 19/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2704 - accuracy: 0.8999 - val_loss: 0.4015 - val_accuracy: 0.8756 - lr: 0.0010 Epoch 20/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2683 - accuracy: 0.9015 - val_loss: 0.4161 - val_accuracy: 0.8756 - lr: 0.0010 Epoch 21/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2655 - accuracy: 0.9020 - val_loss: 0.4207 - val_accuracy: 0.8740 - lr: 0.0010 Epoch 22/25 1719/1719 [==============================] - 4s 2ms/step - loss: 0.2646 - accuracy: 0.9020 - val_loss: 0.4497 - val_accuracy: 0.8746 - lr: 0.0010 Epoch 23/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2626 - accuracy: 0.9032 - val_loss: 0.4429 - val_accuracy: 0.8762 - lr: 0.0010 Epoch 24/25 1719/1719 [==============================] - 4s 2ms/step - loss: 0.2608 - accuracy: 0.9038 - val_loss: 0.4566 - val_accuracy: 0.8748 - lr: 0.0010 Epoch 25/25 1719/1719 [==============================] - 4s 2ms/step - loss: 0.2587 - accuracy: 0.9038 - val_loss: 0.4726 - val_accuracy: 0.8770 - lr: 0.0010
We've looked at InverseTimeDecay
, ExponentialDecay
, and PiecewiseConstantDecay
. A few more schedulers are available in tf.keras.optimizers.schedules
, here is the full list:
for name in sorted(dir(tf.keras.optimizers.schedules)):
if name[0] == name[0].lower(): # must start with capital letter
continue
scheduler_class = getattr(tf.keras.optimizers.schedules, name)
print(f"• {name} – {scheduler_class.__doc__.splitlines()[0]}")
• CosineDecay – A LearningRateSchedule that uses a cosine decay with optional warmup. • CosineDecayRestarts – A LearningRateSchedule that uses a cosine decay schedule with restarts. • ExponentialDecay – A LearningRateSchedule that uses an exponential decay schedule. • InverseTimeDecay – A LearningRateSchedule that uses an inverse time decay schedule. • LearningRateSchedule – The learning rate schedule base class. • PiecewiseConstantDecay – A LearningRateSchedule that uses a piecewise constant decay schedule. • PolynomialDecay – A LearningRateSchedule that uses a polynomial decay schedule.
# extra code – build and compile the model
model = build_model()
optimizer = tf.keras.optimizers.SGD(learning_rate=lr0)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
metrics=["accuracy"])
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)
history = model.fit(X_train, y_train, epochs=n_epochs,
validation_data=(X_valid, y_valid),
callbacks=[lr_scheduler])
Epoch 1/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.6807 - accuracy: 0.7679 - val_loss: 0.4814 - val_accuracy: 0.8310 - lr: 0.0100 Epoch 2/25 1719/1719 [==============================] - 2s 1ms/step - loss: 0.4659 - accuracy: 0.8343 - val_loss: 0.4615 - val_accuracy: 0.8306 - lr: 0.0100 Epoch 3/25 1719/1719 [==============================] - 3s 1ms/step - loss: 0.4201 - accuracy: 0.8505 - val_loss: 0.4199 - val_accuracy: 0.8490 - lr: 0.0100 Epoch 4/25 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3957 - accuracy: 0.8590 - val_loss: 0.3845 - val_accuracy: 0.8614 - lr: 0.0100 Epoch 5/25 1719/1719 [==============================] - 2s 1ms/step - loss: 0.3754 - accuracy: 0.8658 - val_loss: 0.3742 - val_accuracy: 0.8614 - lr: 0.0100 Epoch 6/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3588 - accuracy: 0.8709 - val_loss: 0.3853 - val_accuracy: 0.8628 - lr: 0.0100 Epoch 7/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3469 - accuracy: 0.8740 - val_loss: 0.3627 - val_accuracy: 0.8690 - lr: 0.0100 Epoch 8/25 1719/1719 [==============================] - 4s 2ms/step - loss: 0.3346 - accuracy: 0.8785 - val_loss: 0.3574 - val_accuracy: 0.8680 - lr: 0.0100 Epoch 9/25 1719/1719 [==============================] - 4s 2ms/step - loss: 0.3244 - accuracy: 0.8828 - val_loss: 0.3410 - val_accuracy: 0.8748 - lr: 0.0100 Epoch 10/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3149 - accuracy: 0.8850 - val_loss: 0.3410 - val_accuracy: 0.8720 - lr: 0.0100 Epoch 11/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.3074 - accuracy: 0.8879 - val_loss: 0.3629 - val_accuracy: 0.8678 - lr: 0.0100 Epoch 12/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2990 - accuracy: 0.8920 - val_loss: 0.3379 - val_accuracy: 0.8746 - lr: 0.0100 Epoch 13/25 1719/1719 [==============================] - 3s 1ms/step - loss: 0.2929 - accuracy: 0.8938 - val_loss: 0.3223 - val_accuracy: 0.8808 - lr: 0.0100 Epoch 14/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2867 - accuracy: 0.8947 - val_loss: 0.3405 - val_accuracy: 0.8754 - lr: 0.0100 Epoch 15/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2807 - accuracy: 0.8972 - val_loss: 0.3480 - val_accuracy: 0.8730 - lr: 0.0100 Epoch 16/25 1719/1719 [==============================] - 4s 2ms/step - loss: 0.2743 - accuracy: 0.8998 - val_loss: 0.3350 - val_accuracy: 0.8766 - lr: 0.0100 Epoch 17/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2694 - accuracy: 0.9019 - val_loss: 0.3421 - val_accuracy: 0.8764 - lr: 0.0100 Epoch 18/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2631 - accuracy: 0.9032 - val_loss: 0.3360 - val_accuracy: 0.8772 - lr: 0.0100 Epoch 19/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2445 - accuracy: 0.9110 - val_loss: 0.3162 - val_accuracy: 0.8874 - lr: 0.0050 Epoch 20/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2410 - accuracy: 0.9131 - val_loss: 0.3221 - val_accuracy: 0.8812 - lr: 0.0050 Epoch 21/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2380 - accuracy: 0.9137 - val_loss: 0.3166 - val_accuracy: 0.8828 - lr: 0.0050 Epoch 22/25 1719/1719 [==============================] - 4s 2ms/step - loss: 0.2351 - accuracy: 0.9148 - val_loss: 0.3146 - val_accuracy: 0.8854 - lr: 0.0050 Epoch 23/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2330 - accuracy: 0.9160 - val_loss: 0.3191 - val_accuracy: 0.8836 - lr: 0.0050 Epoch 24/25 1719/1719 [==============================] - 3s 1ms/step - loss: 0.2300 - accuracy: 0.9161 - val_loss: 0.3175 - val_accuracy: 0.8878 - lr: 0.0050 Epoch 25/25 1719/1719 [==============================] - 3s 2ms/step - loss: 0.2276 - accuracy: 0.9174 - val_loss: 0.3205 - val_accuracy: 0.8868 - lr: 0.0050
# extra code – this cell plots performance scheduling
plt.plot(history.epoch, history.history["lr"], "bo-")
plt.xlabel("Epoch")
plt.ylabel("Learning Rate", color='b')
plt.tick_params('y', colors='b')
plt.gca().set_xlim(0, n_epochs - 1)
plt.grid(True)
ax2 = plt.gca().twinx()
ax2.plot(history.epoch, history.history["val_loss"], "r^-")
ax2.set_ylabel('Validation Loss', color='r')
ax2.tick_params('y', colors='r')
plt.title("Reduce LR on Plateau", fontsize=14)
plt.show()
The ExponentialLearningRate
custom callback updates the learning rate during training, at the end of each batch. It multiplies it by a constant factor
. It also saves the learning rate and loss at each batch. Since logs["loss"]
is actually the mean loss since the start of the epoch, and we want to save the batch loss instead, we must compute the mean times the number of batches since the beginning of the epoch to get the total loss so far, then we subtract the total loss at the previous batch to get the current batch's loss.
K = tf.keras.backend
class ExponentialLearningRate(tf.keras.callbacks.Callback):
def __init__(self, factor):
self.factor = factor
self.rates = []
self.losses = []
def on_epoch_begin(self, epoch, logs=None):
self.sum_of_epoch_losses = 0
def on_batch_end(self, batch, logs=None):
mean_epoch_loss = logs["loss"] # the epoch's mean loss so far
new_sum_of_epoch_losses = mean_epoch_loss * (batch + 1)
batch_loss = new_sum_of_epoch_losses - self.sum_of_epoch_losses
self.sum_of_epoch_losses = new_sum_of_epoch_losses
self.rates.append(K.get_value(self.model.optimizer.learning_rate))
self.losses.append(batch_loss)
K.set_value(self.model.optimizer.learning_rate,
self.model.optimizer.learning_rate * self.factor)
The find_learning_rate()
function trains the model using the ExponentialLearningRate
callback, and it returns the learning rates and corresponding batch losses. At the end, it restores the model and its optimizer to their initial state.
def find_learning_rate(model, X, y, epochs=1, batch_size=32, min_rate=1e-4,
max_rate=1):
init_weights = model.get_weights()
iterations = math.ceil(len(X) / batch_size) * epochs
factor = (max_rate / min_rate) ** (1 / iterations)
init_lr = K.get_value(model.optimizer.learning_rate)
K.set_value(model.optimizer.learning_rate, min_rate)
exp_lr = ExponentialLearningRate(factor)
history = model.fit(X, y, epochs=epochs, batch_size=batch_size,
callbacks=[exp_lr])
K.set_value(model.optimizer.learning_rate, init_lr)
model.set_weights(init_weights)
return exp_lr.rates, exp_lr.losses
The plot_lr_vs_loss()
function plots the learning rates vs the losses. The optimal learning rate to use as the maximum learning rate in 1cycle is near the bottom of the curve.
def plot_lr_vs_loss(rates, losses):
plt.plot(rates, losses, "b")
plt.gca().set_xscale('log')
max_loss = losses[0] + min(losses)
plt.hlines(min(losses), min(rates), max(rates), color="k")
plt.axis([min(rates), max(rates), 0, max_loss])
plt.xlabel("Learning rate")
plt.ylabel("Loss")
plt.grid()
Let's build a simple Fashion MNIST model and compile it:
model = build_model()
model.compile(loss="sparse_categorical_crossentropy",
optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
metrics=["accuracy"])
Now let's find the optimal max learning rate for 1cycle:
batch_size = 128
rates, losses = find_learning_rate(model, X_train, y_train, epochs=1,
batch_size=batch_size)
plot_lr_vs_loss(rates, losses)
430/430 [==============================] - 1s 1ms/step - loss: 1.7725 - accuracy: 0.4122
Looks like the max learning rate to use for 1cycle is around 10–1.
The OneCycleScheduler
custom callback updates the learning rate at the beginning of each batch. It applies the logic described in the book: increase the learning rate linearly during about half of training, then reduce it linearly back to the initial learning rate, and lastly reduce it down to close to zero linearly for the very last part of training.
class OneCycleScheduler(tf.keras.callbacks.Callback):
def __init__(self, iterations, max_lr=1e-3, start_lr=None,
last_iterations=None, last_lr=None):
self.iterations = iterations
self.max_lr = max_lr
self.start_lr = start_lr or max_lr / 10
self.last_iterations = last_iterations or iterations // 10 + 1
self.half_iteration = (iterations - self.last_iterations) // 2
self.last_lr = last_lr or self.start_lr / 1000
self.iteration = 0
def _interpolate(self, iter1, iter2, lr1, lr2):
return (lr2 - lr1) * (self.iteration - iter1) / (iter2 - iter1) + lr1
def on_batch_begin(self, batch, logs):
if self.iteration < self.half_iteration:
lr = self._interpolate(0, self.half_iteration, self.start_lr,
self.max_lr)
elif self.iteration < 2 * self.half_iteration:
lr = self._interpolate(self.half_iteration, 2 * self.half_iteration,
self.max_lr, self.start_lr)
else:
lr = self._interpolate(2 * self.half_iteration, self.iterations,
self.start_lr, self.last_lr)
self.iteration += 1
K.set_value(self.model.optimizer.learning_rate, lr)
Let's build and compile a simple Fashion MNIST model, then train it using the OneCycleScheduler
callback:
model = build_model()
model.compile(loss="sparse_categorical_crossentropy",
optimizer=tf.keras.optimizers.SGD(),
metrics=["accuracy"])
n_epochs = 25
onecycle = OneCycleScheduler(math.ceil(len(X_train) / batch_size) * n_epochs,
max_lr=0.1)
history = model.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size,
validation_data=(X_valid, y_valid),
callbacks=[onecycle])
Epoch 1/25 430/430 [==============================] - 1s 2ms/step - loss: 0.9502 - accuracy: 0.6913 - val_loss: 0.6003 - val_accuracy: 0.7874 Epoch 2/25 430/430 [==============================] - 1s 1ms/step - loss: 0.5695 - accuracy: 0.8025 - val_loss: 0.4918 - val_accuracy: 0.8248 Epoch 3/25 430/430 [==============================] - 1s 1ms/step - loss: 0.4954 - accuracy: 0.8252 - val_loss: 0.4762 - val_accuracy: 0.8264 Epoch 4/25 430/430 [==============================] - 1s 1ms/step - loss: 0.4515 - accuracy: 0.8402 - val_loss: 0.4261 - val_accuracy: 0.8478 Epoch 5/25 430/430 [==============================] - 1s 1ms/step - loss: 0.4225 - accuracy: 0.8492 - val_loss: 0.4066 - val_accuracy: 0.8486 Epoch 6/25 430/430 [==============================] - 1s 1ms/step - loss: 0.3958 - accuracy: 0.8571 - val_loss: 0.4787 - val_accuracy: 0.8224 Epoch 7/25 430/430 [==============================] - 1s 1ms/step - loss: 0.3787 - accuracy: 0.8626 - val_loss: 0.3917 - val_accuracy: 0.8566 Epoch 8/25 430/430 [==============================] - 1s 1ms/step - loss: 0.3630 - accuracy: 0.8683 - val_loss: 0.4719 - val_accuracy: 0.8296 Epoch 9/25 430/430 [==============================] - 1s 1ms/step - loss: 0.3512 - accuracy: 0.8724 - val_loss: 0.3673 - val_accuracy: 0.8652 Epoch 10/25 430/430 [==============================] - 1s 1ms/step - loss: 0.3360 - accuracy: 0.8766 - val_loss: 0.4957 - val_accuracy: 0.8466 Epoch 11/25 430/430 [==============================] - 1s 1ms/step - loss: 0.3287 - accuracy: 0.8786 - val_loss: 0.4187 - val_accuracy: 0.8370 Epoch 12/25 430/430 [==============================] - 1s 1ms/step - loss: 0.3173 - accuracy: 0.8815 - val_loss: 0.3425 - val_accuracy: 0.8728 Epoch 13/25 430/430 [==============================] - 1s 1ms/step - loss: 0.2961 - accuracy: 0.8910 - val_loss: 0.3217 - val_accuracy: 0.8792 Epoch 14/25 430/430 [==============================] - 1s 1ms/step - loss: 0.2818 - accuracy: 0.8958 - val_loss: 0.3734 - val_accuracy: 0.8692 Epoch 15/25 430/430 [==============================] - 1s 1ms/step - loss: 0.2675 - accuracy: 0.9003 - val_loss: 0.3261 - val_accuracy: 0.8844 Epoch 16/25 430/430 [==============================] - 1s 1ms/step - loss: 0.2558 - accuracy: 0.9055 - val_loss: 0.3205 - val_accuracy: 0.8820 Epoch 17/25 430/430 [==============================] - 1s 1ms/step - loss: 0.2464 - accuracy: 0.9091 - val_loss: 0.3089 - val_accuracy: 0.8894 Epoch 18/25 430/430 [==============================] - 1s 1ms/step - loss: 0.2368 - accuracy: 0.9115 - val_loss: 0.3130 - val_accuracy: 0.8870 Epoch 19/25 430/430 [==============================] - 1s 1ms/step - loss: 0.2292 - accuracy: 0.9145 - val_loss: 0.3078 - val_accuracy: 0.8854 Epoch 20/25 430/430 [==============================] - 1s 1ms/step - loss: 0.2205 - accuracy: 0.9186 - val_loss: 0.3092 - val_accuracy: 0.8886 Epoch 21/25 430/430 [==============================] - 1s 1ms/step - loss: 0.2138 - accuracy: 0.9209 - val_loss: 0.3022 - val_accuracy: 0.8914 Epoch 22/25 430/430 [==============================] - 1s 1ms/step - loss: 0.2073 - accuracy: 0.9232 - val_loss: 0.3054 - val_accuracy: 0.8914 Epoch 23/25 430/430 [==============================] - 1s 1ms/step - loss: 0.2020 - accuracy: 0.9261 - val_loss: 0.3026 - val_accuracy: 0.8896 Epoch 24/25 430/430 [==============================] - 1s 1ms/step - loss: 0.1989 - accuracy: 0.9273 - val_loss: 0.3020 - val_accuracy: 0.8922 Epoch 25/25 430/430 [==============================] - 1s 1ms/step - loss: 0.1967 - accuracy: 0.9276 - val_loss: 0.3016 - val_accuracy: 0.8920
layer = tf.keras.layers.Dense(100, activation="relu",
kernel_initializer="he_normal",
kernel_regularizer=tf.keras.regularizers.l2(0.01))
Or use l1(0.1)
for ℓ1 regularization with a factor of 0.1, or l1_l2(0.1, 0.01)
for both ℓ1 and ℓ2 regularization, with factors 0.1 and 0.01 respectively.
tf.random.set_seed(42) # extra code – for reproducibility
from functools import partial
RegularizedDense = partial(tf.keras.layers.Dense,
activation="relu",
kernel_initializer="he_normal",
kernel_regularizer=tf.keras.regularizers.l2(0.01))
model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=[28, 28]),
RegularizedDense(100),
RegularizedDense(100),
RegularizedDense(10, activation="softmax")
])
# extra code – compile and train the model
optimizer = tf.keras.optimizers.SGD(learning_rate=0.02)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=2,
validation_data=(X_valid, y_valid))
Epoch 1/2 1719/1719 [==============================] - 2s 878us/step - loss: 3.1224 - accuracy: 0.7748 - val_loss: 1.8602 - val_accuracy: 0.8264 Epoch 2/2 1719/1719 [==============================] - 1s 814us/step - loss: 1.4263 - accuracy: 0.8159 - val_loss: 1.1269 - val_accuracy: 0.8182
tf.random.set_seed(42) # extra code – for reproducibility
model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=[28, 28]),
tf.keras.layers.Dropout(rate=0.2),
tf.keras.layers.Dense(100, activation="relu",
kernel_initializer="he_normal"),
tf.keras.layers.Dropout(rate=0.2),
tf.keras.layers.Dense(100, activation="relu",
kernel_initializer="he_normal"),
tf.keras.layers.Dropout(rate=0.2),
tf.keras.layers.Dense(10, activation="softmax")
])
# extra code – compile and train the model
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,
validation_data=(X_valid, y_valid))
Epoch 1/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.6703 - accuracy: 0.7536 - val_loss: 0.4498 - val_accuracy: 0.8342 Epoch 2/10 1719/1719 [==============================] - 2s 996us/step - loss: 0.5103 - accuracy: 0.8136 - val_loss: 0.4401 - val_accuracy: 0.8296 Epoch 3/10 1719/1719 [==============================] - 2s 998us/step - loss: 0.4712 - accuracy: 0.8263 - val_loss: 0.3806 - val_accuracy: 0.8554 Epoch 4/10 1719/1719 [==============================] - 2s 977us/step - loss: 0.4488 - accuracy: 0.8337 - val_loss: 0.3711 - val_accuracy: 0.8608 Epoch 5/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.4342 - accuracy: 0.8409 - val_loss: 0.3672 - val_accuracy: 0.8606 Epoch 6/10 1719/1719 [==============================] - 2s 983us/step - loss: 0.4245 - accuracy: 0.8427 - val_loss: 0.3706 - val_accuracy: 0.8600 Epoch 7/10 1719/1719 [==============================] - 2s 995us/step - loss: 0.4131 - accuracy: 0.8467 - val_loss: 0.3582 - val_accuracy: 0.8650 Epoch 8/10 1719/1719 [==============================] - 2s 959us/step - loss: 0.4074 - accuracy: 0.8484 - val_loss: 0.3478 - val_accuracy: 0.8708 Epoch 9/10 1719/1719 [==============================] - 2s 997us/step - loss: 0.4024 - accuracy: 0.8533 - val_loss: 0.3556 - val_accuracy: 0.8690 Epoch 10/10 1719/1719 [==============================] - 2s 998us/step - loss: 0.3903 - accuracy: 0.8552 - val_loss: 0.3453 - val_accuracy: 0.8732
The training accuracy looks like it's lower than the validation accuracy, but that's just because dropout is only active during training. If we evaluate the model on the training set after training (i.e., with dropout turned off), we get the "real" training accuracy, which is very slightly higher than the validation accuracy and the test accuracy:
model.evaluate(X_train, y_train)
1719/1719 [==============================] - 1s 578us/step - loss: 0.3082 - accuracy: 0.8849
[0.30816400051116943, 0.8849090933799744]
model.evaluate(X_test, y_test)
313/313 [==============================] - 0s 588us/step - loss: 0.3629 - accuracy: 0.8700
[0.3628920316696167, 0.8700000047683716]
Note: make sure to use AlphaDropout
instead of Dropout
if you want to build a self-normalizing neural net using SELU.
tf.random.set_seed(42) # extra code – for reproducibility
y_probas = np.stack([model(X_test, training=True)
for sample in range(100)])
y_proba = y_probas.mean(axis=0)
model.predict(X_test[:1]).round(3)
array([[0. , 0. , 0. , 0. , 0. , 0.024, 0. , 0.132, 0. , 0.844]], dtype=float32)
y_proba[0].round(3)
array([0. , 0. , 0. , 0. , 0. , 0.067, 0. , 0.209, 0.001, 0.723], dtype=float32)
y_std = y_probas.std(axis=0)
y_std[0].round(3)
array([0. , 0. , 0. , 0.001, 0. , 0.096, 0. , 0.162, 0.001, 0.183], dtype=float32)
y_pred = y_proba.argmax(axis=1)
accuracy = (y_pred == y_test).sum() / len(y_test)
accuracy
0.8717
class MCDropout(tf.keras.layers.Dropout):
def call(self, inputs, training=None):
return super().call(inputs, training=True)
# extra code – shows how to convert Dropout to MCDropout in a Sequential model
Dropout = tf.keras.layers.Dropout
mc_model = tf.keras.Sequential([
MCDropout(layer.rate) if isinstance(layer, Dropout) else layer
for layer in model.layers
])
mc_model.set_weights(model.get_weights())
mc_model.summary()
Model: "sequential_25" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= flatten_22 (Flatten) (None, 784) 0 _________________________________________________________________ mc_dropout (MCDropout) (None, 784) 0 _________________________________________________________________ dense_89 (Dense) (None, 100) 78500 _________________________________________________________________ mc_dropout_1 (MCDropout) (None, 100) 0 _________________________________________________________________ dense_90 (Dense) (None, 100) 10100 _________________________________________________________________ mc_dropout_2 (MCDropout) (None, 100) 0 _________________________________________________________________ dense_91 (Dense) (None, 10) 1010 ================================================================= Total params: 89,610 Trainable params: 89,610 Non-trainable params: 0 _________________________________________________________________
Now we can use the model with MC Dropout:
# extra code – shows that the model works without retraining
tf.random.set_seed(42)
np.mean([mc_model.predict(X_test[:1])
for sample in range(100)], axis=0).round(2)
array([[0. , 0. , 0. , 0. , 0. , 0.07, 0. , 0.17, 0. , 0.76]], dtype=float32)
dense = tf.keras.layers.Dense(
100, activation="relu", kernel_initializer="he_normal",
kernel_constraint=tf.keras.constraints.max_norm(1.))
# extra code – shows how to apply max norm to every hidden layer in a model
MaxNormDense = partial(tf.keras.layers.Dense,
activation="relu", kernel_initializer="he_normal",
kernel_constraint=tf.keras.constraints.max_norm(1.))
tf.random.set_seed(42)
model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=[28, 28]),
MaxNormDense(100),
MaxNormDense(100),
tf.keras.layers.Dense(10, activation="softmax")
])
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,
validation_data=(X_valid, y_valid))
Epoch 1/10 1719/1719 [==============================] - 2s 1ms/step - loss: 0.5500 - accuracy: 0.8015 - val_loss: 0.4510 - val_accuracy: 0.8242 Epoch 2/10 1719/1719 [==============================] - 2s 960us/step - loss: 0.4089 - accuracy: 0.8499 - val_loss: 0.3956 - val_accuracy: 0.8504 Epoch 3/10 1719/1719 [==============================] - 2s 974us/step - loss: 0.3777 - accuracy: 0.8604 - val_loss: 0.3693 - val_accuracy: 0.8680 Epoch 4/10 1719/1719 [==============================] - 2s 943us/step - loss: 0.3581 - accuracy: 0.8690 - val_loss: 0.3517 - val_accuracy: 0.8716 Epoch 5/10 1719/1719 [==============================] - 2s 949us/step - loss: 0.3416 - accuracy: 0.8729 - val_loss: 0.3433 - val_accuracy: 0.8682 Epoch 6/10 1719/1719 [==============================] - 2s 951us/step - loss: 0.3368 - accuracy: 0.8756 - val_loss: 0.4045 - val_accuracy: 0.8582 Epoch 7/10 1719/1719 [==============================] - 2s 935us/step - loss: 0.3293 - accuracy: 0.8767 - val_loss: 0.4168 - val_accuracy: 0.8476 Epoch 8/10 1719/1719 [==============================] - 2s 951us/step - loss: 0.3258 - accuracy: 0.8779 - val_loss: 0.3570 - val_accuracy: 0.8674 Epoch 9/10 1719/1719 [==============================] - 2s 970us/step - loss: 0.3269 - accuracy: 0.8787 - val_loss: 0.3702 - val_accuracy: 0.8578 Epoch 10/10 1719/1719 [==============================] - 2s 948us/step - loss: 0.3169 - accuracy: 0.8809 - val_loss: 0.3907 - val_accuracy: 0.8578
momentum
hyperparameter too close to 1 (e.g., 0.99999) when using an SGD
optimizer, then the algorithm will likely pick up a lot of speed, hopefully moving roughly toward the global minimum, but its momentum will carry it right past the minimum. Then it will slow down and come back, accelerate again, overshoot again, and so on. It may oscillate this way many times before converging, so overall it will take much longer to converge than with a smaller momentum
value.Exercise: Build a DNN with 20 hidden layers of 100 neurons each (that's too many, but it's the point of this exercise). Use He initialization and the Swish activation function.
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=[32, 32, 3]))
for _ in range(20):
model.add(tf.keras.layers.Dense(100,
activation="swish",
kernel_initializer="he_normal"))
Exercise: Using Nadam optimization and early stopping, train the network on the CIFAR10 dataset. You can load it with tf.keras.datasets.cifar10.load_data()
. The dataset is composed of 60,000 32 × 32–pixel color images (50,000 for training, 10,000 for testing) with 10 classes, so you'll need a softmax output layer with 10 neurons. Remember to search for the right learning rate each time you change the model's architecture or hyperparameters.
Let's add the output layer to the model:
model.add(tf.keras.layers.Dense(10, activation="softmax"))
Let's use a Nadam optimizer with a learning rate of 5e-5. I tried learning rates 1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3 and 1e-2, and I compared their learning curves for 10 epochs each (using the TensorBoard callback, below). The learning rates 3e-5 and 1e-4 were pretty good, so I tried 5e-5, which turned out to be slightly better.
optimizer = tf.keras.optimizers.Nadam(learning_rate=5e-5)
model.compile(loss="sparse_categorical_crossentropy",
optimizer=optimizer,
metrics=["accuracy"])
Let's load the CIFAR10 dataset. We also want to use early stopping, so we need a validation set. Let's use the first 5,000 images of the original training set as the validation set:
cifar10 = tf.keras.datasets.cifar10.load_data()
(X_train_full, y_train_full), (X_test, y_test) = cifar10
X_train = X_train_full[5000:]
y_train = y_train_full[5000:]
X_valid = X_train_full[:5000]
y_valid = y_train_full[:5000]
Now we can create the callbacks we need and train the model:
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=20,
restore_best_weights=True)
model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("my_cifar10_model",
save_best_only=True)
run_index = 1 # increment every time you train the model
run_logdir = Path() / "my_cifar10_logs" / f"run_{run_index:03d}"
tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)
callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]
%load_ext tensorboard
%tensorboard --logdir=./my_cifar10_logs
model.fit(X_train, y_train, epochs=100,
validation_data=(X_valid, y_valid),
callbacks=callbacks)
Epoch 1/100 1404/1407 [============================>.] - ETA: 0s - loss: 4.0493 - accuracy: 0.1598INFO:tensorflow:Assets written to: my_cifar10_model/assets 1407/1407 [==============================] - 17s 10ms/step - loss: 4.0462 - accuracy: 0.1597 - val_loss: 2.1441 - val_accuracy: 0.2036 Epoch 2/100 1407/1407 [==============================] - ETA: 0s - loss: 2.0667 - accuracy: 0.2320INFO:tensorflow:Assets written to: my_cifar10_model/assets 1407/1407 [==============================] - 12s 9ms/step - loss: 2.0667 - accuracy: 0.2320 - val_loss: 2.0134 - val_accuracy: 0.2472 Epoch 3/100 1407/1407 [==============================] - ETA: 0s - loss: 1.9472 - accuracy: 0.2819INFO:tensorflow:Assets written to: my_cifar10_model/assets 1407/1407 [==============================] - 14s 10ms/step - loss: 1.9472 - accuracy: 0.2819 - val_loss: 1.9427 - val_accuracy: 0.2796 Epoch 4/100 1405/1407 [============================>.] - ETA: 0s - loss: 1.8636 - accuracy: 0.3182INFO:tensorflow:Assets written to: my_cifar10_model/assets 1407/1407 [==============================] - 14s 10ms/step - loss: 1.8637 - accuracy: 0.3182 - val_loss: 1.8934 - val_accuracy: 0.3222 Epoch 5/100 1402/1407 [============================>.] - ETA: 0s - loss: 1.7975 - accuracy: 0.3464INFO:tensorflow:Assets written to: my_cifar10_model/assets 1407/1407 [==============================] - 14s 10ms/step - loss: 1.7974 - accuracy: 0.3465 - val_loss: 1.8389 - val_accuracy: 0.3284 Epoch 6/100 1407/1407 [==============================] - 9s 7ms/step - loss: 1.7446 - accuracy: 0.3664 - val_loss: 2.0006 - val_accuracy: 0.3030 Epoch 7/100 1407/1407 [==============================] - ETA: 0s - loss: 1.6974 - accuracy: 0.3852INFO:tensorflow:Assets written to: my_cifar10_model/assets 1407/1407 [==============================] - 12s 8ms/step - loss: 1.6974 - accuracy: 0.3852 - val_loss: 1.7075 - val_accuracy: 0.3738 Epoch 8/100 1405/1407 [============================>.] - ETA: 0s - loss: 1.6605 - accuracy: 0.3984INFO:tensorflow:Assets written to: my_cifar10_model/assets 1407/1407 [==============================] - 12s 8ms/step - loss: 1.6604 - accuracy: 0.3984 - val_loss: 1.6788 - val_accuracy: 0.3836 Epoch 9/100 1405/1407 [============================>.] - ETA: 0s - loss: 1.6322 - accuracy: 0.4114INFO:tensorflow:Assets written to: my_cifar10_model/assets 1407/1407 [==============================] - 14s 10ms/step - loss: 1.6321 - accuracy: 0.4114 - val_loss: 1.6477 - val_accuracy: 0.4014 Epoch 10/100 1407/1407 [==============================] - 12s 8ms/step - loss: 1.6065 - accuracy: 0.4205 - val_loss: 1.6623 - val_accuracy: 0.3980 Epoch 11/100 1401/1407 [============================>.] - ETA: 0s - loss: 1.5843 - accuracy: 0.4287INFO:tensorflow:Assets written to: my_cifar10_model/assets 1407/1407 [==============================] - 14s 10ms/step - loss: 1.5845 - accuracy: 0.4285 - val_loss: 1.6032 - val_accuracy: 0.4198 Epoch 12/100 1407/1407 [==============================] - 9s 6ms/step - loss: 1.5634 - accuracy: 0.4367 - val_loss: 1.6063 - val_accuracy: 0.4258 Epoch 13/100 1401/1407 [============================>.] - ETA: 0s - loss: 1.5443 - accuracy: 0.4420INFO:tensorflow:Assets written to: my_cifar10_model/assets <<47 more lines>> 1407/1407 [==============================] - 12s 8ms/step - loss: 1.3247 - accuracy: 0.5256 - val_loss: 1.5130 - val_accuracy: 0.4616 Epoch 33/100 1407/1407 [==============================] - 13s 9ms/step - loss: 1.3164 - accuracy: 0.5286 - val_loss: 1.5284 - val_accuracy: 0.4686 Epoch 34/100 1407/1407 [==============================] - 12s 9ms/step - loss: 1.3091 - accuracy: 0.5303 - val_loss: 1.5208 - val_accuracy: 0.4682 Epoch 35/100 1407/1407 [==============================] - 12s 8ms/step - loss: 1.3026 - accuracy: 0.5319 - val_loss: 1.5479 - val_accuracy: 0.4604 Epoch 36/100 1407/1407 [==============================] - 11s 8ms/step - loss: 1.2930 - accuracy: 0.5378 - val_loss: 1.5443 - val_accuracy: 0.4580 Epoch 37/100 1407/1407 [==============================] - 12s 8ms/step - loss: 1.2833 - accuracy: 0.5406 - val_loss: 1.5165 - val_accuracy: 0.4710 Epoch 38/100 1407/1407 [==============================] - 11s 8ms/step - loss: 1.2763 - accuracy: 0.5433 - val_loss: 1.5345 - val_accuracy: 0.4672 Epoch 39/100 1407/1407 [==============================] - 12s 9ms/step - loss: 1.2687 - accuracy: 0.5437 - val_loss: 1.5162 - val_accuracy: 0.4712 Epoch 40/100 1407/1407 [==============================] - 11s 7ms/step - loss: 1.2623 - accuracy: 0.5490 - val_loss: 1.5717 - val_accuracy: 0.4566 Epoch 41/100 1407/1407 [==============================] - 14s 10ms/step - loss: 1.2580 - accuracy: 0.5467 - val_loss: 1.5296 - val_accuracy: 0.4738 Epoch 42/100 1407/1407 [==============================] - 13s 9ms/step - loss: 1.2469 - accuracy: 0.5532 - val_loss: 1.5179 - val_accuracy: 0.4690 Epoch 43/100 1407/1407 [==============================] - 11s 8ms/step - loss: 1.2404 - accuracy: 0.5542 - val_loss: 1.5542 - val_accuracy: 0.4566 Epoch 44/100 1407/1407 [==============================] - 12s 8ms/step - loss: 1.2292 - accuracy: 0.5605 - val_loss: 1.5536 - val_accuracy: 0.4608 Epoch 45/100 1407/1407 [==============================] - 12s 9ms/step - loss: 1.2276 - accuracy: 0.5606 - val_loss: 1.5522 - val_accuracy: 0.4624 Epoch 46/100 1407/1407 [==============================] - 13s 9ms/step - loss: 1.2200 - accuracy: 0.5637 - val_loss: 1.5339 - val_accuracy: 0.4794 Epoch 47/100 1407/1407 [==============================] - 13s 9ms/step - loss: 1.2080 - accuracy: 0.5677 - val_loss: 1.5451 - val_accuracy: 0.4688 Epoch 48/100 1407/1407 [==============================] - 15s 10ms/step - loss: 1.2050 - accuracy: 0.5675 - val_loss: 1.5209 - val_accuracy: 0.4770 Epoch 49/100 1407/1407 [==============================] - 10s 7ms/step - loss: 1.1947 - accuracy: 0.5718 - val_loss: 1.5435 - val_accuracy: 0.4736
<keras.callbacks.History at 0x7fb9f02fc070>
model.evaluate(X_valid, y_valid)
157/157 [==============================] - 0s 2ms/step - loss: 1.5062 - accuracy: 0.4676
[1.5061508417129517, 0.4675999879837036]
The model with the lowest validation loss gets about 46.8% accuracy on the validation set. It took 29 epochs to reach the lowest validation loss, with roughly 10 seconds per epoch on my laptop (without a GPU). Let's see if we can improve the model using Batch Normalization.
Exercise: Now try adding Batch Normalization and compare the learning curves: Is it converging faster than before? Does it produce a better model? How does it affect training speed?
The code below is very similar to the code above, with a few changes:
my_cifar10_bn_model
.tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=[32, 32, 3]))
for _ in range(20):
model.add(tf.keras.layers.Dense(100, kernel_initializer="he_normal"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Activation("swish"))
model.add(tf.keras.layers.Dense(10, activation="softmax"))
optimizer = tf.keras.optimizers.Nadam(learning_rate=5e-4)
model.compile(loss="sparse_categorical_crossentropy",
optimizer=optimizer,
metrics=["accuracy"])
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=20,
restore_best_weights=True)
model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("my_cifar10_bn_model",
save_best_only=True)
run_index = 1 # increment every time you train the model
run_logdir = Path() / "my_cifar10_logs" / f"run_bn_{run_index:03d}"
tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)
callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]
model.fit(X_train, y_train, epochs=100,
validation_data=(X_valid, y_valid),
callbacks=callbacks)
model.evaluate(X_valid, y_valid)
Epoch 1/100 1403/1407 [============================>.] - ETA: 0s - loss: 2.0377 - accuracy: 0.2523INFO:tensorflow:Assets written to: my_cifar10_bn_model/assets 1407/1407 [==============================] - 32s 18ms/step - loss: 2.0374 - accuracy: 0.2525 - val_loss: 1.8766 - val_accuracy: 0.3154 Epoch 2/100 1407/1407 [==============================] - 17s 12ms/step - loss: 1.7874 - accuracy: 0.3542 - val_loss: 1.8784 - val_accuracy: 0.3268 Epoch 3/100 1407/1407 [==============================] - 20s 15ms/step - loss: 1.6806 - accuracy: 0.3969 - val_loss: 1.9764 - val_accuracy: 0.3252 Epoch 4/100 1403/1407 [============================>.] - ETA: 0s - loss: 1.6111 - accuracy: 0.4229INFO:tensorflow:Assets written to: my_cifar10_bn_model/assets 1407/1407 [==============================] - 24s 17ms/step - loss: 1.6112 - accuracy: 0.4228 - val_loss: 1.7087 - val_accuracy: 0.3750 Epoch 5/100 1402/1407 [============================>.] - ETA: 0s - loss: 1.5520 - accuracy: 0.4478INFO:tensorflow:Assets written to: my_cifar10_bn_model/assets 1407/1407 [==============================] - 21s 15ms/step - loss: 1.5521 - accuracy: 0.4476 - val_loss: 1.6272 - val_accuracy: 0.4176 Epoch 6/100 1406/1407 [============================>.] - ETA: 0s - loss: 1.5030 - accuracy: 0.4659INFO:tensorflow:Assets written to: my_cifar10_bn_model/assets 1407/1407 [==============================] - 23s 16ms/step - loss: 1.5030 - accuracy: 0.4660 - val_loss: 1.5401 - val_accuracy: 0.4452 Epoch 7/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.4559 - accuracy: 0.4812 - val_loss: 1.6990 - val_accuracy: 0.3952 Epoch 8/100 1403/1407 [============================>.] - ETA: 0s - loss: 1.4169 - accuracy: 0.4987INFO:tensorflow:Assets written to: my_cifar10_bn_model/assets 1407/1407 [==============================] - 21s 15ms/step - loss: 1.4168 - accuracy: 0.4987 - val_loss: 1.5078 - val_accuracy: 0.4652 Epoch 9/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.3863 - accuracy: 0.5123 - val_loss: 1.5513 - val_accuracy: 0.4470 Epoch 10/100 1407/1407 [==============================] - 17s 12ms/step - loss: 1.3514 - accuracy: 0.5216 - val_loss: 1.5208 - val_accuracy: 0.4562 Epoch 11/100 1407/1407 [==============================] - 16s 12ms/step - loss: 1.3220 - accuracy: 0.5314 - val_loss: 1.7301 - val_accuracy: 0.4206 Epoch 12/100 1404/1407 [============================>.] - ETA: 0s - loss: 1.2933 - accuracy: 0.5410INFO:tensorflow:Assets written to: my_cifar10_bn_model/assets 1407/1407 [==============================] - 25s 18ms/step - loss: 1.2931 - accuracy: 0.5410 - val_loss: 1.4909 - val_accuracy: 0.4734 Epoch 13/100 1407/1407 [==============================] - 16s 11ms/step - loss: 1.2702 - accuracy: 0.5490 - val_loss: 1.5256 - val_accuracy: 0.4636 Epoch 14/100 1407/1407 [==============================] - 17s 12ms/step - loss: 1.2424 - accuracy: 0.5591 - val_loss: 1.5569 - val_accuracy: 0.4624 Epoch 15/100 <<12 more lines>> Epoch 21/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.1174 - accuracy: 0.6066 - val_loss: 1.5241 - val_accuracy: 0.4828 Epoch 22/100 1407/1407 [==============================] - 18s 13ms/step - loss: 1.0978 - accuracy: 0.6128 - val_loss: 1.5313 - val_accuracy: 0.4772 Epoch 23/100 1407/1407 [==============================] - 17s 12ms/step - loss: 1.0844 - accuracy: 0.6198 - val_loss: 1.4993 - val_accuracy: 0.4924 Epoch 24/100 1407/1407 [==============================] - 17s 12ms/step - loss: 1.0677 - accuracy: 0.6244 - val_loss: 1.4622 - val_accuracy: 0.5078 Epoch 25/100 1407/1407 [==============================] - 18s 13ms/step - loss: 1.0571 - accuracy: 0.6297 - val_loss: 1.4917 - val_accuracy: 0.4990 Epoch 26/100 1407/1407 [==============================] - 19s 14ms/step - loss: 1.0395 - accuracy: 0.6327 - val_loss: 1.4888 - val_accuracy: 0.4896 Epoch 27/100 1407/1407 [==============================] - 18s 13ms/step - loss: 1.0298 - accuracy: 0.6370 - val_loss: 1.5358 - val_accuracy: 0.5024 Epoch 28/100 1407/1407 [==============================] - 18s 13ms/step - loss: 1.0150 - accuracy: 0.6444 - val_loss: 1.5219 - val_accuracy: 0.5030 Epoch 29/100 1407/1407 [==============================] - 16s 12ms/step - loss: 1.0100 - accuracy: 0.6456 - val_loss: 1.4933 - val_accuracy: 0.5098 Epoch 30/100 1407/1407 [==============================] - 20s 14ms/step - loss: 0.9956 - accuracy: 0.6492 - val_loss: 1.4756 - val_accuracy: 0.5012 Epoch 31/100 1407/1407 [==============================] - 16s 11ms/step - loss: 0.9787 - accuracy: 0.6576 - val_loss: 1.5181 - val_accuracy: 0.4936 Epoch 32/100 1407/1407 [==============================] - 17s 12ms/step - loss: 0.9710 - accuracy: 0.6565 - val_loss: 1.7510 - val_accuracy: 0.4568 Epoch 33/100 1407/1407 [==============================] - 20s 14ms/step - loss: 0.9613 - accuracy: 0.6628 - val_loss: 1.5576 - val_accuracy: 0.4910 Epoch 34/100 1407/1407 [==============================] - 19s 14ms/step - loss: 0.9530 - accuracy: 0.6651 - val_loss: 1.5087 - val_accuracy: 0.5046 Epoch 35/100 1407/1407 [==============================] - 19s 13ms/step - loss: 0.9388 - accuracy: 0.6701 - val_loss: 1.5534 - val_accuracy: 0.4950 Epoch 36/100 1407/1407 [==============================] - 17s 12ms/step - loss: 0.9331 - accuracy: 0.6743 - val_loss: 1.5033 - val_accuracy: 0.5046 Epoch 37/100 1407/1407 [==============================] - 19s 14ms/step - loss: 0.9144 - accuracy: 0.6808 - val_loss: 1.5679 - val_accuracy: 0.5028 157/157 [==============================] - 0s 2ms/step - loss: 1.4236 - accuracy: 0.5074
[1.4236289262771606, 0.5073999762535095]
Exercise: Try replacing Batch Normalization with SELU, and make the necessary adjustements to ensure the network self-normalizes (i.e., standardize the input features, use LeCun normal initialization, make sure the DNN contains only a sequence of dense layers, etc.).
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=[32, 32, 3]))
for _ in range(20):
model.add(tf.keras.layers.Dense(100,
kernel_initializer="lecun_normal",
activation="selu"))
model.add(tf.keras.layers.Dense(10, activation="softmax"))
optimizer = tf.keras.optimizers.Nadam(learning_rate=7e-4)
model.compile(loss="sparse_categorical_crossentropy",
optimizer=optimizer,
metrics=["accuracy"])
early_stopping_cb = tf.keras.callbacks.EarlyStopping(
patience=20, restore_best_weights=True)
model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
"my_cifar10_selu_model", save_best_only=True)
run_index = 1 # increment every time you train the model
run_logdir = Path() / "my_cifar10_logs" / f"run_selu_{run_index:03d}"
tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)
callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]
X_means = X_train.mean(axis=0)
X_stds = X_train.std(axis=0)
X_train_scaled = (X_train - X_means) / X_stds
X_valid_scaled = (X_valid - X_means) / X_stds
X_test_scaled = (X_test - X_means) / X_stds
model.fit(X_train_scaled, y_train, epochs=100,
validation_data=(X_valid_scaled, y_valid),
callbacks=callbacks)
model.evaluate(X_valid_scaled, y_valid)
Epoch 1/100 1403/1407 [============================>.] - ETA: 0s - loss: 1.9386 - accuracy: 0.3045INFO:tensorflow:Assets written to: my_cifar10_selu_model/assets 1407/1407 [==============================] - 20s 13ms/step - loss: 1.9385 - accuracy: 0.3046 - val_loss: 1.8175 - val_accuracy: 0.3510 Epoch 2/100 1405/1407 [============================>.] - ETA: 0s - loss: 1.7241 - accuracy: 0.3869INFO:tensorflow:Assets written to: my_cifar10_selu_model/assets 1407/1407 [==============================] - 16s 11ms/step - loss: 1.7241 - accuracy: 0.3869 - val_loss: 1.7677 - val_accuracy: 0.3614 Epoch 3/100 1407/1407 [==============================] - ETA: 0s - loss: 1.6272 - accuracy: 0.4263INFO:tensorflow:Assets written to: my_cifar10_selu_model/assets 1407/1407 [==============================] - 18s 13ms/step - loss: 1.6272 - accuracy: 0.4263 - val_loss: 1.6878 - val_accuracy: 0.4054 Epoch 4/100 1406/1407 [============================>.] - ETA: 0s - loss: 1.5644 - accuracy: 0.4492INFO:tensorflow:Assets written to: my_cifar10_selu_model/assets 1407/1407 [==============================] - 18s 13ms/step - loss: 1.5643 - accuracy: 0.4492 - val_loss: 1.6589 - val_accuracy: 0.4304 Epoch 5/100 1404/1407 [============================>.] - ETA: 0s - loss: 1.5080 - accuracy: 0.4712INFO:tensorflow:Assets written to: my_cifar10_selu_model/assets 1407/1407 [==============================] - 16s 11ms/step - loss: 1.5080 - accuracy: 0.4712 - val_loss: 1.5651 - val_accuracy: 0.4538 Epoch 6/100 1404/1407 [============================>.] - ETA: 0s - loss: 1.4611 - accuracy: 0.4873INFO:tensorflow:Assets written to: my_cifar10_selu_model/assets 1407/1407 [==============================] - 17s 12ms/step - loss: 1.4613 - accuracy: 0.4872 - val_loss: 1.5305 - val_accuracy: 0.4678 Epoch 7/100 1407/1407 [==============================] - 17s 12ms/step - loss: 1.4174 - accuracy: 0.5077 - val_loss: 1.5346 - val_accuracy: 0.4558 Epoch 8/100 1406/1407 [============================>.] - ETA: 0s - loss: 1.3781 - accuracy: 0.5175INFO:tensorflow:Assets written to: my_cifar10_selu_model/assets 1407/1407 [==============================] - 17s 12ms/step - loss: 1.3781 - accuracy: 0.5175 - val_loss: 1.4773 - val_accuracy: 0.4882 Epoch 9/100 1407/1407 [==============================] - 16s 11ms/step - loss: 1.3413 - accuracy: 0.5345 - val_loss: 1.5021 - val_accuracy: 0.4764 Epoch 10/100 1407/1407 [==============================] - 15s 10ms/step - loss: 1.3182 - accuracy: 0.5422 - val_loss: 1.5709 - val_accuracy: 0.4762 Epoch 11/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.2832 - accuracy: 0.5571 - val_loss: 1.5345 - val_accuracy: 0.4868 Epoch 12/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.2557 - accuracy: 0.5667 - val_loss: 1.5024 - val_accuracy: 0.4900 Epoch 13/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.2373 - accuracy: 0.5710 - val_loss: 1.5114 - val_accuracy: 0.5028 Epoch 14/100 1404/1407 [============================>.] - ETA: 0s - loss: 1.2071 - accuracy: 0.5846INFO:tensorflow:Assets written to: my_cifar10_selu_model/assets 1407/1407 [==============================] - 17s 12ms/step - loss: 1.2073 - accuracy: 0.5847 - val_loss: 1.4608 - val_accuracy: 0.5026 Epoch 15/100 1407/1407 [==============================] - 16s 11ms/step - loss: 1.1843 - accuracy: 0.5940 - val_loss: 1.4962 - val_accuracy: 0.5038 Epoch 16/100 1407/1407 [==============================] - 16s 12ms/step - loss: 1.1617 - accuracy: 0.6026 - val_loss: 1.5255 - val_accuracy: 0.5062 Epoch 17/100 1407/1407 [==============================] - 16s 11ms/step - loss: 1.1452 - accuracy: 0.6084 - val_loss: 1.5057 - val_accuracy: 0.5036 Epoch 18/100 1407/1407 [==============================] - 17s 12ms/step - loss: 1.1297 - accuracy: 0.6145 - val_loss: 1.5097 - val_accuracy: 0.5010 Epoch 19/100 1407/1407 [==============================] - 16s 12ms/step - loss: 1.1004 - accuracy: 0.6245 - val_loss: 1.5218 - val_accuracy: 0.5014 Epoch 20/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.0971 - accuracy: 0.6304 - val_loss: 1.5253 - val_accuracy: 0.5090 Epoch 21/100 1407/1407 [==============================] - 16s 11ms/step - loss: 1.0670 - accuracy: 0.6345 - val_loss: 1.5006 - val_accuracy: 0.5034 Epoch 22/100 1407/1407 [==============================] - 16s 11ms/step - loss: 1.0544 - accuracy: 0.6407 - val_loss: 1.5244 - val_accuracy: 0.5010 Epoch 23/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.0338 - accuracy: 0.6502 - val_loss: 1.5355 - val_accuracy: 0.5096 Epoch 24/100 1407/1407 [==============================] - 14s 10ms/step - loss: 1.0281 - accuracy: 0.6514 - val_loss: 1.5257 - val_accuracy: 0.5164 Epoch 25/100 1407/1407 [==============================] - 14s 10ms/step - loss: 1.4097 - accuracy: 0.6478 - val_loss: 1.8203 - val_accuracy: 0.3514 Epoch 26/100 1407/1407 [==============================] - 16s 11ms/step - loss: 1.3733 - accuracy: 0.5157 - val_loss: 1.5600 - val_accuracy: 0.4664 Epoch 27/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.2032 - accuracy: 0.5814 - val_loss: 1.5367 - val_accuracy: 0.4944 Epoch 28/100 1407/1407 [==============================] - 16s 11ms/step - loss: 1.1291 - accuracy: 0.6121 - val_loss: 1.5333 - val_accuracy: 0.4852 Epoch 29/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.0734 - accuracy: 0.6317 - val_loss: 1.5475 - val_accuracy: 0.5032 Epoch 30/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.0294 - accuracy: 0.6469 - val_loss: 1.5400 - val_accuracy: 0.5052 Epoch 31/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.0081 - accuracy: 0.6605 - val_loss: 1.5617 - val_accuracy: 0.4856 Epoch 32/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.0109 - accuracy: 0.6603 - val_loss: 1.5727 - val_accuracy: 0.5124 Epoch 33/100 1407/1407 [==============================] - 17s 12ms/step - loss: 0.9646 - accuracy: 0.6762 - val_loss: 1.5333 - val_accuracy: 0.5174 Epoch 34/100 1407/1407 [==============================] - 16s 11ms/step - loss: 0.9597 - accuracy: 0.6789 - val_loss: 1.5601 - val_accuracy: 0.5016 157/157 [==============================] - 0s 1ms/step - loss: 1.4608 - accuracy: 0.5026
[1.4607702493667603, 0.5026000142097473]
This model reached the first model's validation loss in just 8 epochs. After 14 epochs, it reached its lowest validation loss, with about 50.3% accuracy, which is better than the original model (46.7%), but not quite as good as the model using batch normalization (50.7%). Each epoch took only 9 seconds. So it's the fastest model to train so far.
Exercise: Try regularizing the model with alpha dropout. Then, without retraining your model, see if you can achieve better accuracy using MC Dropout.
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=[32, 32, 3]))
for _ in range(20):
model.add(tf.keras.layers.Dense(100,
kernel_initializer="lecun_normal",
activation="selu"))
model.add(tf.keras.layers.AlphaDropout(rate=0.1))
model.add(tf.keras.layers.Dense(10, activation="softmax"))
optimizer = tf.keras.optimizers.Nadam(learning_rate=5e-4)
model.compile(loss="sparse_categorical_crossentropy",
optimizer=optimizer,
metrics=["accuracy"])
early_stopping_cb = tf.keras.callbacks.EarlyStopping(
patience=20, restore_best_weights=True)
model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
"my_cifar10_alpha_dropout_model", save_best_only=True)
run_index = 1 # increment every time you train the model
run_logdir = Path() / "my_cifar10_logs" / f"run_alpha_dropout_{run_index:03d}"
tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)
callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]
X_means = X_train.mean(axis=0)
X_stds = X_train.std(axis=0)
X_train_scaled = (X_train - X_means) / X_stds
X_valid_scaled = (X_valid - X_means) / X_stds
X_test_scaled = (X_test - X_means) / X_stds
model.fit(X_train_scaled, y_train, epochs=100,
validation_data=(X_valid_scaled, y_valid),
callbacks=callbacks)
model.evaluate(X_valid_scaled, y_valid)
Epoch 1/100 1405/1407 [============================>.] - ETA: 0s - loss: 1.8953 - accuracy: 0.3240INFO:tensorflow:Assets written to: my_cifar10_alpha_dropout_model/assets 1407/1407 [==============================] - 18s 11ms/step - loss: 1.8950 - accuracy: 0.3239 - val_loss: 1.7556 - val_accuracy: 0.3812 Epoch 2/100 1403/1407 [============================>.] - ETA: 0s - loss: 1.6618 - accuracy: 0.4129INFO:tensorflow:Assets written to: my_cifar10_alpha_dropout_model/assets 1407/1407 [==============================] - 16s 11ms/step - loss: 1.6618 - accuracy: 0.4130 - val_loss: 1.6563 - val_accuracy: 0.4114 Epoch 3/100 1402/1407 [============================>.] - ETA: 0s - loss: 1.5772 - accuracy: 0.4431INFO:tensorflow:Assets written to: my_cifar10_alpha_dropout_model/assets 1407/1407 [==============================] - 16s 11ms/step - loss: 1.5770 - accuracy: 0.4432 - val_loss: 1.6507 - val_accuracy: 0.4232 Epoch 4/100 1406/1407 [============================>.] - ETA: 0s - loss: 1.5081 - accuracy: 0.4673INFO:tensorflow:Assets written to: my_cifar10_alpha_dropout_model/assets 1407/1407 [==============================] - 15s 10ms/step - loss: 1.5081 - accuracy: 0.4672 - val_loss: 1.5892 - val_accuracy: 0.4566 Epoch 5/100 1403/1407 [============================>.] - ETA: 0s - loss: 1.4560 - accuracy: 0.4902INFO:tensorflow:Assets written to: my_cifar10_alpha_dropout_model/assets 1407/1407 [==============================] - 14s 10ms/step - loss: 1.4561 - accuracy: 0.4902 - val_loss: 1.5382 - val_accuracy: 0.4696 Epoch 6/100 1401/1407 [============================>.] - ETA: 0s - loss: 1.4095 - accuracy: 0.5050INFO:tensorflow:Assets written to: my_cifar10_alpha_dropout_model/assets 1407/1407 [==============================] - 16s 11ms/step - loss: 1.4094 - accuracy: 0.5050 - val_loss: 1.5236 - val_accuracy: 0.4818 Epoch 7/100 1401/1407 [============================>.] - ETA: 0s - loss: 1.3634 - accuracy: 0.5234INFO:tensorflow:Assets written to: my_cifar10_alpha_dropout_model/assets 1407/1407 [==============================] - 14s 10ms/step - loss: 1.3636 - accuracy: 0.5232 - val_loss: 1.5139 - val_accuracy: 0.4840 Epoch 8/100 1405/1407 [============================>.] - ETA: 0s - loss: 1.3297 - accuracy: 0.5377INFO:tensorflow:Assets written to: my_cifar10_alpha_dropout_model/assets 1407/1407 [==============================] - 15s 11ms/step - loss: 1.3296 - accuracy: 0.5378 - val_loss: 1.4780 - val_accuracy: 0.4982 Epoch 9/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.2907 - accuracy: 0.5485 - val_loss: 1.5151 - val_accuracy: 0.4854 Epoch 10/100 1407/1407 [==============================] - 13s 10ms/step - loss: 1.2559 - accuracy: 0.5646 - val_loss: 1.4980 - val_accuracy: 0.4976 Epoch 11/100 1407/1407 [==============================] - 14s 10ms/step - loss: 1.2221 - accuracy: 0.5767 - val_loss: 1.5199 - val_accuracy: 0.4990 Epoch 12/100 1407/1407 [==============================] - 13s 9ms/step - loss: 1.1960 - accuracy: 0.5870 - val_loss: 1.5167 - val_accuracy: 0.5030 Epoch 13/100 1407/1407 [==============================] - 14s 10ms/step - loss: 1.1684 - accuracy: 0.5955 - val_loss: 1.5815 - val_accuracy: 0.5014 Epoch 14/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.1463 - accuracy: 0.6025 - val_loss: 1.5427 - val_accuracy: 0.5112 Epoch 15/100 1407/1407 [==============================] - 13s 9ms/step - loss: 1.1125 - accuracy: 0.6169 - val_loss: 1.5868 - val_accuracy: 0.5212 Epoch 16/100 1407/1407 [==============================] - 12s 8ms/step - loss: 1.0854 - accuracy: 0.6243 - val_loss: 1.6234 - val_accuracy: 0.5090 Epoch 17/100 1407/1407 [==============================] - 15s 11ms/step - loss: 1.0668 - accuracy: 0.6328 - val_loss: 1.6162 - val_accuracy: 0.5072 Epoch 18/100 1407/1407 [==============================] - 15s 10ms/step - loss: 1.0440 - accuracy: 0.6442 - val_loss: 1.5748 - val_accuracy: 0.5162 Epoch 19/100 1407/1407 [==============================] - 12s 9ms/step - loss: 1.0272 - accuracy: 0.6477 - val_loss: 1.6518 - val_accuracy: 0.5200 Epoch 20/100 1407/1407 [==============================] - 13s 10ms/step - loss: 1.0007 - accuracy: 0.6594 - val_loss: 1.6224 - val_accuracy: 0.5186 Epoch 21/100 1407/1407 [==============================] - 15s 10ms/step - loss: 0.9824 - accuracy: 0.6639 - val_loss: 1.6972 - val_accuracy: 0.5136 Epoch 22/100 1407/1407 [==============================] - 12s 9ms/step - loss: 0.9660 - accuracy: 0.6714 - val_loss: 1.7210 - val_accuracy: 0.5278 Epoch 23/100 1407/1407 [==============================] - 13s 10ms/step - loss: 0.9472 - accuracy: 0.6780 - val_loss: 1.6436 - val_accuracy: 0.5006 Epoch 24/100 1407/1407 [==============================] - 14s 10ms/step - loss: 0.9314 - accuracy: 0.6819 - val_loss: 1.7059 - val_accuracy: 0.5160 Epoch 25/100 1407/1407 [==============================] - 13s 9ms/step - loss: 0.9172 - accuracy: 0.6888 - val_loss: 1.6926 - val_accuracy: 0.5200 Epoch 26/100 1407/1407 [==============================] - 14s 10ms/step - loss: 0.8990 - accuracy: 0.6947 - val_loss: 1.7705 - val_accuracy: 0.5148 Epoch 27/100 1407/1407 [==============================] - 13s 9ms/step - loss: 0.8758 - accuracy: 0.7028 - val_loss: 1.7023 - val_accuracy: 0.5198 Epoch 28/100 1407/1407 [==============================] - 12s 8ms/step - loss: 0.8622 - accuracy: 0.7090 - val_loss: 1.7567 - val_accuracy: 0.5184 157/157 [==============================] - 0s 1ms/step - loss: 1.4780 - accuracy: 0.4982
[1.4779616594314575, 0.498199999332428]
The model reaches 48.1% accuracy on the validation set. That's worse than without dropout (50.3%). With an extensive hyperparameter search, it might be possible to do better (I tried dropout rates of 5%, 10%, 20% and 40%, and learning rates 1e-4, 3e-4, 5e-4, and 1e-3), but probably not much better in this case.
Let's use MC Dropout now. We will need the MCAlphaDropout
class we used earlier, so let's just copy it here for convenience:
class MCAlphaDropout(tf.keras.layers.AlphaDropout):
def call(self, inputs):
return super().call(inputs, training=True)
Now let's create a new model, identical to the one we just trained (with the same weights), but with MCAlphaDropout
dropout layers instead of AlphaDropout
layers:
mc_model = tf.keras.Sequential([
(
MCAlphaDropout(layer.rate)
if isinstance(layer, tf.keras.layers.AlphaDropout)
else layer
)
for layer in model.layers
])
Then let's add a couple utility functions. The first will run the model many times (10 by default) and it will return the mean predicted class probabilities. The second will use these mean probabilities to predict the most likely class for each instance:
def mc_dropout_predict_probas(mc_model, X, n_samples=10):
Y_probas = [mc_model.predict(X) for sample in range(n_samples)]
return np.mean(Y_probas, axis=0)
def mc_dropout_predict_classes(mc_model, X, n_samples=10):
Y_probas = mc_dropout_predict_probas(mc_model, X, n_samples)
return Y_probas.argmax(axis=1)
Now let's make predictions for all the instances in the validation set, and compute the accuracy:
tf.random.set_seed(42)
y_pred = mc_dropout_predict_classes(mc_model, X_valid_scaled)
accuracy = (y_pred == y_valid[:, 0]).mean()
accuracy
0.4984
We get back to roughly the accuracy of the model without dropout in this case (about 50.3% accuracy).
So the best model we got in this exercise is the Batch Normalization model.
Exercise: Retrain your model using 1cycle scheduling and see if it improves training speed and model accuracy.
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=[32, 32, 3]))
for _ in range(20):
model.add(tf.keras.layers.Dense(100,
kernel_initializer="lecun_normal",
activation="selu"))
model.add(tf.keras.layers.AlphaDropout(rate=0.1))
model.add(tf.keras.layers.Dense(10, activation="softmax"))
optimizer = tf.keras.optimizers.SGD()
model.compile(loss="sparse_categorical_crossentropy",
optimizer=optimizer,
metrics=["accuracy"])
batch_size = 128
rates, losses = find_learning_rate(model, X_train_scaled, y_train, epochs=1,
batch_size=batch_size)
plot_lr_vs_loss(rates, losses)
352/352 [==============================] - 3s 8ms/step - loss: nan - accuracy: 0.1706
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=[32, 32, 3]))
for _ in range(20):
model.add(tf.keras.layers.Dense(100,
kernel_initializer="lecun_normal",
activation="selu"))
model.add(tf.keras.layers.AlphaDropout(rate=0.1))
model.add(tf.keras.layers.Dense(10, activation="softmax"))
optimizer = tf.keras.optimizers.SGD(learning_rate=2e-2)
model.compile(loss="sparse_categorical_crossentropy",
optimizer=optimizer,
metrics=["accuracy"])
n_epochs = 15
n_iterations = math.ceil(len(X_train_scaled) / batch_size) * n_epochs
onecycle = OneCycleScheduler(n_iterations, max_lr=0.05)
history = model.fit(X_train_scaled, y_train, epochs=n_epochs, batch_size=batch_size,
validation_data=(X_valid_scaled, y_valid),
callbacks=[onecycle])
Epoch 1/15 352/352 [==============================] - 3s 9ms/step - loss: 2.0559 - accuracy: 0.2839 - val_loss: 1.7917 - val_accuracy: 0.3768 Epoch 2/15 352/352 [==============================] - 3s 8ms/step - loss: 1.7596 - accuracy: 0.3797 - val_loss: 1.6566 - val_accuracy: 0.4258 Epoch 3/15 352/352 [==============================] - 3s 8ms/step - loss: 1.6199 - accuracy: 0.4247 - val_loss: 1.6395 - val_accuracy: 0.4260 Epoch 4/15 352/352 [==============================] - 3s 9ms/step - loss: 1.5451 - accuracy: 0.4524 - val_loss: 1.6202 - val_accuracy: 0.4408 Epoch 5/15 352/352 [==============================] - 3s 8ms/step - loss: 1.4952 - accuracy: 0.4691 - val_loss: 1.5981 - val_accuracy: 0.4488 Epoch 6/15 352/352 [==============================] - 3s 9ms/step - loss: 1.4541 - accuracy: 0.4842 - val_loss: 1.5720 - val_accuracy: 0.4490 Epoch 7/15 352/352 [==============================] - 3s 9ms/step - loss: 1.4171 - accuracy: 0.4967 - val_loss: 1.6035 - val_accuracy: 0.4470 Epoch 8/15 352/352 [==============================] - 3s 9ms/step - loss: 1.3497 - accuracy: 0.5194 - val_loss: 1.4918 - val_accuracy: 0.4864 Epoch 9/15 352/352 [==============================] - 3s 9ms/step - loss: 1.2788 - accuracy: 0.5459 - val_loss: 1.5597 - val_accuracy: 0.4672 Epoch 10/15 352/352 [==============================] - 3s 9ms/step - loss: 1.2070 - accuracy: 0.5707 - val_loss: 1.5845 - val_accuracy: 0.4864 Epoch 11/15 352/352 [==============================] - 3s 10ms/step - loss: 1.1433 - accuracy: 0.5926 - val_loss: 1.5293 - val_accuracy: 0.4998 Epoch 12/15 352/352 [==============================] - 3s 9ms/step - loss: 1.0745 - accuracy: 0.6182 - val_loss: 1.5118 - val_accuracy: 0.5072 Epoch 13/15 352/352 [==============================] - 3s 10ms/step - loss: 1.0030 - accuracy: 0.6413 - val_loss: 1.5388 - val_accuracy: 0.5204 Epoch 14/15 352/352 [==============================] - 3s 10ms/step - loss: 0.9388 - accuracy: 0.6654 - val_loss: 1.5547 - val_accuracy: 0.5210 Epoch 15/15 352/352 [==============================] - 3s 9ms/step - loss: 0.8989 - accuracy: 0.6805 - val_loss: 1.5835 - val_accuracy: 0.5242
One cycle allowed us to train the model in just 15 epochs, each taking only 2 seconds (thanks to the larger batch size). This is several times faster than the fastest model we trained so far. Moreover, we improved the model's performance (from 50.7% to 52.0%).