Chapter 11 – Deep Learning
This notebook contains all the sample code and solutions to the exercises in chapter 11.
Run in Google Colab |
Warning: this is the code for the 1st edition of the book. Please visit https://github.com/ageron/handson-ml2 for the 2nd edition code, with up-to-date notebooks using the latest library versions. In particular, the 1st edition is based on TensorFlow 1, while the 2nd edition uses TensorFlow 2, which is much simpler to use.
First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals
# Common imports
import numpy as np
import os
try:
# %tensorflow_version only exists in Colab.
%tensorflow_version 1.x
except Exception:
pass
# to make this notebook's output stable across runs
def reset_graph(seed=42):
tf.reset_default_graph()
tf.set_random_seed(seed)
np.random.seed(seed)
# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "deep"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
print("Saving figure", fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(path, format=fig_extension, dpi=resolution)
def logit(z):
return 1 / (1 + np.exp(-z))
z = np.linspace(-5, 5, 200)
plt.plot([-5, 5], [0, 0], 'k-')
plt.plot([-5, 5], [1, 1], 'k--')
plt.plot([0, 0], [-0.2, 1.2], 'k-')
plt.plot([-5, 5], [-3/4, 7/4], 'g--')
plt.plot(z, logit(z), "b-", linewidth=2)
props = dict(facecolor='black', shrink=0.1)
plt.annotate('Saturating', xytext=(3.5, 0.7), xy=(5, 1), arrowprops=props, fontsize=14, ha="center")
plt.annotate('Saturating', xytext=(-3.5, 0.3), xy=(-5, 0), arrowprops=props, fontsize=14, ha="center")
plt.annotate('Linear', xytext=(2, 0.2), xy=(0, 0.5), arrowprops=props, fontsize=14, ha="center")
plt.grid(True)
plt.title("Sigmoid activation function", fontsize=14)
plt.axis([-5, 5, -0.2, 1.2])
save_fig("sigmoid_saturation_plot")
plt.show()
Saving figure sigmoid_saturation_plot
Note: the book uses tensorflow.contrib.layers.fully_connected()
rather than tf.layers.dense()
(which did not exist when this chapter was written). It is now preferable to use tf.layers.dense()
, because anything in the contrib module may change or be deleted without notice. The dense()
function is almost identical to the fully_connected()
function. The main differences relevant to this chapter are:
scope
becomes name
, activation_fn
becomes activation
(and similarly the _fn
suffix is removed from other parameters such as normalizer_fn
), weights_initializer
becomes kernel_initializer
, etc.activation
is now None
rather than tf.nn.relu
.tensorflow.contrib.framework.arg_scope()
(introduced later in chapter 11).import tensorflow as tf
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
he_init = tf.variance_scaling_initializer()
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,
kernel_initializer=he_init, name="hidden1")
WARNING:tensorflow:From <ipython-input-6-da109dac52d3>:3: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version. Instructions for updating: Use keras.layers.Dense instead. WARNING:tensorflow:From /Users/ageron/miniconda3/envs/tf1/lib/python3.7/site-packages/tensorflow_core/python/layers/core.py:187: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version. Instructions for updating: Please use `layer.__call__` method instead.
def leaky_relu(z, alpha=0.01):
return np.maximum(alpha*z, z)
plt.plot(z, leaky_relu(z, 0.05), "b-", linewidth=2)
plt.plot([-5, 5], [0, 0], 'k-')
plt.plot([0, 0], [-0.5, 4.2], 'k-')
plt.grid(True)
props = dict(facecolor='black', shrink=0.1)
plt.annotate('Leak', xytext=(-3.5, 0.5), xy=(-5, -0.2), arrowprops=props, fontsize=14, ha="center")
plt.title("Leaky ReLU activation function", fontsize=14)
plt.axis([-5, 5, -0.5, 4.2])
save_fig("leaky_relu_plot")
plt.show()
Saving figure leaky_relu_plot
Implementing Leaky ReLU in TensorFlow:
reset_graph()
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
def leaky_relu(z, name=None):
return tf.maximum(0.01 * z, z, name=name)
hidden1 = tf.layers.dense(X, n_hidden1, activation=leaky_relu, name="hidden1")
Let's train a neural network on MNIST using the Leaky ReLU. First let's create the graph:
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=leaky_relu, name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=leaky_relu, name="hidden2")
logits = tf.layers.dense(hidden2, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
learning_rate = 0.01
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
WARNING:tensorflow:From /Users/ageron/miniconda3/envs/tf1/lib/python3.7/site-packages/tensorflow_core/python/ops/math_grad.py:1424: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.where in 2.0, which has the same broadcast rule as np.where
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
Let's load the data:
Warning: tf.examples.tutorials.mnist
is deprecated. We will use tf.keras.datasets.mnist
instead.
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]
def shuffle_batch(X, y, batch_size):
rnd_idx = np.random.permutation(len(X))
n_batches = len(X) // batch_size
for batch_idx in np.array_split(rnd_idx, n_batches):
X_batch, y_batch = X[batch_idx], y[batch_idx]
yield X_batch, y_batch
n_epochs = 40
batch_size = 50
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
if epoch % 5 == 0:
acc_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
acc_valid = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
print(epoch, "Batch accuracy:", acc_batch, "Validation accuracy:", acc_valid)
save_path = saver.save(sess, "./my_model_final.ckpt")
0 Batch accuracy: 0.86 Validation accuracy: 0.9044 5 Batch accuracy: 0.94 Validation accuracy: 0.9496 10 Batch accuracy: 0.92 Validation accuracy: 0.9654 15 Batch accuracy: 0.94 Validation accuracy: 0.971 20 Batch accuracy: 1.0 Validation accuracy: 0.9764 25 Batch accuracy: 1.0 Validation accuracy: 0.9778 30 Batch accuracy: 0.98 Validation accuracy: 0.978 35 Batch accuracy: 1.0 Validation accuracy: 0.9788
def elu(z, alpha=1):
return np.where(z < 0, alpha * (np.exp(z) - 1), z)
plt.plot(z, elu(z), "b-", linewidth=2)
plt.plot([-5, 5], [0, 0], 'k-')
plt.plot([-5, 5], [-1, -1], 'k--')
plt.plot([0, 0], [-2.2, 3.2], 'k-')
plt.grid(True)
plt.title(r"ELU activation function ($\alpha=1$)", fontsize=14)
plt.axis([-5, 5, -2.2, 3.2])
save_fig("elu_plot")
plt.show()
Saving figure elu_plot
Implementing ELU in TensorFlow is trivial, just specify the activation function when building each layer:
reset_graph()
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.elu, name="hidden1")
This activation function was proposed in this great paper by Günter Klambauer, Thomas Unterthiner and Andreas Mayr, published in June 2017. During training, a neural network composed exclusively of a stack of dense layers using the SELU activation function and LeCun initialization will self-normalize: the output of each layer will tend to preserve the same mean and variance during training, which solves the vanishing/exploding gradients problem. As a result, this activation function outperforms the other activation functions very significantly for such neural nets, so you should really try it out. Unfortunately, the self-normalizing property of the SELU activation function is easily broken: you cannot use ℓ1 or ℓ2 regularization, regular dropout, max-norm, skip connections or other non-sequential topologies (so recurrent neural networks won't self-normalize). However, in practice it works quite well with sequential CNNs. If you break self-normalization, SELU will not necessarily outperform other activation functions.
from scipy.special import erfc
# alpha and scale to self normalize with mean 0 and standard deviation 1
# (see equation 14 in the paper):
alpha_0_1 = -np.sqrt(2 / np.pi) / (erfc(1/np.sqrt(2)) * np.exp(1/2) - 1)
scale_0_1 = (1 - erfc(1 / np.sqrt(2)) * np.sqrt(np.e)) * np.sqrt(2 * np.pi) * (2 * erfc(np.sqrt(2))*np.e**2 + np.pi*erfc(1/np.sqrt(2))**2*np.e - 2*(2+np.pi)*erfc(1/np.sqrt(2))*np.sqrt(np.e)+np.pi+2)**(-1/2)
def selu(z, scale=scale_0_1, alpha=alpha_0_1):
return scale * elu(z, alpha)
plt.plot(z, selu(z), "b-", linewidth=2)
plt.plot([-5, 5], [0, 0], 'k-')
plt.plot([-5, 5], [-1.758, -1.758], 'k--')
plt.plot([0, 0], [-2.2, 3.2], 'k-')
plt.grid(True)
plt.title(r"SELU activation function", fontsize=14)
plt.axis([-5, 5, -2.2, 3.2])
save_fig("selu_plot")
plt.show()
Saving figure selu_plot
By default, the SELU hyperparameters (scale
and alpha
) are tuned in such a way that the mean output of each neuron remains close to 0, and the standard deviation remains close to 1 (assuming the inputs are standardized with mean 0 and standard deviation 1 too). Using this activation function, even a 1,000 layer deep neural network preserves roughly mean 0 and standard deviation 1 across all layers, avoiding the exploding/vanishing gradients problem:
np.random.seed(42)
Z = np.random.normal(size=(500, 100)) # standardized inputs
for layer in range(1000):
W = np.random.normal(size=(100, 100), scale=np.sqrt(1 / 100)) # LeCun initialization
Z = selu(np.dot(Z, W))
means = np.mean(Z, axis=0).mean()
stds = np.std(Z, axis=0).mean()
if layer % 100 == 0:
print("Layer {}: mean {:.2f}, std deviation {:.2f}".format(layer, means, stds))
Layer 0: mean -0.00, std deviation 1.00 Layer 100: mean 0.02, std deviation 0.96 Layer 200: mean 0.01, std deviation 0.90 Layer 300: mean -0.02, std deviation 0.92 Layer 400: mean 0.05, std deviation 0.89 Layer 500: mean 0.01, std deviation 0.93 Layer 600: mean 0.02, std deviation 0.92 Layer 700: mean -0.02, std deviation 0.90 Layer 800: mean 0.05, std deviation 0.83 Layer 900: mean 0.02, std deviation 1.00
The tf.nn.selu()
function was added in TensorFlow 1.4. For earlier versions, you can use the following implementation:
def selu(z, scale=alpha_0_1, alpha=scale_0_1):
return scale * tf.where(z >= 0.0, z, alpha * tf.nn.elu(z))
However, the SELU activation function cannot be used along with regular Dropout (this would cancel the SELU activation function's self-normalizing property). Fortunately, there is a Dropout variant called Alpha Dropout proposed in the same paper. It is available in tf.contrib.nn.alpha_dropout()
since TF 1.4 (or check out this implementation by the Institute of Bioinformatics, Johannes Kepler University Linz).
Let's create a neural net for MNIST using the SELU activation function:
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=selu, name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=selu, name="hidden2")
logits = tf.layers.dense(hidden2, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
learning_rate = 0.01
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 40
batch_size = 50
Now let's train it. Do not forget to scale the inputs to mean 0 and standard deviation 1:
means = X_train.mean(axis=0, keepdims=True)
stds = X_train.std(axis=0, keepdims=True) + 1e-10
X_val_scaled = (X_valid - means) / stds
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
X_batch_scaled = (X_batch - means) / stds
sess.run(training_op, feed_dict={X: X_batch_scaled, y: y_batch})
if epoch % 5 == 0:
acc_batch = accuracy.eval(feed_dict={X: X_batch_scaled, y: y_batch})
acc_valid = accuracy.eval(feed_dict={X: X_val_scaled, y: y_valid})
print(epoch, "Batch accuracy:", acc_batch, "Validation accuracy:", acc_valid)
save_path = saver.save(sess, "./my_model_final_selu.ckpt")
0 Batch accuracy: 0.88 Validation accuracy: 0.923 5 Batch accuracy: 0.98 Validation accuracy: 0.9578 10 Batch accuracy: 1.0 Validation accuracy: 0.9664 15 Batch accuracy: 0.96 Validation accuracy: 0.9682 20 Batch accuracy: 1.0 Validation accuracy: 0.9694 25 Batch accuracy: 1.0 Validation accuracy: 0.9688 30 Batch accuracy: 1.0 Validation accuracy: 0.9694 35 Batch accuracy: 1.0 Validation accuracy: 0.97
Note: the book uses tensorflow.contrib.layers.batch_norm()
rather than tf.layers.batch_normalization()
(which did not exist when this chapter was written). It is now preferable to use tf.layers.batch_normalization()
, because anything in the contrib module may change or be deleted without notice. Instead of using the batch_norm()
function as a regularizer parameter to the fully_connected()
function, we now use batch_normalization()
and we explicitly create a distinct layer. The parameters are a bit different, in particular:
decay
is renamed to momentum
,is_training
is renamed to training
,updates_collections
is removed: the update operations needed by batch normalization are added to the UPDATE_OPS
collection and you need to explicity run these operations during training (see the execution phase below),scale=True
, as that is the default.Also note that in order to run batch norm just before each hidden layer's activation function, we apply the ELU activation function manually, right after the batch norm layer.
Note: since the tf.layers.dense()
function is incompatible with tf.contrib.layers.arg_scope()
(which is used in the book), we now use python's functools.partial()
function instead. It makes it easy to create a my_dense_layer()
function that just calls tf.layers.dense()
with the desired parameters automatically set (unless they are overridden when calling my_dense_layer()
). As you can see, the code remains very similar.
reset_graph()
import tensorflow as tf
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
training = tf.placeholder_with_default(False, shape=(), name='training')
hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1")
bn1 = tf.layers.batch_normalization(hidden1, training=training, momentum=0.9)
bn1_act = tf.nn.elu(bn1)
hidden2 = tf.layers.dense(bn1_act, n_hidden2, name="hidden2")
bn2 = tf.layers.batch_normalization(hidden2, training=training, momentum=0.9)
bn2_act = tf.nn.elu(bn2)
logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name="outputs")
logits = tf.layers.batch_normalization(logits_before_bn, training=training,
momentum=0.9)
WARNING:tensorflow:From <ipython-input-32-2f36d39a8af9>:15: batch_normalization (from tensorflow.python.layers.normalization) is deprecated and will be removed in a future version. Instructions for updating: Use keras.layers.BatchNormalization instead. In particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` should not be used (consult the `tf.keras.layers.batch_normalization` documentation).
reset_graph()
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
training = tf.placeholder_with_default(False, shape=(), name='training')
To avoid repeating the same parameters over and over again, we can use Python's partial()
function:
from functools import partial
my_batch_norm_layer = partial(tf.layers.batch_normalization,
training=training, momentum=0.9)
hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1")
bn1 = my_batch_norm_layer(hidden1)
bn1_act = tf.nn.elu(bn1)
hidden2 = tf.layers.dense(bn1_act, n_hidden2, name="hidden2")
bn2 = my_batch_norm_layer(hidden2)
bn2_act = tf.nn.elu(bn2)
logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name="outputs")
logits = my_batch_norm_layer(logits_before_bn)
Let's build a neural net for MNIST, using the ELU activation function and Batch Normalization at each layer:
reset_graph()
batch_norm_momentum = 0.9
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')
with tf.name_scope("dnn"):
he_init = tf.variance_scaling_initializer()
my_batch_norm_layer = partial(
tf.layers.batch_normalization,
training=training,
momentum=batch_norm_momentum)
my_dense_layer = partial(
tf.layers.dense,
kernel_initializer=he_init)
hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
bn1 = tf.nn.elu(my_batch_norm_layer(hidden1))
hidden2 = my_dense_layer(bn1, n_hidden2, name="hidden2")
bn2 = tf.nn.elu(my_batch_norm_layer(hidden2))
logits_before_bn = my_dense_layer(bn2, n_outputs, name="outputs")
logits = my_batch_norm_layer(logits_before_bn)
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
Note: since we are using tf.layers.batch_normalization()
rather than tf.contrib.layers.batch_norm()
(as in the book), we need to explicitly run the extra update operations needed by batch normalization (sess.run([training_op, extra_update_ops],...
).
n_epochs = 20
batch_size = 200
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run([training_op, extra_update_ops],
feed_dict={training: True, X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
print(epoch, "Validation accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_model_final.ckpt")
0 Validation accuracy: 0.8952 1 Validation accuracy: 0.9202 2 Validation accuracy: 0.9318 3 Validation accuracy: 0.9422 4 Validation accuracy: 0.9468 5 Validation accuracy: 0.954 6 Validation accuracy: 0.9568 7 Validation accuracy: 0.96 8 Validation accuracy: 0.962 9 Validation accuracy: 0.9638 10 Validation accuracy: 0.9662 11 Validation accuracy: 0.9682 12 Validation accuracy: 0.9672 13 Validation accuracy: 0.9696 14 Validation accuracy: 0.9706 15 Validation accuracy: 0.9704 16 Validation accuracy: 0.9718 17 Validation accuracy: 0.9726 18 Validation accuracy: 0.9738 19 Validation accuracy: 0.9742
What!? That's not a great accuracy for MNIST. Of course, if you train for longer it will get much better accuracy, but with such a shallow network, Batch Norm and ELU are unlikely to have very positive impact: they shine mostly for much deeper nets.
Note that you could also make the training operation depend on the update operations:
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
training_op = optimizer.minimize(loss)
This way, you would just have to evaluate the training_op
during training, TensorFlow would automatically run the update operations as well:
sess.run(training_op, feed_dict={training: True, X: X_batch, y: y_batch})
One more thing: notice that the list of trainable variables is shorter than the list of all global variables. This is because the moving averages are non-trainable variables. If you want to reuse a pretrained neural network (see below), you must not forget these non-trainable variables.
[v.name for v in tf.trainable_variables()]
['hidden1/kernel:0', 'hidden1/bias:0', 'batch_normalization/gamma:0', 'batch_normalization/beta:0', 'hidden2/kernel:0', 'hidden2/bias:0', 'batch_normalization_1/gamma:0', 'batch_normalization_1/beta:0', 'outputs/kernel:0', 'outputs/bias:0', 'batch_normalization_2/gamma:0', 'batch_normalization_2/beta:0']
[v.name for v in tf.global_variables()]
['hidden1/kernel:0', 'hidden1/bias:0', 'batch_normalization/gamma:0', 'batch_normalization/beta:0', 'batch_normalization/moving_mean:0', 'batch_normalization/moving_variance:0', 'hidden2/kernel:0', 'hidden2/bias:0', 'batch_normalization_1/gamma:0', 'batch_normalization_1/beta:0', 'batch_normalization_1/moving_mean:0', 'batch_normalization_1/moving_variance:0', 'outputs/kernel:0', 'outputs/bias:0', 'batch_normalization_2/gamma:0', 'batch_normalization_2/beta:0', 'batch_normalization_2/moving_mean:0', 'batch_normalization_2/moving_variance:0']
Let's create a simple neural net for MNIST and add gradient clipping. The first part is the same as earlier (except we added a few more layers to demonstrate reusing pretrained models, see below):
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
n_hidden2 = 50
n_hidden3 = 50
n_hidden4 = 50
n_hidden5 = 50
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3")
hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4")
hidden5 = tf.layers.dense(hidden4, n_hidden5, activation=tf.nn.relu, name="hidden5")
logits = tf.layers.dense(hidden5, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
learning_rate = 0.01
Now we apply gradient clipping. For this, we need to get the gradients, use the clip_by_value()
function to clip them, then apply them:
threshold = 1.0
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
for grad, var in grads_and_vars]
training_op = optimizer.apply_gradients(capped_gvs)
The rest is the same as usual:
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 20
batch_size = 200
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
print(epoch, "Validation accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_model_final.ckpt")
0 Validation accuracy: 0.288 1 Validation accuracy: 0.7936 2 Validation accuracy: 0.8798 3 Validation accuracy: 0.906 4 Validation accuracy: 0.9164 5 Validation accuracy: 0.9218 6 Validation accuracy: 0.9296 7 Validation accuracy: 0.9358 8 Validation accuracy: 0.9382 9 Validation accuracy: 0.9414 10 Validation accuracy: 0.9456 11 Validation accuracy: 0.9474 12 Validation accuracy: 0.9478 13 Validation accuracy: 0.9534 14 Validation accuracy: 0.9568 15 Validation accuracy: 0.9566 16 Validation accuracy: 0.9574 17 Validation accuracy: 0.959 18 Validation accuracy: 0.9622 19 Validation accuracy: 0.9612
First you need to load the graph's structure. The import_meta_graph()
function does just that, loading the graph's operations into the default graph, and returning a Saver
that you can then use to restore the model's state. Note that by default, a Saver
saves the structure of the graph into a .meta
file, so that's the file you should load:
reset_graph()
saver = tf.train.import_meta_graph("./my_model_final.ckpt.meta")
Next you need to get a handle on all the operations you will need for training. If you don't know the graph's structure, you can list all the operations:
for op in tf.get_default_graph().get_operations():
print(op.name)
X y hidden1/kernel/Initializer/random_uniform/shape hidden1/kernel/Initializer/random_uniform/min hidden1/kernel/Initializer/random_uniform/max hidden1/kernel/Initializer/random_uniform/RandomUniform hidden1/kernel/Initializer/random_uniform/sub hidden1/kernel/Initializer/random_uniform/mul hidden1/kernel/Initializer/random_uniform hidden1/kernel hidden1/kernel/Assign hidden1/kernel/read hidden1/bias/Initializer/zeros hidden1/bias hidden1/bias/Assign hidden1/bias/read dnn/hidden1/MatMul dnn/hidden1/BiasAdd dnn/hidden1/Relu hidden2/kernel/Initializer/random_uniform/shape hidden2/kernel/Initializer/random_uniform/min hidden2/kernel/Initializer/random_uniform/max hidden2/kernel/Initializer/random_uniform/RandomUniform hidden2/kernel/Initializer/random_uniform/sub hidden2/kernel/Initializer/random_uniform/mul hidden2/kernel/Initializer/random_uniform hidden2/kernel hidden2/kernel/Assign hidden2/kernel/read hidden2/bias/Initializer/zeros hidden2/bias hidden2/bias/Assign hidden2/bias/read dnn/hidden2/MatMul dnn/hidden2/BiasAdd <<210 more lines>> GradientDescent/update_hidden4/bias/ApplyGradientDescent GradientDescent/update_hidden5/kernel/ApplyGradientDescent GradientDescent/update_hidden5/bias/ApplyGradientDescent GradientDescent/update_outputs/kernel/ApplyGradientDescent GradientDescent/update_outputs/bias/ApplyGradientDescent GradientDescent eval/in_top_k/InTopKV2/k eval/in_top_k/InTopKV2 eval/Cast eval/Const eval/accuracy init save/filename/input save/filename save/Const save/SaveV2/tensor_names save/SaveV2/shape_and_slices save/SaveV2 save/control_dependency save/RestoreV2/tensor_names save/RestoreV2/shape_and_slices save/RestoreV2 save/Assign save/Assign_1 save/Assign_2 save/Assign_3 save/Assign_4 save/Assign_5 save/Assign_6 save/Assign_7 save/Assign_8 save/Assign_9 save/Assign_10 save/Assign_11 save/restore_all
Oops, that's a lot of operations! It's much easier to use TensorBoard to visualize the graph:
from datetime import datetime
root_logdir = os.path.join(os.curdir, "tf_logs")
def make_log_subdir(run_id=None):
if run_id is None:
run_id = datetime.utcnow().strftime("%Y%m%d%H%M%S")
return "{}/run-{}/".format(root_logdir, run_id)
def save_graph(graph=None, run_id=None):
if graph is None:
graph = tf.get_default_graph()
logdir = make_log_subdir(run_id)
file_writer = tf.summary.FileWriter(logdir, graph=graph)
file_writer.close()
return logdir
save_graph()
'./tf_logs/run-20210325200138/'
%load_ext tensorboard
%tensorboard --logdir {root_logdir}
Reusing TensorBoard on port 6007 (pid 46883), started 0:09:56 ago. (Use '!kill 46883' to kill it.)
Once you know which operations you need, you can get a handle on them using the graph's get_operation_by_name()
or get_tensor_by_name()
methods:
X = tf.get_default_graph().get_tensor_by_name("X:0")
y = tf.get_default_graph().get_tensor_by_name("y:0")
accuracy = tf.get_default_graph().get_tensor_by_name("eval/accuracy:0")
training_op = tf.get_default_graph().get_operation_by_name("GradientDescent")
If you are the author of the original model, you could make things easier for people who will reuse your model by giving operations very clear names and documenting them. Another approach is to create a collection containing all the important operations that people will want to get a handle on:
for op in (X, y, accuracy, training_op):
tf.add_to_collection("my_important_ops", op)
This way people who reuse your model will be able to simply write:
X, y, accuracy, training_op = tf.get_collection("my_important_ops")
Now you can start a session, restore the model's state and continue training on your data:
with tf.Session() as sess:
saver.restore(sess, "./my_model_final.ckpt")
# continue training the model...
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt
Actually, let's test this for real!
with tf.Session() as sess:
saver.restore(sess, "./my_model_final.ckpt")
for epoch in range(n_epochs):
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
print(epoch, "Validation accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_new_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt 0 Validation accuracy: 0.9636 1 Validation accuracy: 0.9632 2 Validation accuracy: 0.9658 3 Validation accuracy: 0.9652 4 Validation accuracy: 0.9646 5 Validation accuracy: 0.965 6 Validation accuracy: 0.969 7 Validation accuracy: 0.9682 8 Validation accuracy: 0.9682 9 Validation accuracy: 0.9684 10 Validation accuracy: 0.9704 11 Validation accuracy: 0.971 12 Validation accuracy: 0.9668 13 Validation accuracy: 0.97 14 Validation accuracy: 0.9712 15 Validation accuracy: 0.9726 16 Validation accuracy: 0.9718 17 Validation accuracy: 0.971 18 Validation accuracy: 0.9712 19 Validation accuracy: 0.9712
Alternatively, if you have access to the Python code that built the original graph, you can use it instead of import_meta_graph()
:
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
n_hidden2 = 50
n_hidden3 = 50
n_hidden4 = 50
n_hidden5 = 50
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3")
hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4")
hidden5 = tf.layers.dense(hidden4, n_hidden5, activation=tf.nn.relu, name="hidden5")
logits = tf.layers.dense(hidden5, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
learning_rate = 0.01
threshold = 1.0
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
for grad, var in grads_and_vars]
training_op = optimizer.apply_gradients(capped_gvs)
saver = tf.train.Saver()
And continue training:
with tf.Session() as sess:
saver.restore(sess, "./my_model_final.ckpt")
for epoch in range(n_epochs):
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
print(epoch, "Validation accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_new_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt 0 Validation accuracy: 0.9642 1 Validation accuracy: 0.963 2 Validation accuracy: 0.9656 3 Validation accuracy: 0.9652 4 Validation accuracy: 0.9642 5 Validation accuracy: 0.965 6 Validation accuracy: 0.9686 7 Validation accuracy: 0.9686 8 Validation accuracy: 0.9684 9 Validation accuracy: 0.9684 10 Validation accuracy: 0.9702 11 Validation accuracy: 0.9716 12 Validation accuracy: 0.9676 13 Validation accuracy: 0.97 14 Validation accuracy: 0.9706 15 Validation accuracy: 0.9724 16 Validation accuracy: 0.972 17 Validation accuracy: 0.9712 18 Validation accuracy: 0.9712 19 Validation accuracy: 0.9708
In general you will want to reuse only the lower layers. If you are using import_meta_graph()
it will load the whole graph, but you can simply ignore the parts you do not need. In this example, we add a new 4th hidden layer on top of the pretrained 3rd layer (ignoring the old 4th hidden layer). We also build a new output layer, the loss for this new output, and a new optimizer to minimize it. We also need another saver to save the whole graph (containing both the entire old graph plus the new operations), and an initialization operation to initialize all the new variables:
reset_graph()
n_hidden4 = 20 # new layer
n_outputs = 10 # new layer
saver = tf.train.import_meta_graph("./my_model_final.ckpt.meta")
X = tf.get_default_graph().get_tensor_by_name("X:0")
y = tf.get_default_graph().get_tensor_by_name("y:0")
hidden3 = tf.get_default_graph().get_tensor_by_name("dnn/hidden3/Relu:0")
new_hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="new_hidden4")
new_logits = tf.layers.dense(new_hidden4, n_outputs, name="new_outputs")
with tf.name_scope("new_loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=new_logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("new_eval"):
correct = tf.nn.in_top_k(new_logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
with tf.name_scope("new_train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
new_saver = tf.train.Saver()
And we can train this new model:
with tf.Session() as sess:
init.run()
saver.restore(sess, "./my_model_final.ckpt")
for epoch in range(n_epochs):
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
print(epoch, "Validation accuracy:", accuracy_val)
save_path = new_saver.save(sess, "./my_new_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt 0 Validation accuracy: 0.9126 1 Validation accuracy: 0.9374 2 Validation accuracy: 0.946 3 Validation accuracy: 0.9498 4 Validation accuracy: 0.953 5 Validation accuracy: 0.9528 6 Validation accuracy: 0.9564 7 Validation accuracy: 0.96 8 Validation accuracy: 0.9616 9 Validation accuracy: 0.9612 10 Validation accuracy: 0.9634 11 Validation accuracy: 0.9626 12 Validation accuracy: 0.9648 13 Validation accuracy: 0.9656 14 Validation accuracy: 0.9664 15 Validation accuracy: 0.967 16 Validation accuracy: 0.968 17 Validation accuracy: 0.9678 18 Validation accuracy: 0.9684 19 Validation accuracy: 0.9678
If you have access to the Python code that built the original graph, you can just reuse the parts you need and drop the rest:
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300 # reused
n_hidden2 = 50 # reused
n_hidden3 = 50 # reused
n_hidden4 = 20 # new!
n_outputs = 10 # new!
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1") # reused
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2") # reused
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3") # reused
hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4") # new!
logits = tf.layers.dense(hidden4, n_outputs, name="outputs") # new!
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
However, you must create one Saver
to restore the pretrained model (giving it the list of variables to restore, or else it will complain that the graphs don't match), and another Saver
to save the new model, once it is trained:
reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
scope="hidden[123]") # regular expression
restore_saver = tf.train.Saver(reuse_vars) # to restore layers 1-3
init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_model_final.ckpt")
for epoch in range(n_epochs): # not shown in the book
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size): # not shown
sess.run(training_op, feed_dict={X: X_batch, y: y_batch}) # not shown
accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid}) # not shown
print(epoch, "Validation accuracy:", accuracy_val) # not shown
save_path = saver.save(sess, "./my_new_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt 0 Validation accuracy: 0.9024 1 Validation accuracy: 0.9332 2 Validation accuracy: 0.943 3 Validation accuracy: 0.947 4 Validation accuracy: 0.9516 5 Validation accuracy: 0.9532 6 Validation accuracy: 0.9558 7 Validation accuracy: 0.9592 8 Validation accuracy: 0.9586 9 Validation accuracy: 0.9608 10 Validation accuracy: 0.9626 11 Validation accuracy: 0.962 12 Validation accuracy: 0.964 13 Validation accuracy: 0.9662 14 Validation accuracy: 0.966 15 Validation accuracy: 0.9662 16 Validation accuracy: 0.9672 17 Validation accuracy: 0.9674 18 Validation accuracy: 0.9682 19 Validation accuracy: 0.9678
In this example, for each variable we want to reuse, we find its initializer's assignment operation, and we get its second input, which corresponds to the initialization value. When we run the initializer, we replace the initialization values with the ones we want, using a feed_dict
:
reset_graph()
n_inputs = 2
n_hidden1 = 3
original_w = [[1., 2., 3.], [4., 5., 6.]] # Load the weights from the other framework
original_b = [7., 8., 9.] # Load the biases from the other framework
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
# [...] Build the rest of the model
# Get a handle on the assignment nodes for the hidden1 variables
graph = tf.get_default_graph()
assign_kernel = graph.get_operation_by_name("hidden1/kernel/Assign")
assign_bias = graph.get_operation_by_name("hidden1/bias/Assign")
init_kernel = assign_kernel.inputs[1]
init_bias = assign_bias.inputs[1]
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init, feed_dict={init_kernel: original_w, init_bias: original_b})
# [...] Train the model on your new task
print(hidden1.eval(feed_dict={X: [[10.0, 11.0]]})) # not shown in the book
[[ 61. 83. 105.]]
Note: the weights variable created by the tf.layers.dense()
function is called "kernel"
(instead of "weights"
when using the tf.contrib.layers.fully_connected()
, as in the book), and the biases variable is called bias
instead of biases
.
Another approach (initially used in the book) would be to create dedicated assignment nodes and dedicated placeholders. This is more verbose and less efficient, but you may find this more explicit:
reset_graph()
n_inputs = 2
n_hidden1 = 3
original_w = [[1., 2., 3.], [4., 5., 6.]] # Load the weights from the other framework
original_b = [7., 8., 9.] # Load the biases from the other framework
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
# [...] Build the rest of the model
# Get a handle on the variables of layer hidden1
with tf.variable_scope("", default_name="", reuse=True): # root scope
hidden1_weights = tf.get_variable("hidden1/kernel")
hidden1_biases = tf.get_variable("hidden1/bias")
# Create dedicated placeholders and assignment nodes
original_weights = tf.placeholder(tf.float32, shape=(n_inputs, n_hidden1))
original_biases = tf.placeholder(tf.float32, shape=n_hidden1)
assign_hidden1_weights = tf.assign(hidden1_weights, original_weights)
assign_hidden1_biases = tf.assign(hidden1_biases, original_biases)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
sess.run(assign_hidden1_weights, feed_dict={original_weights: original_w})
sess.run(assign_hidden1_biases, feed_dict={original_biases: original_b})
# [...] Train the model on your new task
print(hidden1.eval(feed_dict={X: [[10.0, 11.0]]}))
[[ 61. 83. 105.]]
Note that we could also get a handle on the variables using get_collection()
and specifying the scope
:
tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="hidden1")
[<tf.Variable 'hidden1/kernel:0' shape=(2, 3) dtype=float32_ref>, <tf.Variable 'hidden1/bias:0' shape=(3,) dtype=float32_ref>]
Or we could use the graph's get_tensor_by_name()
method:
tf.get_default_graph().get_tensor_by_name("hidden1/kernel:0")
<tf.Tensor 'hidden1/kernel:0' shape=(2, 3) dtype=float32_ref>
tf.get_default_graph().get_tensor_by_name("hidden1/bias:0")
<tf.Tensor 'hidden1/bias:0' shape=(3,) dtype=float32_ref>
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300 # reused
n_hidden2 = 50 # reused
n_hidden3 = 50 # reused
n_hidden4 = 20 # new!
n_outputs = 10 # new!
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1") # reused
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2") # reused
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3") # reused
hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4") # new!
logits = tf.layers.dense(hidden4, n_outputs, name="outputs") # new!
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
with tf.name_scope("train"): # not shown in the book
optimizer = tf.train.GradientDescentOptimizer(learning_rate) # not shown
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
scope="hidden[34]|outputs")
training_op = optimizer.minimize(loss, var_list=train_vars)
reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
scope="hidden[123]") # regular expression
restore_saver = tf.train.Saver(reuse_vars) # to restore layers 1-3
init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_model_final.ckpt")
for epoch in range(n_epochs):
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
print(epoch, "Validation accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_new_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt 0 Validation accuracy: 0.8964 1 Validation accuracy: 0.9298 2 Validation accuracy: 0.94 3 Validation accuracy: 0.9442 4 Validation accuracy: 0.948 5 Validation accuracy: 0.951 6 Validation accuracy: 0.9508 7 Validation accuracy: 0.9538 8 Validation accuracy: 0.9554 9 Validation accuracy: 0.957 10 Validation accuracy: 0.9562 11 Validation accuracy: 0.9566 12 Validation accuracy: 0.9572 13 Validation accuracy: 0.9578 14 Validation accuracy: 0.959 15 Validation accuracy: 0.9576 16 Validation accuracy: 0.9574 17 Validation accuracy: 0.9602 18 Validation accuracy: 0.9592 19 Validation accuracy: 0.9602
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300 # reused
n_hidden2 = 50 # reused
n_hidden3 = 50 # reused
n_hidden4 = 20 # new!
n_outputs = 10 # new!
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,
name="hidden1") # reused frozen
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu,
name="hidden2") # reused frozen
hidden2_stop = tf.stop_gradient(hidden2)
hidden3 = tf.layers.dense(hidden2_stop, n_hidden3, activation=tf.nn.relu,
name="hidden3") # reused, not frozen
hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu,
name="hidden4") # new!
logits = tf.layers.dense(hidden4, n_outputs, name="outputs") # new!
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
The training code is exactly the same as earlier:
reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
scope="hidden[123]") # regular expression
restore_saver = tf.train.Saver(reuse_vars) # to restore layers 1-3
init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_model_final.ckpt")
for epoch in range(n_epochs):
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
print(epoch, "Validation accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_new_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt 0 Validation accuracy: 0.902 1 Validation accuracy: 0.9302 2 Validation accuracy: 0.9438 3 Validation accuracy: 0.9478 4 Validation accuracy: 0.9514 5 Validation accuracy: 0.9522 6 Validation accuracy: 0.9524 7 Validation accuracy: 0.9556 8 Validation accuracy: 0.9556 9 Validation accuracy: 0.9558 10 Validation accuracy: 0.957 11 Validation accuracy: 0.9552 12 Validation accuracy: 0.9572 13 Validation accuracy: 0.9582 14 Validation accuracy: 0.9582 15 Validation accuracy: 0.957 16 Validation accuracy: 0.9566 17 Validation accuracy: 0.9578 18 Validation accuracy: 0.9594 19 Validation accuracy: 0.958
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300 # reused
n_hidden2 = 50 # reused
n_hidden3 = 50 # reused
n_hidden4 = 20 # new!
n_outputs = 10 # new!
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,
name="hidden1") # reused frozen
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu,
name="hidden2") # reused frozen & cached
hidden2_stop = tf.stop_gradient(hidden2)
hidden3 = tf.layers.dense(hidden2_stop, n_hidden3, activation=tf.nn.relu,
name="hidden3") # reused, not frozen
hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu,
name="hidden4") # new!
logits = tf.layers.dense(hidden4, n_outputs, name="outputs") # new!
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
scope="hidden[123]") # regular expression
restore_saver = tf.train.Saver(reuse_vars) # to restore layers 1-3
init = tf.global_variables_initializer()
saver = tf.train.Saver()
import numpy as np
n_batches = len(X_train) // batch_size
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_model_final.ckpt")
h2_cache = sess.run(hidden2, feed_dict={X: X_train})
h2_cache_valid = sess.run(hidden2, feed_dict={X: X_valid}) # not shown in the book
for epoch in range(n_epochs):
shuffled_idx = np.random.permutation(len(X_train))
hidden2_batches = np.array_split(h2_cache[shuffled_idx], n_batches)
y_batches = np.array_split(y_train[shuffled_idx], n_batches)
for hidden2_batch, y_batch in zip(hidden2_batches, y_batches):
sess.run(training_op, feed_dict={hidden2:hidden2_batch, y:y_batch})
accuracy_val = accuracy.eval(feed_dict={hidden2: h2_cache_valid, # not shown
y: y_valid}) # not shown
print(epoch, "Validation accuracy:", accuracy_val) # not shown
save_path = saver.save(sess, "./my_new_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt 0 Validation accuracy: 0.902 1 Validation accuracy: 0.9302 2 Validation accuracy: 0.9438 3 Validation accuracy: 0.9478 4 Validation accuracy: 0.9514 5 Validation accuracy: 0.9522 6 Validation accuracy: 0.9524 7 Validation accuracy: 0.9556 8 Validation accuracy: 0.9556 9 Validation accuracy: 0.9558 10 Validation accuracy: 0.957 11 Validation accuracy: 0.9552 12 Validation accuracy: 0.9572 13 Validation accuracy: 0.9582 14 Validation accuracy: 0.9582 15 Validation accuracy: 0.957 16 Validation accuracy: 0.9566 17 Validation accuracy: 0.9578 18 Validation accuracy: 0.9594 19 Validation accuracy: 0.958
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
momentum=0.9)
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
momentum=0.9, use_nesterov=True)
optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
momentum=0.9, decay=0.9, epsilon=1e-10)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
logits = tf.layers.dense(hidden2, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
with tf.name_scope("train"): # not shown in the book
initial_learning_rate = 0.1
decay_steps = 10000
decay_rate = 1/10
global_step = tf.Variable(0, trainable=False, name="global_step")
learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step,
decay_steps, decay_rate)
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)
training_op = optimizer.minimize(loss, global_step=global_step)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 5
batch_size = 50
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
print(epoch, "Validation accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_model_final.ckpt")
0 Validation accuracy: 0.959 1 Validation accuracy: 0.9688 2 Validation accuracy: 0.9726 3 Validation accuracy: 0.9804 4 Validation accuracy: 0.982
Let's implement $\ell_1$ regularization manually. First, we create the model, as usual (with just one hidden layer this time, for simplicity):
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
logits = tf.layers.dense(hidden1, n_outputs, name="outputs")
Next, we get a handle on the layer weights, and we compute the total loss, which is equal to the sum of the usual cross entropy loss and the $\ell_1$ loss (i.e., the absolute values of the weights):
W1 = tf.get_default_graph().get_tensor_by_name("hidden1/kernel:0")
W2 = tf.get_default_graph().get_tensor_by_name("outputs/kernel:0")
scale = 0.001 # l1 regularization hyperparameter
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
logits=logits)
base_loss = tf.reduce_mean(xentropy, name="avg_xentropy")
reg_losses = tf.reduce_sum(tf.abs(W1)) + tf.reduce_sum(tf.abs(W2))
loss = tf.add(base_loss, scale * reg_losses, name="loss")
The rest is just as usual:
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
learning_rate = 0.01
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 20
batch_size = 200
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
print(epoch, "Validation accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_model_final.ckpt")
0 Validation accuracy: 0.831 1 Validation accuracy: 0.871 2 Validation accuracy: 0.8838 3 Validation accuracy: 0.8934 4 Validation accuracy: 0.8966 5 Validation accuracy: 0.8988 6 Validation accuracy: 0.9016 7 Validation accuracy: 0.9044 8 Validation accuracy: 0.9058 9 Validation accuracy: 0.906 10 Validation accuracy: 0.9068 11 Validation accuracy: 0.9054 12 Validation accuracy: 0.907 13 Validation accuracy: 0.9084 14 Validation accuracy: 0.9088 15 Validation accuracy: 0.9064 16 Validation accuracy: 0.9066 17 Validation accuracy: 0.9066 18 Validation accuracy: 0.9066 19 Validation accuracy: 0.9052
Alternatively, we can pass a regularization function to the tf.layers.dense()
function, which will use it to create operations that will compute the regularization loss, and it adds these operations to the collection of regularization losses. The beginning is the same as above:
reset_graph()
n_inputs = 28 * 28 # MNIST
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
Next, we will use Python's partial()
function to avoid repeating the same arguments over and over again. Note that we set the kernel_regularizer
argument:
scale = 0.001
my_dense_layer = partial(
tf.layers.dense, activation=tf.nn.relu,
kernel_regularizer=tf.contrib.layers.l1_regularizer(scale))
with tf.name_scope("dnn"):
hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
hidden2 = my_dense_layer(hidden1, n_hidden2, name="hidden2")
logits = my_dense_layer(hidden2, n_outputs, activation=None,
name="outputs")
Next we must add the regularization losses to the base loss:
with tf.name_scope("loss"): # not shown in the book
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits( # not shown
labels=y, logits=logits) # not shown
base_loss = tf.reduce_mean(xentropy, name="avg_xentropy") # not shown
reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
loss = tf.add_n([base_loss] + reg_losses, name="loss")
And the rest is the same as usual:
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
learning_rate = 0.01
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 20
batch_size = 200
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
print(epoch, "Validation accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_model_final.ckpt")
0 Validation accuracy: 0.8274 1 Validation accuracy: 0.8766 2 Validation accuracy: 0.8952 3 Validation accuracy: 0.9016 4 Validation accuracy: 0.908 5 Validation accuracy: 0.9096 6 Validation accuracy: 0.9126 7 Validation accuracy: 0.9154 8 Validation accuracy: 0.9178 9 Validation accuracy: 0.919 10 Validation accuracy: 0.92 11 Validation accuracy: 0.9224 12 Validation accuracy: 0.9212 13 Validation accuracy: 0.9228 14 Validation accuracy: 0.9224 15 Validation accuracy: 0.9216 16 Validation accuracy: 0.9218 17 Validation accuracy: 0.9228 18 Validation accuracy: 0.9216 19 Validation accuracy: 0.9214
Note: the book uses tf.contrib.layers.dropout()
rather than tf.layers.dropout()
(which did not exist when this chapter was written). It is now preferable to use tf.layers.dropout()
, because anything in the contrib module may change or be deleted without notice. The tf.layers.dropout()
function is almost identical to the tf.contrib.layers.dropout()
function, except for a few minor differences. Most importantly:
rate
) rather than the keep probability (keep_prob
), where rate
is simply equal to 1 - keep_prob
,is_training
parameter is renamed to training
.reset_graph()
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')
dropout_rate = 0.5 # == 1 - keep_prob
X_drop = tf.layers.dropout(X, dropout_rate, training=training)
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X_drop, n_hidden1, activation=tf.nn.relu,
name="hidden1")
hidden1_drop = tf.layers.dropout(hidden1, dropout_rate, training=training)
hidden2 = tf.layers.dense(hidden1_drop, n_hidden2, activation=tf.nn.relu,
name="hidden2")
hidden2_drop = tf.layers.dropout(hidden2, dropout_rate, training=training)
logits = tf.layers.dense(hidden2_drop, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 20
batch_size = 50
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run(training_op, feed_dict={X: X_batch, y: y_batch, training: True})
accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
print(epoch, "Validation accuracy:", accuracy_val)
save_path = saver.save(sess, "./my_model_final.ckpt")
0 Validation accuracy: 0.9264 1 Validation accuracy: 0.9446 2 Validation accuracy: 0.9488 3 Validation accuracy: 0.9556 4 Validation accuracy: 0.9612 5 Validation accuracy: 0.9598 6 Validation accuracy: 0.9616 7 Validation accuracy: 0.9674 8 Validation accuracy: 0.967 9 Validation accuracy: 0.9706 10 Validation accuracy: 0.9674 11 Validation accuracy: 0.9678 12 Validation accuracy: 0.9698 13 Validation accuracy: 0.97 14 Validation accuracy: 0.971 15 Validation accuracy: 0.9702 16 Validation accuracy: 0.9718 17 Validation accuracy: 0.9716 18 Validation accuracy: 0.9734 19 Validation accuracy: 0.972
Let's go back to a plain and simple neural net for MNIST with just 2 hidden layers:
reset_graph()
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10
learning_rate = 0.01
momentum = 0.9
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
logits = tf.layers.dense(hidden2, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
Next, let's get a handle on the first hidden layer's weight and create an operation that will compute the clipped weights using the clip_by_norm()
function. Then we create an assignment operation to assign the clipped weights to the weights variable:
threshold = 1.0
weights = tf.get_default_graph().get_tensor_by_name("hidden1/kernel:0")
clipped_weights = tf.clip_by_norm(weights, clip_norm=threshold, axes=1)
clip_weights = tf.assign(weights, clipped_weights)
We can do this as well for the second hidden layer:
weights2 = tf.get_default_graph().get_tensor_by_name("hidden2/kernel:0")
clipped_weights2 = tf.clip_by_norm(weights2, clip_norm=threshold, axes=1)
clip_weights2 = tf.assign(weights2, clipped_weights2)
Let's add an initializer and a saver:
init = tf.global_variables_initializer()
saver = tf.train.Saver()
And now we can train the model. It's pretty much as usual, except that right after running the training_op
, we run the clip_weights
and clip_weights2
operations:
n_epochs = 20
batch_size = 50
with tf.Session() as sess: # not shown in the book
init.run() # not shown
for epoch in range(n_epochs): # not shown
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size): # not shown
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
clip_weights.eval()
clip_weights2.eval() # not shown
acc_valid = accuracy.eval(feed_dict={X: X_valid, y: y_valid}) # not shown
print(epoch, "Validation accuracy:", acc_valid) # not shown
save_path = saver.save(sess, "./my_model_final.ckpt") # not shown
0 Validation accuracy: 0.9568 1 Validation accuracy: 0.9696 2 Validation accuracy: 0.972 3 Validation accuracy: 0.9768 4 Validation accuracy: 0.9784 5 Validation accuracy: 0.9786 6 Validation accuracy: 0.9816 7 Validation accuracy: 0.9808 8 Validation accuracy: 0.981 9 Validation accuracy: 0.983 10 Validation accuracy: 0.9822 11 Validation accuracy: 0.9854 12 Validation accuracy: 0.9822 13 Validation accuracy: 0.9842 14 Validation accuracy: 0.984 15 Validation accuracy: 0.9852 16 Validation accuracy: 0.984 17 Validation accuracy: 0.9844 18 Validation accuracy: 0.9844 19 Validation accuracy: 0.9844
The implementation above is straightforward and it works fine, but it is a bit messy. A better approach is to define a max_norm_regularizer()
function:
def max_norm_regularizer(threshold, axes=1, name="max_norm",
collection="max_norm"):
def max_norm(weights):
clipped = tf.clip_by_norm(weights, clip_norm=threshold, axes=axes)
clip_weights = tf.assign(weights, clipped, name=name)
tf.add_to_collection(collection, clip_weights)
return None # there is no regularization loss term
return max_norm
Then you can call this function to get a max norm regularizer (with the threshold you want). When you create a hidden layer, you can pass this regularizer to the kernel_regularizer
argument:
reset_graph()
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10
learning_rate = 0.01
momentum = 0.9
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
max_norm_reg = max_norm_regularizer(threshold=1.0)
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,
kernel_regularizer=max_norm_reg, name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu,
kernel_regularizer=max_norm_reg, name="hidden2")
logits = tf.layers.dense(hidden2, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
Training is as usual, except you must run the weights clipping operations after each training operation:
n_epochs = 20
batch_size = 50
clip_all_weights = tf.get_collection("max_norm")
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
sess.run(clip_all_weights)
acc_valid = accuracy.eval(feed_dict={X: X_valid, y: y_valid}) # not shown
print(epoch, "Validation accuracy:", acc_valid) # not shown
save_path = saver.save(sess, "./my_model_final.ckpt") # not shown
0 Validation accuracy: 0.9556 1 Validation accuracy: 0.9698 2 Validation accuracy: 0.9726 3 Validation accuracy: 0.9744 4 Validation accuracy: 0.9762 5 Validation accuracy: 0.9772 6 Validation accuracy: 0.979 7 Validation accuracy: 0.9816 8 Validation accuracy: 0.9814 9 Validation accuracy: 0.9812 10 Validation accuracy: 0.9818 11 Validation accuracy: 0.9816 12 Validation accuracy: 0.9802 13 Validation accuracy: 0.9822 14 Validation accuracy: 0.982 15 Validation accuracy: 0.9812 16 Validation accuracy: 0.9824 17 Validation accuracy: 0.9836 18 Validation accuracy: 0.9824 19 Validation accuracy: 0.9826
See appendix A.
Exercise: Build a DNN with five hidden layers of 100 neurons each, He initialization, and the ELU activation function.
We will need similar DNNs in the next exercises, so let's create a function to build this DNN:
he_init = tf.variance_scaling_initializer()
def dnn(inputs, n_hidden_layers=5, n_neurons=100, name=None,
activation=tf.nn.elu, initializer=he_init):
with tf.variable_scope(name, "dnn"):
for layer in range(n_hidden_layers):
inputs = tf.layers.dense(inputs, n_neurons, activation=activation,
kernel_initializer=initializer,
name="hidden%d" % (layer + 1))
return inputs
n_inputs = 28 * 28 # MNIST
n_outputs = 5
reset_graph()
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
dnn_outputs = dnn(X)
logits = tf.layers.dense(dnn_outputs, n_outputs, kernel_initializer=he_init, name="logits")
Y_proba = tf.nn.softmax(logits, name="Y_proba")
Exercise: Using Adam optimization and early stopping, try training it on MNIST but only on digits 0 to 4, as we will use transfer learning for digits 5 to 9 in the next exercise. You will need a softmax output layer with five neurons, and as always make sure to save checkpoints at regular intervals and save the final model so you can reuse it later.
Let's complete the graph with the cost function, the training op, and all the other usual components:
learning_rate = 0.01
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss, name="training_op")
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
init = tf.global_variables_initializer()
saver = tf.train.Saver()
Now let's create the training set, validation and test set (we need the validation set to implement early stopping):
X_train1 = X_train[y_train < 5]
y_train1 = y_train[y_train < 5]
X_valid1 = X_valid[y_valid < 5]
y_valid1 = y_valid[y_valid < 5]
X_test1 = X_test[y_test < 5]
y_test1 = y_test[y_test < 5]
n_epochs = 1000
batch_size = 20
max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train1))
for rnd_indices in np.array_split(rnd_idx, len(X_train1) // batch_size):
X_batch, y_batch = X_train1[rnd_indices], y_train1[rnd_indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid1, y: y_valid1})
if loss_val < best_loss:
save_path = saver.save(sess, "./my_mnist_model_0_to_4.ckpt")
best_loss = loss_val
checks_without_progress = 0
else:
checks_without_progress += 1
if checks_without_progress > max_checks_without_progress:
print("Early stopping!")
break
print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_val, best_loss, acc_val * 100))
with tf.Session() as sess:
saver.restore(sess, "./my_mnist_model_0_to_4.ckpt")
acc_test = accuracy.eval(feed_dict={X: X_test1, y: y_test1})
print("Final test accuracy: {:.2f}%".format(acc_test * 100))
0 Validation loss: 0.116407 Best loss: 0.116407 Accuracy: 97.58% 1 Validation loss: 0.180534 Best loss: 0.116407 Accuracy: 97.11% 2 Validation loss: 0.227535 Best loss: 0.116407 Accuracy: 93.86% 3 Validation loss: 0.107346 Best loss: 0.107346 Accuracy: 97.54% 4 Validation loss: 0.302668 Best loss: 0.107346 Accuracy: 95.35% 5 Validation loss: 1.631054 Best loss: 0.107346 Accuracy: 22.01% 6 Validation loss: 1.635262 Best loss: 0.107346 Accuracy: 18.73% 7 Validation loss: 1.671200 Best loss: 0.107346 Accuracy: 22.01% 8 Validation loss: 1.695277 Best loss: 0.107346 Accuracy: 19.27% 9 Validation loss: 1.744607 Best loss: 0.107346 Accuracy: 20.91% 10 Validation loss: 1.629857 Best loss: 0.107346 Accuracy: 22.01% 11 Validation loss: 1.810803 Best loss: 0.107346 Accuracy: 22.01% 12 Validation loss: 1.675703 Best loss: 0.107346 Accuracy: 18.73% 13 Validation loss: 1.633233 Best loss: 0.107346 Accuracy: 20.91% 14 Validation loss: 1.652905 Best loss: 0.107346 Accuracy: 20.91% 15 Validation loss: 1.635937 Best loss: 0.107346 Accuracy: 20.91% 16 Validation loss: 1.718919 Best loss: 0.107346 Accuracy: 19.08% 17 Validation loss: 1.682458 Best loss: 0.107346 Accuracy: 19.27% 18 Validation loss: 1.675366 Best loss: 0.107346 Accuracy: 18.73% 19 Validation loss: 1.645800 Best loss: 0.107346 Accuracy: 19.08% 20 Validation loss: 1.722334 Best loss: 0.107346 Accuracy: 22.01% 21 Validation loss: 1.656418 Best loss: 0.107346 Accuracy: 22.01% 22 Validation loss: 1.643529 Best loss: 0.107346 Accuracy: 18.73% 23 Validation loss: 1.644233 Best loss: 0.107346 Accuracy: 19.27% Early stopping! INFO:tensorflow:Restoring parameters from ./my_mnist_model_0_to_4.ckpt Final test accuracy: 97.26%
This test accuracy is not too bad, but let's see if we can do better by tuning the hyperparameters.
Exercise: Tune the hyperparameters using cross-validation and see what precision you can achieve.
Let's create a DNNClassifier
class, compatible with Scikit-Learn's RandomizedSearchCV
class, to perform hyperparameter tuning. Here are the key points of this implementation:
__init__()
method (constructor) does nothing more than create instance variables for each of the hyperparameters.fit()
method creates the graph, starts a session and trains the model:_build_graph()
method to build the graph (much lile the graph we defined earlier). Once this method is done creating the graph, it saves all the important operations as instance variables for easy access by other methods._dnn()
method builds the hidden layers, just like the dnn()
function above, but also with support for batch normalization and dropout (for the next exercises).fit()
method is given a validation set (X_valid
and y_valid
), then it implements early stopping. This implementation does not save the best model to disk, but rather to memory: it uses the _get_model_params()
method to get all the graph's variables and their values, and the _restore_model_params()
method to restore the variable values (of the best model found). This trick helps speed up training.fit()
method has finished training the model, it keeps the session open so that predictions can be made quickly, without having to save a model to disk and restore it for every prediction. You can close the session by calling the close_session()
method.predict_proba()
method uses the trained model to predict the class probabilities.predict()
method calls predict_proba()
and returns the class with the highest probability, for each instance.from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import NotFittedError
class DNNClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, n_hidden_layers=5, n_neurons=100, optimizer_class=tf.train.AdamOptimizer,
learning_rate=0.01, batch_size=20, activation=tf.nn.elu, initializer=he_init,
batch_norm_momentum=None, dropout_rate=None, random_state=None):
"""Initialize the DNNClassifier by simply storing all the hyperparameters."""
self.n_hidden_layers = n_hidden_layers
self.n_neurons = n_neurons
self.optimizer_class = optimizer_class
self.learning_rate = learning_rate
self.batch_size = batch_size
self.activation = activation
self.initializer = initializer
self.batch_norm_momentum = batch_norm_momentum
self.dropout_rate = dropout_rate
self.random_state = random_state
self._session = None
def _dnn(self, inputs):
"""Build the hidden layers, with support for batch normalization and dropout."""
for layer in range(self.n_hidden_layers):
if self.dropout_rate:
inputs = tf.layers.dropout(inputs, self.dropout_rate, training=self._training)
inputs = tf.layers.dense(inputs, self.n_neurons,
kernel_initializer=self.initializer,
name="hidden%d" % (layer + 1))
if self.batch_norm_momentum:
inputs = tf.layers.batch_normalization(inputs, momentum=self.batch_norm_momentum,
training=self._training)
inputs = self.activation(inputs, name="hidden%d_out" % (layer + 1))
return inputs
def _build_graph(self, n_inputs, n_outputs):
"""Build the same model as earlier"""
if self.random_state is not None:
tf.set_random_seed(self.random_state)
np.random.seed(self.random_state)
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
if self.batch_norm_momentum or self.dropout_rate:
self._training = tf.placeholder_with_default(False, shape=(), name='training')
else:
self._training = None
dnn_outputs = self._dnn(X)
logits = tf.layers.dense(dnn_outputs, n_outputs, kernel_initializer=he_init, name="logits")
Y_proba = tf.nn.softmax(logits, name="Y_proba")
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
optimizer = self.optimizer_class(learning_rate=self.learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
init = tf.global_variables_initializer()
saver = tf.train.Saver()
# Make the important operations available easily through instance variables
self._X, self._y = X, y
self._Y_proba, self._loss = Y_proba, loss
self._training_op, self._accuracy = training_op, accuracy
self._init, self._saver = init, saver
def close_session(self):
if self._session:
self._session.close()
def _get_model_params(self):
"""Get all variable values (used for early stopping, faster than saving to disk)"""
with self._graph.as_default():
gvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
return {gvar.op.name: value for gvar, value in zip(gvars, self._session.run(gvars))}
def _restore_model_params(self, model_params):
"""Set all variables to the given values (for early stopping, faster than loading from disk)"""
gvar_names = list(model_params.keys())
assign_ops = {gvar_name: self._graph.get_operation_by_name(gvar_name + "/Assign")
for gvar_name in gvar_names}
init_values = {gvar_name: assign_op.inputs[1] for gvar_name, assign_op in assign_ops.items()}
feed_dict = {init_values[gvar_name]: model_params[gvar_name] for gvar_name in gvar_names}
self._session.run(assign_ops, feed_dict=feed_dict)
def fit(self, X, y, n_epochs=100, X_valid=None, y_valid=None):
"""Fit the model to the training set. If X_valid and y_valid are provided, use early stopping."""
self.close_session()
# infer n_inputs and n_outputs from the training set.
n_inputs = X.shape[1]
self.classes_ = np.unique(y)
n_outputs = len(self.classes_)
# Translate the labels vector to a vector of sorted class indices, containing
# integers from 0 to n_outputs - 1.
# For example, if y is equal to [8, 8, 9, 5, 7, 6, 6, 6], then the sorted class
# labels (self.classes_) will be equal to [5, 6, 7, 8, 9], and the labels vector
# will be translated to [3, 3, 4, 0, 2, 1, 1, 1]
self.class_to_index_ = {label: index
for index, label in enumerate(self.classes_)}
y = np.array([self.class_to_index_[label]
for label in y], dtype=np.int32)
self._graph = tf.Graph()
with self._graph.as_default():
self._build_graph(n_inputs, n_outputs)
# extra ops for batch normalization
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
# needed in case of early stopping
max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty
best_params = None
# Now train the model!
self._session = tf.Session(graph=self._graph)
with self._session.as_default() as sess:
self._init.run()
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X))
for rnd_indices in np.array_split(rnd_idx, len(X) // self.batch_size):
X_batch, y_batch = X[rnd_indices], y[rnd_indices]
feed_dict = {self._X: X_batch, self._y: y_batch}
if self._training is not None:
feed_dict[self._training] = True
sess.run(self._training_op, feed_dict=feed_dict)
if extra_update_ops:
sess.run(extra_update_ops, feed_dict=feed_dict)
if X_valid is not None and y_valid is not None:
loss_val, acc_val = sess.run([self._loss, self._accuracy],
feed_dict={self._X: X_valid,
self._y: y_valid})
if loss_val < best_loss:
best_params = self._get_model_params()
best_loss = loss_val
checks_without_progress = 0
else:
checks_without_progress += 1
print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_val, best_loss, acc_val * 100))
if checks_without_progress > max_checks_without_progress:
print("Early stopping!")
break
else:
loss_train, acc_train = sess.run([self._loss, self._accuracy],
feed_dict={self._X: X_batch,
self._y: y_batch})
print("{}\tLast training batch loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_train, acc_train * 100))
# If we used early stopping then rollback to the best model found
if best_params:
self._restore_model_params(best_params)
return self
def predict_proba(self, X):
if not self._session:
raise NotFittedError("This %s instance is not fitted yet" % self.__class__.__name__)
with self._session.as_default() as sess:
return self._Y_proba.eval(feed_dict={self._X: X})
def predict(self, X):
class_indices = np.argmax(self.predict_proba(X), axis=1)
return np.array([[self.classes_[class_index]]
for class_index in class_indices], np.int32)
def save(self, path):
self._saver.save(self._session, path)
Let's see if we get the exact same accuracy as earlier using this class (without dropout or batch norm):
dnn_clf = DNNClassifier(random_state=42)
dnn_clf.fit(X_train1, y_train1, n_epochs=1000, X_valid=X_valid1, y_valid=y_valid1)
0 Validation loss: 0.116407 Best loss: 0.116407 Accuracy: 97.58% 1 Validation loss: 0.180534 Best loss: 0.116407 Accuracy: 97.11% 2 Validation loss: 0.227535 Best loss: 0.116407 Accuracy: 93.86% 3 Validation loss: 0.107346 Best loss: 0.107346 Accuracy: 97.54% 4 Validation loss: 0.302668 Best loss: 0.107346 Accuracy: 95.35% 5 Validation loss: 1.631054 Best loss: 0.107346 Accuracy: 22.01% 6 Validation loss: 1.635262 Best loss: 0.107346 Accuracy: 18.73% 7 Validation loss: 1.671200 Best loss: 0.107346 Accuracy: 22.01% 8 Validation loss: 1.695277 Best loss: 0.107346 Accuracy: 19.27% 9 Validation loss: 1.744607 Best loss: 0.107346 Accuracy: 20.91% 10 Validation loss: 1.629857 Best loss: 0.107346 Accuracy: 22.01% 11 Validation loss: 1.810803 Best loss: 0.107346 Accuracy: 22.01% 12 Validation loss: 1.675703 Best loss: 0.107346 Accuracy: 18.73% 13 Validation loss: 1.633233 Best loss: 0.107346 Accuracy: 20.91% 14 Validation loss: 1.652905 Best loss: 0.107346 Accuracy: 20.91% 15 Validation loss: 1.635937 Best loss: 0.107346 Accuracy: 20.91% 16 Validation loss: 1.718919 Best loss: 0.107346 Accuracy: 19.08% 17 Validation loss: 1.682458 Best loss: 0.107346 Accuracy: 19.27% 18 Validation loss: 1.675366 Best loss: 0.107346 Accuracy: 18.73% 19 Validation loss: 1.645800 Best loss: 0.107346 Accuracy: 19.08% 20 Validation loss: 1.722334 Best loss: 0.107346 Accuracy: 22.01% 21 Validation loss: 1.656418 Best loss: 0.107346 Accuracy: 22.01% 22 Validation loss: 1.643529 Best loss: 0.107346 Accuracy: 18.73% 23 Validation loss: 1.644233 Best loss: 0.107346 Accuracy: 19.27% 24 Validation loss: 1.690035 Best loss: 0.107346 Accuracy: 18.73% Early stopping!
DNNClassifier(activation=<function elu at 0x1243639d8>, batch_norm_momentum=None, batch_size=20, dropout_rate=None, initializer=<tensorflow.python.ops.init_ops.VarianceScaling object at 0x117bf5828>, learning_rate=0.01, n_hidden_layers=5, n_neurons=100, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42)
The model is trained, let's see if it gets the same accuracy as earlier:
from sklearn.metrics import accuracy_score
y_pred = dnn_clf.predict(X_test1)
accuracy_score(y_test1, y_pred)
0.9725627553998832
Yep! Working fine. Now we can use Scikit-Learn's RandomizedSearchCV
class to search for better hyperparameters (this may take over an hour, depending on your system):
from sklearn.model_selection import RandomizedSearchCV
def leaky_relu(alpha=0.01):
def parametrized_leaky_relu(z, name=None):
return tf.maximum(alpha * z, z, name=name)
return parametrized_leaky_relu
param_distribs = {
"n_neurons": [10, 30, 50, 70, 90, 100, 120, 140, 160],
"batch_size": [10, 50, 100, 500],
"learning_rate": [0.01, 0.02, 0.05, 0.1],
"activation": [tf.nn.relu, tf.nn.elu, leaky_relu(alpha=0.01), leaky_relu(alpha=0.1)],
# you could also try exploring different numbers of hidden layers, different optimizers, etc.
#"n_hidden_layers": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
#"optimizer_class": [tf.train.AdamOptimizer, partial(tf.train.MomentumOptimizer, momentum=0.95)],
}
rnd_search = RandomizedSearchCV(DNNClassifier(random_state=42), param_distribs, n_iter=50,
cv=3, random_state=42, verbose=2)
rnd_search.fit(X_train1, y_train1, X_valid=X_valid1, y_valid=y_valid1, n_epochs=1000)
# If you have Scikit-Learn 0.18 or earlier, you should upgrade, or use the fit_params argument:
# fit_params = dict(X_valid=X_valid1, y_valid=y_valid1, n_epochs=1000)
# rnd_search = RandomizedSearchCV(DNNClassifier(random_state=42), param_distribs, n_iter=50,
# fit_params=fit_params, random_state=42, verbose=2)
# rnd_search.fit(X_train1, y_train1)
Fitting 3 folds for each of 50 candidates, totalling 150 fits [CV] n_neurons=10, learning_rate=0.05, batch_size=100, activation=<function elu at 0x1243639d8>
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
0 Validation loss: 0.143224 Best loss: 0.143224 Accuracy: 95.82% 1 Validation loss: 0.143304 Best loss: 0.143224 Accuracy: 96.60% 2 Validation loss: 0.106488 Best loss: 0.106488 Accuracy: 96.95% 3 Validation loss: 0.307107 Best loss: 0.106488 Accuracy: 92.34% 4 Validation loss: 0.157948 Best loss: 0.106488 Accuracy: 95.50% 5 Validation loss: 0.131002 Best loss: 0.106488 Accuracy: 96.40% 6 Validation loss: 0.931847 Best loss: 0.106488 Accuracy: 58.29% 7 Validation loss: 0.872748 Best loss: 0.106488 Accuracy: 57.97% 8 Validation loss: 0.699336 Best loss: 0.106488 Accuracy: 58.29% 9 Validation loss: 0.853343 Best loss: 0.106488 Accuracy: 57.27% 10 Validation loss: 0.738493 Best loss: 0.106488 Accuracy: 59.19% 11 Validation loss: 0.670431 Best loss: 0.106488 Accuracy: 59.23% 12 Validation loss: 0.717334 Best loss: 0.106488 Accuracy: 59.11% 13 Validation loss: 0.718714 Best loss: 0.106488 Accuracy: 56.57% 14 Validation loss: 0.679313 Best loss: 0.106488 Accuracy: 59.07% 15 Validation loss: 0.732966 Best loss: 0.106488 Accuracy: 58.41% 16 Validation loss: 0.666333 Best loss: 0.106488 Accuracy: 60.48% 17 Validation loss: 0.677045 Best loss: 0.106488 Accuracy: 61.18% 18 Validation loss: 0.666103 Best loss: 0.106488 Accuracy: 59.97% 19 Validation loss: 0.710005 Best loss: 0.106488 Accuracy: 63.21% 20 Validation loss: 1.037921 Best loss: 0.106488 Accuracy: 64.03% 21 Validation loss: 1.626959 Best loss: 0.106488 Accuracy: 19.27% 22 Validation loss: 1.615710 Best loss: 0.106488 Accuracy: 18.73% 23 Validation loss: 1.609028 Best loss: 0.106488 Accuracy: 20.91% Early stopping! [CV] n_neurons=10, learning_rate=0.05, batch_size=100, activation=<function elu at 0x1243639d8>, total= 4.7s [CV] n_neurons=10, learning_rate=0.05, batch_size=100, activation=<function elu at 0x1243639d8>
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 4.8s remaining: 0.0s
0 Validation loss: 0.137274 Best loss: 0.137274 Accuracy: 96.33% 1 Validation loss: 0.145733 Best loss: 0.137274 Accuracy: 95.97% 2 Validation loss: 0.171077 Best loss: 0.137274 Accuracy: 95.90% 3 Validation loss: 0.139310 Best loss: 0.137274 Accuracy: 96.79% <<5140 more lines>> 51 Validation loss: 0.400818 Best loss: 0.265362 Accuracy: 97.11% 52 Validation loss: 0.509595 Best loss: 0.265362 Accuracy: 96.99% Early stopping! [CV] n_neurons=90, learning_rate=0.1, batch_size=500, activation=<function leaky_relu.<locals>.parametrized_leaky_relu at 0x13eb49f28>, total= 11.3s
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 46.9min finished
0 Validation loss: 0.069587 Best loss: 0.069587 Accuracy: 98.12% 1 Validation loss: 0.045462 Best loss: 0.045462 Accuracy: 98.48% 2 Validation loss: 0.046439 Best loss: 0.045462 Accuracy: 98.40% 3 Validation loss: 0.037278 Best loss: 0.037278 Accuracy: 98.59% 4 Validation loss: 0.039989 Best loss: 0.037278 Accuracy: 98.51% 5 Validation loss: 0.039621 Best loss: 0.037278 Accuracy: 98.79% 6 Validation loss: 0.035959 Best loss: 0.035959 Accuracy: 99.06% 7 Validation loss: 0.033321 Best loss: 0.033321 Accuracy: 99.06% 8 Validation loss: 0.044559 Best loss: 0.033321 Accuracy: 98.87% 9 Validation loss: 0.035999 Best loss: 0.033321 Accuracy: 99.10% 10 Validation loss: 0.042629 Best loss: 0.033321 Accuracy: 98.98% 11 Validation loss: 0.059839 Best loss: 0.033321 Accuracy: 98.71% 12 Validation loss: 0.044683 Best loss: 0.033321 Accuracy: 98.87% 13 Validation loss: 0.051294 Best loss: 0.033321 Accuracy: 98.75% 14 Validation loss: 0.050140 Best loss: 0.033321 Accuracy: 98.98% 15 Validation loss: 0.051109 Best loss: 0.033321 Accuracy: 98.79% 16 Validation loss: 0.072444 Best loss: 0.033321 Accuracy: 97.97% 17 Validation loss: 0.063308 Best loss: 0.033321 Accuracy: 98.71% 18 Validation loss: 0.051853 Best loss: 0.033321 Accuracy: 98.87% 19 Validation loss: 0.058982 Best loss: 0.033321 Accuracy: 98.91% 20 Validation loss: 0.046894 Best loss: 0.033321 Accuracy: 99.06% 21 Validation loss: 0.039036 Best loss: 0.033321 Accuracy: 99.02% 22 Validation loss: 0.057221 Best loss: 0.033321 Accuracy: 98.32% 23 Validation loss: 0.054618 Best loss: 0.033321 Accuracy: 98.75% 24 Validation loss: 0.039252 Best loss: 0.033321 Accuracy: 99.14% 25 Validation loss: 0.111809 Best loss: 0.033321 Accuracy: 98.05% 26 Validation loss: 0.060662 Best loss: 0.033321 Accuracy: 98.98% 27 Validation loss: 0.073774 Best loss: 0.033321 Accuracy: 99.02% 28 Validation loss: 0.048667 Best loss: 0.033321 Accuracy: 99.18% Early stopping!
RandomizedSearchCV(cv='warn', error_score='raise-deprecating', estimator=DNNClassifier(activation=<function elu at 0x1243639d8>, batch_norm_momentum=None, batch_size=20, dropout_rate=None, initializer=<tensorflow.python.ops.init_ops.VarianceScaling object at 0x117bf5828>, learning_rate=0.01, n_hidden_layers=5, n_neurons=100, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42), fit_params=None, iid='warn', n_iter=50, n_jobs=None, param_distributions={'n_neurons': [10, 30, 50, 70, 90, 100, 120, 140, 160], 'batch_size': [10, 50, 100, 500], 'learning_rate': [0.01, 0.02, 0.05, 0.1], 'activation': [<function relu at 0x124366d08>, <function elu at 0x1243639d8>, <function leaky_relu.<locals>.parametrized_leaky_relu at 0x133807c80>, <function leaky_relu.<locals>.parametrized_leaky_relu at 0x13eb49f28>]}, pre_dispatch='2*n_jobs', random_state=42, refit=True, return_train_score='warn', scoring=None, verbose=2)
rnd_search.best_params_
{'n_neurons': 90, 'learning_rate': 0.01, 'batch_size': 500, 'activation': <function __main__.leaky_relu.<locals>.parametrized_leaky_relu(z, name=None)>}
y_pred = rnd_search.predict(X_test1)
accuracy_score(y_test1, y_pred)
0.9891029383148473
Wonderful! Tuning the hyperparameters got us up to 98.91% accuracy! It may not sound like a great improvement to go from 97.26% to 98.91% accuracy, but consider the error rate: it went from roughly 2.6% to 1.1%. That's almost 60% reduction of the number of errors this model will produce!
It's a good idea to save this model:
rnd_search.best_estimator_.save("./my_best_mnist_model_0_to_4")
Exercise: Now try adding Batch Normalization and compare the learning curves: is it converging faster than before? Does it produce a better model?
Let's train the best model found, once again, to see how fast it converges (alternatively, you could tweak the code above to make it write summaries for TensorBoard, so you can visualize the learning curve):
dnn_clf = DNNClassifier(activation=leaky_relu(alpha=0.1), batch_size=500, learning_rate=0.01,
n_neurons=140, random_state=42)
dnn_clf.fit(X_train1, y_train1, n_epochs=1000, X_valid=X_valid1, y_valid=y_valid1)
0 Validation loss: 0.083541 Best loss: 0.083541 Accuracy: 97.54% 1 Validation loss: 0.052198 Best loss: 0.052198 Accuracy: 98.40% 2 Validation loss: 0.044553 Best loss: 0.044553 Accuracy: 98.71% 3 Validation loss: 0.051113 Best loss: 0.044553 Accuracy: 98.48% 4 Validation loss: 0.046304 Best loss: 0.044553 Accuracy: 98.75% 5 Validation loss: 0.037796 Best loss: 0.037796 Accuracy: 98.91% 6 Validation loss: 0.048525 Best loss: 0.037796 Accuracy: 98.67% 7 Validation loss: 0.039877 Best loss: 0.037796 Accuracy: 98.75% 8 Validation loss: 0.038729 Best loss: 0.037796 Accuracy: 98.98% 9 Validation loss: 0.064167 Best loss: 0.037796 Accuracy: 98.24% 10 Validation loss: 0.057274 Best loss: 0.037796 Accuracy: 98.79% 11 Validation loss: 0.064388 Best loss: 0.037796 Accuracy: 98.55% 12 Validation loss: 0.056382 Best loss: 0.037796 Accuracy: 98.63% 13 Validation loss: 0.049408 Best loss: 0.037796 Accuracy: 98.91% 14 Validation loss: 0.038494 Best loss: 0.037796 Accuracy: 99.10% 15 Validation loss: 0.064619 Best loss: 0.037796 Accuracy: 98.67% 16 Validation loss: 0.055027 Best loss: 0.037796 Accuracy: 98.91% 17 Validation loss: 0.054773 Best loss: 0.037796 Accuracy: 98.91% 18 Validation loss: 0.076131 Best loss: 0.037796 Accuracy: 98.71% 19 Validation loss: 0.063031 Best loss: 0.037796 Accuracy: 98.59% 20 Validation loss: 0.120501 Best loss: 0.037796 Accuracy: 98.55% 21 Validation loss: 3.922006 Best loss: 0.037796 Accuracy: 94.14% 22 Validation loss: 0.395737 Best loss: 0.037796 Accuracy: 96.83% 23 Validation loss: 0.237014 Best loss: 0.037796 Accuracy: 96.56% 24 Validation loss: 0.159249 Best loss: 0.037796 Accuracy: 97.07% 25 Validation loss: 0.228444 Best loss: 0.037796 Accuracy: 95.74% 26 Validation loss: 0.134490 Best loss: 0.037796 Accuracy: 96.99% Early stopping!
DNNClassifier(activation=<function leaky_relu.<locals>.parametrized_leaky_relu at 0x13e25ea60>, batch_norm_momentum=None, batch_size=500, dropout_rate=None, initializer=<tensorflow.python.ops.init_ops.VarianceScaling object at 0x117bf5828>, learning_rate=0.01, n_hidden_layers=5, n_neurons=140, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42)
The best loss is reached at epoch 5.
Let's check that we do indeed get 98.9% accuracy on the test set:
y_pred = dnn_clf.predict(X_test1)
accuracy_score(y_test1, y_pred)
0.9898812998637867
Good, now let's use the exact same model, but this time with batch normalization:
dnn_clf_bn = DNNClassifier(activation=leaky_relu(alpha=0.1), batch_size=500, learning_rate=0.01,
n_neurons=90, random_state=42,
batch_norm_momentum=0.95)
dnn_clf_bn.fit(X_train1, y_train1, n_epochs=1000, X_valid=X_valid1, y_valid=y_valid1)
0 Validation loss: 0.046685 Best loss: 0.046685 Accuracy: 98.63% 1 Validation loss: 0.040820 Best loss: 0.040820 Accuracy: 98.79% 2 Validation loss: 0.046557 Best loss: 0.040820 Accuracy: 98.67% 3 Validation loss: 0.032236 Best loss: 0.032236 Accuracy: 98.94% 4 Validation loss: 0.056148 Best loss: 0.032236 Accuracy: 98.44% 5 Validation loss: 0.035988 Best loss: 0.032236 Accuracy: 98.98% 6 Validation loss: 0.037958 Best loss: 0.032236 Accuracy: 98.94% 7 Validation loss: 0.034588 Best loss: 0.032236 Accuracy: 99.02% 8 Validation loss: 0.031261 Best loss: 0.031261 Accuracy: 99.34% 9 Validation loss: 0.050791 Best loss: 0.031261 Accuracy: 98.79% 10 Validation loss: 0.035324 Best loss: 0.031261 Accuracy: 99.02% 11 Validation loss: 0.039875 Best loss: 0.031261 Accuracy: 98.98% 12 Validation loss: 0.048575 Best loss: 0.031261 Accuracy: 98.94% 13 Validation loss: 0.028059 Best loss: 0.028059 Accuracy: 99.18% 14 Validation loss: 0.044112 Best loss: 0.028059 Accuracy: 99.14% 15 Validation loss: 0.039050 Best loss: 0.028059 Accuracy: 99.22% 16 Validation loss: 0.033278 Best loss: 0.028059 Accuracy: 99.14% 17 Validation loss: 0.031734 Best loss: 0.028059 Accuracy: 99.18% 18 Validation loss: 0.034500 Best loss: 0.028059 Accuracy: 99.14% 19 Validation loss: 0.032757 Best loss: 0.028059 Accuracy: 99.26% 20 Validation loss: 0.023842 Best loss: 0.023842 Accuracy: 99.53% 21 Validation loss: 0.026727 Best loss: 0.023842 Accuracy: 99.41% 22 Validation loss: 0.027016 Best loss: 0.023842 Accuracy: 99.41% 23 Validation loss: 0.033038 Best loss: 0.023842 Accuracy: 99.34% 24 Validation loss: 0.035490 Best loss: 0.023842 Accuracy: 99.18% 25 Validation loss: 0.060346 Best loss: 0.023842 Accuracy: 98.75% 26 Validation loss: 0.051341 Best loss: 0.023842 Accuracy: 99.26% 27 Validation loss: 0.033108 Best loss: 0.023842 Accuracy: 99.26% 28 Validation loss: 0.042162 Best loss: 0.023842 Accuracy: 99.18% 29 Validation loss: 0.036313 Best loss: 0.023842 Accuracy: 99.26% 30 Validation loss: 0.033812 Best loss: 0.023842 Accuracy: 99.26% 31 Validation loss: 0.038173 Best loss: 0.023842 Accuracy: 99.26% 32 Validation loss: 0.029853 Best loss: 0.023842 Accuracy: 99.37% 33 Validation loss: 0.026557 Best loss: 0.023842 Accuracy: 99.37% 34 Validation loss: 0.035003 Best loss: 0.023842 Accuracy: 99.37% 35 Validation loss: 0.027140 Best loss: 0.023842 Accuracy: 99.34% 36 Validation loss: 0.038988 Best loss: 0.023842 Accuracy: 99.34% 37 Validation loss: 0.048149 Best loss: 0.023842 Accuracy: 98.98% 38 Validation loss: 0.049070 Best loss: 0.023842 Accuracy: 99.02% 39 Validation loss: 0.041233 Best loss: 0.023842 Accuracy: 99.26% 40 Validation loss: 0.038571 Best loss: 0.023842 Accuracy: 99.26% 41 Validation loss: 0.036886 Best loss: 0.023842 Accuracy: 99.34% Early stopping!
DNNClassifier(activation=<function leaky_relu.<locals>.parametrized_leaky_relu at 0x14030a378>, batch_norm_momentum=0.95, batch_size=500, dropout_rate=None, initializer=<tensorflow.python.ops.init_ops.VarianceScaling object at 0x117bf5828>, learning_rate=0.01, n_hidden_layers=5, n_neurons=90, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42)
The best params are reached during epoch 20, that's actually a slower convergence than earlier. Let's check the accuracy:
y_pred = dnn_clf_bn.predict(X_test1)
accuracy_score(y_test1, y_pred)
0.9941622883829538
Great, batch normalization improved accuracy! Let's see if we can find a good set of hyperparameters that will work even better with batch normalization:
from sklearn.model_selection import RandomizedSearchCV
param_distribs = {
"n_neurons": [10, 30, 50, 70, 90, 100, 120, 140, 160],
"batch_size": [10, 50, 100, 500],
"learning_rate": [0.01, 0.02, 0.05, 0.1],
"activation": [tf.nn.relu, tf.nn.elu, leaky_relu(alpha=0.01), leaky_relu(alpha=0.1)],
# you could also try exploring different numbers of hidden layers, different optimizers, etc.
#"n_hidden_layers": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
#"optimizer_class": [tf.train.AdamOptimizer, partial(tf.train.MomentumOptimizer, momentum=0.95)],
"batch_norm_momentum": [0.9, 0.95, 0.98, 0.99, 0.999],
}
rnd_search_bn = RandomizedSearchCV(DNNClassifier(random_state=42), param_distribs, n_iter=50, cv=3,
random_state=42, verbose=2)
rnd_search_bn.fit(X_train1, y_train1, X_valid=X_valid1, y_valid=y_valid1, n_epochs=1000)
# If you have Scikit-Learn 0.18 or earlier, you should upgrade, or use the fit_params argument:
# fit_params = dict(X_valid=X_valid1, y_valid=y_valid1, n_epochs=1000)
# rnd_search_bn = RandomizedSearchCV(DNNClassifier(random_state=42), param_distribs, n_iter=50,
# fit_params=fit_params, random_state=42, verbose=2)
# rnd_search_bn.fit(X_train1, y_train1)
Fitting 3 folds for each of 50 candidates, totalling 150 fits [CV] n_neurons=70, learning_rate=0.01, batch_size=50, batch_norm_momentum=0.99, activation=<function relu at 0x124366d08>
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
0 Validation loss: 0.098522 Best loss: 0.098522 Accuracy: 97.81% 1 Validation loss: 0.080233 Best loss: 0.080233 Accuracy: 98.08% 2 Validation loss: 0.068767 Best loss: 0.068767 Accuracy: 98.01% 3 Validation loss: 0.057095 Best loss: 0.057095 Accuracy: 98.28% 4 Validation loss: 0.067008 Best loss: 0.057095 Accuracy: 98.12% 5 Validation loss: 0.058910 Best loss: 0.057095 Accuracy: 98.55% 6 Validation loss: 0.038421 Best loss: 0.038421 Accuracy: 98.91% 7 Validation loss: 0.071075 Best loss: 0.038421 Accuracy: 98.36% 8 Validation loss: 0.063073 Best loss: 0.038421 Accuracy: 98.28% 9 Validation loss: 0.057488 Best loss: 0.038421 Accuracy: 98.75% 10 Validation loss: 0.049557 Best loss: 0.038421 Accuracy: 98.75% 11 Validation loss: 0.039810 Best loss: 0.038421 Accuracy: 99.06% 12 Validation loss: 0.061837 Best loss: 0.038421 Accuracy: 98.55% 13 Validation loss: 0.062008 Best loss: 0.038421 Accuracy: 98.51% 14 Validation loss: 0.075937 Best loss: 0.038421 Accuracy: 98.44% 15 Validation loss: 0.053910 Best loss: 0.038421 Accuracy: 98.71% 16 Validation loss: 0.051419 Best loss: 0.038421 Accuracy: 98.94% 17 Validation loss: 0.049013 Best loss: 0.038421 Accuracy: 98.98% 18 Validation loss: 0.048979 Best loss: 0.038421 Accuracy: 99.10% 19 Validation loss: 0.058969 Best loss: 0.038421 Accuracy: 98.59% 20 Validation loss: 0.060048 Best loss: 0.038421 Accuracy: 98.79% 21 Validation loss: 0.088256 Best loss: 0.038421 Accuracy: 98.32% 22 Validation loss: 0.055535 Best loss: 0.038421 Accuracy: 98.59% 23 Validation loss: 0.054632 Best loss: 0.038421 Accuracy: 98.94% 24 Validation loss: 0.092021 Best loss: 0.038421 Accuracy: 98.20% 25 Validation loss: 0.042263 Best loss: 0.038421 Accuracy: 99.02% 26 Validation loss: 0.041139 Best loss: 0.038421 Accuracy: 99.30% 27 Validation loss: 0.054255 Best loss: 0.038421 Accuracy: 99.06% Early stopping! [CV] n_neurons=70, learning_rate=0.01, batch_size=50, batch_norm_momentum=0.99, activation=<function relu at 0x124366d08>, total= 39.0s [CV] n_neurons=70, learning_rate=0.01, batch_size=50, batch_norm_momentum=0.99, activation=<function relu at 0x124366d08>
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 39.1s remaining: 0.0s
<<7081 more lines>> 36 Validation loss: 0.032222 Best loss: 0.021774 Accuracy: 99.14% 37 Validation loss: 0.025638 Best loss: 0.021774 Accuracy: 99.34% 38 Validation loss: 0.031702 Best loss: 0.021774 Accuracy: 99.02% 39 Validation loss: 0.027012 Best loss: 0.021774 Accuracy: 99.30% 40 Validation loss: 0.027163 Best loss: 0.021774 Accuracy: 99.30% 41 Validation loss: 0.029205 Best loss: 0.021774 Accuracy: 99.26% 42 Validation loss: 0.024973 Best loss: 0.021774 Accuracy: 99.41% 43 Validation loss: 0.036898 Best loss: 0.021774 Accuracy: 98.94% 44 Validation loss: 0.040366 Best loss: 0.021774 Accuracy: 99.14% 45 Validation loss: 0.033711 Best loss: 0.021774 Accuracy: 99.02% 46 Validation loss: 0.046615 Best loss: 0.021774 Accuracy: 98.79% 47 Validation loss: 0.032732 Best loss: 0.021774 Accuracy: 99.26% 48 Validation loss: 0.020177 Best loss: 0.020177 Accuracy: 99.45% 49 Validation loss: 0.031700 Best loss: 0.020177 Accuracy: 99.37% 50 Validation loss: 0.035962 Best loss: 0.020177 Accuracy: 99.14% 51 Validation loss: 0.031128 Best loss: 0.020177 Accuracy: 99.18% 52 Validation loss: 0.038107 Best loss: 0.020177 Accuracy: 99.14% 53 Validation loss: 0.036671 Best loss: 0.020177 Accuracy: 99.18% 54 Validation loss: 0.029867 Best loss: 0.020177 Accuracy: 99.30% 55 Validation loss: 0.039179 Best loss: 0.020177 Accuracy: 99.10% 56 Validation loss: 0.028410 Best loss: 0.020177 Accuracy: 99.10% 57 Validation loss: 0.037625 Best loss: 0.020177 Accuracy: 99.06% 58 Validation loss: 0.035516 Best loss: 0.020177 Accuracy: 99.22% 59 Validation loss: 0.030096 Best loss: 0.020177 Accuracy: 99.37% 60 Validation loss: 0.032056 Best loss: 0.020177 Accuracy: 99.22% 61 Validation loss: 0.026143 Best loss: 0.020177 Accuracy: 99.37% 62 Validation loss: 0.022387 Best loss: 0.020177 Accuracy: 99.45% 63 Validation loss: 0.026331 Best loss: 0.020177 Accuracy: 99.41% 64 Validation loss: 0.034930 Best loss: 0.020177 Accuracy: 99.10% 65 Validation loss: 0.029928 Best loss: 0.020177 Accuracy: 99.30% 66 Validation loss: 0.028943 Best loss: 0.020177 Accuracy: 99.30% 67 Validation loss: 0.034912 Best loss: 0.020177 Accuracy: 99.18% 68 Validation loss: 0.037118 Best loss: 0.020177 Accuracy: 99.18% 69 Validation loss: 0.034165 Best loss: 0.020177 Accuracy: 99.37% Early stopping!
RandomizedSearchCV(cv='warn', error_score='raise-deprecating', estimator=DNNClassifier(activation=<function elu at 0x1243639d8>, batch_norm_momentum=None, batch_size=20, dropout_rate=None, initializer=<tensorflow.python.ops.init_ops.VarianceScaling object at 0x117bf5828>, learning_rate=0.01, n_hidden_layers=5, n_neurons=100, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42), fit_params={'X_valid': array([[0., 0., ..., 0., 0.], [0., 0., ..., 0., 0.], ..., [0., 0., ..., 0., 0.], [0., 0., ..., 0., 0.]], dtype=float32), 'y_valid': array([0, 4, ..., 1, 2], dtype=int32), 'n_epochs': 1000}, iid='warn', n_iter=50, n_jobs=None, param_distributions={'n_neurons': [10, 30, 50, 70, 90, 100, 120, 140, 160], 'batch_size': [10, 50, 100, 500], 'learning_rate': [0.01, 0.02, 0.05, 0.1], 'activation': [<function relu at 0x124366d08>, <function elu at 0x1243639d8>, <function leaky_relu.<locals>.parametrized_leaky_relu at 0x1500bd2f0>, <function leaky_relu.<locals>.parametrized_leaky_relu at 0x1500bd378>], 'batch_norm_momentum': [0.9, 0.95, 0.98, 0.99, 0.999]}, pre_dispatch='2*n_jobs', random_state=42, refit=True, return_train_score='warn', scoring=None, verbose=2)
rnd_search_bn.best_params_
{'n_neurons': 160, 'learning_rate': 0.01, 'batch_size': 10, 'batch_norm_momentum': 0.98, 'activation': <function tensorflow.python.ops.gen_nn_ops.relu(features, name=None)>}
y_pred = rnd_search_bn.predict(X_test1)
accuracy_score(y_test1, y_pred)
0.9949406499318934
Slightly better than earlier: 99.49% vs 99.42%. Let's see if dropout can do better.
Exercise: is the model overfitting the training set? Try adding dropout to every layer and try again. Does it help?
Let's go back to the model we trained earlier and see how it performs on the training set:
y_pred = dnn_clf.predict(X_train1)
accuracy_score(y_train1, y_pred)
0.9950781082816178
The model performs significantly better on the training set than on the test set (99.51% vs 99.00%), which means it is overfitting the training set. A bit of regularization may help. Let's try adding dropout with a 50% dropout rate:
dnn_clf_dropout = DNNClassifier(activation=leaky_relu(alpha=0.1), batch_size=500, learning_rate=0.01,
n_neurons=90, random_state=42,
dropout_rate=0.5)
dnn_clf_dropout.fit(X_train1, y_train1, n_epochs=1000, X_valid=X_valid1, y_valid=y_valid1)
0 Validation loss: 0.131152 Best loss: 0.131152 Accuracy: 96.91% 1 Validation loss: 0.105306 Best loss: 0.105306 Accuracy: 97.46% 2 Validation loss: 0.091219 Best loss: 0.091219 Accuracy: 97.73% 3 Validation loss: 0.089638 Best loss: 0.089638 Accuracy: 97.85% 4 Validation loss: 0.091288 Best loss: 0.089638 Accuracy: 97.69% 5 Validation loss: 0.081112 Best loss: 0.081112 Accuracy: 98.05% 6 Validation loss: 0.075575 Best loss: 0.075575 Accuracy: 98.24% 7 Validation loss: 0.084841 Best loss: 0.075575 Accuracy: 97.77% 8 Validation loss: 0.075269 Best loss: 0.075269 Accuracy: 97.65% 9 Validation loss: 0.076625 Best loss: 0.075269 Accuracy: 98.12% 10 Validation loss: 0.072509 Best loss: 0.072509 Accuracy: 97.97% 11 Validation loss: 0.071006 Best loss: 0.071006 Accuracy: 98.44% 12 Validation loss: 0.073272 Best loss: 0.071006 Accuracy: 98.08% 13 Validation loss: 0.076293 Best loss: 0.071006 Accuracy: 98.16% 14 Validation loss: 0.074955 Best loss: 0.071006 Accuracy: 98.05% 15 Validation loss: 0.066207 Best loss: 0.066207 Accuracy: 98.20% 16 Validation loss: 0.067388 Best loss: 0.066207 Accuracy: 98.08% 17 Validation loss: 0.061916 Best loss: 0.061916 Accuracy: 98.40% 18 Validation loss: 0.064908 Best loss: 0.061916 Accuracy: 98.40% 19 Validation loss: 0.064921 Best loss: 0.061916 Accuracy: 98.40% 20 Validation loss: 0.069939 Best loss: 0.061916 Accuracy: 98.40% 21 Validation loss: 0.069870 Best loss: 0.061916 Accuracy: 98.32% 22 Validation loss: 0.062807 Best loss: 0.061916 Accuracy: 98.24% 23 Validation loss: 0.065312 Best loss: 0.061916 Accuracy: 98.44% 24 Validation loss: 0.067044 Best loss: 0.061916 Accuracy: 98.44% 25 Validation loss: 0.072251 Best loss: 0.061916 Accuracy: 98.16% 26 Validation loss: 0.064444 Best loss: 0.061916 Accuracy: 98.20% 27 Validation loss: 0.069022 Best loss: 0.061916 Accuracy: 98.44% 28 Validation loss: 0.069079 Best loss: 0.061916 Accuracy: 98.28% 29 Validation loss: 0.148266 Best loss: 0.061916 Accuracy: 96.52% 30 Validation loss: 0.119943 Best loss: 0.061916 Accuracy: 96.72% 31 Validation loss: 0.167303 Best loss: 0.061916 Accuracy: 96.68% 32 Validation loss: 0.131897 Best loss: 0.061916 Accuracy: 96.52% 33 Validation loss: 0.146681 Best loss: 0.061916 Accuracy: 95.43% 34 Validation loss: 0.125731 Best loss: 0.061916 Accuracy: 96.64% 35 Validation loss: 0.099879 Best loss: 0.061916 Accuracy: 97.89% 36 Validation loss: 0.096915 Best loss: 0.061916 Accuracy: 97.73% 37 Validation loss: 0.096422 Best loss: 0.061916 Accuracy: 97.85% 38 Validation loss: 0.108040 Best loss: 0.061916 Accuracy: 97.54% Early stopping!
DNNClassifier(activation=<function leaky_relu.<locals>.parametrized_leaky_relu at 0x14e8501e0>, batch_norm_momentum=None, batch_size=500, dropout_rate=0.5, initializer=<tensorflow.python.ops.init_ops.VarianceScaling object at 0x117bf5828>, learning_rate=0.01, n_hidden_layers=5, n_neurons=90, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42)
The best params are reached during epoch 17. Dropout somewhat slowed down convergence.
Let's check the accuracy:
y_pred = dnn_clf_dropout.predict(X_test1)
accuracy_score(y_test1, y_pred)
0.9861840825063242
We are out of luck, dropout does not seem to help. Let's try tuning the hyperparameters, perhaps we can squeeze a bit more performance out of this model:
from sklearn.model_selection import RandomizedSearchCV
param_distribs = {
"n_neurons": [10, 30, 50, 70, 90, 100, 120, 140, 160],
"batch_size": [10, 50, 100, 500],
"learning_rate": [0.01, 0.02, 0.05, 0.1],
"activation": [tf.nn.relu, tf.nn.elu, leaky_relu(alpha=0.01), leaky_relu(alpha=0.1)],
# you could also try exploring different numbers of hidden layers, different optimizers, etc.
#"n_hidden_layers": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
#"optimizer_class": [tf.train.AdamOptimizer, partial(tf.train.MomentumOptimizer, momentum=0.95)],
"dropout_rate": [0.2, 0.3, 0.4, 0.5, 0.6],
}
rnd_search_dropout = RandomizedSearchCV(DNNClassifier(random_state=42), param_distribs, n_iter=50,
cv=3, random_state=42, verbose=2)
rnd_search_dropout.fit(X_train1, y_train1, X_valid=X_valid1, y_valid=y_valid1, n_epochs=1000)
# If you have Scikit-Learn 0.18 or earlier, you should upgrade, or use the fit_params argument:
# fit_params = dict(X_valid=X_valid1, y_valid=y_valid1, n_epochs=1000)
# rnd_search_dropout = RandomizedSearchCV(DNNClassifier(random_state=42), param_distribs, n_iter=50,
# fit_params=fit_params, random_state=42, verbose=2)
# rnd_search_dropout.fit(X_train1, y_train1)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Fitting 3 folds for each of 50 candidates, totalling 150 fits [CV] n_neurons=70, learning_rate=0.01, dropout_rate=0.5, batch_size=100, activation=<function relu at 0x124366d08> 0 Validation loss: 0.218595 Best loss: 0.218595 Accuracy: 93.63% 1 Validation loss: 0.210470 Best loss: 0.210470 Accuracy: 94.61% 2 Validation loss: 0.224635 Best loss: 0.210470 Accuracy: 95.50% 3 Validation loss: 0.200494 Best loss: 0.200494 Accuracy: 94.84% 4 Validation loss: 0.184056 Best loss: 0.184056 Accuracy: 95.58% 5 Validation loss: 0.187698 Best loss: 0.184056 Accuracy: 96.33% 6 Validation loss: 0.151692 Best loss: 0.151692 Accuracy: 96.17% 7 Validation loss: 0.176633 Best loss: 0.151692 Accuracy: 96.21% 8 Validation loss: 0.187090 Best loss: 0.151692 Accuracy: 96.01% 9 Validation loss: 0.204406 Best loss: 0.151692 Accuracy: 96.40% 10 Validation loss: 0.193938 Best loss: 0.151692 Accuracy: 95.74% 11 Validation loss: 0.190056 Best loss: 0.151692 Accuracy: 96.21% 12 Validation loss: 0.183601 Best loss: 0.151692 Accuracy: 96.05% 13 Validation loss: 0.179737 Best loss: 0.151692 Accuracy: 96.25% 14 Validation loss: 0.289718 Best loss: 0.151692 Accuracy: 96.29% 15 Validation loss: 0.188605 Best loss: 0.151692 Accuracy: 95.86% 16 Validation loss: 0.195911 Best loss: 0.151692 Accuracy: 96.01% 17 Validation loss: 0.158151 Best loss: 0.151692 Accuracy: 96.25% 18 Validation loss: 0.168049 Best loss: 0.151692 Accuracy: 96.25% 19 Validation loss: 0.170637 Best loss: 0.151692 Accuracy: 96.40% 20 Validation loss: 0.192890 Best loss: 0.151692 Accuracy: 96.21% 21 Validation loss: 0.178800 Best loss: 0.151692 Accuracy: 95.97% 22 Validation loss: 0.185295 Best loss: 0.151692 Accuracy: 96.44% 23 Validation loss: 0.150369 Best loss: 0.150369 Accuracy: 96.91% 24 Validation loss: 0.161164 Best loss: 0.150369 Accuracy: 96.52% 25 Validation loss: 0.180860 Best loss: 0.150369 Accuracy: 96.13% 26 Validation loss: 0.182730 Best loss: 0.150369 Accuracy: 96.52% 27 Validation loss: 0.184583 Best loss: 0.150369 Accuracy: 96.09% 28 Validation loss: 0.183952 Best loss: 0.150369 Accuracy: 95.39% 29 Validation loss: 0.211111 Best loss: 0.150369 Accuracy: 95.54% 30 Validation loss: 0.225760 Best loss: 0.150369 Accuracy: 95.97% 31 Validation loss: 0.170313 Best loss: 0.150369 Accuracy: 96.91% <<5625 more lines>> 8 Validation loss: 0.086624 Best loss: 0.065724 Accuracy: 98.01% 9 Validation loss: 0.069571 Best loss: 0.065724 Accuracy: 98.44% 10 Validation loss: 0.094720 Best loss: 0.065724 Accuracy: 98.20% 11 Validation loss: 0.070504 Best loss: 0.065724 Accuracy: 98.51% 12 Validation loss: 0.090169 Best loss: 0.065724 Accuracy: 98.24% 13 Validation loss: 0.080667 Best loss: 0.065724 Accuracy: 98.20% 14 Validation loss: 0.120917 Best loss: 0.065724 Accuracy: 96.60% 15 Validation loss: 0.105030 Best loss: 0.065724 Accuracy: 97.62% 16 Validation loss: 0.138571 Best loss: 0.065724 Accuracy: 97.85% 17 Validation loss: 0.078942 Best loss: 0.065724 Accuracy: 97.97% 18 Validation loss: 0.081645 Best loss: 0.065724 Accuracy: 97.89% 19 Validation loss: 0.054128 Best loss: 0.054128 Accuracy: 98.44% 20 Validation loss: 0.051510 Best loss: 0.051510 Accuracy: 98.44% 21 Validation loss: 0.071159 Best loss: 0.051510 Accuracy: 98.67% 22 Validation loss: 0.084647 Best loss: 0.051510 Accuracy: 98.28% 23 Validation loss: 0.081601 Best loss: 0.051510 Accuracy: 98.36% 24 Validation loss: 0.152964 Best loss: 0.051510 Accuracy: 97.93% 25 Validation loss: 0.173249 Best loss: 0.051510 Accuracy: 97.03% 26 Validation loss: 0.128901 Best loss: 0.051510 Accuracy: 96.13% 27 Validation loss: 0.110458 Best loss: 0.051510 Accuracy: 97.93% 28 Validation loss: 0.108197 Best loss: 0.051510 Accuracy: 97.30% 29 Validation loss: 0.104204 Best loss: 0.051510 Accuracy: 97.85% 30 Validation loss: 0.126637 Best loss: 0.051510 Accuracy: 98.32% 31 Validation loss: 0.142045 Best loss: 0.051510 Accuracy: 97.62% 32 Validation loss: 0.103701 Best loss: 0.051510 Accuracy: 97.69% 33 Validation loss: 0.120295 Best loss: 0.051510 Accuracy: 97.42% 34 Validation loss: 0.151388 Best loss: 0.051510 Accuracy: 97.85% 35 Validation loss: 0.096931 Best loss: 0.051510 Accuracy: 97.58% 36 Validation loss: 0.153569 Best loss: 0.051510 Accuracy: 97.11% 37 Validation loss: 0.120552 Best loss: 0.051510 Accuracy: 98.05% 38 Validation loss: 0.076677 Best loss: 0.051510 Accuracy: 98.55% 39 Validation loss: 0.071904 Best loss: 0.051510 Accuracy: 98.55% 40 Validation loss: 0.072618 Best loss: 0.051510 Accuracy: 98.12% 41 Validation loss: 0.086680 Best loss: 0.051510 Accuracy: 98.08% Early stopping!
RandomizedSearchCV(cv='warn', error_score='raise-deprecating', estimator=DNNClassifier(activation=<function elu at 0x1243639d8>, batch_norm_momentum=None, batch_size=20, dropout_rate=None, initializer=<tensorflow.python.ops.init_ops.VarianceScaling object at 0x117bf5828>, learning_rate=0.01, n_hidden_layers=5, n_neurons=100, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42), fit_params={'X_valid': array([[0., 0., ..., 0., 0.], [0., 0., ..., 0., 0.], ..., [0., 0., ..., 0., 0.], [0., 0., ..., 0., 0.]], dtype=float32), 'y_valid': array([0, 4, ..., 1, 2], dtype=int32), 'n_epochs': 1000}, iid='warn', n_iter=50, n_jobs=None, param_distributions={'n_neurons': [10, 30, 50, 70, 90, 100, 120, 140, 160], 'batch_size': [10, 50, 100, 500], 'learning_rate': [0.01, 0.02, 0.05, 0.1], 'activation': [<function relu at 0x124366d08>, <function elu at 0x1243639d8>, <function leaky_relu.<locals>.parametrized_leaky_relu at 0x14e850620>, <function leaky_relu.<locals>.parametrized_leaky_relu at 0x14e850d08>], 'dropout_rate': [0.2, 0.3, 0.4, 0.5, 0.6]}, pre_dispatch='2*n_jobs', random_state=42, refit=True, return_train_score='warn', scoring=None, verbose=2)
rnd_search_dropout.best_params_
{'n_neurons': 160, 'learning_rate': 0.01, 'dropout_rate': 0.2, 'batch_size': 100, 'activation': <function tensorflow.python.ops.gen_nn_ops.relu(features, name=None)>}
y_pred = rnd_search_dropout.predict(X_test1)
accuracy_score(y_test1, y_pred)
0.9889083479276124
Oh well, dropout did not improve the model. Better luck next time! :)
But that's okay, we have ourselves a nice DNN that achieves 99.49% accuracy on the test set using Batch Normalization, or 98.91% without BN. Let's see if some of this expertise on digits 0 to 4 can be transferred to the task of classifying digits 5 to 9. For the sake of simplicity we will reuse the DNN without BN.
Exercise: create a new DNN that reuses all the pretrained hidden layers of the previous model, freezes them, and replaces the softmax output layer with a new one.
Let's load the best model's graph and get a handle on all the important operations we will need. Note that instead of creating a new softmax output layer, we will just reuse the existing one (since it has the same number of outputs as the existing one). We will reinitialize its parameters before training.
reset_graph()
restore_saver = tf.train.import_meta_graph("./my_best_mnist_model_0_to_4.meta")
X = tf.get_default_graph().get_tensor_by_name("X:0")
y = tf.get_default_graph().get_tensor_by_name("y:0")
loss = tf.get_default_graph().get_tensor_by_name("loss:0")
Y_proba = tf.get_default_graph().get_tensor_by_name("Y_proba:0")
logits = Y_proba.op.inputs[0]
accuracy = tf.get_default_graph().get_tensor_by_name("accuracy:0")
To freeze the lower layers, we will exclude their variables from the optimizer's list of trainable variables, keeping only the output layer's trainable variables:
learning_rate = 0.01
output_layer_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="logits")
optimizer = tf.train.AdamOptimizer(learning_rate, name="Adam2")
training_op = optimizer.minimize(loss, var_list=output_layer_vars)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
init = tf.global_variables_initializer()
five_frozen_saver = tf.train.Saver()
Exercise: train this new DNN on digits 5 to 9, using only 100 images per digit, and time how long it takes. Despite this small number of examples, can you achieve high precision?
Let's create the training, validation and test sets. We need to subtract 5 from the labels because TensorFlow expects integers from 0 to n_classes-1
.
X_train2_full = X_train[y_train >= 5]
y_train2_full = y_train[y_train >= 5] - 5
X_valid2_full = X_valid[y_valid >= 5]
y_valid2_full = y_valid[y_valid >= 5] - 5
X_test2 = X_test[y_test >= 5]
y_test2 = y_test[y_test >= 5] - 5
Also, for the purpose of this exercise, we want to keep only 100 instances per class in the training set (and let's keep only 30 instances per class in the validation set). Let's create a small function to do that:
def sample_n_instances_per_class(X, y, n=100):
Xs, ys = [], []
for label in np.unique(y):
idx = (y == label)
Xc = X[idx][:n]
yc = y[idx][:n]
Xs.append(Xc)
ys.append(yc)
return np.concatenate(Xs), np.concatenate(ys)
X_train2, y_train2 = sample_n_instances_per_class(X_train2_full, y_train2_full, n=100)
X_valid2, y_valid2 = sample_n_instances_per_class(X_valid2_full, y_valid2_full, n=30)
Now let's train the model. This is the same training code as earlier, using early stopping, except for the initialization: we first initialize all the variables, then we restore the best model trained earlier (on digits 0 to 4), and finally we reinitialize the output layer variables.
import time
n_epochs = 1000
batch_size = 20
max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_best_mnist_model_0_to_4")
t0 = time.time()
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train2))
for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid2, y: y_valid2})
if loss_val < best_loss:
save_path = five_frozen_saver.save(sess, "./my_mnist_model_5_to_9_five_frozen")
best_loss = loss_val
checks_without_progress = 0
else:
checks_without_progress += 1
if checks_without_progress > max_checks_without_progress:
print("Early stopping!")
break
print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_val, best_loss, acc_val * 100))
t1 = time.time()
print("Total training time: {:.1f}s".format(t1 - t0))
with tf.Session() as sess:
five_frozen_saver.restore(sess, "./my_mnist_model_5_to_9_five_frozen")
acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
print("Final test accuracy: {:.2f}%".format(acc_test * 100))
INFO:tensorflow:Restoring parameters from ./my_best_mnist_model_0_to_4 0 Validation loss: 1.361167 Best loss: 1.361167 Accuracy: 43.33% 1 Validation loss: 1.154602 Best loss: 1.154602 Accuracy: 57.33% 2 Validation loss: 1.054218 Best loss: 1.054218 Accuracy: 53.33% 3 Validation loss: 0.981128 Best loss: 0.981128 Accuracy: 62.67% 4 Validation loss: 0.995353 Best loss: 0.981128 Accuracy: 59.33% 5 Validation loss: 0.967000 Best loss: 0.967000 Accuracy: 65.33% 6 Validation loss: 0.955700 Best loss: 0.955700 Accuracy: 61.33% 7 Validation loss: 1.015331 Best loss: 0.955700 Accuracy: 58.67% 8 Validation loss: 0.978280 Best loss: 0.955700 Accuracy: 62.00% 9 Validation loss: 0.923389 Best loss: 0.923389 Accuracy: 69.33% 10 Validation loss: 0.996236 Best loss: 0.923389 Accuracy: 63.33% 11 Validation loss: 0.976757 Best loss: 0.923389 Accuracy: 62.67% 12 Validation loss: 0.969096 Best loss: 0.923389 Accuracy: 63.33% 13 Validation loss: 1.023069 Best loss: 0.923389 Accuracy: 63.33% 14 Validation loss: 1.104664 Best loss: 0.923389 Accuracy: 55.33% 15 Validation loss: 0.950175 Best loss: 0.923389 Accuracy: 65.33% 16 Validation loss: 1.002944 Best loss: 0.923389 Accuracy: 63.33% 17 Validation loss: 0.895543 Best loss: 0.895543 Accuracy: 70.67% 18 Validation loss: 0.961151 Best loss: 0.895543 Accuracy: 66.67% 19 Validation loss: 0.896372 Best loss: 0.895543 Accuracy: 67.33% 20 Validation loss: 0.911938 Best loss: 0.895543 Accuracy: 69.33% 21 Validation loss: 0.929007 Best loss: 0.895543 Accuracy: 68.00% 22 Validation loss: 0.939231 Best loss: 0.895543 Accuracy: 65.33% 23 Validation loss: 0.919057 Best loss: 0.895543 Accuracy: 68.67% 24 Validation loss: 0.994529 Best loss: 0.895543 Accuracy: 65.33% 25 Validation loss: 0.901279 Best loss: 0.895543 Accuracy: 68.67% 26 Validation loss: 0.916238 Best loss: 0.895543 Accuracy: 68.67% 27 Validation loss: 1.007434 Best loss: 0.895543 Accuracy: 65.33% 28 Validation loss: 0.924729 Best loss: 0.895543 Accuracy: 70.00% 29 Validation loss: 0.974399 Best loss: 0.895543 Accuracy: 66.00% 30 Validation loss: 0.899418 Best loss: 0.895543 Accuracy: 68.00% 31 Validation loss: 0.940563 Best loss: 0.895543 Accuracy: 66.00% 32 Validation loss: 0.920235 Best loss: 0.895543 Accuracy: 68.00% 33 Validation loss: 0.929848 Best loss: 0.895543 Accuracy: 68.67% 34 Validation loss: 0.930288 Best loss: 0.895543 Accuracy: 66.67% 35 Validation loss: 0.943884 Best loss: 0.895543 Accuracy: 64.67% 36 Validation loss: 0.939372 Best loss: 0.895543 Accuracy: 68.00% 37 Validation loss: 0.894239 Best loss: 0.894239 Accuracy: 67.33% 38 Validation loss: 0.888806 Best loss: 0.888806 Accuracy: 69.33% 39 Validation loss: 0.933829 Best loss: 0.888806 Accuracy: 66.00% 40 Validation loss: 0.911836 Best loss: 0.888806 Accuracy: 72.67% 41 Validation loss: 0.896729 Best loss: 0.888806 Accuracy: 70.00% 42 Validation loss: 0.929394 Best loss: 0.888806 Accuracy: 68.00% 43 Validation loss: 0.919418 Best loss: 0.888806 Accuracy: 69.33% 44 Validation loss: 0.907830 Best loss: 0.888806 Accuracy: 65.33% 45 Validation loss: 1.004304 Best loss: 0.888806 Accuracy: 71.33% 46 Validation loss: 0.871899 Best loss: 0.871899 Accuracy: 74.00% 47 Validation loss: 0.904889 Best loss: 0.871899 Accuracy: 67.33% 48 Validation loss: 0.914138 Best loss: 0.871899 Accuracy: 66.00% 49 Validation loss: 0.930001 Best loss: 0.871899 Accuracy: 69.33% 50 Validation loss: 0.962153 Best loss: 0.871899 Accuracy: 68.67% 51 Validation loss: 0.925021 Best loss: 0.871899 Accuracy: 65.33% 52 Validation loss: 0.974412 Best loss: 0.871899 Accuracy: 67.33% 53 Validation loss: 0.897499 Best loss: 0.871899 Accuracy: 68.67% 54 Validation loss: 0.933581 Best loss: 0.871899 Accuracy: 60.67% 55 Validation loss: 0.988574 Best loss: 0.871899 Accuracy: 68.67% 56 Validation loss: 0.927290 Best loss: 0.871899 Accuracy: 66.67% 57 Validation loss: 1.018713 Best loss: 0.871899 Accuracy: 64.00% 58 Validation loss: 0.964709 Best loss: 0.871899 Accuracy: 66.00% 59 Validation loss: 1.004696 Best loss: 0.871899 Accuracy: 59.33% 60 Validation loss: 1.008746 Best loss: 0.871899 Accuracy: 58.67% 61 Validation loss: 0.948558 Best loss: 0.871899 Accuracy: 68.00% 62 Validation loss: 0.966037 Best loss: 0.871899 Accuracy: 64.00% 63 Validation loss: 0.922541 Best loss: 0.871899 Accuracy: 68.00% 64 Validation loss: 0.892541 Best loss: 0.871899 Accuracy: 72.00% 65 Validation loss: 0.890340 Best loss: 0.871899 Accuracy: 70.67% 66 Validation loss: 0.957904 Best loss: 0.871899 Accuracy: 66.00% Early stopping! Total training time: 1.9s INFO:tensorflow:Restoring parameters from ./my_mnist_model_5_to_9_five_frozen Final test accuracy: 64.02%
Well that's not a great accuracy, is it? Of course with such a tiny training set, and with only one layer to tweak, we should not expect miracles.
Exercise: try caching the frozen layers, and train the model again: how much faster is it now?
Let's start by getting a handle on the output of the last frozen layer:
hidden5_out = tf.get_default_graph().get_tensor_by_name("hidden5_out:0")
Now let's train the model using roughly the same code as earlier. The difference is that we compute the output of the top frozen layer at the beginning (both for the training set and the validation set), and we cache it. This makes training roughly 1.5 to 3 times faster in this example (this may vary greatly, depending on your system):
import time
n_epochs = 1000
batch_size = 20
max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_best_mnist_model_0_to_4")
t0 = time.time()
hidden5_train = hidden5_out.eval(feed_dict={X: X_train2, y: y_train2})
hidden5_valid = hidden5_out.eval(feed_dict={X: X_valid2, y: y_valid2})
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train2))
for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
h5_batch, y_batch = hidden5_train[rnd_indices], y_train2[rnd_indices]
sess.run(training_op, feed_dict={hidden5_out: h5_batch, y: y_batch})
loss_val, acc_val = sess.run([loss, accuracy], feed_dict={hidden5_out: hidden5_valid, y: y_valid2})
if loss_val < best_loss:
save_path = five_frozen_saver.save(sess, "./my_mnist_model_5_to_9_five_frozen")
best_loss = loss_val
checks_without_progress = 0
else:
checks_without_progress += 1
if checks_without_progress > max_checks_without_progress:
print("Early stopping!")
break
print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_val, best_loss, acc_val * 100))
t1 = time.time()
print("Total training time: {:.1f}s".format(t1 - t0))
with tf.Session() as sess:
five_frozen_saver.restore(sess, "./my_mnist_model_5_to_9_five_frozen")
acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
print("Final test accuracy: {:.2f}%".format(acc_test * 100))
INFO:tensorflow:Restoring parameters from ./my_best_mnist_model_0_to_4 0 Validation loss: 1.416103 Best loss: 1.416103 Accuracy: 44.00% 1 Validation loss: 1.099216 Best loss: 1.099216 Accuracy: 53.33% 2 Validation loss: 1.024954 Best loss: 1.024954 Accuracy: 59.33% 3 Validation loss: 0.969193 Best loss: 0.969193 Accuracy: 60.00% 4 Validation loss: 0.973461 Best loss: 0.969193 Accuracy: 64.67% 5 Validation loss: 0.949333 Best loss: 0.949333 Accuracy: 64.67% 6 Validation loss: 0.922953 Best loss: 0.922953 Accuracy: 66.67% 7 Validation loss: 0.957186 Best loss: 0.922953 Accuracy: 62.67% 8 Validation loss: 0.950264 Best loss: 0.922953 Accuracy: 68.00% 9 Validation loss: 1.053465 Best loss: 0.922953 Accuracy: 59.33% 10 Validation loss: 1.069949 Best loss: 0.922953 Accuracy: 54.00% 11 Validation loss: 0.965197 Best loss: 0.922953 Accuracy: 62.67% 12 Validation loss: 0.949233 Best loss: 0.922953 Accuracy: 63.33% 13 Validation loss: 0.926229 Best loss: 0.922953 Accuracy: 63.33% 14 Validation loss: 0.922854 Best loss: 0.922854 Accuracy: 67.33% 15 Validation loss: 0.965205 Best loss: 0.922854 Accuracy: 66.67% 16 Validation loss: 1.050026 Best loss: 0.922854 Accuracy: 59.33% 17 Validation loss: 0.946699 Best loss: 0.922854 Accuracy: 64.67% 18 Validation loss: 0.973966 Best loss: 0.922854 Accuracy: 64.00% 19 Validation loss: 0.902573 Best loss: 0.902573 Accuracy: 66.67% 20 Validation loss: 0.933625 Best loss: 0.902573 Accuracy: 65.33% 21 Validation loss: 0.938296 Best loss: 0.902573 Accuracy: 64.00% 22 Validation loss: 0.938790 Best loss: 0.902573 Accuracy: 66.67% 23 Validation loss: 0.936572 Best loss: 0.902573 Accuracy: 68.00% 24 Validation loss: 1.039109 Best loss: 0.902573 Accuracy: 65.33% 25 Validation loss: 1.146837 Best loss: 0.902573 Accuracy: 59.33% 26 Validation loss: 0.958702 Best loss: 0.902573 Accuracy: 68.67% 27 Validation loss: 0.915434 Best loss: 0.902573 Accuracy: 70.67% 28 Validation loss: 0.915402 Best loss: 0.902573 Accuracy: 66.00% 29 Validation loss: 0.920591 Best loss: 0.902573 Accuracy: 70.67% 30 Validation loss: 1.029216 Best loss: 0.902573 Accuracy: 64.67% 31 Validation loss: 1.039922 Best loss: 0.902573 Accuracy: 55.33% 32 Validation loss: 0.925041 Best loss: 0.902573 Accuracy: 64.00% 33 Validation loss: 0.944033 Best loss: 0.902573 Accuracy: 67.33% 34 Validation loss: 0.941914 Best loss: 0.902573 Accuracy: 66.67% 35 Validation loss: 0.866297 Best loss: 0.866297 Accuracy: 69.33% 36 Validation loss: 0.900787 Best loss: 0.866297 Accuracy: 70.67% 37 Validation loss: 0.889670 Best loss: 0.866297 Accuracy: 66.67% 38 Validation loss: 0.968139 Best loss: 0.866297 Accuracy: 62.00% 39 Validation loss: 0.929764 Best loss: 0.866297 Accuracy: 66.00% 40 Validation loss: 0.889130 Best loss: 0.866297 Accuracy: 68.00% 41 Validation loss: 0.940024 Best loss: 0.866297 Accuracy: 70.00% 42 Validation loss: 0.896472 Best loss: 0.866297 Accuracy: 69.33% 43 Validation loss: 0.893887 Best loss: 0.866297 Accuracy: 67.33% 44 Validation loss: 0.925727 Best loss: 0.866297 Accuracy: 68.67% 45 Validation loss: 0.945748 Best loss: 0.866297 Accuracy: 66.00% 46 Validation loss: 0.897087 Best loss: 0.866297 Accuracy: 70.00% 47 Validation loss: 0.923855 Best loss: 0.866297 Accuracy: 68.67% 48 Validation loss: 0.944244 Best loss: 0.866297 Accuracy: 66.67% 49 Validation loss: 0.975582 Best loss: 0.866297 Accuracy: 66.67% 50 Validation loss: 0.889869 Best loss: 0.866297 Accuracy: 68.67% 51 Validation loss: 0.895552 Best loss: 0.866297 Accuracy: 69.33% 52 Validation loss: 0.943707 Best loss: 0.866297 Accuracy: 66.00% 53 Validation loss: 0.902883 Best loss: 0.866297 Accuracy: 70.67% 54 Validation loss: 0.958292 Best loss: 0.866297 Accuracy: 68.67% 55 Validation loss: 0.917368 Best loss: 0.866297 Accuracy: 67.33% Early stopping! Total training time: 1.1s INFO:tensorflow:Restoring parameters from ./my_mnist_model_5_to_9_five_frozen Final test accuracy: 61.16%
Exercise: try again reusing just four hidden layers instead of five. Can you achieve a higher precision?
Let's load the best model again, but this time we will create a new softmax output layer on top of the 4th hidden layer:
reset_graph()
n_outputs = 5
restore_saver = tf.train.import_meta_graph("./my_best_mnist_model_0_to_4.meta")
X = tf.get_default_graph().get_tensor_by_name("X:0")
y = tf.get_default_graph().get_tensor_by_name("y:0")
hidden4_out = tf.get_default_graph().get_tensor_by_name("hidden4_out:0")
logits = tf.layers.dense(hidden4_out, n_outputs, kernel_initializer=he_init, name="new_logits")
Y_proba = tf.nn.softmax(logits)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")
And now let's create the training operation. We want to freeze all the layers except for the new output layer:
learning_rate = 0.01
output_layer_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="new_logits")
optimizer = tf.train.AdamOptimizer(learning_rate, name="Adam2")
training_op = optimizer.minimize(loss, var_list=output_layer_vars)
init = tf.global_variables_initializer()
four_frozen_saver = tf.train.Saver()
And once again we train the model with the same code as earlier. Note: we could of course write a function once and use it multiple times, rather than copying almost the same training code over and over again, but as we keep tweaking the code slightly, the function would need multiple arguments and if
statements, and it would have to be at the beginning of the notebook, where it would not make much sense to readers. In short it would be very confusing, so we're better off with copy & paste.
n_epochs = 1000
batch_size = 20
max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_best_mnist_model_0_to_4")
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train2))
for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid2, y: y_valid2})
if loss_val < best_loss:
save_path = four_frozen_saver.save(sess, "./my_mnist_model_5_to_9_four_frozen")
best_loss = loss_val
checks_without_progress = 0
else:
checks_without_progress += 1
if checks_without_progress > max_checks_without_progress:
print("Early stopping!")
break
print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_val, best_loss, acc_val * 100))
with tf.Session() as sess:
four_frozen_saver.restore(sess, "./my_mnist_model_5_to_9_four_frozen")
acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
print("Final test accuracy: {:.2f}%".format(acc_test * 100))
INFO:tensorflow:Restoring parameters from ./my_best_mnist_model_0_to_4 0 Validation loss: 1.073254 Best loss: 1.073254 Accuracy: 51.33% 1 Validation loss: 1.039487 Best loss: 1.039487 Accuracy: 64.00% 2 Validation loss: 0.991418 Best loss: 0.991418 Accuracy: 59.33% 3 Validation loss: 0.902691 Best loss: 0.902691 Accuracy: 64.67% 4 Validation loss: 0.919874 Best loss: 0.902691 Accuracy: 63.33% 5 Validation loss: 0.879734 Best loss: 0.879734 Accuracy: 72.00% 6 Validation loss: 0.877940 Best loss: 0.877940 Accuracy: 70.67% 7 Validation loss: 0.899513 Best loss: 0.877940 Accuracy: 71.33% 8 Validation loss: 0.879717 Best loss: 0.877940 Accuracy: 67.33% 9 Validation loss: 0.826527 Best loss: 0.826527 Accuracy: 75.33% 10 Validation loss: 0.890165 Best loss: 0.826527 Accuracy: 67.33% 11 Validation loss: 0.876235 Best loss: 0.826527 Accuracy: 68.67% 12 Validation loss: 0.877598 Best loss: 0.826527 Accuracy: 71.33% 13 Validation loss: 0.898070 Best loss: 0.826527 Accuracy: 74.67% 14 Validation loss: 0.923526 Best loss: 0.826527 Accuracy: 68.00% 15 Validation loss: 0.859624 Best loss: 0.826527 Accuracy: 70.00% 16 Validation loss: 0.896264 Best loss: 0.826527 Accuracy: 67.33% 17 Validation loss: 0.800813 Best loss: 0.800813 Accuracy: 73.33% 18 Validation loss: 0.811318 Best loss: 0.800813 Accuracy: 74.00% 19 Validation loss: 0.809687 Best loss: 0.800813 Accuracy: 75.33% 20 Validation loss: 0.807125 Best loss: 0.800813 Accuracy: 72.67% 21 Validation loss: 0.819150 Best loss: 0.800813 Accuracy: 71.33% 22 Validation loss: 0.849812 Best loss: 0.800813 Accuracy: 76.67% 23 Validation loss: 0.801709 Best loss: 0.800813 Accuracy: 74.67% 24 Validation loss: 0.832877 Best loss: 0.800813 Accuracy: 74.00% 25 Validation loss: 0.792853 Best loss: 0.792853 Accuracy: 72.67% 26 Validation loss: 0.842031 Best loss: 0.792853 Accuracy: 76.00% 27 Validation loss: 0.872236 Best loss: 0.792853 Accuracy: 71.33% 28 Validation loss: 0.782557 Best loss: 0.782557 Accuracy: 78.00% 29 Validation loss: 0.802515 Best loss: 0.782557 Accuracy: 73.33% 30 Validation loss: 0.812652 Best loss: 0.782557 Accuracy: 72.67% 31 Validation loss: 0.825467 Best loss: 0.782557 Accuracy: 76.00% 32 Validation loss: 0.791320 Best loss: 0.782557 Accuracy: 76.67% 33 Validation loss: 0.785207 Best loss: 0.782557 Accuracy: 77.33% 34 Validation loss: 0.815450 Best loss: 0.782557 Accuracy: 76.67% 35 Validation loss: 0.865081 Best loss: 0.782557 Accuracy: 71.33% 36 Validation loss: 0.852323 Best loss: 0.782557 Accuracy: 74.67% 37 Validation loss: 0.836967 Best loss: 0.782557 Accuracy: 72.00% 38 Validation loss: 0.807404 Best loss: 0.782557 Accuracy: 77.33% 39 Validation loss: 0.821566 Best loss: 0.782557 Accuracy: 75.33% 40 Validation loss: 0.817326 Best loss: 0.782557 Accuracy: 76.00% 41 Validation loss: 0.807987 Best loss: 0.782557 Accuracy: 70.67% 42 Validation loss: 0.838029 Best loss: 0.782557 Accuracy: 74.00% 43 Validation loss: 0.820425 Best loss: 0.782557 Accuracy: 76.00% 44 Validation loss: 0.785871 Best loss: 0.782557 Accuracy: 76.00% 45 Validation loss: 0.844337 Best loss: 0.782557 Accuracy: 78.67% 46 Validation loss: 0.764127 Best loss: 0.764127 Accuracy: 78.67% 47 Validation loss: 0.789726 Best loss: 0.764127 Accuracy: 77.33% 48 Validation loss: 0.839190 Best loss: 0.764127 Accuracy: 72.67% 49 Validation loss: 0.849353 Best loss: 0.764127 Accuracy: 75.33% 50 Validation loss: 0.869818 Best loss: 0.764127 Accuracy: 74.00% 51 Validation loss: 0.805526 Best loss: 0.764127 Accuracy: 76.67% 52 Validation loss: 0.850749 Best loss: 0.764127 Accuracy: 72.67% 53 Validation loss: 0.838693 Best loss: 0.764127 Accuracy: 71.33% 54 Validation loss: 0.791396 Best loss: 0.764127 Accuracy: 75.33% 55 Validation loss: 0.846888 Best loss: 0.764127 Accuracy: 76.00% 56 Validation loss: 0.826717 Best loss: 0.764127 Accuracy: 74.67% 57 Validation loss: 0.878286 Best loss: 0.764127 Accuracy: 70.67% 58 Validation loss: 0.878869 Best loss: 0.764127 Accuracy: 72.67% 59 Validation loss: 0.822241 Best loss: 0.764127 Accuracy: 72.67% 60 Validation loss: 0.864925 Best loss: 0.764127 Accuracy: 73.33% 61 Validation loss: 0.804545 Best loss: 0.764127 Accuracy: 73.33% 62 Validation loss: 0.891784 Best loss: 0.764127 Accuracy: 72.67% 63 Validation loss: 0.810186 Best loss: 0.764127 Accuracy: 74.00% 64 Validation loss: 0.810786 Best loss: 0.764127 Accuracy: 74.67% 65 Validation loss: 0.818044 Best loss: 0.764127 Accuracy: 74.00% 66 Validation loss: 0.853420 Best loss: 0.764127 Accuracy: 74.67% Early stopping! INFO:tensorflow:Restoring parameters from ./my_mnist_model_5_to_9_four_frozen Final test accuracy: 69.10%
Still not fantastic, but much better.
Exercise: now unfreeze the top two hidden layers and continue training: can you get the model to perform even better?
learning_rate = 0.01
unfrozen_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="hidden[34]|new_logits")
optimizer = tf.train.AdamOptimizer(learning_rate, name="Adam3")
training_op = optimizer.minimize(loss, var_list=unfrozen_vars)
init = tf.global_variables_initializer()
two_frozen_saver = tf.train.Saver()
n_epochs = 1000
batch_size = 20
max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty
with tf.Session() as sess:
init.run()
four_frozen_saver.restore(sess, "./my_mnist_model_5_to_9_four_frozen")
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train2))
for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid2, y: y_valid2})
if loss_val < best_loss:
save_path = two_frozen_saver.save(sess, "./my_mnist_model_5_to_9_two_frozen")
best_loss = loss_val
checks_without_progress = 0
else:
checks_without_progress += 1
if checks_without_progress > max_checks_without_progress:
print("Early stopping!")
break
print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_val, best_loss, acc_val * 100))
with tf.Session() as sess:
two_frozen_saver.restore(sess, "./my_mnist_model_5_to_9_two_frozen")
acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
print("Final test accuracy: {:.2f}%".format(acc_test * 100))
INFO:tensorflow:Restoring parameters from ./my_mnist_model_5_to_9_four_frozen 0 Validation loss: 1.054859 Best loss: 1.054859 Accuracy: 74.00% 1 Validation loss: 0.812410 Best loss: 0.812410 Accuracy: 78.00% 2 Validation loss: 0.750377 Best loss: 0.750377 Accuracy: 80.67% 3 Validation loss: 0.570973 Best loss: 0.570973 Accuracy: 84.67% 4 Validation loss: 0.805442 Best loss: 0.570973 Accuracy: 79.33% 5 Validation loss: 0.920925 Best loss: 0.570973 Accuracy: 80.00% 6 Validation loss: 0.817471 Best loss: 0.570973 Accuracy: 81.33% 7 Validation loss: 0.777876 Best loss: 0.570973 Accuracy: 84.00% 8 Validation loss: 1.030498 Best loss: 0.570973 Accuracy: 74.67% 9 Validation loss: 1.074356 Best loss: 0.570973 Accuracy: 81.33% 10 Validation loss: 0.912521 Best loss: 0.570973 Accuracy: 83.33% 11 Validation loss: 1.356695 Best loss: 0.570973 Accuracy: 79.33% 12 Validation loss: 0.918798 Best loss: 0.570973 Accuracy: 82.00% 13 Validation loss: 0.971029 Best loss: 0.570973 Accuracy: 82.67% 14 Validation loss: 0.860108 Best loss: 0.570973 Accuracy: 83.33% 15 Validation loss: 1.074813 Best loss: 0.570973 Accuracy: 82.00% 16 Validation loss: 0.867760 Best loss: 0.570973 Accuracy: 84.00% 17 Validation loss: 0.858290 Best loss: 0.570973 Accuracy: 85.33% 18 Validation loss: 0.996560 Best loss: 0.570973 Accuracy: 85.33% 19 Validation loss: 1.304507 Best loss: 0.570973 Accuracy: 83.33% 20 Validation loss: 1.134808 Best loss: 0.570973 Accuracy: 80.67% 21 Validation loss: 1.189581 Best loss: 0.570973 Accuracy: 82.00% 22 Validation loss: 1.131344 Best loss: 0.570973 Accuracy: 81.33% 23 Validation loss: 1.240507 Best loss: 0.570973 Accuracy: 82.67% Early stopping! INFO:tensorflow:Restoring parameters from ./my_mnist_model_5_to_9_two_frozen Final test accuracy: 78.09%
Let's check what accuracy we can get by unfreezing all layers:
learning_rate = 0.01
optimizer = tf.train.AdamOptimizer(learning_rate, name="Adam4")
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
no_frozen_saver = tf.train.Saver()
n_epochs = 1000
batch_size = 20
max_checks_without_progress = 20
checks_without_progress = 0
best_loss = np.infty
with tf.Session() as sess:
init.run()
two_frozen_saver.restore(sess, "./my_mnist_model_5_to_9_two_frozen")
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train2))
for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_valid2, y: y_valid2})
if loss_val < best_loss:
save_path = no_frozen_saver.save(sess, "./my_mnist_model_5_to_9_no_frozen")
best_loss = loss_val
checks_without_progress = 0
else:
checks_without_progress += 1
if checks_without_progress > max_checks_without_progress:
print("Early stopping!")
break
print("{}\tValidation loss: {:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(
epoch, loss_val, best_loss, acc_val * 100))
with tf.Session() as sess:
no_frozen_saver.restore(sess, "./my_mnist_model_5_to_9_no_frozen")
acc_test = accuracy.eval(feed_dict={X: X_test2, y: y_test2})
print("Final test accuracy: {:.2f}%".format(acc_test * 100))
INFO:tensorflow:Restoring parameters from ./my_mnist_model_5_to_9_two_frozen 0 Validation loss: 0.863416 Best loss: 0.863416 Accuracy: 86.00% 1 Validation loss: 0.695079 Best loss: 0.695079 Accuracy: 90.00% 2 Validation loss: 0.402921 Best loss: 0.402921 Accuracy: 92.00% 3 Validation loss: 0.606936 Best loss: 0.402921 Accuracy: 92.00% 4 Validation loss: 0.354645 Best loss: 0.354645 Accuracy: 90.67% 5 Validation loss: 0.376935 Best loss: 0.354645 Accuracy: 90.67% 6 Validation loss: 0.593208 Best loss: 0.354645 Accuracy: 90.00% 7 Validation loss: 0.388302 Best loss: 0.354645 Accuracy: 92.67% 8 Validation loss: 0.503276 Best loss: 0.354645 Accuracy: 91.33% 9 Validation loss: 1.440716 Best loss: 0.354645 Accuracy: 80.00% 10 Validation loss: 0.464323 Best loss: 0.354645 Accuracy: 92.00% 11 Validation loss: 0.410302 Best loss: 0.354645 Accuracy: 93.33% 12 Validation loss: 1.131754 Best loss: 0.354645 Accuracy: 88.00% 13 Validation loss: 0.511544 Best loss: 0.354645 Accuracy: 92.00% 14 Validation loss: 0.402083 Best loss: 0.354645 Accuracy: 94.00% 15 Validation loss: 1.149943 Best loss: 0.354645 Accuracy: 92.00% 16 Validation loss: 0.405171 Best loss: 0.354645 Accuracy: 94.00% 17 Validation loss: 0.304346 Best loss: 0.304346 Accuracy: 94.67% 18 Validation loss: 0.386952 Best loss: 0.304346 Accuracy: 94.67% 19 Validation loss: 0.387063 Best loss: 0.304346 Accuracy: 94.67% 20 Validation loss: 0.384417 Best loss: 0.304346 Accuracy: 94.67% 21 Validation loss: 0.381116 Best loss: 0.304346 Accuracy: 94.67% 22 Validation loss: 0.379346 Best loss: 0.304346 Accuracy: 94.67% 23 Validation loss: 0.378128 Best loss: 0.304346 Accuracy: 94.67% 24 Validation loss: 0.376642 Best loss: 0.304346 Accuracy: 94.67% 25 Validation loss: 0.375432 Best loss: 0.304346 Accuracy: 94.67% 26 Validation loss: 0.374804 Best loss: 0.304346 Accuracy: 94.67% 27 Validation loss: 0.373952 Best loss: 0.304346 Accuracy: 94.67% 28 Validation loss: 0.373471 Best loss: 0.304346 Accuracy: 94.67% 29 Validation loss: 0.373027 Best loss: 0.304346 Accuracy: 94.67% 30 Validation loss: 0.373124 Best loss: 0.304346 Accuracy: 94.67% 31 Validation loss: 0.373098 Best loss: 0.304346 Accuracy: 94.67% 32 Validation loss: 0.373206 Best loss: 0.304346 Accuracy: 94.67% 33 Validation loss: 0.372812 Best loss: 0.304346 Accuracy: 94.67% 34 Validation loss: 0.373109 Best loss: 0.304346 Accuracy: 94.67% 35 Validation loss: 0.372616 Best loss: 0.304346 Accuracy: 94.67% 36 Validation loss: 0.372491 Best loss: 0.304346 Accuracy: 94.67% 37 Validation loss: 0.372270 Best loss: 0.304346 Accuracy: 94.67% Early stopping! INFO:tensorflow:Restoring parameters from ./my_mnist_model_5_to_9_no_frozen Final test accuracy: 91.34%
Let's compare that to a DNN trained from scratch:
dnn_clf_5_to_9 = DNNClassifier(n_hidden_layers=4, random_state=42)
dnn_clf_5_to_9.fit(X_train2, y_train2, n_epochs=1000, X_valid=X_valid2, y_valid=y_valid2)
0 Validation loss: 0.674618 Best loss: 0.674618 Accuracy: 80.67% 1 Validation loss: 0.584845 Best loss: 0.584845 Accuracy: 88.67% 2 Validation loss: 0.647296 Best loss: 0.584845 Accuracy: 84.00% 3 Validation loss: 0.530389 Best loss: 0.530389 Accuracy: 87.33% 4 Validation loss: 0.683215 Best loss: 0.530389 Accuracy: 90.67% 5 Validation loss: 0.538040 Best loss: 0.530389 Accuracy: 89.33% 6 Validation loss: 0.670196 Best loss: 0.530389 Accuracy: 90.67% 7 Validation loss: 0.836470 Best loss: 0.530389 Accuracy: 85.33% 8 Validation loss: 0.837684 Best loss: 0.530389 Accuracy: 92.67% 9 Validation loss: 0.588950 Best loss: 0.530389 Accuracy: 88.00% 10 Validation loss: 0.643213 Best loss: 0.530389 Accuracy: 90.67% 11 Validation loss: 1.010521 Best loss: 0.530389 Accuracy: 88.00% 12 Validation loss: 0.931423 Best loss: 0.530389 Accuracy: 90.00% 13 Validation loss: 1.563524 Best loss: 0.530389 Accuracy: 88.67% 14 Validation loss: 2.340119 Best loss: 0.530389 Accuracy: 89.33% 15 Validation loss: 1.402095 Best loss: 0.530389 Accuracy: 88.00% 16 Validation loss: 1.269974 Best loss: 0.530389 Accuracy: 86.00% 17 Validation loss: 1.036325 Best loss: 0.530389 Accuracy: 89.33% 18 Validation loss: 1.578565 Best loss: 0.530389 Accuracy: 88.67% 19 Validation loss: 0.993890 Best loss: 0.530389 Accuracy: 93.33% 20 Validation loss: 0.958130 Best loss: 0.530389 Accuracy: 87.33% 21 Validation loss: 1.505322 Best loss: 0.530389 Accuracy: 88.67% 22 Validation loss: 1.378772 Best loss: 0.530389 Accuracy: 89.33% 23 Validation loss: 0.999445 Best loss: 0.530389 Accuracy: 88.00% 24 Validation loss: 2.366345 Best loss: 0.530389 Accuracy: 90.00% Early stopping!
DNNClassifier(activation=<function elu at 0x1243639d8>, batch_norm_momentum=None, batch_size=20, dropout_rate=None, initializer=<tensorflow.python.ops.init_ops.VarianceScaling object at 0x117bf5828>, learning_rate=0.01, n_hidden_layers=4, n_neurons=100, optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>, random_state=42)
y_pred = dnn_clf_5_to_9.predict(X_test2)
accuracy_score(y_test2, y_pred)
0.8481793869574161
Transfer learning allowed us to go from 84.8% accuracy to 91.3%. Not too bad!
In this exercise you will build a DNN that compares two MNIST digit images and predicts whether they represent the same digit or not. Then you will reuse the lower layers of this network to train an MNIST classifier using very little training data.
Exercise: Start by building two DNNs (let's call them DNN A and B), both similar to the one you built earlier but without the output layer: each DNN should have five hidden layers of 100 neurons each, He initialization, and ELU activation. Next, add one more hidden layer with 10 units on top of both DNNs. You should use TensorFlow's concat()
function with axis=1
to concatenate the outputs of both DNNs along the horizontal axis, then feed the result to the hidden layer. Finally, add an output layer with a single neuron using the logistic activation function.
Warning! There was an error in the book for this exercise: there was no instruction to add a top hidden layer. Without it, the neural network generally fails to start learning. If you have the latest version of the book, this error has been fixed.
You could have two input placeholders, X1
and X2
, one for the images that should be fed to the first DNN, and the other for the images that should be fed to the second DNN. It would work fine. However, another option is to have a single input placeholder to hold both sets of images (each row will hold a pair of images), and use tf.unstack()
to split this tensor into two separate tensors, like this:
n_inputs = 28 * 28 # MNIST
reset_graph()
X = tf.placeholder(tf.float32, shape=(None, 2, n_inputs), name="X")
X1, X2 = tf.unstack(X, axis=1)
We also need the labels placeholder. Each label will be 0 if the images represent different digits, or 1 if they represent the same digit:
y = tf.placeholder(tf.int32, shape=[None, 1])
Now let's feed these inputs through two separate DNNs:
dnn1 = dnn(X1, name="DNN_A")
dnn2 = dnn(X2, name="DNN_B")
And let's concatenate their outputs:
dnn_outputs = tf.concat([dnn1, dnn2], axis=1)
Each DNN outputs 100 activations (per instance), so the shape is [None, 100]
:
dnn1.shape
TensorShape([Dimension(None), Dimension(100)])
dnn2.shape
TensorShape([Dimension(None), Dimension(100)])
And of course the concatenated outputs have a shape of [None, 200]
:
dnn_outputs.shape
TensorShape([Dimension(None), Dimension(200)])
Now lets add an extra hidden layer with just 10 neurons, and the output layer, with a single neuron:
hidden = tf.layers.dense(dnn_outputs, units=10, activation=tf.nn.elu, kernel_initializer=he_init)
logits = tf.layers.dense(hidden, units=1, kernel_initializer=he_init)
y_proba = tf.nn.sigmoid(logits)
The whole network predicts 1
if y_proba >= 0.5
(i.e. the network predicts that the images represent the same digit), or 0
otherwise. We compute instead logits >= 0
, which is equivalent but faster to compute:
y_pred = tf.cast(tf.greater_equal(logits, 0), tf.int32)
Now let's add the cost function:
y_as_float = tf.cast(y, tf.float32)
xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_as_float, logits=logits)
loss = tf.reduce_mean(xentropy)
And we can now create the training operation using an optimizer:
learning_rate = 0.01
momentum = 0.95
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
training_op = optimizer.minimize(loss)
We will want to measure our classifier's accuracy.
y_pred_correct = tf.equal(y_pred, y)
accuracy = tf.reduce_mean(tf.cast(y_pred_correct, tf.float32))
And the usual init
and saver
:
init = tf.global_variables_initializer()
saver = tf.train.Saver()
Exercise: split the MNIST training set in two sets: split #1 should containing 55,000 images, and split #2 should contain contain 5,000 images. Create a function that generates a training batch where each instance is a pair of MNIST images picked from split #1. Half of the training instances should be pairs of images that belong to the same class, while the other half should be images from different classes. For each pair, the training label should be 0 if the images are from the same class, or 1 if they are from different classes.
The MNIST dataset returned by TensorFlow's input_data()
function is already split into 3 parts: a training set (55,000 instances), a validation set (5,000 instances) and a test set (10,000 instances). Let's use the first set to generate the training set composed image pairs, and we will use the second set for the second phase of the exercise (to train a regular MNIST classifier). We will use the third set as the test set for both phases.
X_train1 = X_train
y_train1 = y_train
X_train2 = X_valid
y_train2 = y_valid
X_test = X_test
y_test = y_test
Let's write a function that generates pairs of images: 50% representing the same digit, and 50% representing different digits. There are many ways to implement this. In this implementation, we first decide how many "same" pairs (i.e. pairs of images representing the same digit) we will generate, and how many "different" pairs (i.e. pairs of images representing different digits). We could just use batch_size // 2
but we want to handle the case where it is odd (granted, that might be overkill!). Then we generate random pairs and we pick the right number of "same" pairs, then we generate the right number of "different" pairs. Finally we shuffle the batch and return it:
def generate_batch(images, labels, batch_size):
size1 = batch_size // 2
size2 = batch_size - size1
if size1 != size2 and np.random.rand() > 0.5:
size1, size2 = size2, size1
X = []
y = []
while len(X) < size1:
rnd_idx1, rnd_idx2 = np.random.randint(0, len(images), 2)
if rnd_idx1 != rnd_idx2 and labels[rnd_idx1] == labels[rnd_idx2]:
X.append(np.array([images[rnd_idx1], images[rnd_idx2]]))
y.append([1])
while len(X) < batch_size:
rnd_idx1, rnd_idx2 = np.random.randint(0, len(images), 2)
if labels[rnd_idx1] != labels[rnd_idx2]:
X.append(np.array([images[rnd_idx1], images[rnd_idx2]]))
y.append([0])
rnd_indices = np.random.permutation(batch_size)
return np.array(X)[rnd_indices], np.array(y)[rnd_indices]
Let's test it to generate a small batch of 5 image pairs:
batch_size = 5
X_batch, y_batch = generate_batch(X_train1, y_train1, batch_size)
Each row in X_batch
contains a pair of images:
X_batch.shape, X_batch.dtype
((5, 2, 784), dtype('float32'))
Let's look at these pairs:
plt.figure(figsize=(3, 3 * batch_size))
plt.subplot(121)
plt.imshow(X_batch[:,0].reshape(28 * batch_size, 28), cmap="binary", interpolation="nearest")
plt.axis('off')
plt.subplot(122)
plt.imshow(X_batch[:,1].reshape(28 * batch_size, 28), cmap="binary", interpolation="nearest")
plt.axis('off')
plt.show()
And let's look at the labels (0 means "different", 1 means "same"):
y_batch
array([[1], [0], [0], [1], [0]])
Perfect!
Exercise: train the DNN on this training set. For each image pair, you can simultaneously feed the first image to DNN A and the second image to DNN B. The whole network will gradually learn to tell whether two images belong to the same class or not.
Let's generate a test set composed of many pairs of images pulled from the MNIST test set:
X_test1, y_test1 = generate_batch(X_test, y_test, batch_size=len(X_test))
And now, let's train the model. There's really nothing special about this step, except for the fact that we need a fairly large batch_size
, otherwise the model fails to learn anything and ends up with an accuracy of 50%:
n_epochs = 100
batch_size = 500
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(len(X_train1) // batch_size):
X_batch, y_batch = generate_batch(X_train1, y_train1, batch_size)
loss_val, _ = sess.run([loss, training_op], feed_dict={X: X_batch, y: y_batch})
print(epoch, "Train loss:", loss_val)
if epoch % 5 == 0:
acc_test = accuracy.eval(feed_dict={X: X_test1, y: y_test1})
print(epoch, "Test accuracy:", acc_test)
save_path = saver.save(sess, "./my_digit_comparison_model.ckpt")
0 Train loss: 0.69103277 0 Test accuracy: 0.542 1 Train loss: 0.6035354 2 Train loss: 0.54946035 3 Train loss: 0.47047246 4 Train loss: 0.4060757 5 Train loss: 0.38308156 5 Test accuracy: 0.824 6 Train loss: 0.39047274 7 Train loss: 0.3390794 8 Train loss: 0.3210671 9 Train loss: 0.31792685 10 Train loss: 0.24494292 10 Test accuracy: 0.8881 11 Train loss: 0.2929235 12 Train loss: 0.23225449 13 Train loss: 0.23180929 14 Train loss: 0.19877923 15 Train loss: 0.20065464 15 Test accuracy: 0.9203 16 Train loss: 0.19700499 17 Train loss: 0.18893136 18 Train loss: 0.19965452 19 Train loss: 0.24071647 20 Train loss: 0.18882024 20 Test accuracy: 0.9367 21 Train loss: 0.12419197 22 Train loss: 0.14013417 23 Train loss: 0.120789476 24 Train loss: 0.15721135 25 Train loss: 0.11507861 25 Test accuracy: 0.948 26 Train loss: 0.13891116 27 Train loss: 0.1526081 28 Train loss: 0.123436704 <<50 more lines>> 70 Test accuracy: 0.9743 71 Train loss: 0.019732744 72 Train loss: 0.039464083 73 Train loss: 0.04187814 74 Train loss: 0.05303406 75 Train loss: 0.052625064 75 Test accuracy: 0.9756 76 Train loss: 0.038283084 77 Train loss: 0.026332883 78 Train loss: 0.07060841 79 Train loss: 0.03239444 80 Train loss: 0.03136283 80 Test accuracy: 0.9731 81 Train loss: 0.04390848 82 Train loss: 0.015268046 83 Train loss: 0.04875638 84 Train loss: 0.029360933 85 Train loss: 0.0418443 85 Test accuracy: 0.9759 86 Train loss: 0.018274888 87 Train loss: 0.038872603 88 Train loss: 0.02969683 89 Train loss: 0.020990817 90 Train loss: 0.045234833 90 Test accuracy: 0.9769 91 Train loss: 0.039237432 92 Train loss: 0.031329047 93 Train loss: 0.033414133 94 Train loss: 0.025883088 95 Train loss: 0.019567214 95 Test accuracy: 0.9765 96 Train loss: 0.020650322 97 Train loss: 0.0339851 98 Train loss: 0.047079965 99 Train loss: 0.03125228
All right, we reach 97.6% accuracy on this digit comparison task. That's not too bad, this model knows a thing or two about comparing handwritten digits!
Let's see if some of that knowledge can be useful for the regular MNIST classification task.
Exercise: now create a new DNN by reusing and freezing the hidden layers of DNN A and adding a softmax output layer on top with 10 neurons. Train this network on split #2 and see if you can achieve high performance despite having only 500 images per class.
Let's create the model, it is pretty straightforward. There are many ways to freeze the lower layers, as explained in the book. In this example, we chose to use the tf.stop_gradient()
function. Note that we need one Saver
to restore the pretrained DNN A, and another Saver
to save the final model:
reset_graph()
n_inputs = 28 * 28 # MNIST
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
dnn_outputs = dnn(X, name="DNN_A")
frozen_outputs = tf.stop_gradient(dnn_outputs)
logits = tf.layers.dense(frozen_outputs, n_outputs, kernel_initializer=he_init)
Y_proba = tf.nn.softmax(logits)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
dnn_A_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="DNN_A")
restore_saver = tf.train.Saver(var_list={var.op.name: var for var in dnn_A_vars})
saver = tf.train.Saver()
Now on to training! We first initialize all variables (including the variables in the new output layer), then we restore the pretrained DNN A. Next, we just train the model on the small MNIST dataset (containing just 5,000 images):
n_epochs = 100
batch_size = 50
with tf.Session() as sess:
init.run()
restore_saver.restore(sess, "./my_digit_comparison_model.ckpt")
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train2))
for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
if epoch % 10 == 0:
acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
print(epoch, "Test accuracy:", acc_test)
save_path = saver.save(sess, "./my_mnist_model_final.ckpt")
INFO:tensorflow:Restoring parameters from ./my_digit_comparison_model.ckpt 0 Test accuracy: 0.9455 10 Test accuracy: 0.9634 20 Test accuracy: 0.9659 30 Test accuracy: 0.9656 40 Test accuracy: 0.9655 50 Test accuracy: 0.9656 60 Test accuracy: 0.9655 70 Test accuracy: 0.9656 80 Test accuracy: 0.9654 90 Test accuracy: 0.9654
Well, 96.5% accuracy, that's not the best MNIST model we have trained so far, but recall that we are only using a small training set (just 500 images per digit). Let's compare this result with the same DNN trained from scratch, without using transfer learning:
reset_graph()
n_inputs = 28 * 28 # MNIST
n_outputs = 10
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
dnn_outputs = dnn(X, name="DNN_A")
logits = tf.layers.dense(dnn_outputs, n_outputs, kernel_initializer=he_init)
Y_proba = tf.nn.softmax(logits)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
dnn_A_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="DNN_A")
restore_saver = tf.train.Saver(var_list={var.op.name: var for var in dnn_A_vars})
saver = tf.train.Saver()
n_epochs = 150
batch_size = 50
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
rnd_idx = np.random.permutation(len(X_train2))
for rnd_indices in np.array_split(rnd_idx, len(X_train2) // batch_size):
X_batch, y_batch = X_train2[rnd_indices], y_train2[rnd_indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
if epoch % 10 == 0:
acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
print(epoch, "Test accuracy:", acc_test)
save_path = saver.save(sess, "./my_mnist_model_final.ckpt")
0 Test accuracy: 0.8694 10 Test accuracy: 0.9276 20 Test accuracy: 0.9299 30 Test accuracy: 0.935 40 Test accuracy: 0.942 50 Test accuracy: 0.9435 60 Test accuracy: 0.9442 70 Test accuracy: 0.9447 80 Test accuracy: 0.9448 90 Test accuracy: 0.945 100 Test accuracy: 0.945 110 Test accuracy: 0.9458 120 Test accuracy: 0.9456 130 Test accuracy: 0.9458 140 Test accuracy: 0.9458
Only 94.6% accuracy... So transfer learning helped us reduce the error rate from 5.4% to 3.5% (that's over 35% error reduction). Moreover, the model using transfer learning reached over 96% accuracy in less than 10 epochs.
Bottom line: transfer learning does not always work, but when it does it can make a big difference. So try it out!