Chapter 15 – Autoencoders
This notebook contains all the sample code and solutions to the exercises in chapter 15.
Run in Google Colab |
Warning: this is the code for the 1st edition of the book. Please visit https://github.com/ageron/handson-ml2 for the 2nd edition code, with up-to-date notebooks using the latest library versions. In particular, the 1st edition is based on TensorFlow 1, while the 2nd edition uses TensorFlow 2, which is much simpler to use.
First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals
# Common imports
import numpy as np
import os
import sys
try:
# %tensorflow_version only exists in Colab.
%tensorflow_version 1.x
except Exception:
pass
# to make this notebook's output stable across runs
def reset_graph(seed=42):
tf.reset_default_graph()
tf.set_random_seed(seed)
np.random.seed(seed)
# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "autoencoders"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
print("Saving figure", fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(path, format=fig_extension, dpi=resolution)
A couple utility functions to plot grayscale 28x28 image:
def plot_image(image, shape=[28, 28]):
plt.imshow(image.reshape(shape), cmap="Greys", interpolation="nearest")
plt.axis("off")
def plot_multiple_images(images, n_rows, n_cols, pad=2):
images = images - images.min() # make the minimum == 0, so the padding looks white
w,h = images.shape[1:]
image = np.zeros(((w+pad)*n_rows+pad, (h+pad)*n_cols+pad))
for y in range(n_rows):
for x in range(n_cols):
image[(y*(h+pad)+pad):(y*(h+pad)+pad+h),(x*(w+pad)+pad):(x*(w+pad)+pad+w)] = images[y*n_cols+x]
plt.imshow(image, cmap="Greys", interpolation="nearest")
plt.axis("off")
Build 3D dataset:
import numpy.random as rnd
rnd.seed(4)
m = 200
w1, w2 = 0.1, 0.3
noise = 0.1
angles = rnd.rand(m) * 3 * np.pi / 2 - 0.5
data = np.empty((m, 3))
data[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * rnd.randn(m) / 2
data[:, 1] = np.sin(angles) * 0.7 + noise * rnd.randn(m) / 2
data[:, 2] = data[:, 0] * w1 + data[:, 1] * w2 + noise * rnd.randn(m)
Normalize the data:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(data[:100])
X_test = scaler.transform(data[100:])
Now let's build the Autoencoder...
Note: instead of using the fully_connected()
function from the tensorflow.contrib.layers
module (as in the book), we now use the dense()
function from the tf.layers
module, which did not exist when this chapter was written. This is preferable because anything in contrib may change or be deleted without notice, while tf.layers
is part of the official API. As you will see, the code is mostly the same.
The main differences relevant to this chapter are:
scope
parameter was renamed to name
, and the _fn
suffix was removed in all the parameters that had it (for example the activation_fn
parameter was renamed to activation
).weights
parameter was renamed to kernel
and the weights variable is now named "kernel"
rather than "weights"
,"bias"
rather than "biases"
,None
instead of tf.nn.relu
import tensorflow as tf
reset_graph()
n_inputs = 3
n_hidden = 2 # codings
n_outputs = n_inputs
learning_rate = 0.01
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = tf.layers.dense(X, n_hidden)
outputs = tf.layers.dense(hidden, n_outputs)
reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(reconstruction_loss)
init = tf.global_variables_initializer()
n_iterations = 1000
codings = hidden
with tf.Session() as sess:
init.run()
for iteration in range(n_iterations):
training_op.run(feed_dict={X: X_train})
codings_val = codings.eval(feed_dict={X: X_test})
fig = plt.figure(figsize=(4,3))
plt.plot(codings_val[:,0], codings_val[:, 1], "b.")
plt.xlabel("$z_1$", fontsize=18)
plt.ylabel("$z_2$", fontsize=18, rotation=0)
save_fig("linear_autoencoder_pca_plot")
plt.show()
Saving figure linear_autoencoder_pca_plot
Let's use MNIST:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")
Extracting /tmp/data/train-images-idx3-ubyte.gz Extracting /tmp/data/train-labels-idx1-ubyte.gz Extracting /tmp/data/t10k-images-idx3-ubyte.gz Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
Let's build a stacked Autoencoder with 3 hidden layers and 1 output layer (ie. 2 stacked Autoencoders). We will use ELU activation, He initialization and L2 regularization.
Note: since the tf.layers.dense()
function is incompatible with tf.contrib.layers.arg_scope()
(which is used in the book), we now use python's functools.partial()
function instead. It makes it easy to create a my_dense_layer()
function that just calls tf.layers.dense()
with the desired parameters automatically set (unless they are overridden when calling my_dense_layer()
).
reset_graph()
from functools import partial
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150 # codings
n_hidden3 = n_hidden1
n_outputs = n_inputs
learning_rate = 0.01
l2_reg = 0.0001
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
he_init = tf.contrib.layers.variance_scaling_initializer() # He initialization
#Equivalent to:
#he_init = lambda shape, dtype=tf.float32: tf.truncated_normal(shape, 0., stddev=np.sqrt(2/shape[0]))
l2_regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
my_dense_layer = partial(tf.layers.dense,
activation=tf.nn.elu,
kernel_initializer=he_init,
kernel_regularizer=l2_regularizer)
hidden1 = my_dense_layer(X, n_hidden1)
hidden2 = my_dense_layer(hidden1, n_hidden2)
hidden3 = my_dense_layer(hidden2, n_hidden3)
outputs = my_dense_layer(hidden3, n_outputs, activation=None)
reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))
reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
loss = tf.add_n([reconstruction_loss] + reg_losses)
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
saver = tf.train.Saver() # not shown in the book
Now let's train it! Note that we don't feed target values (y_batch
is not used). This is unsupervised training.
n_epochs = 5
batch_size = 150
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
n_batches = mnist.train.num_examples // batch_size
for iteration in range(n_batches):
print("\r{}%".format(100 * iteration // n_batches), end="") # not shown in the book
sys.stdout.flush() # not shown
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch})
loss_train = reconstruction_loss.eval(feed_dict={X: X_batch}) # not shown
print("\r{}".format(epoch), "Train MSE:", loss_train) # not shown
saver.save(sess, "./my_model_all_layers.ckpt") # not shown
0 Train MSE: 0.0204011 19% Train MSE: 0.0114192 2 Train MSE: 0.0102221 3 Train MSE: 0.00989991 4 Train MSE: 0.0103724
This function loads the model, evaluates it on the test set (it measures the reconstruction error), then it displays the original image and its reconstruction:
def show_reconstructed_digits(X, outputs, model_path = None, n_test_digits = 2):
with tf.Session() as sess:
if model_path:
saver.restore(sess, model_path)
X_test = mnist.test.images[:n_test_digits]
outputs_val = outputs.eval(feed_dict={X: X_test})
fig = plt.figure(figsize=(8, 3 * n_test_digits))
for digit_index in range(n_test_digits):
plt.subplot(n_test_digits, 2, digit_index * 2 + 1)
plot_image(X_test[digit_index])
plt.subplot(n_test_digits, 2, digit_index * 2 + 2)
plot_image(outputs_val[digit_index])
show_reconstructed_digits(X, outputs, "./my_model_all_layers.ckpt")
save_fig("reconstruction_plot")
INFO:tensorflow:Restoring parameters from ./my_model_all_layers.ckpt Saving figure reconstruction_plot
It is common to tie the weights of the encoder and the decoder (weights_decoder = tf.transpose(weights_encoder)
). Unfortunately this makes it impossible (or very tricky) to use the tf.layers.dense()
function, so we need to build the Autoencoder manually:
reset_graph()
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150 # codings
n_hidden3 = n_hidden1
n_outputs = n_inputs
learning_rate = 0.01
l2_reg = 0.0005
activation = tf.nn.elu
regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
initializer = tf.contrib.layers.variance_scaling_initializer()
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
weights1_init = initializer([n_inputs, n_hidden1])
weights2_init = initializer([n_hidden1, n_hidden2])
weights1 = tf.Variable(weights1_init, dtype=tf.float32, name="weights1")
weights2 = tf.Variable(weights2_init, dtype=tf.float32, name="weights2")
weights3 = tf.transpose(weights2, name="weights3") # tied weights
weights4 = tf.transpose(weights1, name="weights4") # tied weights
biases1 = tf.Variable(tf.zeros(n_hidden1), name="biases1")
biases2 = tf.Variable(tf.zeros(n_hidden2), name="biases2")
biases3 = tf.Variable(tf.zeros(n_hidden3), name="biases3")
biases4 = tf.Variable(tf.zeros(n_outputs), name="biases4")
hidden1 = activation(tf.matmul(X, weights1) + biases1)
hidden2 = activation(tf.matmul(hidden1, weights2) + biases2)
hidden3 = activation(tf.matmul(hidden2, weights3) + biases3)
outputs = tf.matmul(hidden3, weights4) + biases4
reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))
reg_loss = regularizer(weights1) + regularizer(weights2)
loss = reconstruction_loss + reg_loss
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 5
batch_size = 150
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
n_batches = mnist.train.num_examples // batch_size
for iteration in range(n_batches):
print("\r{}%".format(100 * iteration // n_batches), end="")
sys.stdout.flush()
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch})
loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
print("\r{}".format(epoch), "Train MSE:", loss_train)
saver.save(sess, "./my_model_tying_weights.ckpt")
0 Train MSE: 0.0150667 19% Train MSE: 0.0164884 2 Train MSE: 0.0173757 3 Train MSE: 0.0168781 4 Train MSE: 0.0155875
show_reconstructed_digits(X, outputs, "./my_model_tying_weights.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_tying_weights.ckpt
There are many ways to train one Autoencoder at a time. The first approach is to train each Autoencoder using a different graph, then we create the Stacked Autoencoder by simply initializing it with the weights and biases copied from these Autoencoders.
Let's create a function that will train one autoencoder and return the transformed training set (i.e., the output of the hidden layer) and the model parameters.
reset_graph()
from functools import partial
def train_autoencoder(X_train, n_neurons, n_epochs, batch_size,
learning_rate = 0.01, l2_reg = 0.0005, seed=42,
hidden_activation=tf.nn.elu,
output_activation=tf.nn.elu):
graph = tf.Graph()
with graph.as_default():
tf.set_random_seed(seed)
n_inputs = X_train.shape[1]
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
my_dense_layer = partial(
tf.layers.dense,
kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
kernel_regularizer=tf.contrib.layers.l2_regularizer(l2_reg))
hidden = my_dense_layer(X, n_neurons, activation=hidden_activation, name="hidden")
outputs = my_dense_layer(hidden, n_inputs, activation=output_activation, name="outputs")
reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))
reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
loss = tf.add_n([reconstruction_loss] + reg_losses)
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
with tf.Session(graph=graph) as sess:
init.run()
for epoch in range(n_epochs):
n_batches = len(X_train) // batch_size
for iteration in range(n_batches):
print("\r{}%".format(100 * iteration // n_batches), end="")
sys.stdout.flush()
indices = rnd.permutation(len(X_train))[:batch_size]
X_batch = X_train[indices]
sess.run(training_op, feed_dict={X: X_batch})
loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
print("\r{}".format(epoch), "Train MSE:", loss_train)
params = dict([(var.name, var.eval()) for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)])
hidden_val = hidden.eval(feed_dict={X: X_train})
return hidden_val, params["hidden/kernel:0"], params["hidden/bias:0"], params["outputs/kernel:0"], params["outputs/bias:0"]
Now let's train two Autoencoders. The first one is trained on the training data, and the second is trained on the previous Autoencoder's hidden layer output:
hidden_output, W1, b1, W4, b4 = train_autoencoder(mnist.train.images, n_neurons=300, n_epochs=4, batch_size=150,
output_activation=None)
_, W2, b2, W3, b3 = train_autoencoder(hidden_output, n_neurons=150, n_epochs=4, batch_size=150)
0 Train MSE: 0.0185175 1 Train MSE: 0.0186825 2 Train MSE: 0.0184675 3 Train MSE: 0.0192315 0 Train MSE: 0.00423611 1 Train MSE: 0.00483268 2 Train MSE: 0.00466874 3 Train MSE: 0.0044039
Finally, we can create a Stacked Autoencoder by simply reusing the weights and biases from the Autoencoders we just trained:
reset_graph()
n_inputs = 28*28
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden1 = tf.nn.elu(tf.matmul(X, W1) + b1)
hidden2 = tf.nn.elu(tf.matmul(hidden1, W2) + b2)
hidden3 = tf.nn.elu(tf.matmul(hidden2, W3) + b3)
outputs = tf.matmul(hidden3, W4) + b4
show_reconstructed_digits(X, outputs)
Another approach is to use a single graph. To do this, we create the graph for the full Stacked Autoencoder, but then we also add operations to train each Autoencoder independently: phase 1 trains the bottom and top layer (ie. the first Autoencoder) and phase 2 trains the two middle layers (ie. the second Autoencoder).
reset_graph()
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150 # codings
n_hidden3 = n_hidden1
n_outputs = n_inputs
learning_rate = 0.01
l2_reg = 0.0001
activation = tf.nn.elu
regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
initializer = tf.contrib.layers.variance_scaling_initializer()
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
weights1_init = initializer([n_inputs, n_hidden1])
weights2_init = initializer([n_hidden1, n_hidden2])
weights3_init = initializer([n_hidden2, n_hidden3])
weights4_init = initializer([n_hidden3, n_outputs])
weights1 = tf.Variable(weights1_init, dtype=tf.float32, name="weights1")
weights2 = tf.Variable(weights2_init, dtype=tf.float32, name="weights2")
weights3 = tf.Variable(weights3_init, dtype=tf.float32, name="weights3")
weights4 = tf.Variable(weights4_init, dtype=tf.float32, name="weights4")
biases1 = tf.Variable(tf.zeros(n_hidden1), name="biases1")
biases2 = tf.Variable(tf.zeros(n_hidden2), name="biases2")
biases3 = tf.Variable(tf.zeros(n_hidden3), name="biases3")
biases4 = tf.Variable(tf.zeros(n_outputs), name="biases4")
hidden1 = activation(tf.matmul(X, weights1) + biases1)
hidden2 = activation(tf.matmul(hidden1, weights2) + biases2)
hidden3 = activation(tf.matmul(hidden2, weights3) + biases3)
outputs = tf.matmul(hidden3, weights4) + biases4
reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))
optimizer = tf.train.AdamOptimizer(learning_rate)
with tf.name_scope("phase1"):
phase1_outputs = tf.matmul(hidden1, weights4) + biases4 # bypass hidden2 and hidden3
phase1_reconstruction_loss = tf.reduce_mean(tf.square(phase1_outputs - X))
phase1_reg_loss = regularizer(weights1) + regularizer(weights4)
phase1_loss = phase1_reconstruction_loss + phase1_reg_loss
phase1_training_op = optimizer.minimize(phase1_loss)
with tf.name_scope("phase2"):
phase2_reconstruction_loss = tf.reduce_mean(tf.square(hidden3 - hidden1))
phase2_reg_loss = regularizer(weights2) + regularizer(weights3)
phase2_loss = phase2_reconstruction_loss + phase2_reg_loss
train_vars = [weights2, biases2, weights3, biases3]
phase2_training_op = optimizer.minimize(phase2_loss, var_list=train_vars) # freeze hidden1
init = tf.global_variables_initializer()
saver = tf.train.Saver()
training_ops = [phase1_training_op, phase2_training_op]
reconstruction_losses = [phase1_reconstruction_loss, phase2_reconstruction_loss]
n_epochs = [4, 4]
batch_sizes = [150, 150]
with tf.Session() as sess:
init.run()
for phase in range(2):
print("Training phase #{}".format(phase + 1))
for epoch in range(n_epochs[phase]):
n_batches = mnist.train.num_examples // batch_sizes[phase]
for iteration in range(n_batches):
print("\r{}%".format(100 * iteration // n_batches), end="")
sys.stdout.flush()
X_batch, y_batch = mnist.train.next_batch(batch_sizes[phase])
sess.run(training_ops[phase], feed_dict={X: X_batch})
loss_train = reconstruction_losses[phase].eval(feed_dict={X: X_batch})
print("\r{}".format(epoch), "Train MSE:", loss_train)
saver.save(sess, "./my_model_one_at_a_time.ckpt")
loss_test = reconstruction_loss.eval(feed_dict={X: mnist.test.images})
print("Test MSE:", loss_test)
Training phase #1 0 Train MSE: 0.00740679 1 Train MSE: 0.00782866 2 Train MSE: 0.00772802 3 Train MSE: 0.00740893 Training phase #2 0 Train MSE: 0.295499 1 Train MSE: 0.00594454 2 Train MSE: 0.00310264 3 Train MSE: 0.00249803 Test MSE: 0.00979144
training_ops = [phase1_training_op, phase2_training_op]
reconstruction_losses = [phase1_reconstruction_loss, phase2_reconstruction_loss]
n_epochs = [4, 4]
batch_sizes = [150, 150]
with tf.Session() as sess:
init.run()
for phase in range(2):
print("Training phase #{}".format(phase + 1))
if phase == 1:
hidden1_cache = hidden1.eval(feed_dict={X: mnist.train.images})
for epoch in range(n_epochs[phase]):
n_batches = mnist.train.num_examples // batch_sizes[phase]
for iteration in range(n_batches):
print("\r{}%".format(100 * iteration // n_batches), end="")
sys.stdout.flush()
if phase == 1:
indices = rnd.permutation(mnist.train.num_examples)
hidden1_batch = hidden1_cache[indices[:batch_sizes[phase]]]
feed_dict = {hidden1: hidden1_batch}
sess.run(training_ops[phase], feed_dict=feed_dict)
else:
X_batch, y_batch = mnist.train.next_batch(batch_sizes[phase])
feed_dict = {X: X_batch}
sess.run(training_ops[phase], feed_dict=feed_dict)
loss_train = reconstruction_losses[phase].eval(feed_dict=feed_dict)
print("\r{}".format(epoch), "Train MSE:", loss_train)
saver.save(sess, "./my_model_cache_frozen.ckpt")
loss_test = reconstruction_loss.eval(feed_dict={X: mnist.test.images})
print("Test MSE:", loss_test)
Training phase #1 0 Train MSE: 0.00753817 1 Train MSE: 0.00775457 2 Train MSE: 0.00734359 3 Train MSE: 0.00783768 Training phase #2 0 Train MSE: 0.200137 1 Train MSE: 0.00520852 2 Train MSE: 0.00259211 3 Train MSE: 0.00210128 Test MSE: 0.0097786
n_test_digits = 2
X_test = mnist.test.images[:n_test_digits]
with tf.Session() as sess:
saver.restore(sess, "./my_model_one_at_a_time.ckpt") # not shown in the book
outputs_val = outputs.eval(feed_dict={X: X_test})
def plot_image(image, shape=[28, 28]):
plt.imshow(image.reshape(shape), cmap="Greys", interpolation="nearest")
plt.axis("off")
for digit_index in range(n_test_digits):
plt.subplot(n_test_digits, 2, digit_index * 2 + 1)
plot_image(X_test[digit_index])
plt.subplot(n_test_digits, 2, digit_index * 2 + 2)
plot_image(outputs_val[digit_index])
INFO:tensorflow:Restoring parameters from ./my_model_one_at_a_time.ckpt
with tf.Session() as sess:
saver.restore(sess, "./my_model_one_at_a_time.ckpt") # not shown in the book
weights1_val = weights1.eval()
for i in range(5):
plt.subplot(1, 5, i + 1)
plot_image(weights1_val.T[i])
save_fig("extracted_features_plot") # not shown
plt.show() # not shown
INFO:tensorflow:Restoring parameters from ./my_model_one_at_a_time.ckpt Saving figure extracted_features_plot
Let's create a small neural network for MNIST classification:
reset_graph()
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150
n_outputs = 10
learning_rate = 0.01
l2_reg = 0.0005
activation = tf.nn.elu
regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
initializer = tf.contrib.layers.variance_scaling_initializer()
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
y = tf.placeholder(tf.int32, shape=[None])
weights1_init = initializer([n_inputs, n_hidden1])
weights2_init = initializer([n_hidden1, n_hidden2])
weights3_init = initializer([n_hidden2, n_outputs])
weights1 = tf.Variable(weights1_init, dtype=tf.float32, name="weights1")
weights2 = tf.Variable(weights2_init, dtype=tf.float32, name="weights2")
weights3 = tf.Variable(weights3_init, dtype=tf.float32, name="weights3")
biases1 = tf.Variable(tf.zeros(n_hidden1), name="biases1")
biases2 = tf.Variable(tf.zeros(n_hidden2), name="biases2")
biases3 = tf.Variable(tf.zeros(n_outputs), name="biases3")
hidden1 = activation(tf.matmul(X, weights1) + biases1)
hidden2 = activation(tf.matmul(hidden1, weights2) + biases2)
logits = tf.matmul(hidden2, weights3) + biases3
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
reg_loss = regularizer(weights1) + regularizer(weights2) + regularizer(weights3)
loss = cross_entropy + reg_loss
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
pretrain_saver = tf.train.Saver([weights1, weights2, biases1, biases2])
saver = tf.train.Saver()
Regular training (without pretraining):
n_epochs = 4
batch_size = 150
n_labeled_instances = 20000
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
n_batches = n_labeled_instances // batch_size
for iteration in range(n_batches):
print("\r{}%".format(100 * iteration // n_batches), end="")
sys.stdout.flush()
indices = rnd.permutation(n_labeled_instances)[:batch_size]
X_batch, y_batch = mnist.train.images[indices], mnist.train.labels[indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
print("\r{}".format(epoch), "Train accuracy:", accuracy_val, end=" ")
saver.save(sess, "./my_model_supervised.ckpt")
accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})
print("Test accuracy:", accuracy_val)
0 Train accuracy: 0.973333 Test accuracy: 0.9334 1 Train accuracy: 0.98 Test accuracy: 0.936 2 Train accuracy: 0.973333 Test accuracy: 0.9382 3 Train accuracy: 0.986667 Test accuracy: 0.9494
Now reusing the first two layers of the autoencoder we pretrained:
n_epochs = 4
batch_size = 150
n_labeled_instances = 20000
#training_op = optimizer.minimize(loss, var_list=[weights3, biases3]) # Freeze layers 1 and 2 (optional)
with tf.Session() as sess:
init.run()
pretrain_saver.restore(sess, "./my_model_cache_frozen.ckpt")
for epoch in range(n_epochs):
n_batches = n_labeled_instances // batch_size
for iteration in range(n_batches):
print("\r{}%".format(100 * iteration // n_batches), end="")
sys.stdout.flush()
indices = rnd.permutation(n_labeled_instances)[:batch_size]
X_batch, y_batch = mnist.train.images[indices], mnist.train.labels[indices]
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
accuracy_val = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
print("\r{}".format(epoch), "Train accuracy:", accuracy_val, end="\t")
saver.save(sess, "./my_model_supervised_pretrained.ckpt")
accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})
print("Test accuracy:", accuracy_val)
INFO:tensorflow:Restoring parameters from ./my_model_cache_frozen.ckpt 0 Train accuracy: 0.94 Test accuracy: 0.9266 1 Train accuracy: 0.98 Test accuracy: 0.94 2 Train accuracy: 1.0 Test accuracy: 0.946 3 Train accuracy: 0.98 Test accuracy: 0.9401
Note: the book uses tf.contrib.layers.dropout()
rather than tf.layers.dropout()
(which did not exist when this chapter was written). It is now preferable to use tf.layers.dropout()
, because anything in the contrib module may change or be deleted without notice. The tf.layers.dropout()
function is almost identical to the tf.contrib.layers.dropout()
function, except for a few minor differences. Most importantly:
rate
) rather than the keep probability (keep_prob
), where rate
is simply equal to 1 - keep_prob
,is_training
parameter is renamed to training
.Using Gaussian noise:
reset_graph()
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150 # codings
n_hidden3 = n_hidden1
n_outputs = n_inputs
learning_rate = 0.01
noise_level = 1.0
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
X_noisy = X + noise_level * tf.random_normal(tf.shape(X))
hidden1 = tf.layers.dense(X_noisy, n_hidden1, activation=tf.nn.relu,
name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, # not shown in the book
name="hidden2") # not shown
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, # not shown
name="hidden3") # not shown
outputs = tf.layers.dense(hidden3, n_outputs, name="outputs") # not shown
reconstruction_loss = tf.reduce_mean(tf.square(outputs - X)) # MSE
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(reconstruction_loss)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 10
batch_size = 150
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
n_batches = mnist.train.num_examples // batch_size
for iteration in range(n_batches):
print("\r{}%".format(100 * iteration // n_batches), end="")
sys.stdout.flush()
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch})
loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
print("\r{}".format(epoch), "Train MSE:", loss_train)
saver.save(sess, "./my_model_stacked_denoising_gaussian.ckpt")
0 Train MSE: 0.0440489 1 Train MSE: 0.0432517 2 Train MSE: 0.042057 3 Train MSE: 0.0409477 4 Train MSE: 0.0402107 5 Train MSE: 0.0388787 6 Train MSE: 0.0391096 7 Train MSE: 0.0421885 8 Train MSE: 0.0398648 9 Train MSE: 0.0408181
Using dropout:
reset_graph()
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150 # codings
n_hidden3 = n_hidden1
n_outputs = n_inputs
learning_rate = 0.01
dropout_rate = 0.3
training = tf.placeholder_with_default(False, shape=(), name='training')
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
X_drop = tf.layers.dropout(X, dropout_rate, training=training)
hidden1 = tf.layers.dense(X_drop, n_hidden1, activation=tf.nn.relu,
name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, # not shown in the book
name="hidden2") # not shown
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, # not shown
name="hidden3") # not shown
outputs = tf.layers.dense(hidden3, n_outputs, name="outputs") # not shown
reconstruction_loss = tf.reduce_mean(tf.square(outputs - X)) # MSE
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(reconstruction_loss)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 10
batch_size = 150
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
n_batches = mnist.train.num_examples // batch_size
for iteration in range(n_batches):
print("\r{}%".format(100 * iteration // n_batches), end="")
sys.stdout.flush()
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch, training: True})
loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
print("\r{}".format(epoch), "Train MSE:", loss_train)
saver.save(sess, "./my_model_stacked_denoising_dropout.ckpt")
0 Train MSE: 0.0296476 1 Train MSE: 0.0275545 2 Train MSE: 0.0250731 3 Train MSE: 0.0254317 4 Train MSE: 0.0249076 5 Train MSE: 0.0250501 6 Train MSE: 0.024483 7 Train MSE: 0.0251505 8 Train MSE: 0.0243836 9 Train MSE: 0.0242349
show_reconstructed_digits(X, outputs, "./my_model_stacked_denoising_dropout.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_stacked_denoising_dropout.ckpt
p = 0.1
q = np.linspace(0.001, 0.999, 500)
kl_div = p * np.log(p / q) + (1 - p) * np.log((1 - p) / (1 - q))
mse = (p - q)**2
plt.plot([p, p], [0, 0.3], "k:")
plt.text(0.05, 0.32, "Target\nsparsity", fontsize=14)
plt.plot(q, kl_div, "b-", label="KL divergence")
plt.plot(q, mse, "r--", label="MSE")
plt.legend(loc="upper left")
plt.xlabel("Actual sparsity")
plt.ylabel("Cost", rotation=0)
plt.axis([0, 1, 0, 0.95])
save_fig("sparsity_loss_plot")
Saving figure sparsity_loss_plot
reset_graph()
n_inputs = 28 * 28
n_hidden1 = 1000 # sparse codings
n_outputs = n_inputs
def kl_divergence(p, q):
# Kullback Leibler divergence
return p * tf.log(p / q) + (1 - p) * tf.log((1 - p) / (1 - q))
learning_rate = 0.01
sparsity_target = 0.1
sparsity_weight = 0.2
X = tf.placeholder(tf.float32, shape=[None, n_inputs]) # not shown in the book
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.sigmoid) # not shown
outputs = tf.layers.dense(hidden1, n_outputs) # not shown
hidden1_mean = tf.reduce_mean(hidden1, axis=0) # batch mean
sparsity_loss = tf.reduce_sum(kl_divergence(sparsity_target, hidden1_mean))
reconstruction_loss = tf.reduce_mean(tf.square(outputs - X)) # MSE
loss = reconstruction_loss + sparsity_weight * sparsity_loss
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 100
batch_size = 1000
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
n_batches = mnist.train.num_examples // batch_size
for iteration in range(n_batches):
print("\r{}%".format(100 * iteration // n_batches), end="")
sys.stdout.flush()
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch})
reconstruction_loss_val, sparsity_loss_val, loss_val = sess.run([reconstruction_loss, sparsity_loss, loss], feed_dict={X: X_batch})
print("\r{}".format(epoch), "Train MSE:", reconstruction_loss_val, "\tSparsity loss:", sparsity_loss_val, "\tTotal loss:", loss_val)
saver.save(sess, "./my_model_sparse.ckpt")
0 Train MSE: 0.134832 Sparsity loss: 0.421739 Total loss: 0.21918 1 Train MSE: 0.0587859 Sparsity loss: 0.0108979 Total loss: 0.0609655 2 Train MSE: 0.053738 Sparsity loss: 0.0201038 Total loss: 0.0577588 3 Train MSE: 0.0476169 Sparsity loss: 0.0399679 Total loss: 0.0556105 4 Train MSE: 0.0447499 Sparsity loss: 0.0116199 Total loss: 0.0470739 5 Train MSE: 0.0403685 Sparsity loss: 0.0930409 Total loss: 0.0589767 6 Train MSE: 0.0388338 Sparsity loss: 0.0462908 Total loss: 0.048092 7 Train MSE: 0.0378196 Sparsity loss: 0.0758871 Total loss: 0.052997 8 Train MSE: 0.0332092 Sparsity loss: 0.0200693 Total loss: 0.037223 9 Train MSE: 0.0314318 Sparsity loss: 0.0965061 Total loss: 0.050733 10 Train MSE: 0.0273777 Sparsity loss: 0.0670885 Total loss: 0.0407954 11 Train MSE: 0.0246779 Sparsity loss: 0.0900828 Total loss: 0.0426945 12 Train MSE: 0.0233311 Sparsity loss: 0.0577432 Total loss: 0.0348797 13 Train MSE: 0.0228954 Sparsity loss: 0.0623308 Total loss: 0.0353615 14 Train MSE: 0.0210913 Sparsity loss: 0.0258186 Total loss: 0.026255 15 Train MSE: 0.0220006 Sparsity loss: 0.483207 Total loss: 0.118642 16 Train MSE: 0.0190526 Sparsity loss: 0.0361403 Total loss: 0.0262806 17 Train MSE: 0.0188885 Sparsity loss: 0.132695 Total loss: 0.0454275 18 Train MSE: 0.0174156 Sparsity loss: 0.0403093 Total loss: 0.0254774 19 Train MSE: 0.0178612 Sparsity loss: 0.110486 Total loss: 0.0399584 20 Train MSE: 0.0168293 Sparsity loss: 0.0291402 Total loss: 0.0226573 21 Train MSE: 0.0183871 Sparsity loss: 0.364209 Total loss: 0.0912289 22 Train MSE: 0.0161226 Sparsity loss: 0.0556278 Total loss: 0.0272482 23 Train MSE: 0.0158919 Sparsity loss: 0.0792573 Total loss: 0.0317434 24 Train MSE: 0.0157006 Sparsity loss: 0.149254 Total loss: 0.0455514 25 Train MSE: 0.0145307 Sparsity loss: 0.136184 Total loss: 0.0417676 26 Train MSE: 0.0144209 Sparsity loss: 0.110554 Total loss: 0.0365316 27 Train MSE: 0.0138508 Sparsity loss: 0.0744676 Total loss: 0.0287443 28 Train MSE: 0.0139305 Sparsity loss: 0.158476 Total loss: 0.0456257 29 Train MSE: 0.0133762 Sparsity loss: 0.143838 Total loss: 0.0421438 30 Train MSE: 0.0137258 Sparsity loss: 0.185643 Total loss: 0.0508544 31 Train MSE: 0.0139518 Sparsity loss: 0.0635133 Total loss: 0.0266544 32 Train MSE: 0.013692 Sparsity loss: 0.0577956 Total loss: 0.0252512 33 Train MSE: 0.0134704 Sparsity loss: 0.104171 Total loss: 0.0343045 34 Train MSE: 0.0124406 Sparsity loss: 0.136569 Total loss: 0.0397544 <<30 more lines>> 65 Train MSE: 0.0156422 Sparsity loss: 0.173917 Total loss: 0.0504256 66 Train MSE: 0.0150095 Sparsity loss: 1.02187 Total loss: 0.219383 67 Train MSE: 0.036823 Sparsity loss: 0.323619 Total loss: 0.101547 68 Train MSE: 0.0148193 Sparsity loss: 0.230714 Total loss: 0.060962 69 Train MSE: 0.0126409 Sparsity loss: 0.454552 Total loss: 0.103551 70 Train MSE: 0.045501 Sparsity loss: 0.745102 Total loss: 0.194521 71 Train MSE: 0.0143786 Sparsity loss: 0.229362 Total loss: 0.060251 72 Train MSE: 0.0151026 Sparsity loss: 0.826014 Total loss: 0.180306 73 Train MSE: 0.0136122 Sparsity loss: 0.316737 Total loss: 0.0769596 74 Train MSE: 0.0309757 Sparsity loss: 0.289552 Total loss: 0.0888861 75 Train MSE: 0.0304744 Sparsity loss: 0.489417 Total loss: 0.128358 76 Train MSE: 0.0204102 Sparsity loss: 0.201982 Total loss: 0.0608067 77 Train MSE: 0.0211023 Sparsity loss: 0.32347 Total loss: 0.0857964 78 Train MSE: 0.0178777 Sparsity loss: 0.533425 Total loss: 0.124563 79 Train MSE: 0.018841 Sparsity loss: 0.424661 Total loss: 0.103773 80 Train MSE: 0.0159234 Sparsity loss: 0.115559 Total loss: 0.0390352 81 Train MSE: 0.0129649 Sparsity loss: 0.912508 Total loss: 0.195467 82 Train MSE: 0.0162278 Sparsity loss: 2.17347 Total loss: 0.450922 83 Train MSE: 0.0146708 Sparsity loss: 0.681089 Total loss: 0.150889 84 Train MSE: 0.0150686 Sparsity loss: 0.292309 Total loss: 0.0735305 85 Train MSE: 0.0250247 Sparsity loss: 0.949989 Total loss: 0.215023 86 Train MSE: 0.0146914 Sparsity loss: 0.685326 Total loss: 0.151757 87 Train MSE: 0.0122667 Sparsity loss: 1.44823 Total loss: 0.301912 88 Train MSE: 0.0197259 Sparsity loss: 0.861047 Total loss: 0.191935 89 Train MSE: 0.0331342 Sparsity loss: 0.291833 Total loss: 0.0915009 90 Train MSE: 0.0295548 Sparsity loss: 0.445159 Total loss: 0.118587 91 Train MSE: 0.0145762 Sparsity loss: 0.0887034 Total loss: 0.0323169 92 Train MSE: 0.0147775 Sparsity loss: 0.390856 Total loss: 0.0929486 93 Train MSE: 0.0166543 Sparsity loss: 0.155326 Total loss: 0.0477195 94 Train MSE: 0.012198 Sparsity loss: 0.12071 Total loss: 0.03634 95 Train MSE: 0.0141104 Sparsity loss: 0.107212 Total loss: 0.0355529 96 Train MSE: 0.018834 Sparsity loss: 0.230255 Total loss: 0.0648851 97 Train MSE: 0.0134663 Sparsity loss: 0.102045 Total loss: 0.0338754 98 Train MSE: 0.013678 Sparsity loss: 0.0839055 Total loss: 0.0304591 99 Train MSE: 0.0245401 Sparsity loss: 0.335841 Total loss: 0.0917084
show_reconstructed_digits(X, outputs, "./my_model_sparse.ckpt")
INFO:tensorflow:Restoring parameters from ./my_model_sparse.ckpt
Note that the coding layer must output values from 0 to 1, which is why we use the sigmoid activation function:
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.sigmoid)
To speed up training, you can normalize the inputs between 0 and 1, and use the cross entropy instead of the MSE for the cost function:
logits = tf.layers.dense(hidden1, n_outputs)
outputs = tf.nn.sigmoid(logits)
xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=X, logits=logits)
reconstruction_loss = tf.reduce_mean(xentropy)
reset_graph()
from functools import partial
n_inputs = 28 * 28
n_hidden1 = 500
n_hidden2 = 500
n_hidden3 = 20 # codings
n_hidden4 = n_hidden2
n_hidden5 = n_hidden1
n_outputs = n_inputs
learning_rate = 0.001
initializer = tf.contrib.layers.variance_scaling_initializer()
my_dense_layer = partial(
tf.layers.dense,
activation=tf.nn.elu,
kernel_initializer=initializer)
X = tf.placeholder(tf.float32, [None, n_inputs])
hidden1 = my_dense_layer(X, n_hidden1)
hidden2 = my_dense_layer(hidden1, n_hidden2)
hidden3_mean = my_dense_layer(hidden2, n_hidden3, activation=None)
hidden3_sigma = my_dense_layer(hidden2, n_hidden3, activation=None)
noise = tf.random_normal(tf.shape(hidden3_sigma), dtype=tf.float32)
hidden3 = hidden3_mean + hidden3_sigma * noise
hidden4 = my_dense_layer(hidden3, n_hidden4)
hidden5 = my_dense_layer(hidden4, n_hidden5)
logits = my_dense_layer(hidden5, n_outputs, activation=None)
outputs = tf.sigmoid(logits)
xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=X, logits=logits)
reconstruction_loss = tf.reduce_sum(xentropy)
eps = 1e-10 # smoothing term to avoid computing log(0) which is NaN
latent_loss = 0.5 * tf.reduce_sum(
tf.square(hidden3_sigma) + tf.square(hidden3_mean)
- 1 - tf.log(eps + tf.square(hidden3_sigma)))
loss = reconstruction_loss + latent_loss
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 50
batch_size = 150
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
n_batches = mnist.train.num_examples // batch_size
for iteration in range(n_batches):
print("\r{}%".format(100 * iteration // n_batches), end="")
sys.stdout.flush()
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch})
loss_val, reconstruction_loss_val, latent_loss_val = sess.run([loss, reconstruction_loss, latent_loss], feed_dict={X: X_batch})
print("\r{}".format(epoch), "Train total loss:", loss_val, "\tReconstruction loss:", reconstruction_loss_val, "\tLatent loss:", latent_loss_val)
saver.save(sess, "./my_model_variational.ckpt")
0 Train total loss: 32440.1 Reconstruction loss: 25031.5 Latent loss: 7408.61 1 Train total loss: 30017.4 Reconstruction loss: 23093.3 Latent loss: 6924.14 2 Train total loss: 23337.9 Reconstruction loss: 20221.0 Latent loss: 3116.88 3 Train total loss: 21724.7 Reconstruction loss: 18698.8 Latent loss: 3025.89 4 Train total loss: 28219.0 Reconstruction loss: 21493.3 Latent loss: 6725.66 5 Train total loss: 25906.5 Reconstruction loss: 19582.4 Latent loss: 6324.09 6 Train total loss: 19198.3 Reconstruction loss: 15831.6 Latent loss: 3366.69 7 Train total loss: 17638.8 Reconstruction loss: 14539.6 Latent loss: 3099.17 8 Train total loss: 16688.3 Reconstruction loss: 13615.9 Latent loss: 3072.4 9 Train total loss: 17007.3 Reconstruction loss: 13783.2 Latent loss: 3224.1 10 Train total loss: 16550.5 Reconstruction loss: 13333.8 Latent loss: 3216.75 11 Train total loss: 16248.7 Reconstruction loss: 13009.1 Latent loss: 3239.6 12 Train total loss: 16346.3 Reconstruction loss: 13150.0 Latent loss: 3196.26 13 Train total loss: 16067.2 Reconstruction loss: 12777.2 Latent loss: 3290.02 14 Train total loss: 16512.1 Reconstruction loss: 13058.1 Latent loss: 3454.07 15 Train total loss: 16099.5 Reconstruction loss: 12739.1 Latent loss: 3360.35 16 Train total loss: 20827.6 Reconstruction loss: 16602.6 Latent loss: 4224.96 17 Train total loss: 38965.4 Reconstruction loss: 24849.1 Latent loss: 14116.2 18 Train total loss: 29396.9 Reconstruction loss: 24286.1 Latent loss: 5110.81 19 Train total loss: 27910.6 Reconstruction loss: 21005.3 Latent loss: 6905.23 20 Train total loss: 26797.9 Reconstruction loss: 20202.2 Latent loss: 6595.64 21 Train total loss: 18686.1 Reconstruction loss: 15251.4 Latent loss: 3434.69 22 Train total loss: 17034.8 Reconstruction loss: 13890.0 Latent loss: 3144.77 23 Train total loss: 16404.0 Reconstruction loss: 13102.6 Latent loss: 3301.37 24 Train total loss: 16214.5 Reconstruction loss: 12803.4 Latent loss: 3411.13 25 Train total loss: 16253.4 Reconstruction loss: 12823.9 Latent loss: 3429.48 26 Train total loss: 16326.2 Reconstruction loss: 12934.0 Latent loss: 3392.18 27 Train total loss: 16161.3 Reconstruction loss: 12767.4 Latent loss: 3393.91 28 Train total loss: 16990.3 Reconstruction loss: 13471.8 Latent loss: 3518.54 29 Train total loss: 15728.4 Reconstruction loss: 12465.1 Latent loss: 3263.28 30 Train total loss: 16505.3 Reconstruction loss: 13219.9 Latent loss: 3285.37 31 Train total loss: 16961.6 Reconstruction loss: 13379.0 Latent loss: 3582.55 32 Train total loss: 17671.7 Reconstruction loss: 14372.1 Latent loss: 3299.55 33 Train total loss: 16640.7 Reconstruction loss: 13332.3 Latent loss: 3308.39 34 Train total loss: 21943.6 Reconstruction loss: 15878.3 Latent loss: 6065.31 35 Train total loss: 15656.0 Reconstruction loss: 12254.1 Latent loss: 3401.86 36 Train total loss: 15697.0 Reconstruction loss: 12231.0 Latent loss: 3465.93 37 Train total loss: 15769.4 Reconstruction loss: 12409.7 Latent loss: 3359.68 38 Train total loss: 17182.6 Reconstruction loss: 13943.9 Latent loss: 3238.67 39 Train total loss: 18285.6 Reconstruction loss: 14796.2 Latent loss: 3489.34 40 Train total loss: 20053.2 Reconstruction loss: 14899.0 Latent loss: 5154.25 41 Train total loss: 16290.2 Reconstruction loss: 13008.1 Latent loss: 3282.09 42 Train total loss: 27364.1 Reconstruction loss: 22713.0 Latent loss: 4651.08 43 Train total loss: 15450.8 Reconstruction loss: 12009.0 Latent loss: 3441.87 44 Train total loss: 15567.6 Reconstruction loss: 12068.6 Latent loss: 3499.0 45 Train total loss: 15348.8 Reconstruction loss: 11840.9 Latent loss: 3507.95 46 Train total loss: 15435.6 Reconstruction loss: 11949.6 Latent loss: 3486.03 47 Train total loss: 15210.5 Reconstruction loss: 11804.4 Latent loss: 3406.18 48 Train total loss: 20627.8 Reconstruction loss: 16485.7 Latent loss: 4142.07 49 Train total loss: 15147.4 Reconstruction loss: 11587.1 Latent loss: 3560.29
reset_graph()
from functools import partial
n_inputs = 28 * 28
n_hidden1 = 500
n_hidden2 = 500
n_hidden3 = 20 # codings
n_hidden4 = n_hidden2
n_hidden5 = n_hidden1
n_outputs = n_inputs
learning_rate = 0.001
initializer = tf.contrib.layers.variance_scaling_initializer()
my_dense_layer = partial(
tf.layers.dense,
activation=tf.nn.elu,
kernel_initializer=initializer)
X = tf.placeholder(tf.float32, [None, n_inputs])
hidden1 = my_dense_layer(X, n_hidden1)
hidden2 = my_dense_layer(hidden1, n_hidden2)
hidden3_mean = my_dense_layer(hidden2, n_hidden3, activation=None)
hidden3_gamma = my_dense_layer(hidden2, n_hidden3, activation=None)
noise = tf.random_normal(tf.shape(hidden3_gamma), dtype=tf.float32)
hidden3 = hidden3_mean + tf.exp(0.5 * hidden3_gamma) * noise
hidden4 = my_dense_layer(hidden3, n_hidden4)
hidden5 = my_dense_layer(hidden4, n_hidden5)
logits = my_dense_layer(hidden5, n_outputs, activation=None)
outputs = tf.sigmoid(logits)
xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=X, logits=logits)
reconstruction_loss = tf.reduce_sum(xentropy)
latent_loss = 0.5 * tf.reduce_sum(
tf.exp(hidden3_gamma) + tf.square(hidden3_mean) - 1 - hidden3_gamma)
loss = reconstruction_loss + latent_loss
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
Let's train the model and generate a few random digits:
import numpy as np
n_digits = 60
n_epochs = 50
batch_size = 150
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
n_batches = mnist.train.num_examples // batch_size
for iteration in range(n_batches):
print("\r{}%".format(100 * iteration // n_batches), end="") # not shown in the book
sys.stdout.flush() # not shown
X_batch, y_batch = mnist.train.next_batch(batch_size)
sess.run(training_op, feed_dict={X: X_batch})
loss_val, reconstruction_loss_val, latent_loss_val = sess.run([loss, reconstruction_loss, latent_loss], feed_dict={X: X_batch}) # not shown
print("\r{}".format(epoch), "Train total loss:", loss_val, "\tReconstruction loss:", reconstruction_loss_val, "\tLatent loss:", latent_loss_val) # not shown
saver.save(sess, "./my_model_variational.ckpt") # not shown
codings_rnd = np.random.normal(size=[n_digits, n_hidden3])
outputs_val = outputs.eval(feed_dict={hidden3: codings_rnd})
0 Train total loss: 17792.6 Reconstruction loss: 14122.9 Latent loss: 3669.64 1 Train total loss: 17332.2 Reconstruction loss: 13560.0 Latent loss: 3772.24 2 Train total loss: 16350.7 Reconstruction loss: 12579.3 Latent loss: 3771.48 3 Train total loss: 16581.4 Reconstruction loss: 12810.6 Latent loss: 3770.78 4 Train total loss: 16223.9 Reconstruction loss: 12450.0 Latent loss: 3773.86 5 Train total loss: 15628.1 Reconstruction loss: 11819.6 Latent loss: 3808.51 6 Train total loss: 16080.9 Reconstruction loss: 12179.7 Latent loss: 3901.24 7 Train total loss: 15772.8 Reconstruction loss: 12021.3 Latent loss: 3751.55 8 Train total loss: 16276.5 Reconstruction loss: 12404.6 Latent loss: 3871.83 9 Train total loss: 15589.6 Reconstruction loss: 11740.6 Latent loss: 3849.02 10 Train total loss: 15931.3 Reconstruction loss: 12031.4 Latent loss: 3899.94 11 Train total loss: 16112.7 Reconstruction loss: 12238.3 Latent loss: 3874.35 12 Train total loss: 16002.0 Reconstruction loss: 12185.1 Latent loss: 3816.83 13 Train total loss: 15357.7 Reconstruction loss: 11667.4 Latent loss: 3690.35 14 Train total loss: 16208.4 Reconstruction loss: 12264.4 Latent loss: 3943.96 15 Train total loss: 15970.0 Reconstruction loss: 12158.5 Latent loss: 3811.52 16 Train total loss: 15551.6 Reconstruction loss: 11783.1 Latent loss: 3768.49 17 Train total loss: 15330.0 Reconstruction loss: 11555.7 Latent loss: 3774.3 18 Train total loss: 15251.3 Reconstruction loss: 11584.5 Latent loss: 3666.81 19 Train total loss: 15196.0 Reconstruction loss: 11516.6 Latent loss: 3679.44 20 Train total loss: 15323.9 Reconstruction loss: 11525.9 Latent loss: 3797.99 21 Train total loss: 15358.7 Reconstruction loss: 11515.6 Latent loss: 3843.17 22 Train total loss: 15297.9 Reconstruction loss: 11582.5 Latent loss: 3715.37 23 Train total loss: 14673.0 Reconstruction loss: 10940.7 Latent loss: 3732.34 24 Train total loss: 15293.5 Reconstruction loss: 11561.7 Latent loss: 3731.75 25 Train total loss: 15256.3 Reconstruction loss: 11540.8 Latent loss: 3715.53 26 Train total loss: 15305.4 Reconstruction loss: 11475.4 Latent loss: 3830.01 27 Train total loss: 15276.9 Reconstruction loss: 11449.7 Latent loss: 3827.24 28 Train total loss: 14980.6 Reconstruction loss: 11318.0 Latent loss: 3662.56 29 Train total loss: 15232.8 Reconstruction loss: 11520.1 Latent loss: 3712.69 30 Train total loss: 14872.4 Reconstruction loss: 11172.9 Latent loss: 3699.47 31 Train total loss: 14890.3 Reconstruction loss: 11144.1 Latent loss: 3746.17 32 Train total loss: 15246.7 Reconstruction loss: 11439.3 Latent loss: 3807.4 33 Train total loss: 15063.5 Reconstruction loss: 11282.1 Latent loss: 3781.41 34 Train total loss: 15046.7 Reconstruction loss: 11310.2 Latent loss: 3736.47 35 Train total loss: 15293.9 Reconstruction loss: 11599.5 Latent loss: 3694.4 36 Train total loss: 15134.5 Reconstruction loss: 11362.8 Latent loss: 3771.74 37 Train total loss: 14705.7 Reconstruction loss: 11054.7 Latent loss: 3650.98 38 Train total loss: 14913.9 Reconstruction loss: 11077.0 Latent loss: 3836.93 39 Train total loss: 14848.1 Reconstruction loss: 11198.5 Latent loss: 3649.57 40 Train total loss: 14694.2 Reconstruction loss: 10991.5 Latent loss: 3702.73 41 Train total loss: 15223.9 Reconstruction loss: 11465.1 Latent loss: 3758.8 42 Train total loss: 14585.3 Reconstruction loss: 11019.3 Latent loss: 3566.01 43 Train total loss: 14579.1 Reconstruction loss: 10931.2 Latent loss: 3647.84 44 Train total loss: 15049.1 Reconstruction loss: 11381.9 Latent loss: 3667.18 45 Train total loss: 14855.6 Reconstruction loss: 11125.6 Latent loss: 3730.04 46 Train total loss: 14777.7 Reconstruction loss: 11093.4 Latent loss: 3684.3 47 Train total loss: 14408.9 Reconstruction loss: 10788.5 Latent loss: 3620.39 48 Train total loss: 14479.2 Reconstruction loss: 10864.3 Latent loss: 3614.88 49 Train total loss: 14637.6 Reconstruction loss: 10926.0 Latent loss: 3711.55
plt.figure(figsize=(8,50)) # not shown in the book
for iteration in range(n_digits):
plt.subplot(n_digits, 10, iteration + 1)
plot_image(outputs_val[iteration])
n_rows = 6
n_cols = 10
plot_multiple_images(outputs_val.reshape(-1, 28, 28), n_rows, n_cols)
save_fig("generated_digits_plot")
plt.show()
Saving figure generated_digits_plot
Note that the latent loss is computed differently in this second variant:
latent_loss = 0.5 * tf.reduce_sum(
tf.exp(hidden3_gamma) + tf.square(hidden3_mean) - 1 - hidden3_gamma)
Encode:
n_digits = 3
X_test, y_test = mnist.test.next_batch(batch_size)
codings = hidden3
with tf.Session() as sess:
saver.restore(sess, "./my_model_variational.ckpt")
codings_val = codings.eval(feed_dict={X: X_test})
INFO:tensorflow:Restoring parameters from ./my_model_variational.ckpt
Decode:
with tf.Session() as sess:
saver.restore(sess, "./my_model_variational.ckpt")
outputs_val = outputs.eval(feed_dict={codings: codings_val})
INFO:tensorflow:Restoring parameters from ./my_model_variational.ckpt
Let's plot the reconstructions:
fig = plt.figure(figsize=(8, 2.5 * n_digits))
for iteration in range(n_digits):
plt.subplot(n_digits, 2, 1 + 2 * iteration)
plot_image(X_test[iteration])
plt.subplot(n_digits, 2, 2 + 2 * iteration)
plot_image(outputs_val[iteration])
n_iterations = 3
n_digits = 6
codings_rnd = np.random.normal(size=[n_digits, n_hidden3])
with tf.Session() as sess:
saver.restore(sess, "./my_model_variational.ckpt")
target_codings = np.roll(codings_rnd, -1, axis=0)
for iteration in range(n_iterations + 1):
codings_interpolate = codings_rnd + (target_codings - codings_rnd) * iteration / n_iterations
outputs_val = outputs.eval(feed_dict={codings: codings_interpolate})
plt.figure(figsize=(11, 1.5*n_iterations))
for digit_index in range(n_digits):
plt.subplot(1, n_digits, digit_index + 1)
plot_image(outputs_val[digit_index])
plt.show()
INFO:tensorflow:Restoring parameters from ./my_model_variational.ckpt
Coming soon...