In this post, we will introduce the most direct way of incorporating distribution object into a deep learning model with distribution lambda layer. This is the summary of lecture "Probabilistic Deep Learning with Tensorflow 2" from Imperial College London.
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import matplotlib.pyplot as plt
tfd = tfp.distributions
tfpl = tfp.layers
plt.rcParams['figure.figsize'] = (10, 6)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import RMSprop
print("Tensorflow Version: ", tf.__version__)
print("Tensorflow Probability Version: ", tfp.__version__)
Tensorflow Version: 2.5.0 Tensorflow Probability Version: 0.13.0
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
model = Sequential([
Dense(1, input_shape=(2, ))
])
model.compile(loss='mse', optimizer='rmsprop')
model.fit(X_train, y_train, epochs=10)
# y = <x, w> + b + eps
# eps ~ N(0, sigma^2)
# Likelihood
# theta = (w, b)
# theta^* = argmax_{theta} P(D | theta)
import tensorflow_probability as tfp
tfd = tfp.distributions
tfpl = tfp.layers
model = Sequential([
Dense(1, input_shape=(2, )),
# Final layer includes normal distribution
# Form is a sort of Lambda function, that can give the output as a mean
# in this case, output tensor is go through normal distribution
# And normal distriubiton is trainable
tfpl.DistributionLambda(
lambda t: tfd.Normal(loc=t, scale=1)
)
])
Negative Log-Likelihood Loss function
def nll(y_true, y_pred):
# Since y_pred is distribution object, we can call log_prob for sample data
return -y_pred.log_prob(y_true)
In this case, output will be tensor object, not distribution object.
model = Sequential([
Dense(1, input_shape=(2, )),
tfpl.DistributionLambda(
lambda t:tfd.Normal(loc=t, scale=1),
convert_to_tensor_fn=tfd.Distribution.sample
)
])
# tfd.Distribution.mean
# tfd.Distribution.mode
model = tf.keras.Sequential([
tf.keras.layers.Dense(8, input_shape=(12, )),
tfpl.DistributionLambda(lambda t: tfd.Bernoulli(logits=t))
])
model(np.ones((3, 12), dtype=np.float32)).sample(5)
<tf.Tensor: shape=(5, 3, 8), dtype=int32, numpy= array([[[0, 0, 0, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 1, 1, 1]], [[0, 0, 0, 1, 0, 1, 1, 0], [0, 0, 0, 1, 0, 1, 1, 1], [1, 0, 0, 1, 0, 1, 1, 0]], [[0, 0, 0, 1, 0, 0, 1, 1], [1, 0, 0, 0, 1, 1, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0]], [[1, 0, 0, 1, 1, 1, 0, 1], [0, 0, 0, 1, 1, 1, 0, 1], [0, 0, 0, 1, 1, 1, 1, 1]], [[1, 1, 0, 0, 1, 1, 0, 0], [1, 1, 0, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 1, 1, 0]]], dtype=int32)>
Its input layer has 8 nodes and passes through the distributionLambda layer. So its event_shape has 8. And input data is entered as 3 batches. We generate 5 samples. As a result, the output tensor shape will be (5, 3, 8)
.
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense (Dense) (None, 8) 104 _________________________________________________________________ distribution_lambda (Distrib multiple 0 ================================================================= Total params: 104 Trainable params: 104 Non-trainable params: 0 _________________________________________________________________
DistributionLambda
layer¶Create a model whose first layer represents:
$$ y = \text{sigmoid}(x) = \frac{1}{1 + \exp(-x)}. $$# Create a sigmoid model, first deterministic, then probabilistic
model = Sequential([
Dense(units=1, input_shape=(1, ), activation='sigmoid',
kernel_initializer=tf.constant_initializer(1),
bias_initializer=tf.constant_initializer(0))
], name='deterministic_sigmoid_model')
model.summary()
Model: "deterministic_sigmoid_model" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_4 (Dense) (None, 1) 2 ================================================================= Total params: 2 Trainable params: 2 Non-trainable params: 0 _________________________________________________________________
# Plot the deterministic model
X = np.linspace(-5, 5, 100)
plt.scatter(X, model.predict(X), alpha=0.4)
plt.plot(X, 1 / (1 + np.exp(-X)), color='r', alpha=0.8)
plt.title('Deterministic Sigmoid function')
plt.show()
# Create a constant input for this model
x = np.array([[0]])
x
array([[0]])
# Explore the feedforward object
y = model(x)
y
<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.5]], dtype=float32)>
# ... and its behavior under repeated calls
for _ in range(5):
print(model.predict(x))
[[0.5]] [[0.5]] [[0.5]] [[0.5]] [[0.5]]
model = Sequential([
Dense(units=1, input_shape=(1, ), activation='sigmoid',
kernel_initializer=tf.constant_initializer(1),
bias_initializer=tf.constant_initializer(0)),
tfpl.DistributionLambda(lambda t: tfd.Bernoulli(probs=t),
convert_to_tensor_fn=tfd.Distribution.sample)
], name='probabilistic_sigmoid_model')
model.summary()
Model: "probabilistic_sigmoid_model" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_5 (Dense) (None, 1) 2 _________________________________________________________________ distribution_lambda_1 (Distr multiple 0 ================================================================= Total params: 2 Trainable params: 2 Non-trainable params: 0 _________________________________________________________________
# Plot the probabilistic model
X = np.linspace(-5, 5, 100)
plt.scatter(X, model.predict(X), alpha=0.4)
plt.plot(X, 1 / (1 + np.exp(-X)), color='r', alpha=0.8)
plt.title('Probabilistic Sigmoid function')
plt.show()
# Create a constant input for this model
x = np.array([[0]])
x
array([[0]])
# Explore the feedforward object
y = model(x)
y
<tfp.distributions.Bernoulli 'probabilistic_sigmoid_model_distribution_lambda_1_Bernoulli' batch_shape=[1, 1] event_shape=[] dtype=int32>
# ... and its behavior under repeated calls
for _ in range(5):
print(model.predict(x))
[[0]] [[0]] [[1]] [[0]] [[1]]
# Use the model to create 500 training points
X_train = np.linspace(-5, 5, 500)[:, np.newaxis]
y_train = model.predict(X_train)
# Plot the data and the mean of the distribution
plt.scatter(X_train, y_train, alpha=0.04, color='blue', label='samples')
plt.plot(X_train, model(X_train).mean().numpy().flatten(), color='red', alpha=0.8, label='mean')
plt.legend(loc='best')
plt.show()
model_untrained = Sequential([
Dense(units=1, input_shape=(1, ), activation='sigmoid',
kernel_initializer=tf.constant_initializer(2),
bias_initializer=tf.constant_initializer(2)),
tfpl.DistributionLambda(lambda t:tfd.Bernoulli(probs=t),
convert_to_tensor_fn=tfd.Distribution.sample)
], name='wrong_probabilistic_model')
model.summary()
Model: "probabilistic_sigmoid_model" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_5 (Dense) (None, 1) 2 _________________________________________________________________ distribution_lambda_1 (Distr multiple 0 ================================================================= Total params: 2 Trainable params: 2 Non-trainable params: 0 _________________________________________________________________
# Define negative log likelihood,
def nll(y_true, y_pred):
return -y_pred.log_prob(y_true)
# Compile untrained model
model_untrained.compile(loss=nll, optimizer=RMSprop(learning_rate=1e-2))
# Train the model, record weights after each epoch
epochs = [0]
training_weights = [model_untrained.weights[0].numpy()[0, 0]]
training_bias = [model_untrained.weights[1].numpy()[0]]
for e in range(100):
model_untrained.fit(X_train, y_train, epochs=1, verbose=False)
epochs.append(e)
training_weights.append(model_untrained.weights[0].numpy()[0, 0])
training_bias.append(model_untrained.weights[1].numpy()[0])
# Plot the model weights as they train, converging to the correct values
plt.plot(epochs, training_weights, label='weight')
plt.plot(epochs, training_bias, label='bias')
plt.axhline(y=1, label='true_weight', color='k', linestyle=':')
plt.axhline(y=0, label='true_bias', color='k', linestyle='--')
plt.xlabel('Epoch')
plt.legend(loc='best')
plt.show()
This make sense that the training data is not perfect. It contains some errors itself. This kind of uncertainty is called epistemic uncertainty. We will cover it in later notebook.