import numpy as np
# Define the true weights and bias of the model
w_true = np.array([2, -3.4])
b_true = 4.2
# Construct a random generator, seeded for reproducibility
rng = np.random.default_rng(seed=0)
# Generate the inputs (from a standard normal distribution) and outputs (with some Gaussian noise)
number_examples = 1000
input_size = len(w_true)
X = rng.normal(0, 1, (number_examples, input_size))
y = np.matmul(X, w_true) + b_true + rng.normal(0, 0.01, number_examples)
# Define the parameters for the training
number_epochs = 3
batch_size = 10
learning_rate = 0.03
# Initialize the weights and bias to recover
w = rng.normal(0, 1, input_size)
b = 0
# Initialize an array for the mean loss over the minibatches of every epoch
epoch_loss = np.zeros(number_epochs)
# Loop over the epochs
for i in range(number_epochs):
# Generate random indices for all the examples
example_indices = np.arange(number_examples)
rng.shuffle(example_indices)
# Initialize a list for the mean loss over the examples of every minibatch
batch_loss = []
# Loop over the examples in minibatches
for j in np.arange(0, number_examples, batch_size):
# Get the indices of the examples for one minibatch
batch_indices = example_indices[j:min(j+batch_size, number_examples)]
# Get the inputs and outputs for the current minibatch
X_batch = X[batch_indices, :]
y_batch = y[batch_indices]
# Compute the predicted outputs
y_hat = np.matmul(X_batch, w) + b
# Compute the loss between the predicted and true outputs
l = 0.5*np.power(y_hat-y_batch, 2)
# Save the mean loss for the current minibatch
batch_loss.append(np.mean(l))
# Update the weights and bias using stochastic gradient descent (SGD)
w = w - learning_rate*np.mean(X_batch*(y_hat-y_batch)[:, None], axis=0)
b = b - learning_rate*np.mean(y_hat-y_batch, axis=0)
# Save the mean loss for the current epoch
epoch_loss[i] = np.mean(batch_loss)
# Print the progress
print(f'{i+1}/{number_epochs}: {epoch_loss[i]}')
# Print the predicted weights and bias
print('')
print(f'w = {w}')
print(f'b = {b}')
1/3: 2.8028630762446722 2/3: 0.005944852663986381 3/3: 6.104357869966443e-05 w = [ 1.99956731 -3.39973415] b = 4.198903523007357
import torch
# Define the true weights and bias of the model
w_true = torch.tensor([2, -3.4])
b_true = 4.2
# Generate inputs and outputs
number_examples = 1000
input_size = len(w_true)
X = torch.normal(0, 1, (number_examples, input_size))
y = torch.matmul(X, w_true) + b_true + torch.normal(0, 0.01, [number_examples])
# Define a function to read the dataset in random minibatches
def batch(X, y, batch_size):
# Generate random indices for all the examples
number_examples = X.shape[0]
example_indices = torch.randperm(number_examples)
# Loop over the examples in minibatches
for i in range(0, number_examples, batch_size):
# Get the indices of the examples for one minibatch
batch_indices = example_indices[i:min(i+batch_size, number_examples)]
# Return the input and output for the current minibatch and continue the iteration in the function
yield X[batch_indices], y[batch_indices]
# Define the parameters for the training
number_epochs = 3
batch_size = 10
learning_rate = 0.03
# Initialize the weights and bias to recover, requiring the gradients to be computed
w = torch.normal(0, 1, [input_size], requires_grad=True)
b = torch.zeros(1, requires_grad=True)
# Initialize an array for the mean loss over the minibatches of every epoch
epoch_loss = torch.zeros(number_epochs)
# Loop over the epochs
for i in range(number_epochs):
# Initialize a list for the mean loss over the examples of every minibatch
batch_loss = []
# Loop over the examples in minibatches
for X_batch, y_batch in batch(X, y, batch_size):
# Compute the predicted outputs
y_hat = torch.matmul(X_batch, w) + b
# Compute the loss between the predicted and true outputs
l = 0.5*(y_hat-y_batch)**2
# Compute the gradient on l wrt w and b
# (sum and not mean as the gradients will be divided by the batch size during SGD)
l.sum().backward()
# Save the mean loss for the current minibatch
batch_loss.append(l.mean())
# Temporarily sets all the requires_grad flags to false
with torch.no_grad():
# Update the weights and bias using SGD
# (use augmented assignments to avoid modifying existing variables)
w -= learning_rate*w.grad/len(l)
b -= learning_rate*b.grad/len(l)
# Set the gradients to zeros to avoid accumulating gradients
w.grad.zero_()
b.grad.zero_()
# Save the mean loss for the current epoch
epoch_loss[i] = sum(batch_loss)/len(batch_loss)
# Print the progress
print(f'{i+1}/{number_epochs}: {epoch_loss[i]}')
# Print the predicted weights and bias
print('')
print(f'w = {w}')
print(f'b = {b}')
1/3: 3.0891919136047363 2/3: 0.006769159343093634 3/3: 6.386495078913867e-05 w = tensor([ 1.9992, -3.3997], requires_grad=True) b = tensor([4.1992], requires_grad=True)
import torch
from torch.utils import data
from torch import nn
# Define the true weights and bias of the model
w_true = torch.tensor([2, -3.4])
b_true = 4.2
# Generate inputs and outputs
number_examples = 1000
input_size = len(w_true)
X = torch.normal(0, 1, (number_examples, input_size))
y = torch.matmul(X, w_true) + b_true + torch.normal(0, 0.01, [number_examples])
# Define a function to read the dataset in random minibatches by using data iterator
def batch(X, y, batch_size):
data_set = data.TensorDataset(*(X, y))
return data.DataLoader(data_set, batch_size, shuffle=True)
# Define the parameters for the training
number_epochs = 3
batch_size = 10
learning_rate = 0.03
# Define the model with a fully-connected layer
model = nn.Sequential(nn.Linear(input_size, 1))
# Initialize the parameters
model[0].weight.data.normal_(0, 0.01)
model[0].bias.data.fill_(0)
# Define the loss function (mean squared error, without the 0.5 factor)
loss = nn.MSELoss()
# Define the optimization algorithm (SGD)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# Initialize an array for the mean loss over the minibatches of every epoch
epoch_loss = torch.zeros(number_epochs)
# Loop over the epochs
for i in range(number_epochs):
# Initialize a list for the mean loss over the examples of every minibatch
batch_loss = []
# Loop over the examples in minibatches
for X_batch, y_batch in batch(X, y, batch_size):
# Compute the predicted outputs
y_hat = model(X_batch)
# Compute the loss between the predicted and true outputs
l = loss(y_hat, y_batch[:, None])
# Save the loss for the current minibatch
batch_loss.append(l)
# Set the gradients to zero
optimizer.zero_grad()
# Computes the gradient
l.backward()
# Performs a single parameter update
optimizer.step()
# Save the mean loss for the current epoch
epoch_loss[i] = sum(batch_loss)/len(batch_loss)
# Print the progress
print(f'{i+1}/{number_epochs}: {epoch_loss[i]}')
# Print the predicted weights and bias
print('')
print(f'w = {model[0].weight.data}')
print(f'b = {model[0].bias.data}')
1/3: 2.8517115116119385 2/3: 0.0001156603466370143 3/3: 0.00010338309220969677 w = tensor([[ 2.0001, -3.3987]]) b = tensor([4.1996])
import tensorflow as tf
# Define the true weights and bias of the model
w_true = tf.constant([2, -3.4], shape=[2, 1])
b_true = tf.constant(4.2)
# Generate inputs and outputs
number_examples = 1000
input_size = len(w_true)
tf.random.set_seed(0)
X = tf.random.normal([number_examples, input_size], 0, 1)
y = tf.matmul(X, w_true) + b_true + tf.random.normal([number_examples], 0, 0.01)
# Define the parameters for the training
number_epochs = 3
batch_size = 10
learning_rate = 0.03
# Define the model with a densely-connected NN layer with initialized parameters
model = tf.keras.Sequential([tf.keras.layers.Dense(1, \
kernel_initializer=tf.initializers.RandomNormal(mean=0, stddev=0.01), \
bias_initializer='zeros')])
# Configure the model for training with SGD optimizer and MSE loss
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), \
loss=tf.keras.losses.MeanSquaredError())
# Train the model given the batch size and number of epochs
model.fit(x=X, y=y, batch_size=batch_size, epochs=number_epochs, verbose=1)
# Print the predicted weights and bias
print('')
print(f'w = {model.get_weights()[0]}')
print(f'b = {model.get_weights()[1]}')
Epoch 1/3 100/100 [==============================] - 0s 469us/step - loss: 2.8948 Epoch 2/3 100/100 [==============================] - 0s 413us/step - loss: 1.1406e-04 Epoch 3/3 100/100 [==============================] - 0s 449us/step - loss: 1.0830e-04 w = [[ 2. ] [-3.4]] b = [4.199775]
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
# Get the Fashion-MNIST dataset, with train and test inputs and outputs
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
# Normalize the inputs
X_train = X_train/255
X_test = X_test/255
# Translate the outputs into labels
label_list = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
label_train = [label_list[i] for i in y_train]
label_test = [label_list[i] for i in y_test]
# Show a single example for the different classes
number_classes = len(label_list)
plt.figure(figsize=(18, 2))
for i in range(number_classes):
j = np.where(y_train==i)[0][0]
plt.subplot(1, number_classes, i+1)
plt.imshow(X_train[j, :, :], cmap='binary')
plt.title(label_list[i])
plt.xticks([])
plt.yticks([])
plt.show()
import matplotlib.pyplot as plt
import numpy as np
import random
import tensorflow as tf
# Get the train and test inputs and outputs
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
number_train = len(X_train)
number_test = len(X_test)
# Normalize and flatten the inputs
input_size = np.size(X_train[0])
X_train = np.reshape(X_train/255, (number_train, input_size))
X_test = np.reshape(X_test/255, (number_test, input_size))
# Derive one-hot versions of the train outputs
output_size = 10
Y_train = np.zeros((number_train, output_size))
Y_train[np.arange(number_train), y_train] = 1
# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1
# Initialize the weights and bias to recover
W = np.random.default_rng().normal(0, 0.01, size=(input_size, output_size))
b = np.zeros(output_size)
# Initialize lists for the mean train loss and accuracy over the minibatches for every epoch
train_loss = [[] for _ in range(number_epochs)]
train_accuracy = [[] for _ in range(number_epochs)]
# Initialize a list for the test accuracy overall for every epoch
test_accuracy = [None]*number_epochs
# Loop over the epochs
for i in range(number_epochs):
# Generate random indices for all the train examples
train_indices = np.arange(number_train)
random.shuffle(train_indices)
# Loop over the train examples in minibatches
for j in np.arange(0, number_train, batch_size):
# Get the indices of the train examples for one minibatch
batch_indices = train_indices[j:min(j+batch_size, number_train)]
# Get the train inputs and outputs for the minibatch
X = X_train[batch_indices, :]
y = y_train[batch_indices]
Y = Y_train[batch_indices]
# Compute the predicted outputs (logits)
O = np.matmul(X, W) + b
# Compute the softmax of the logits (indirectly to avoid numerical stability issues)
O = O-np.max(O, axis=1)[:, None]
O_exp = np.exp(O)
Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None]
# Compute the mean cross-entropy loss for the minibatch and save it
l = np.mean(np.log(np.sum(O_exp, axis=1)-np.sum(Y*O, axis=1)))
train_loss[i].append(l)
# Compute the mean accuracy for the minibatch and save it
a = np.mean(np.argmax(Y_hat, axis=1)==y)
train_accuracy[i].append(a)
# Update the weights and bias using SGD
dl = Y_hat-Y
W = W-learning_rate*np.matmul(X.T, dl)/np.shape(X)[0]
b = b-learning_rate*np.mean(dl, axis=0)
# Derive the mean train loss and accuracy for the current epoch
train_loss[i] = np.mean(train_loss[i])
train_accuracy[i] = np.mean(train_accuracy[i])
# Compute the test outputs and derive the test accuracy for the current epoch
O = np.matmul(X_test, W) + b
O = O-np.max(O, axis=1)[:, None]
O_exp = np.exp(O)
Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None]
test_accuracy[i] = np.mean(np.argmax(Y_hat, axis=1)==y_test)
# Print the progress
print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}')
# Show some predictions
number_examples = 10
O = np.matmul(X_test[:number_examples, :], W) + b
O = O-np.max(O, axis=1)[:, None]
O_exp = np.exp(O)
Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None]
y_hat = np.argmax(Y_hat, axis=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
plt.subplot(1, number_examples, i+1)
plt.imshow(np.reshape(X_test[i, :], (28, 28))*255, cmap='binary')
plt.title(f'True: {label_list[y_test[i].item()]}\n Pred: {label_list[y_hat[i]]}')
plt.xticks([])
plt.yticks([])
plt.show()
1/10: train_loss=0.646; train_accuracy=0.751; test_accuracy=0.784 2/10: train_loss=0.441; train_accuracy=0.814; test_accuracy=0.799 3/10: train_loss=0.398; train_accuracy=0.826; test_accuracy=0.810 4/10: train_loss=0.375; train_accuracy=0.831; test_accuracy=0.811 5/10: train_loss=0.360; train_accuracy=0.836; test_accuracy=0.813 6/10: train_loss=0.348; train_accuracy=0.841; test_accuracy=0.821 7/10: train_loss=0.340; train_accuracy=0.843; test_accuracy=0.818 8/10: train_loss=0.334; train_accuracy=0.845; test_accuracy=0.829 9/10: train_loss=0.329; train_accuracy=0.846; test_accuracy=0.831 10/10: train_loss=0.324; train_accuracy=0.848; test_accuracy=0.834
import matplotlib.pyplot as plt
import torch
import torchvision
from torch.utils import data
# Get the dataset (transform the image data from PIL type to normalized 32-bit floating point tensors)
fmnist_train = torchvision.datasets.FashionMNIST(root='data', train=True, download=True,
transform=torchvision.transforms.ToTensor())
fmnist_test = torchvision.datasets.FashionMNIST(root='data', train=False, download=True,
transform=torchvision.transforms.ToTensor())
# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1
# Use data iterators to read a minibatch at each iteration, shuffling the examples for the train set and using 4 processes
train_iter = data.DataLoader(fmnist_train, batch_size, shuffle=True, num_workers=4)
test_iter = data.DataLoader(fmnist_test, batch_size, shuffle=False, num_workers=4)
# Initialize the parameters to recover, requiring the gradients to be computed
input_size = fmnist_train[0][0].nelement()
output_size = 10
W = torch.normal(0, 0.01, size=(input_size, output_size), requires_grad=True)
b = torch.zeros(output_size, requires_grad=True)
# Initialize lists for the mean train loss, train and test accuracy over the minibatches for every epoch
train_loss = [[] for _ in range(number_epochs)]
train_accuracy = [[] for _ in range(number_epochs)]
test_accuracy = [[] for _ in range(number_epochs)]
# Loop over the epochs
for i in range(number_epochs):
# Loop over the train examples in minibatches
for X, y in train_iter:
# Compute the logits, after flattening the images
O = torch.matmul(torch.reshape(X, (-1, input_size)), W) + b
# Compute the softmax of the logits
O_exp = torch.exp(O)
Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True)
# Compute the cross-entropy loss (use the indices of the true classes in y_batch
# to get the corresponding probabilities in y_batch, for all the examples)
l = -torch.log(Y_hat[range(Y_hat.shape[0]), y])
# Save the mean loss for the current minibatch
train_loss[i].append(torch.mean(l).item())
# Compute the mean accuracy for the current minibatch and save it
a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
train_accuracy[i].append(a)
# Compute the gradient on l with respect to W and b
# (sum and not mean as the gradients will be divided by the batch size during SGD)
torch.sum(l).backward()
# Disable gradient calculation for the following operations not to be differentiable
with torch.no_grad():
# Update the weights and bias using SGD
# (use augmented assignments to avoid modifying existing variables)
W -= learning_rate*W.grad/len(l)
b -= learning_rate*b.grad/len(l)
# Set the gradients to zeros to avoid accumulating gradients
W.grad.zero_()
b.grad.zero_()
# Derive the mean train loss and accuracy for the current epoch
train_loss[i] = sum(train_loss[i])/len(train_loss[i])
train_accuracy[i] = sum(train_accuracy[i])/len(train_accuracy[i])
# Compute the test outputs and derive the test accuracy for every epoch, in minibatches
with torch.no_grad():
for X, y in test_iter:
O = torch.matmul(torch.reshape(X, (-1, input_size)), W) + b
O_exp = torch.exp(O)
Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True)
a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
test_accuracy[i].append(a)
test_accuracy[i] = sum(test_accuracy[i])/len(test_accuracy[i])
# Print the progress
print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}')
# Show some predictions
for X, y in test_iter:
break
number_examples = 10
O = torch.matmul(torch.reshape(X[:number_examples], (-1, input_size)), W) + b
O_exp = torch.exp(O)
Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True)
y_hat = torch.argmax(Y_hat, dim=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
plt.subplot(1, number_examples, i+1)
plt.imshow(X[i][0], cmap='binary')
plt.title(f'True: {label_list[y[i].item()]}\n Pred: {label_list[y_hat[i]]}')
plt.xticks([])
plt.yticks([])
plt.show()
1/10: train_loss=0.785; train_accuracy=0.750; test_accuracy=0.791 2/10: train_loss=0.570; train_accuracy=0.813; test_accuracy=0.802 3/10: train_loss=0.524; train_accuracy=0.826; test_accuracy=0.820 4/10: train_loss=0.501; train_accuracy=0.831; test_accuracy=0.825 5/10: train_loss=0.486; train_accuracy=0.838; test_accuracy=0.823 6/10: train_loss=0.475; train_accuracy=0.839; test_accuracy=0.825 7/10: train_loss=0.466; train_accuracy=0.842; test_accuracy=0.832 8/10: train_loss=0.457; train_accuracy=0.845; test_accuracy=0.832 9/10: train_loss=0.451; train_accuracy=0.847; test_accuracy=0.825 10/10: train_loss=0.447; train_accuracy=0.848; test_accuracy=0.828
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils import data
import torchvision
# Get the dataset (transform the image data from PIL type to normalized 32-bit floating point tensors)
fmnist_train = torchvision.datasets.FashionMNIST(root='data', train=True, download=True,
transform=torchvision.transforms.ToTensor())
fmnist_test = torchvision.datasets.FashionMNIST(root='data', train=False, download=True,
transform=torchvision.transforms.ToTensor())
# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1
# Use data iterators to read a minibatch at each iteration, shuffling the examples for the train set and using 4 processes
train_iter = data.DataLoader(fmnist_train, batch_size, shuffle=True, num_workers=4)
test_iter = data.DataLoader(fmnist_test, batch_size, shuffle=False, num_workers=4)
# Define the model, with a flatten layer to reshape the inputs before the fully-connected layer
input_size = fmnist_train[0][0].nelement()
output_size = 10
model = nn.Sequential(nn.Flatten(), nn.Linear(input_size, output_size))
# Initialize the parameters by applying a function recursively to every submodule
def init(m):
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight, std=0.01)
model.apply(init);
# Define the loss function (with no reduction applied to the output, no mean, no sum, none)
loss = nn.CrossEntropyLoss(reduction='none')
# Define the optimization algorithm
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# Initialize lists for the mean train loss, train and test accuracy over the minibatches for every epoch
train_loss = [[] for _ in range(number_epochs)]
train_accuracy = [[] for _ in range(number_epochs)]
test_accuracy = [[] for _ in range(number_epochs)]
# Loop over the epochs
for i in range(number_epochs):
# Loop over the train examples in minibatches
for X, y in train_iter:
# Compute the predicted outputs
Y_hat = model(X)
# Compute the loss
l = loss(Y_hat, y)
# Save the mean loss for the current minibatch
train_loss[i].append(torch.mean(l).item())
# Compute the mean accuracy for the current minibatch and save it
a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
train_accuracy[i].append(a)
# Set the gradients to zero
optimizer.zero_grad()
# Compute the gradient
l.mean().backward()
# Performs a single parameter update
optimizer.step()
# Derive the mean train loss and accuracy for the current epoch
train_loss[i] = sum(train_loss[i])/len(train_loss[i])
train_accuracy[i] = sum(train_accuracy[i])/len(train_accuracy[i])
# Compute the test outputs and derive the test accuracy for every epoch, in minibatches
with torch.no_grad():
for X, y in test_iter:
Y_hat = model(X)
a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
test_accuracy[i].append(a)
test_accuracy[i] = sum(test_accuracy[i])/len(test_accuracy[i])
# Print the progress
print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}')
# Show some predictions
for X, y in test_iter:
break
number_examples = 10
Y_hat = model(X[:number_examples])
y_hat = torch.argmax(Y_hat, dim=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
plt.subplot(1, number_examples, i+1)
plt.imshow(X[i][0], cmap='binary')
plt.title(f'True: {label_list[y[i].item()]}\n Pred: {label_list[y_hat[i]]}')
plt.xticks([])
plt.yticks([])
plt.show()
1/10: train_loss=0.783; train_accuracy=0.752; test_accuracy=0.793 2/10: train_loss=0.570; train_accuracy=0.813; test_accuracy=0.810 3/10: train_loss=0.525; train_accuracy=0.827; test_accuracy=0.814 4/10: train_loss=0.500; train_accuracy=0.834; test_accuracy=0.827 5/10: train_loss=0.486; train_accuracy=0.837; test_accuracy=0.822 6/10: train_loss=0.473; train_accuracy=0.840; test_accuracy=0.824 7/10: train_loss=0.465; train_accuracy=0.843; test_accuracy=0.832 8/10: train_loss=0.458; train_accuracy=0.844; test_accuracy=0.833 9/10: train_loss=0.453; train_accuracy=0.847; test_accuracy=0.829 10/10: train_loss=0.447; train_accuracy=0.848; test_accuracy=0.822
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
# Get the train and test inputs and outputs
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
X_train = X_train/255
X_test = X_test/255
input_size = X_train[0, :, :].shape
output_size = 10
# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1
# Define a model with flattened inputs and a densely-connected NN layer
model = tf.keras.Sequential([tf.keras.layers.Flatten(input_shape=input_size),
tf.keras.layers.Dense(output_size,
activation=None,
kernel_initializer=tf.initializers.RandomNormal(mean=0, stddev=0.01),
bias_initializer='zeros')])
# Configure the model with SGD optimizer, cross-entropy loss (with integers, not one-hot), and accuracy metrics
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), \
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), \
metrics=['accuracy'])
# Train the model
model.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=number_epochs, verbose=1)
# Show some predictions
number_examples = 10
Y_hat = model.predict(X_test[:number_examples, :, :])
y_hat = np.argmax(Y_hat, axis=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
plt.subplot(1, number_examples, i+1)
plt.imshow(X_test[i, :, :], cmap='binary')
plt.title(f'True: {label_list[y_test[i]]}\n Pred: {label_list[y_hat[i]]}')
plt.xticks([])
plt.yticks([])
plt.show()
Epoch 1/10 235/235 [==============================] - 1s 765us/step - loss: 0.7853 - accuracy: 0.7506 Epoch 2/10 235/235 [==============================] - 0s 755us/step - loss: 0.5703 - accuracy: 0.8134 Epoch 3/10 235/235 [==============================] - 0s 737us/step - loss: 0.5254 - accuracy: 0.8254 Epoch 4/10 235/235 [==============================] - 0s 733us/step - loss: 0.5009 - accuracy: 0.8323 Epoch 5/10 235/235 [==============================] - 0s 747us/step - loss: 0.4847 - accuracy: 0.8368 Epoch 6/10 235/235 [==============================] - 0s 751us/step - loss: 0.4743 - accuracy: 0.8402 Epoch 7/10 235/235 [==============================] - 0s 788us/step - loss: 0.4651 - accuracy: 0.8421 Epoch 8/10 235/235 [==============================] - 0s 802us/step - loss: 0.4582 - accuracy: 0.8449 Epoch 9/10 235/235 [==============================] - 0s 785us/step - loss: 0.4523 - accuracy: 0.8462 Epoch 10/10 235/235 [==============================] - 0s 757us/step - loss: 0.4466 - accuracy: 0.8479
import matplotlib.pyplot as plt
import numpy as np
import random
import tensorflow as tf
# Get the train and test inputs and outputs
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
number_train = len(X_train)
number_test = len(X_test)
# Normalize and flatten the inputs
input_size = np.size(X_train[0])
X_train = np.reshape(X_train/255, (number_train, input_size))
X_test = np.reshape(X_test/255, (number_test, input_size))
# Derive one-hot versions of the train outputs
output_size = 10
Y_train = np.zeros((number_train, output_size))
Y_train[np.arange(number_train), y_train] = 1
# Initialize the weights and biases to recover
hidden_size = 256
W0 = np.random.default_rng().normal(0, 0.01, size=(input_size, hidden_size))
b0 = np.zeros(hidden_size)
W1 = np.random.default_rng().normal(0, 0.01, size=(hidden_size, output_size))
b1 = np.zeros(output_size)
# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1
# Initialize lists for the mean train loss and accuracy over the minibatches for every epoch
train_loss = [[] for _ in range(number_epochs)]
train_accuracy = [[] for _ in range(number_epochs)]
# Initialize a list for the test accuracy overall for every epoch
test_accuracy = [None]*number_epochs
# Loop over the epochs
for i in range(number_epochs):
# Generate random indices for all the train examples
train_indices = np.arange(number_train)
random.shuffle(train_indices)
# Loop over the train examples in minibatches
for j in np.arange(0, number_train, batch_size):
# Get the indices of the train examples for one minibatch
batch_indices = train_indices[j:min(j+batch_size, number_train)]
# Get the train inputs and outputs for the minibatch
X = X_train[batch_indices, :]
y = y_train[batch_indices]
Y = Y_train[batch_indices]
# Compute the outputs of the model (with ReLU)
H = np.matmul(X, W0) + b0
H[H<0] = 0
O = np.matmul(H, W1) + b1
# Compute the softmax of the logits (indirectly to avoid numerical stability issues)
O = O-np.max(O, axis=1)[:, None]
O_exp = np.exp(O)
Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None]
# Compute the mean cross-entropy loss for the minibatch and save it
l = np.mean(np.log(np.sum(O_exp, axis=1)-np.sum(Y*O, axis=1)))
train_loss[i].append(l)
# Compute the mean accuracy for the minibatch and save it
a = np.mean(np.argmax(Y_hat, axis=1)==y)
train_accuracy[i].append(a)
# Compute the derivative of the loss wrt the output of the output layer
dl1 = Y_hat-Y
# Derive the derivative of the loss wrt the output of the hidden layer (using the chain rule)
dl0 = np.matmul(dl1, W1.T)
# Update the weights and biases of the output layer using SGD
W1 = W1-learning_rate*np.matmul(H.T, dl1)/np.shape(H)[0]
b1 = b1-learning_rate*np.mean(dl1, axis=0)
# Update the weights and biases of the hidden layer using SGD
W0 = W0-learning_rate*np.matmul(X.T, dl0)/np.shape(X)[0]
b0 = b0-learning_rate*np.mean(dl0, axis=0)
# Derive the mean train loss and accuracy for the current epoch
train_loss[i] = np.mean(train_loss[i])
train_accuracy[i] = np.mean(train_accuracy[i])
# Compute the test outputs and derive the test accuracy for the current epoch
H = np.matmul(X_test, W0) + b0
H[H<0] = 0
O = np.matmul(H, W1) + b1
O = O-np.max(O, axis=1)[:, None]
O_exp = np.exp(O)
Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None]
test_accuracy[i] = np.mean(np.argmax(Y_hat, axis=1)==y_test)
# Print the progress
print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}')
# Show some predictions
number_examples = 10
H = np.matmul(X_test[:number_examples, :], W0) + b0
H[H<0] = 0
O = np.matmul(H, W1) + b1
O = O-np.max(O, axis=1)[:, None]
O_exp = np.exp(O)
Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None]
y_hat = np.argmax(Y_hat, axis=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
plt.subplot(1, number_examples, i+1)
plt.imshow(np.reshape(X_test[i, :], (28, 28))*255, cmap='binary')
plt.title(f'True: {label_list[y_test[i].item()]}\n Pred: {label_list[y_hat[i]]}')
plt.xticks([])
plt.yticks([])
plt.show()
1/10: train_loss=0.928; train_accuracy=0.625; test_accuracy=0.748 2/10: train_loss=0.462; train_accuracy=0.790; test_accuracy=0.786 3/10: train_loss=0.389; train_accuracy=0.820; test_accuracy=0.814 4/10: train_loss=0.357; train_accuracy=0.830; test_accuracy=0.820 5/10: train_loss=0.338; train_accuracy=0.838; test_accuracy=0.822 6/10: train_loss=0.327; train_accuracy=0.842; test_accuracy=0.835 7/10: train_loss=0.317; train_accuracy=0.845; test_accuracy=0.827 8/10: train_loss=0.313; train_accuracy=0.847; test_accuracy=0.834 9/10: train_loss=0.305; train_accuracy=0.851; test_accuracy=0.821 10/10: train_loss=0.301; train_accuracy=0.852; test_accuracy=0.822
import matplotlib.pyplot as plt
import torch
import torchvision
from torch.utils import data
# Get the dataset (transform the image data from PIL type to normalized 32-bit floating point tensors)
fmnist_train = torchvision.datasets.FashionMNIST(root='data', train=True, download=True,
transform=torchvision.transforms.ToTensor())
fmnist_test = torchvision.datasets.FashionMNIST(root='data', train=False, download=True,
transform=torchvision.transforms.ToTensor())
# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1
# Use data iterators to read a minibatch at each iteration, shuffling the examples for the train set and using 4 processes
train_iter = data.DataLoader(fmnist_train, batch_size, shuffle=True, num_workers=4)
test_iter = data.DataLoader(fmnist_test, batch_size, shuffle=False, num_workers=4)
# Initialize the parameters to recover, requiring the gradients to be computed
input_size = fmnist_train[0][0].nelement()
output_size = 10
hidden_size = 256
W0 = torch.normal(0, 0.01, size=(input_size, hidden_size), requires_grad=True)
b0 = torch.zeros(hidden_size, requires_grad=True)
W1 = torch.normal(0, 0.01, size=(hidden_size, output_size), requires_grad=True)
b1 = torch.zeros(output_size, requires_grad=True)
# Initialize lists for the mean train loss, train and test accuracy over the minibatches for every epoch
train_loss = [[] for _ in range(number_epochs)]
train_accuracy = [[] for _ in range(number_epochs)]
test_accuracy = [[] for _ in range(number_epochs)]
# Loop over the epochs
for i in range(number_epochs):
# Loop over the train examples in minibatches
for X, y in train_iter:
# Compute the outputs of the model (with ReLU), after flattening the images
H = torch.matmul(torch.reshape(X, (-1, input_size)), W0) + b0
H[H<0] = 0
O = torch.matmul(H, W1) + b1
# Compute the softmax of the logits
O_exp = torch.exp(O)
Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True)
# Compute the cross-entropy loss (use the indices of the true classes in y_batch
# to get the corresponding probabilities in y_batch, for all the examples)
l = -torch.log(Y_hat[range(Y_hat.shape[0]), y])
# Save the mean loss for the current minibatch
train_loss[i].append(torch.mean(l).item())
# Compute the mean accuracy for the current minibatch and save it
a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
train_accuracy[i].append(a)
# Compute the gradient on l with respect to W and b
# (sum and not mean as the gradients will be divided by the batch size during SGD)
torch.sum(l).backward()
# Disable gradient calculation for the following operations not to be differentiable
with torch.no_grad():
# Update the weights and biases using SGD
# (use augmented assignments to avoid modifying existing variables)
W1 -= learning_rate*W1.grad/len(l)
b1 -= learning_rate*b1.grad/len(l)
W0 -= learning_rate*W0.grad/len(l)
b0 -= learning_rate*b0.grad/len(l)
# Set the gradients to zeros to avoid accumulating gradients
W1.grad.zero_()
b1.grad.zero_()
W0.grad.zero_()
b0.grad.zero_()
# Derive the mean train loss and accuracy for the current epoch
train_loss[i] = sum(train_loss[i])/len(train_loss[i])
train_accuracy[i] = sum(train_accuracy[i])/len(train_accuracy[i])
# Compute the test outputs and derive the test accuracy for every epoch, in minibatches
with torch.no_grad():
for X, y in test_iter:
H = torch.matmul(torch.reshape(X, (-1, input_size)), W0) + b0
H[H<0] = 0
O = torch.matmul(H, W1) + b1
O_exp = torch.exp(O)
Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True)
a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
test_accuracy[i].append(a)
test_accuracy[i] = sum(test_accuracy[i])/len(test_accuracy[i])
# Print the progress
print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}')
# Show some predictions
for X, y in test_iter:
break
number_examples = 10
H = torch.matmul(torch.reshape(X[:number_examples], (-1, input_size)), W0) + b0
H[H<0] = 0
O = torch.matmul(H, W1) + b1
O_exp = torch.exp(O)
Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True)
y_hat = torch.argmax(Y_hat, dim=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
plt.subplot(1, number_examples, i+1)
plt.imshow(X[i][0], cmap='binary')
plt.title(f'True: {label_list[y[i].item()]}\n Pred: {label_list[y_hat[i]]}')
plt.xticks([])
plt.yticks([])
plt.show()
1/10: train_loss=1.041; train_accuracy=0.646; test_accuracy=0.736 2/10: train_loss=0.605; train_accuracy=0.786; test_accuracy=0.790 3/10: train_loss=0.520; train_accuracy=0.818; test_accuracy=0.770 4/10: train_loss=0.482; train_accuracy=0.831; test_accuracy=0.822 5/10: train_loss=0.455; train_accuracy=0.841; test_accuracy=0.806 6/10: train_loss=0.432; train_accuracy=0.847; test_accuracy=0.834 7/10: train_loss=0.419; train_accuracy=0.851; test_accuracy=0.838 8/10: train_loss=0.403; train_accuracy=0.858; test_accuracy=0.839 9/10: train_loss=0.393; train_accuracy=0.861; test_accuracy=0.830 10/10: train_loss=0.383; train_accuracy=0.864; test_accuracy=0.849
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils import data
import torchvision
# Get the dataset (transform the image data from PIL type to normalized 32-bit floating point tensors)
fmnist_train = torchvision.datasets.FashionMNIST(root='data', train=True, download=True,
transform=torchvision.transforms.ToTensor())
fmnist_test = torchvision.datasets.FashionMNIST(root='data', train=False, download=True,
transform=torchvision.transforms.ToTensor())
# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1
# Use data iterators to read a minibatch at each iteration, shuffling the examples for the train set and using 4 processes
train_iter = data.DataLoader(fmnist_train, batch_size, shuffle=True, num_workers=4)
test_iter = data.DataLoader(fmnist_test, batch_size, shuffle=False, num_workers=4)
# Define the model, with a flatten layer to reshape the inputs, two fully-connected layer, and a ReLU in-between
input_size = fmnist_train[0][0].nelement()
hidden_size = 256
output_size = 10
model = nn.Sequential(nn.Flatten(),
nn.Linear(input_size, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, output_size))
# Initialize the parameters by applying a function recursively to every submodule
def init(m):
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight, std=0.01)
model.apply(init);
# Define the loss function (with no reduction applied to the output, no mean, no sum, none)
loss = nn.CrossEntropyLoss(reduction='none')
# Define the optimization algorithm
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# Initialize lists for the mean train loss, train and test accuracy over the minibatches for every epoch
train_loss = [[] for _ in range(number_epochs)]
train_accuracy = [[] for _ in range(number_epochs)]
test_accuracy = [[] for _ in range(number_epochs)]
# Loop over the epochs
for i in range(number_epochs):
# Loop over the train examples in minibatches
for X, y in train_iter:
# Compute the predicted outputs
Y_hat = model(X)
# Compute the loss
l = loss(Y_hat, y)
# Save the mean loss for the current minibatch
train_loss[i].append(torch.mean(l).item())
# Compute the mean accuracy for the current minibatch and save it
a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
train_accuracy[i].append(a)
# Set the gradients to zero
optimizer.zero_grad()
# Compute the gradient
l.mean().backward()
# Performs a single parameter update
optimizer.step()
# Derive the mean train loss and accuracy for the current epoch
train_loss[i] = sum(train_loss[i])/len(train_loss[i])
train_accuracy[i] = sum(train_accuracy[i])/len(train_accuracy[i])
# Compute the test outputs and derive the test accuracy for every epoch, in minibatches
with torch.no_grad():
for X, y in test_iter:
Y_hat = model(X)
a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
test_accuracy[i].append(a)
test_accuracy[i] = sum(test_accuracy[i])/len(test_accuracy[i])
# Print the progress
print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}')
# Show some predictions
for X, y in test_iter:
break
number_examples = 10
Y_hat = model(X[:number_examples])
y_hat = torch.argmax(Y_hat, dim=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
plt.subplot(1, number_examples, i+1)
plt.imshow(X[i][0], cmap='binary')
plt.title(f'True: {label_list[y[i].item()]}\n Pred: {label_list[y_hat[i]]}')
plt.xticks([])
plt.yticks([])
plt.show()
1/10: train_loss=1.040; train_accuracy=0.645; test_accuracy=0.712 2/10: train_loss=0.598; train_accuracy=0.790; test_accuracy=0.799 3/10: train_loss=0.518; train_accuracy=0.819; test_accuracy=0.810 4/10: train_loss=0.478; train_accuracy=0.833; test_accuracy=0.825 5/10: train_loss=0.454; train_accuracy=0.839; test_accuracy=0.833 6/10: train_loss=0.431; train_accuracy=0.848; test_accuracy=0.837 7/10: train_loss=0.415; train_accuracy=0.854; test_accuracy=0.843 8/10: train_loss=0.405; train_accuracy=0.858; test_accuracy=0.849 9/10: train_loss=0.391; train_accuracy=0.862; test_accuracy=0.839 10/10: train_loss=0.382; train_accuracy=0.864; test_accuracy=0.853
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
# Get the train and test inputs and outputs
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
X_train = X_train/255
X_test = X_test/255
input_size = X_train[0, :, :].shape
hidden_size = 256
output_size = 10
# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1
# Define a model with flattened inputs, a densely-connected NN layer with a ReLU, and another one without activation
model = tf.keras.Sequential([tf.keras.layers.Flatten(input_shape=input_size),
tf.keras.layers.Dense(hidden_size,
activation='relu',
kernel_initializer=tf.initializers.RandomNormal(mean=0, stddev=0.01),
bias_initializer='zeros'),
tf.keras.layers.Dense(output_size,
activation=None,
kernel_initializer=tf.initializers.RandomNormal(mean=0, stddev=0.01),
bias_initializer='zeros')])
# Configure the model with SGD optimizer, cross-entropy loss (with integers, not one-hot), and accuracy metrics
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), \
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), \
metrics=['accuracy'])
# Train the model
model.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=number_epochs, verbose=1)
# Show some predictions
number_examples = 10
Y_hat = model.predict(X_test[:number_examples, :, :])
y_hat = np.argmax(Y_hat, axis=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
plt.subplot(1, number_examples, i+1)
plt.imshow(X_test[i, :, :], cmap='binary')
plt.title(f'True: {label_list[y_test[i]]}\n Pred: {label_list[y_hat[i]]}')
plt.xticks([])
plt.yticks([])
plt.show()
Epoch 1/10 235/235 [==============================] - 1s 2ms/step - loss: 1.0377 - accuracy: 0.6388 Epoch 2/10 235/235 [==============================] - 0s 2ms/step - loss: 0.5989 - accuracy: 0.7903 Epoch 3/10 235/235 [==============================] - 0s 2ms/step - loss: 0.5175 - accuracy: 0.8191 Epoch 4/10 235/235 [==============================] - 0s 2ms/step - loss: 0.4774 - accuracy: 0.8320 Epoch 5/10 235/235 [==============================] - 0s 2ms/step - loss: 0.4517 - accuracy: 0.8424 Epoch 6/10 235/235 [==============================] - 0s 2ms/step - loss: 0.4316 - accuracy: 0.8482 Epoch 7/10 235/235 [==============================] - 0s 2ms/step - loss: 0.4165 - accuracy: 0.8530 Epoch 8/10 235/235 [==============================] - 0s 2ms/step - loss: 0.4021 - accuracy: 0.8579 Epoch 9/10 235/235 [==============================] - 0s 2ms/step - loss: 0.3896 - accuracy: 0.8623 Epoch 10/10 235/235 [==============================] - 0s 2ms/step - loss: 0.3800 - accuracy: 0.8653