#!/usr/bin/env python # coding: utf-8 # # Deep Learning # ## 1. Linear Regression # https://d2l.ai/chapter_linear-regression/ # ### 1.1. Linear regression from scratch in NumPy # In[1]: import numpy as np # Define the true weights and bias of the model w_true = np.array([2, -3.4]) b_true = 4.2 # Construct a random generator, seeded for reproducibility rng = np.random.default_rng(seed=0) # Generate the inputs (from a standard normal distribution) and outputs (with some Gaussian noise) number_examples = 1000 input_size = len(w_true) X = rng.normal(0, 1, (number_examples, input_size)) y = np.matmul(X, w_true) + b_true + rng.normal(0, 0.01, number_examples) # Define the parameters for the training number_epochs = 3 batch_size = 10 learning_rate = 0.03 # Initialize the weights and bias to recover w = rng.normal(0, 1, input_size) b = 0 # Initialize an array for the mean loss over the minibatches of every epoch epoch_loss = np.zeros(number_epochs) # Loop over the epochs for i in range(number_epochs): # Generate random indices for all the examples example_indices = np.arange(number_examples) rng.shuffle(example_indices) # Initialize a list for the mean loss over the examples of every minibatch batch_loss = [] # Loop over the examples in minibatches for j in np.arange(0, number_examples, batch_size): # Get the indices of the examples for one minibatch batch_indices = example_indices[j:min(j+batch_size, number_examples)] # Get the inputs and outputs for the current minibatch X_batch = X[batch_indices, :] y_batch = y[batch_indices] # Compute the predicted outputs y_hat = np.matmul(X_batch, w) + b # Compute the loss between the predicted and true outputs l = 0.5*np.power(y_hat-y_batch, 2) # Save the mean loss for the current minibatch batch_loss.append(np.mean(l)) # Update the weights and bias using stochastic gradient descent (SGD) w = w - learning_rate*np.mean(X_batch*(y_hat-y_batch)[:, None], axis=0) b = b - learning_rate*np.mean(y_hat-y_batch, axis=0) # Save the mean loss for the current epoch epoch_loss[i] = np.mean(batch_loss) # Print the progress print(f'{i+1}/{number_epochs}: {epoch_loss[i]}') # Print the predicted weights and bias print('') print(f'w = {w}') print(f'b = {b}') # ### 1.2. Linear regression from scratch in PyTorch # In[4]: import torch # Define the true weights and bias of the model w_true = torch.tensor([2, -3.4]) b_true = 4.2 # Generate inputs and outputs number_examples = 1000 input_size = len(w_true) X = torch.normal(0, 1, (number_examples, input_size)) y = torch.matmul(X, w_true) + b_true + torch.normal(0, 0.01, [number_examples]) # Define a function to read the dataset in random minibatches def batch(X, y, batch_size): # Generate random indices for all the examples number_examples = X.shape[0] example_indices = torch.randperm(number_examples) # Loop over the examples in minibatches for i in range(0, number_examples, batch_size): # Get the indices of the examples for one minibatch batch_indices = example_indices[i:min(i+batch_size, number_examples)] # Return the input and output for the current minibatch and continue the iteration in the function yield X[batch_indices], y[batch_indices] # Define the parameters for the training number_epochs = 3 batch_size = 10 learning_rate = 0.03 # Initialize the weights and bias to recover, requiring the gradients to be computed w = torch.normal(0, 1, [input_size], requires_grad=True) b = torch.zeros(1, requires_grad=True) # Initialize an array for the mean loss over the minibatches of every epoch epoch_loss = torch.zeros(number_epochs) # Loop over the epochs for i in range(number_epochs): # Initialize a list for the mean loss over the examples of every minibatch batch_loss = [] # Loop over the examples in minibatches for X_batch, y_batch in batch(X, y, batch_size): # Compute the predicted outputs y_hat = torch.matmul(X_batch, w) + b # Compute the loss between the predicted and true outputs l = 0.5*(y_hat-y_batch)**2 # Compute the gradient on l wrt w and b # (sum and not mean as the gradients will be divided by the batch size during SGD) l.sum().backward() # Save the mean loss for the current minibatch batch_loss.append(l.mean()) # Temporarily sets all the requires_grad flags to false with torch.no_grad(): # Update the weights and bias using SGD # (use augmented assignments to avoid modifying existing variables) w -= learning_rate*w.grad/len(l) b -= learning_rate*b.grad/len(l) # Set the gradients to zeros to avoid accumulating gradients w.grad.zero_() b.grad.zero_() # Save the mean loss for the current epoch epoch_loss[i] = sum(batch_loss)/len(batch_loss) # Print the progress print(f'{i+1}/{number_epochs}: {epoch_loss[i]}') # Print the predicted weights and bias print('') print(f'w = {w}') print(f'b = {b}') # ### 1.3. Linear regression using APIs in PyTorch # In[1]: import torch from torch.utils import data from torch import nn # Define the true weights and bias of the model w_true = torch.tensor([2, -3.4]) b_true = 4.2 # Generate inputs and outputs number_examples = 1000 input_size = len(w_true) X = torch.normal(0, 1, (number_examples, input_size)) y = torch.matmul(X, w_true) + b_true + torch.normal(0, 0.01, [number_examples]) # Define a function to read the dataset in random minibatches by using data iterator def batch(X, y, batch_size): data_set = data.TensorDataset(*(X, y)) return data.DataLoader(data_set, batch_size, shuffle=True) # Define the parameters for the training number_epochs = 3 batch_size = 10 learning_rate = 0.03 # Define the model with a fully-connected layer model = nn.Sequential(nn.Linear(input_size, 1)) # Initialize the parameters model[0].weight.data.normal_(0, 0.01) model[0].bias.data.fill_(0) # Define the loss function (mean squared error, without the 0.5 factor) loss = nn.MSELoss() # Define the optimization algorithm (SGD) optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Initialize an array for the mean loss over the minibatches of every epoch epoch_loss = torch.zeros(number_epochs) # Loop over the epochs for i in range(number_epochs): # Initialize a list for the mean loss over the examples of every minibatch batch_loss = [] # Loop over the examples in minibatches for X_batch, y_batch in batch(X, y, batch_size): # Compute the predicted outputs y_hat = model(X_batch) # Compute the loss between the predicted and true outputs l = loss(y_hat, y_batch[:, None]) # Save the loss for the current minibatch batch_loss.append(l) # Set the gradients to zero optimizer.zero_grad() # Computes the gradient l.backward() # Performs a single parameter update optimizer.step() # Save the mean loss for the current epoch epoch_loss[i] = sum(batch_loss)/len(batch_loss) # Print the progress print(f'{i+1}/{number_epochs}: {epoch_loss[i]}') # Print the predicted weights and bias print('') print(f'w = {model[0].weight.data}') print(f'b = {model[0].bias.data}') # ### 1.4. Linear regression using higher-level APIs in Keras # In[45]: import tensorflow as tf # Define the true weights and bias of the model w_true = tf.constant([2, -3.4], shape=[2, 1]) b_true = tf.constant(4.2) # Generate inputs and outputs number_examples = 1000 input_size = len(w_true) tf.random.set_seed(0) X = tf.random.normal([number_examples, input_size], 0, 1) y = tf.matmul(X, w_true) + b_true + tf.random.normal([number_examples], 0, 0.01) # Define the parameters for the training number_epochs = 3 batch_size = 10 learning_rate = 0.03 # Define the model with a densely-connected NN layer with initialized parameters model = tf.keras.Sequential([tf.keras.layers.Dense(1, \ kernel_initializer=tf.initializers.RandomNormal(mean=0, stddev=0.01), \ bias_initializer='zeros')]) # Configure the model for training with SGD optimizer and MSE loss model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), \ loss=tf.keras.losses.MeanSquaredError()) # Train the model given the batch size and number of epochs model.fit(x=X, y=y, batch_size=batch_size, epochs=number_epochs, verbose=1) # Print the predicted weights and bias print('') print(f'w = {model.get_weights()[0]}') print(f'b = {model.get_weights()[1]}') # ## 2. Softmax Regression # https://d2l.ai/chapter_linear-classification/ # ### 2.1. Fashion-MNIST dataset # In[2]: import numpy as np import matplotlib.pyplot as plt import tensorflow as tf # Get the Fashion-MNIST dataset, with train and test inputs and outputs (X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data() # Normalize the inputs X_train = X_train/255 X_test = X_test/255 # Translate the outputs into labels label_list = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot'] label_train = [label_list[i] for i in y_train] label_test = [label_list[i] for i in y_test] # Show a single example for the different classes number_classes = len(label_list) plt.figure(figsize=(18, 2)) for i in range(number_classes): j = np.where(y_train==i)[0][0] plt.subplot(1, number_classes, i+1) plt.imshow(X_train[j, :, :], cmap='binary') plt.title(label_list[i]) plt.xticks([]) plt.yticks([]) plt.show() # ### 2.2. Softmax regression from scratch in NumPy # In[1]: import matplotlib.pyplot as plt import numpy as np import random import tensorflow as tf # Get the train and test inputs and outputs (X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data() number_train = len(X_train) number_test = len(X_test) # Normalize and flatten the inputs input_size = np.size(X_train[0]) X_train = np.reshape(X_train/255, (number_train, input_size)) X_test = np.reshape(X_test/255, (number_test, input_size)) # Derive one-hot versions of the train outputs output_size = 10 Y_train = np.zeros((number_train, output_size)) Y_train[np.arange(number_train), y_train] = 1 # Define the parameters for the training number_epochs = 10 batch_size = 256 learning_rate = 0.1 # Initialize the weights and bias to recover W = np.random.default_rng().normal(0, 0.01, size=(input_size, output_size)) b = np.zeros(output_size) # Initialize lists for the mean train loss and accuracy over the minibatches for every epoch train_loss = [[] for _ in range(number_epochs)] train_accuracy = [[] for _ in range(number_epochs)] # Initialize a list for the test accuracy overall for every epoch test_accuracy = [None]*number_epochs # Loop over the epochs for i in range(number_epochs): # Generate random indices for all the train examples train_indices = np.arange(number_train) random.shuffle(train_indices) # Loop over the train examples in minibatches for j in np.arange(0, number_train, batch_size): # Get the indices of the train examples for one minibatch batch_indices = train_indices[j:min(j+batch_size, number_train)] # Get the train inputs and outputs for the minibatch X = X_train[batch_indices, :] y = y_train[batch_indices] Y = Y_train[batch_indices] # Compute the predicted outputs (logits) O = np.matmul(X, W) + b # Compute the softmax of the logits (indirectly to avoid numerical stability issues) O = O-np.max(O, axis=1)[:, None] O_exp = np.exp(O) Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None] # Compute the mean cross-entropy loss for the minibatch and save it l = np.mean(np.log(np.sum(O_exp, axis=1)-np.sum(Y*O, axis=1))) train_loss[i].append(l) # Compute the mean accuracy for the minibatch and save it a = np.mean(np.argmax(Y_hat, axis=1)==y) train_accuracy[i].append(a) # Update the weights and bias using SGD dl = Y_hat-Y W = W-learning_rate*np.matmul(X.T, dl)/np.shape(X)[0] b = b-learning_rate*np.mean(dl, axis=0) # Derive the mean train loss and accuracy for the current epoch train_loss[i] = np.mean(train_loss[i]) train_accuracy[i] = np.mean(train_accuracy[i]) # Compute the test outputs and derive the test accuracy for the current epoch O = np.matmul(X_test, W) + b O = O-np.max(O, axis=1)[:, None] O_exp = np.exp(O) Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None] test_accuracy[i] = np.mean(np.argmax(Y_hat, axis=1)==y_test) # Print the progress print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}') # Show some predictions number_examples = 10 O = np.matmul(X_test[:number_examples, :], W) + b O = O-np.max(O, axis=1)[:, None] O_exp = np.exp(O) Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None] y_hat = np.argmax(Y_hat, axis=1) label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] plt.figure(figsize=(18, 2)) for i in range(number_examples): plt.subplot(1, number_examples, i+1) plt.imshow(np.reshape(X_test[i, :], (28, 28))*255, cmap='binary') plt.title(f'True: {label_list[y_test[i].item()]}\n Pred: {label_list[y_hat[i]]}') plt.xticks([]) plt.yticks([]) plt.show() # ### 2.3. Softmax regression from scratch in PyTorch # In[2]: import matplotlib.pyplot as plt import torch import torchvision from torch.utils import data # Get the dataset (transform the image data from PIL type to normalized 32-bit floating point tensors) fmnist_train = torchvision.datasets.FashionMNIST(root='data', train=True, download=True, transform=torchvision.transforms.ToTensor()) fmnist_test = torchvision.datasets.FashionMNIST(root='data', train=False, download=True, transform=torchvision.transforms.ToTensor()) # Define the parameters for the training number_epochs = 10 batch_size = 256 learning_rate = 0.1 # Use data iterators to read a minibatch at each iteration, shuffling the examples for the train set and using 4 processes train_iter = data.DataLoader(fmnist_train, batch_size, shuffle=True, num_workers=4) test_iter = data.DataLoader(fmnist_test, batch_size, shuffle=False, num_workers=4) # Initialize the parameters to recover, requiring the gradients to be computed input_size = fmnist_train[0][0].nelement() output_size = 10 W = torch.normal(0, 0.01, size=(input_size, output_size), requires_grad=True) b = torch.zeros(output_size, requires_grad=True) # Initialize lists for the mean train loss, train and test accuracy over the minibatches for every epoch train_loss = [[] for _ in range(number_epochs)] train_accuracy = [[] for _ in range(number_epochs)] test_accuracy = [[] for _ in range(number_epochs)] # Loop over the epochs for i in range(number_epochs): # Loop over the train examples in minibatches for X, y in train_iter: # Compute the logits, after flattening the images O = torch.matmul(torch.reshape(X, (-1, input_size)), W) + b # Compute the softmax of the logits O_exp = torch.exp(O) Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True) # Compute the cross-entropy loss (use the indices of the true classes in y_batch # to get the corresponding probabilities in y_batch, for all the examples) l = -torch.log(Y_hat[range(Y_hat.shape[0]), y]) # Save the mean loss for the current minibatch train_loss[i].append(torch.mean(l).item()) # Compute the mean accuracy for the current minibatch and save it a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item() train_accuracy[i].append(a) # Compute the gradient on l with respect to W and b # (sum and not mean as the gradients will be divided by the batch size during SGD) torch.sum(l).backward() # Disable gradient calculation for the following operations not to be differentiable with torch.no_grad(): # Update the weights and bias using SGD # (use augmented assignments to avoid modifying existing variables) W -= learning_rate*W.grad/len(l) b -= learning_rate*b.grad/len(l) # Set the gradients to zeros to avoid accumulating gradients W.grad.zero_() b.grad.zero_() # Derive the mean train loss and accuracy for the current epoch train_loss[i] = sum(train_loss[i])/len(train_loss[i]) train_accuracy[i] = sum(train_accuracy[i])/len(train_accuracy[i]) # Compute the test outputs and derive the test accuracy for every epoch, in minibatches with torch.no_grad(): for X, y in test_iter: O = torch.matmul(torch.reshape(X, (-1, input_size)), W) + b O_exp = torch.exp(O) Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True) a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item() test_accuracy[i].append(a) test_accuracy[i] = sum(test_accuracy[i])/len(test_accuracy[i]) # Print the progress print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}') # Show some predictions for X, y in test_iter: break number_examples = 10 O = torch.matmul(torch.reshape(X[:number_examples], (-1, input_size)), W) + b O_exp = torch.exp(O) Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True) y_hat = torch.argmax(Y_hat, dim=1) label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] plt.figure(figsize=(18, 2)) for i in range(number_examples): plt.subplot(1, number_examples, i+1) plt.imshow(X[i][0], cmap='binary') plt.title(f'True: {label_list[y[i].item()]}\n Pred: {label_list[y_hat[i]]}') plt.xticks([]) plt.yticks([]) plt.show() # ### 2.4. Softmax regression using APIs in PyTorch # In[3]: import matplotlib.pyplot as plt import torch from torch import nn from torch.utils import data import torchvision # Get the dataset (transform the image data from PIL type to normalized 32-bit floating point tensors) fmnist_train = torchvision.datasets.FashionMNIST(root='data', train=True, download=True, transform=torchvision.transforms.ToTensor()) fmnist_test = torchvision.datasets.FashionMNIST(root='data', train=False, download=True, transform=torchvision.transforms.ToTensor()) # Define the parameters for the training number_epochs = 10 batch_size = 256 learning_rate = 0.1 # Use data iterators to read a minibatch at each iteration, shuffling the examples for the train set and using 4 processes train_iter = data.DataLoader(fmnist_train, batch_size, shuffle=True, num_workers=4) test_iter = data.DataLoader(fmnist_test, batch_size, shuffle=False, num_workers=4) # Define the model, with a flatten layer to reshape the inputs before the fully-connected layer input_size = fmnist_train[0][0].nelement() output_size = 10 model = nn.Sequential(nn.Flatten(), nn.Linear(input_size, output_size)) # Initialize the parameters by applying a function recursively to every submodule def init(m): if isinstance(m, nn.Linear): nn.init.normal_(m.weight, std=0.01) model.apply(init); # Define the loss function (with no reduction applied to the output, no mean, no sum, none) loss = nn.CrossEntropyLoss(reduction='none') # Define the optimization algorithm optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Initialize lists for the mean train loss, train and test accuracy over the minibatches for every epoch train_loss = [[] for _ in range(number_epochs)] train_accuracy = [[] for _ in range(number_epochs)] test_accuracy = [[] for _ in range(number_epochs)] # Loop over the epochs for i in range(number_epochs): # Loop over the train examples in minibatches for X, y in train_iter: # Compute the predicted outputs Y_hat = model(X) # Compute the loss l = loss(Y_hat, y) # Save the mean loss for the current minibatch train_loss[i].append(torch.mean(l).item()) # Compute the mean accuracy for the current minibatch and save it a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item() train_accuracy[i].append(a) # Set the gradients to zero optimizer.zero_grad() # Compute the gradient l.mean().backward() # Performs a single parameter update optimizer.step() # Derive the mean train loss and accuracy for the current epoch train_loss[i] = sum(train_loss[i])/len(train_loss[i]) train_accuracy[i] = sum(train_accuracy[i])/len(train_accuracy[i]) # Compute the test outputs and derive the test accuracy for every epoch, in minibatches with torch.no_grad(): for X, y in test_iter: Y_hat = model(X) a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item() test_accuracy[i].append(a) test_accuracy[i] = sum(test_accuracy[i])/len(test_accuracy[i]) # Print the progress print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}') # Show some predictions for X, y in test_iter: break number_examples = 10 Y_hat = model(X[:number_examples]) y_hat = torch.argmax(Y_hat, dim=1) label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] plt.figure(figsize=(18, 2)) for i in range(number_examples): plt.subplot(1, number_examples, i+1) plt.imshow(X[i][0], cmap='binary') plt.title(f'True: {label_list[y[i].item()]}\n Pred: {label_list[y_hat[i]]}') plt.xticks([]) plt.yticks([]) plt.show() # ### 2.5. Softmax regression using higher-level APIs in Keras # In[2]: import matplotlib.pyplot as plt import numpy as np import tensorflow as tf # Get the train and test inputs and outputs (X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data() X_train = X_train/255 X_test = X_test/255 input_size = X_train[0, :, :].shape output_size = 10 # Define the parameters for the training number_epochs = 10 batch_size = 256 learning_rate = 0.1 # Define a model with flattened inputs and a densely-connected NN layer model = tf.keras.Sequential([tf.keras.layers.Flatten(input_shape=input_size), tf.keras.layers.Dense(output_size, activation=None, kernel_initializer=tf.initializers.RandomNormal(mean=0, stddev=0.01), bias_initializer='zeros')]) # Configure the model with SGD optimizer, cross-entropy loss (with integers, not one-hot), and accuracy metrics model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), \ loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), \ metrics=['accuracy']) # Train the model model.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=number_epochs, verbose=1) # Show some predictions number_examples = 10 Y_hat = model.predict(X_test[:number_examples, :, :]) y_hat = np.argmax(Y_hat, axis=1) label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] plt.figure(figsize=(18, 2)) for i in range(number_examples): plt.subplot(1, number_examples, i+1) plt.imshow(X_test[i, :, :], cmap='binary') plt.title(f'True: {label_list[y_test[i]]}\n Pred: {label_list[y_hat[i]]}') plt.xticks([]) plt.yticks([]) plt.show() # ## 3. Multilayer Perceptron # https://d2l.ai/chapter_multilayer-perceptrons # ### 3.1. MLP from scratch in NumPy # In[5]: import matplotlib.pyplot as plt import numpy as np import random import tensorflow as tf # Get the train and test inputs and outputs (X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data() number_train = len(X_train) number_test = len(X_test) # Normalize and flatten the inputs input_size = np.size(X_train[0]) X_train = np.reshape(X_train/255, (number_train, input_size)) X_test = np.reshape(X_test/255, (number_test, input_size)) # Derive one-hot versions of the train outputs output_size = 10 Y_train = np.zeros((number_train, output_size)) Y_train[np.arange(number_train), y_train] = 1 # Initialize the weights and biases to recover hidden_size = 256 W0 = np.random.default_rng().normal(0, 0.01, size=(input_size, hidden_size)) b0 = np.zeros(hidden_size) W1 = np.random.default_rng().normal(0, 0.01, size=(hidden_size, output_size)) b1 = np.zeros(output_size) # Define the parameters for the training number_epochs = 10 batch_size = 256 learning_rate = 0.1 # Initialize lists for the mean train loss and accuracy over the minibatches for every epoch train_loss = [[] for _ in range(number_epochs)] train_accuracy = [[] for _ in range(number_epochs)] # Initialize a list for the test accuracy overall for every epoch test_accuracy = [None]*number_epochs # Loop over the epochs for i in range(number_epochs): # Generate random indices for all the train examples train_indices = np.arange(number_train) random.shuffle(train_indices) # Loop over the train examples in minibatches for j in np.arange(0, number_train, batch_size): # Get the indices of the train examples for one minibatch batch_indices = train_indices[j:min(j+batch_size, number_train)] # Get the train inputs and outputs for the minibatch X = X_train[batch_indices, :] y = y_train[batch_indices] Y = Y_train[batch_indices] # Compute the outputs of the model (with ReLU) H = np.matmul(X, W0) + b0 H[H<0] = 0 O = np.matmul(H, W1) + b1 # Compute the softmax of the logits (indirectly to avoid numerical stability issues) O = O-np.max(O, axis=1)[:, None] O_exp = np.exp(O) Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None] # Compute the mean cross-entropy loss for the minibatch and save it l = np.mean(np.log(np.sum(O_exp, axis=1)-np.sum(Y*O, axis=1))) train_loss[i].append(l) # Compute the mean accuracy for the minibatch and save it a = np.mean(np.argmax(Y_hat, axis=1)==y) train_accuracy[i].append(a) # Compute the derivative of the loss wrt the output of the output layer dl1 = Y_hat-Y # Derive the derivative of the loss wrt the output of the hidden layer (using the chain rule) dl0 = np.matmul(dl1, W1.T) # Update the weights and biases of the output layer using SGD W1 = W1-learning_rate*np.matmul(H.T, dl1)/np.shape(H)[0] b1 = b1-learning_rate*np.mean(dl1, axis=0) # Update the weights and biases of the hidden layer using SGD W0 = W0-learning_rate*np.matmul(X.T, dl0)/np.shape(X)[0] b0 = b0-learning_rate*np.mean(dl0, axis=0) # Derive the mean train loss and accuracy for the current epoch train_loss[i] = np.mean(train_loss[i]) train_accuracy[i] = np.mean(train_accuracy[i]) # Compute the test outputs and derive the test accuracy for the current epoch H = np.matmul(X_test, W0) + b0 H[H<0] = 0 O = np.matmul(H, W1) + b1 O = O-np.max(O, axis=1)[:, None] O_exp = np.exp(O) Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None] test_accuracy[i] = np.mean(np.argmax(Y_hat, axis=1)==y_test) # Print the progress print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}') # Show some predictions number_examples = 10 H = np.matmul(X_test[:number_examples, :], W0) + b0 H[H<0] = 0 O = np.matmul(H, W1) + b1 O = O-np.max(O, axis=1)[:, None] O_exp = np.exp(O) Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None] y_hat = np.argmax(Y_hat, axis=1) label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] plt.figure(figsize=(18, 2)) for i in range(number_examples): plt.subplot(1, number_examples, i+1) plt.imshow(np.reshape(X_test[i, :], (28, 28))*255, cmap='binary') plt.title(f'True: {label_list[y_test[i].item()]}\n Pred: {label_list[y_hat[i]]}') plt.xticks([]) plt.yticks([]) plt.show() # ### 3.2. MLP from scratch in PyTorch # In[3]: import matplotlib.pyplot as plt import torch import torchvision from torch.utils import data # Get the dataset (transform the image data from PIL type to normalized 32-bit floating point tensors) fmnist_train = torchvision.datasets.FashionMNIST(root='data', train=True, download=True, transform=torchvision.transforms.ToTensor()) fmnist_test = torchvision.datasets.FashionMNIST(root='data', train=False, download=True, transform=torchvision.transforms.ToTensor()) # Define the parameters for the training number_epochs = 10 batch_size = 256 learning_rate = 0.1 # Use data iterators to read a minibatch at each iteration, shuffling the examples for the train set and using 4 processes train_iter = data.DataLoader(fmnist_train, batch_size, shuffle=True, num_workers=4) test_iter = data.DataLoader(fmnist_test, batch_size, shuffle=False, num_workers=4) # Initialize the parameters to recover, requiring the gradients to be computed input_size = fmnist_train[0][0].nelement() output_size = 10 hidden_size = 256 W0 = torch.normal(0, 0.01, size=(input_size, hidden_size), requires_grad=True) b0 = torch.zeros(hidden_size, requires_grad=True) W1 = torch.normal(0, 0.01, size=(hidden_size, output_size), requires_grad=True) b1 = torch.zeros(output_size, requires_grad=True) # Initialize lists for the mean train loss, train and test accuracy over the minibatches for every epoch train_loss = [[] for _ in range(number_epochs)] train_accuracy = [[] for _ in range(number_epochs)] test_accuracy = [[] for _ in range(number_epochs)] # Loop over the epochs for i in range(number_epochs): # Loop over the train examples in minibatches for X, y in train_iter: # Compute the outputs of the model (with ReLU), after flattening the images H = torch.matmul(torch.reshape(X, (-1, input_size)), W0) + b0 H[H<0] = 0 O = torch.matmul(H, W1) + b1 # Compute the softmax of the logits O_exp = torch.exp(O) Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True) # Compute the cross-entropy loss (use the indices of the true classes in y_batch # to get the corresponding probabilities in y_batch, for all the examples) l = -torch.log(Y_hat[range(Y_hat.shape[0]), y]) # Save the mean loss for the current minibatch train_loss[i].append(torch.mean(l).item()) # Compute the mean accuracy for the current minibatch and save it a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item() train_accuracy[i].append(a) # Compute the gradient on l with respect to W and b # (sum and not mean as the gradients will be divided by the batch size during SGD) torch.sum(l).backward() # Disable gradient calculation for the following operations not to be differentiable with torch.no_grad(): # Update the weights and biases using SGD # (use augmented assignments to avoid modifying existing variables) W1 -= learning_rate*W1.grad/len(l) b1 -= learning_rate*b1.grad/len(l) W0 -= learning_rate*W0.grad/len(l) b0 -= learning_rate*b0.grad/len(l) # Set the gradients to zeros to avoid accumulating gradients W1.grad.zero_() b1.grad.zero_() W0.grad.zero_() b0.grad.zero_() # Derive the mean train loss and accuracy for the current epoch train_loss[i] = sum(train_loss[i])/len(train_loss[i]) train_accuracy[i] = sum(train_accuracy[i])/len(train_accuracy[i]) # Compute the test outputs and derive the test accuracy for every epoch, in minibatches with torch.no_grad(): for X, y in test_iter: H = torch.matmul(torch.reshape(X, (-1, input_size)), W0) + b0 H[H<0] = 0 O = torch.matmul(H, W1) + b1 O_exp = torch.exp(O) Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True) a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item() test_accuracy[i].append(a) test_accuracy[i] = sum(test_accuracy[i])/len(test_accuracy[i]) # Print the progress print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}') # Show some predictions for X, y in test_iter: break number_examples = 10 H = torch.matmul(torch.reshape(X[:number_examples], (-1, input_size)), W0) + b0 H[H<0] = 0 O = torch.matmul(H, W1) + b1 O_exp = torch.exp(O) Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True) y_hat = torch.argmax(Y_hat, dim=1) label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] plt.figure(figsize=(18, 2)) for i in range(number_examples): plt.subplot(1, number_examples, i+1) plt.imshow(X[i][0], cmap='binary') plt.title(f'True: {label_list[y[i].item()]}\n Pred: {label_list[y_hat[i]]}') plt.xticks([]) plt.yticks([]) plt.show() # ### 3.3. MLP using APIs in PyTorch # In[1]: import matplotlib.pyplot as plt import torch from torch import nn from torch.utils import data import torchvision # Get the dataset (transform the image data from PIL type to normalized 32-bit floating point tensors) fmnist_train = torchvision.datasets.FashionMNIST(root='data', train=True, download=True, transform=torchvision.transforms.ToTensor()) fmnist_test = torchvision.datasets.FashionMNIST(root='data', train=False, download=True, transform=torchvision.transforms.ToTensor()) # Define the parameters for the training number_epochs = 10 batch_size = 256 learning_rate = 0.1 # Use data iterators to read a minibatch at each iteration, shuffling the examples for the train set and using 4 processes train_iter = data.DataLoader(fmnist_train, batch_size, shuffle=True, num_workers=4) test_iter = data.DataLoader(fmnist_test, batch_size, shuffle=False, num_workers=4) # Define the model, with a flatten layer to reshape the inputs, two fully-connected layer, and a ReLU in-between input_size = fmnist_train[0][0].nelement() hidden_size = 256 output_size = 10 model = nn.Sequential(nn.Flatten(), nn.Linear(input_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, output_size)) # Initialize the parameters by applying a function recursively to every submodule def init(m): if isinstance(m, nn.Linear): nn.init.normal_(m.weight, std=0.01) model.apply(init); # Define the loss function (with no reduction applied to the output, no mean, no sum, none) loss = nn.CrossEntropyLoss(reduction='none') # Define the optimization algorithm optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Initialize lists for the mean train loss, train and test accuracy over the minibatches for every epoch train_loss = [[] for _ in range(number_epochs)] train_accuracy = [[] for _ in range(number_epochs)] test_accuracy = [[] for _ in range(number_epochs)] # Loop over the epochs for i in range(number_epochs): # Loop over the train examples in minibatches for X, y in train_iter: # Compute the predicted outputs Y_hat = model(X) # Compute the loss l = loss(Y_hat, y) # Save the mean loss for the current minibatch train_loss[i].append(torch.mean(l).item()) # Compute the mean accuracy for the current minibatch and save it a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item() train_accuracy[i].append(a) # Set the gradients to zero optimizer.zero_grad() # Compute the gradient l.mean().backward() # Performs a single parameter update optimizer.step() # Derive the mean train loss and accuracy for the current epoch train_loss[i] = sum(train_loss[i])/len(train_loss[i]) train_accuracy[i] = sum(train_accuracy[i])/len(train_accuracy[i]) # Compute the test outputs and derive the test accuracy for every epoch, in minibatches with torch.no_grad(): for X, y in test_iter: Y_hat = model(X) a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item() test_accuracy[i].append(a) test_accuracy[i] = sum(test_accuracy[i])/len(test_accuracy[i]) # Print the progress print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}') # Show some predictions for X, y in test_iter: break number_examples = 10 Y_hat = model(X[:number_examples]) y_hat = torch.argmax(Y_hat, dim=1) label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] plt.figure(figsize=(18, 2)) for i in range(number_examples): plt.subplot(1, number_examples, i+1) plt.imshow(X[i][0], cmap='binary') plt.title(f'True: {label_list[y[i].item()]}\n Pred: {label_list[y_hat[i]]}') plt.xticks([]) plt.yticks([]) plt.show() # ### 3.4. MLP using higher-level APIs in Keras # In[4]: import matplotlib.pyplot as plt import numpy as np import tensorflow as tf # Get the train and test inputs and outputs (X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data() X_train = X_train/255 X_test = X_test/255 input_size = X_train[0, :, :].shape hidden_size = 256 output_size = 10 # Define the parameters for the training number_epochs = 10 batch_size = 256 learning_rate = 0.1 # Define a model with flattened inputs, a densely-connected NN layer with a ReLU, and another one without activation model = tf.keras.Sequential([tf.keras.layers.Flatten(input_shape=input_size), tf.keras.layers.Dense(hidden_size, activation='relu', kernel_initializer=tf.initializers.RandomNormal(mean=0, stddev=0.01), bias_initializer='zeros'), tf.keras.layers.Dense(output_size, activation=None, kernel_initializer=tf.initializers.RandomNormal(mean=0, stddev=0.01), bias_initializer='zeros')]) # Configure the model with SGD optimizer, cross-entropy loss (with integers, not one-hot), and accuracy metrics model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), \ loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), \ metrics=['accuracy']) # Train the model model.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=number_epochs, verbose=1) # Show some predictions number_examples = 10 Y_hat = model.predict(X_test[:number_examples, :, :]) y_hat = np.argmax(Y_hat, axis=1) label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] plt.figure(figsize=(18, 2)) for i in range(number_examples): plt.subplot(1, number_examples, i+1) plt.imshow(X_test[i, :, :], cmap='binary') plt.title(f'True: {label_list[y_test[i]]}\n Pred: {label_list[y_hat[i]]}') plt.xticks([]) plt.yticks([]) plt.show() # In[ ]: