#!/usr/bin/env python
# coding: utf-8

# # Deep Learning

# ## 1. Linear Regression
# https://d2l.ai/chapter_linear-regression/

# ### 1.1. Linear regression from scratch in NumPy

# In[1]:


import numpy as np

# Define the true weights and bias of the model
w_true = np.array([2, -3.4])
b_true = 4.2

# Construct a random generator, seeded for reproducibility
rng = np.random.default_rng(seed=0)

# Generate the inputs (from a standard normal distribution) and outputs (with some Gaussian noise)
number_examples = 1000
input_size = len(w_true)
X = rng.normal(0, 1, (number_examples, input_size))
y = np.matmul(X, w_true) + b_true + rng.normal(0, 0.01, number_examples)

# Define the parameters for the training
number_epochs = 3
batch_size = 10
learning_rate = 0.03

# Initialize the weights and bias to recover
w = rng.normal(0, 1, input_size)
b = 0

# Initialize an array for the mean loss over the minibatches of every epoch
epoch_loss = np.zeros(number_epochs)

# Loop over the epochs
for i in range(number_epochs):
    
    # Generate random indices for all the examples
    example_indices = np.arange(number_examples)
    rng.shuffle(example_indices)
    
    # Initialize a list for the mean loss over the examples of every minibatch
    batch_loss = []
    
    # Loop over the examples in minibatches
    for j in np.arange(0, number_examples, batch_size):
        
        # Get the indices of the examples for one minibatch
        batch_indices = example_indices[j:min(j+batch_size, number_examples)]
        
        # Get the inputs and outputs for the current minibatch
        X_batch = X[batch_indices, :]
        y_batch = y[batch_indices]
        
        # Compute the predicted outputs
        y_hat = np.matmul(X_batch, w) + b
        
        # Compute the loss between the predicted and true outputs
        l = 0.5*np.power(y_hat-y_batch, 2)
        
        # Save the mean loss for the current minibatch
        batch_loss.append(np.mean(l))
        
        # Update the weights and bias using stochastic gradient descent (SGD)
        w = w - learning_rate*np.mean(X_batch*(y_hat-y_batch)[:, None], axis=0)
        b = b - learning_rate*np.mean(y_hat-y_batch, axis=0)
        
    # Save the mean loss for the current epoch
    epoch_loss[i] = np.mean(batch_loss)
    
    # Print the progress
    print(f'{i+1}/{number_epochs}: {epoch_loss[i]}')
    
# Print the predicted weights and bias
print('')
print(f'w = {w}')
print(f'b = {b}')


# ### 1.2. Linear regression from scratch in PyTorch

# In[4]:


import torch

# Define the true weights and bias of the model
w_true = torch.tensor([2, -3.4])
b_true = 4.2

# Generate inputs and outputs
number_examples = 1000
input_size = len(w_true)
X = torch.normal(0, 1, (number_examples, input_size))
y = torch.matmul(X, w_true) + b_true + torch.normal(0, 0.01, [number_examples])

# Define a function to read the dataset in random minibatches
def batch(X, y, batch_size):
    
    # Generate random indices for all the examples
    number_examples = X.shape[0]
    example_indices = torch.randperm(number_examples)
    
    # Loop over the examples in minibatches
    for i in range(0, number_examples, batch_size):
        
        # Get the indices of the examples for one minibatch
        batch_indices = example_indices[i:min(i+batch_size, number_examples)]
        
        # Return the input and output for the current minibatch and continue the iteration in the function
        yield X[batch_indices], y[batch_indices]

# Define the parameters for the training
number_epochs = 3
batch_size = 10
learning_rate = 0.03

# Initialize the weights and bias to recover, requiring the gradients to be computed
w = torch.normal(0, 1, [input_size], requires_grad=True)
b = torch.zeros(1, requires_grad=True)

# Initialize an array for the mean loss over the minibatches of every epoch
epoch_loss = torch.zeros(number_epochs)
        
# Loop over the epochs
for i in range(number_epochs):
    
    # Initialize a list for the mean loss over the examples of every minibatch
    batch_loss = []
    
    # Loop over the examples in minibatches
    for X_batch, y_batch in batch(X, y, batch_size):
        
        # Compute the predicted outputs
        y_hat = torch.matmul(X_batch, w) + b
        
        # Compute the loss between the predicted and true outputs
        l = 0.5*(y_hat-y_batch)**2
        
        # Compute the gradient on l wrt w and b
        # (sum and not mean as the gradients will be divided by the batch size during SGD)
        l.sum().backward()
        
        # Save the mean loss for the current minibatch
        batch_loss.append(l.mean())
        
        # Temporarily sets all the requires_grad flags to false
        with torch.no_grad():
            
            # Update the weights and bias using SGD
            # (use augmented assignments to avoid modifying existing variables)
            w -= learning_rate*w.grad/len(l)
            b -= learning_rate*b.grad/len(l)
            
            # Set the gradients to zeros to avoid accumulating gradients
            w.grad.zero_()
            b.grad.zero_()
            
    # Save the mean loss for the current epoch
    epoch_loss[i] = sum(batch_loss)/len(batch_loss)
    
    # Print the progress
    print(f'{i+1}/{number_epochs}: {epoch_loss[i]}')
    
# Print the predicted weights and bias
print('')
print(f'w = {w}')
print(f'b = {b}')


# ### 1.3. Linear regression using APIs in PyTorch

# In[1]:


import torch
from torch.utils import data
from torch import nn

# Define the true weights and bias of the model
w_true = torch.tensor([2, -3.4])
b_true = 4.2

# Generate inputs and outputs
number_examples = 1000
input_size = len(w_true)
X = torch.normal(0, 1, (number_examples, input_size))
y = torch.matmul(X, w_true) + b_true + torch.normal(0, 0.01, [number_examples])

# Define a function to read the dataset in random minibatches by using data iterator
def batch(X, y, batch_size):
    data_set = data.TensorDataset(*(X, y))
    return data.DataLoader(data_set, batch_size, shuffle=True)

# Define the parameters for the training
number_epochs = 3
batch_size = 10
learning_rate = 0.03

# Define the model with a fully-connected layer
model = nn.Sequential(nn.Linear(input_size, 1))

# Initialize the parameters
model[0].weight.data.normal_(0, 0.01)
model[0].bias.data.fill_(0)

# Define the loss function (mean squared error, without the 0.5 factor)
loss = nn.MSELoss()

# Define the optimization algorithm (SGD)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Initialize an array for the mean loss over the minibatches of every epoch
epoch_loss = torch.zeros(number_epochs)

# Loop over the epochs
for i in range(number_epochs):
    
    # Initialize a list for the mean loss over the examples of every minibatch
    batch_loss = []
    
    # Loop over the examples in minibatches
    for X_batch, y_batch in batch(X, y, batch_size):
        
        # Compute the predicted outputs
        y_hat = model(X_batch)
        
        # Compute the loss between the predicted and true outputs
        l = loss(y_hat, y_batch[:, None])
        
        # Save the loss for the current minibatch
        batch_loss.append(l)
        
        # Set the gradients to zero
        optimizer.zero_grad()
        
        # Computes the gradient
        l.backward()
        
        # Performs a single parameter update
        optimizer.step()
        
    # Save the mean loss for the current epoch
    epoch_loss[i] = sum(batch_loss)/len(batch_loss)
        
    # Print the progress
    print(f'{i+1}/{number_epochs}: {epoch_loss[i]}')
    
# Print the predicted weights and bias
print('')
print(f'w = {model[0].weight.data}')
print(f'b = {model[0].bias.data}')


# ### 1.4. Linear regression using higher-level APIs in Keras

# In[45]:


import tensorflow as tf

# Define the true weights and bias of the model
w_true = tf.constant([2, -3.4], shape=[2, 1])
b_true = tf.constant(4.2)

# Generate inputs and outputs
number_examples = 1000
input_size = len(w_true)
tf.random.set_seed(0)
X = tf.random.normal([number_examples, input_size], 0, 1)
y = tf.matmul(X, w_true) + b_true + tf.random.normal([number_examples], 0, 0.01)

# Define the parameters for the training
number_epochs = 3
batch_size = 10
learning_rate = 0.03

# Define the model with a densely-connected NN layer with initialized parameters
model = tf.keras.Sequential([tf.keras.layers.Dense(1, \
                                                   kernel_initializer=tf.initializers.RandomNormal(mean=0, stddev=0.01), \
                                                   bias_initializer='zeros')])

# Configure the model for training with SGD optimizer and MSE loss
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), \
              loss=tf.keras.losses.MeanSquaredError())

# Train the model given the batch size and number of epochs
model.fit(x=X, y=y, batch_size=batch_size, epochs=number_epochs, verbose=1)

# Print the predicted weights and bias
print('')
print(f'w = {model.get_weights()[0]}')
print(f'b = {model.get_weights()[1]}')


# ## 2. Softmax Regression
# https://d2l.ai/chapter_linear-classification/

# ### 2.1. Fashion-MNIST dataset

# In[2]:


import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

# Get the Fashion-MNIST dataset, with train and test inputs and outputs
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

# Normalize the inputs
X_train = X_train/255
X_test = X_test/255

# Translate the outputs into labels
label_list = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
label_train = [label_list[i] for i in y_train]
label_test = [label_list[i] for i in y_test]

# Show a single example for the different classes
number_classes = len(label_list)
plt.figure(figsize=(18, 2))
for i in range(number_classes):
    j = np.where(y_train==i)[0][0]
    plt.subplot(1, number_classes, i+1)
    plt.imshow(X_train[j, :, :], cmap='binary')
    plt.title(label_list[i])
    plt.xticks([])
    plt.yticks([])
plt.show()


# ### 2.2. Softmax regression from scratch in NumPy

# In[1]:


import matplotlib.pyplot as plt
import numpy as np
import random
import tensorflow as tf

# Get the train and test inputs and outputs
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
number_train = len(X_train)
number_test = len(X_test)

# Normalize and flatten the inputs
input_size = np.size(X_train[0])
X_train = np.reshape(X_train/255, (number_train, input_size))
X_test = np.reshape(X_test/255, (number_test, input_size))

# Derive one-hot versions of the train outputs
output_size = 10
Y_train = np.zeros((number_train, output_size))
Y_train[np.arange(number_train), y_train] = 1

# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1

# Initialize the weights and bias to recover
W = np.random.default_rng().normal(0, 0.01, size=(input_size, output_size))
b = np.zeros(output_size)

# Initialize lists for the mean train loss and accuracy over the minibatches for every epoch
train_loss = [[] for _ in range(number_epochs)]
train_accuracy = [[] for _ in range(number_epochs)]

# Initialize a list for the test accuracy overall for every epoch
test_accuracy = [None]*number_epochs

# Loop over the epochs
for i in range(number_epochs):
    
    # Generate random indices for all the train examples
    train_indices = np.arange(number_train)
    random.shuffle(train_indices)
    
    # Loop over the train examples in minibatches
    for j in np.arange(0, number_train, batch_size):
        
        # Get the indices of the train examples for one minibatch
        batch_indices = train_indices[j:min(j+batch_size, number_train)]
        
        # Get the train inputs and outputs for the minibatch
        X = X_train[batch_indices, :]
        y = y_train[batch_indices]
        Y = Y_train[batch_indices]
        
        # Compute the predicted outputs (logits)
        O = np.matmul(X, W) + b
        
        # Compute the softmax of the logits (indirectly to avoid numerical stability issues)
        O = O-np.max(O, axis=1)[:, None]
        O_exp = np.exp(O)
        Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None]
        
        # Compute the mean cross-entropy loss for the minibatch and save it
        l = np.mean(np.log(np.sum(O_exp, axis=1)-np.sum(Y*O, axis=1)))
        train_loss[i].append(l)
        
        # Compute the mean accuracy for the minibatch and save it
        a = np.mean(np.argmax(Y_hat, axis=1)==y)
        train_accuracy[i].append(a)
        
        # Update the weights and bias using SGD
        dl = Y_hat-Y
        W = W-learning_rate*np.matmul(X.T, dl)/np.shape(X)[0]
        b = b-learning_rate*np.mean(dl, axis=0)
        
    # Derive the mean train loss and accuracy for the current epoch
    train_loss[i] = np.mean(train_loss[i])
    train_accuracy[i] = np.mean(train_accuracy[i])
    
    # Compute the test outputs and derive the test accuracy for the current epoch
    O = np.matmul(X_test, W) + b
    O = O-np.max(O, axis=1)[:, None]
    O_exp = np.exp(O)
    Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None]
    test_accuracy[i] = np.mean(np.argmax(Y_hat, axis=1)==y_test)
    
    # Print the progress
    print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}')
    
# Show some predictions
number_examples = 10
O = np.matmul(X_test[:number_examples, :], W) + b
O = O-np.max(O, axis=1)[:, None]
O_exp = np.exp(O)
Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None]
y_hat = np.argmax(Y_hat, axis=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
    plt.subplot(1, number_examples, i+1)
    plt.imshow(np.reshape(X_test[i, :], (28, 28))*255, cmap='binary')
    plt.title(f'True: {label_list[y_test[i].item()]}\n Pred: {label_list[y_hat[i]]}')
    plt.xticks([])
    plt.yticks([])
plt.show()


# ### 2.3. Softmax regression from scratch in PyTorch

# In[2]:


import matplotlib.pyplot as plt
import torch
import torchvision
from torch.utils import data

# Get the dataset (transform the image data from PIL type to normalized 32-bit floating point tensors)
fmnist_train = torchvision.datasets.FashionMNIST(root='data', train=True, download=True, 
                                                 transform=torchvision.transforms.ToTensor())
fmnist_test = torchvision.datasets.FashionMNIST(root='data', train=False, download=True,
                                                transform=torchvision.transforms.ToTensor())

# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1

# Use data iterators to read a minibatch at each iteration, shuffling the examples for the train set and using 4 processes
train_iter = data.DataLoader(fmnist_train, batch_size, shuffle=True, num_workers=4)
test_iter = data.DataLoader(fmnist_test, batch_size, shuffle=False, num_workers=4)

# Initialize the parameters to recover, requiring the gradients to be computed
input_size = fmnist_train[0][0].nelement()
output_size = 10
W = torch.normal(0, 0.01, size=(input_size, output_size), requires_grad=True)
b = torch.zeros(output_size, requires_grad=True)

# Initialize lists for the mean train loss, train and test accuracy over the minibatches for every epoch
train_loss = [[] for _ in range(number_epochs)]
train_accuracy = [[] for _ in range(number_epochs)]
test_accuracy = [[] for _ in range(number_epochs)]

# Loop over the epochs
for i in range(number_epochs):
    
    # Loop over the train examples in minibatches
    for X, y in train_iter:
        
        # Compute the logits, after flattening the images
        O = torch.matmul(torch.reshape(X, (-1, input_size)), W) + b

        # Compute the softmax of the logits
        O_exp = torch.exp(O)
        Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True)
        
        # Compute the cross-entropy loss (use the indices of the true classes in y_batch 
        # to get the corresponding probabilities in y_batch, for all the examples)
        l = -torch.log(Y_hat[range(Y_hat.shape[0]), y])
        
        # Save the mean loss for the current minibatch
        train_loss[i].append(torch.mean(l).item())
        
        # Compute the mean accuracy for the current minibatch and save it
        a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
        train_accuracy[i].append(a)
        
        # Compute the gradient on l with respect to W and b
        # (sum and not mean as the gradients will be divided by the batch size during SGD)
        torch.sum(l).backward()
        
        # Disable gradient calculation for the following operations not to be differentiable
        with torch.no_grad():
            
            # Update the weights and bias using SGD
            # (use augmented assignments to avoid modifying existing variables)
            W -= learning_rate*W.grad/len(l)
            b -= learning_rate*b.grad/len(l)
            
            # Set the gradients to zeros to avoid accumulating gradients
            W.grad.zero_()
            b.grad.zero_()
    
    # Derive the mean train loss and accuracy for the current epoch
    train_loss[i] = sum(train_loss[i])/len(train_loss[i])
    train_accuracy[i] = sum(train_accuracy[i])/len(train_accuracy[i])
    
    # Compute the test outputs and derive the test accuracy for every epoch, in minibatches
    with torch.no_grad():
        for X, y in test_iter:
            O = torch.matmul(torch.reshape(X, (-1, input_size)), W) + b
            O_exp = torch.exp(O)
            Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True)
            a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
            test_accuracy[i].append(a)
    test_accuracy[i] = sum(test_accuracy[i])/len(test_accuracy[i])
    
    # Print the progress
    print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}')
    
# Show some predictions
for X, y in test_iter:
    break
number_examples = 10
O = torch.matmul(torch.reshape(X[:number_examples], (-1, input_size)), W) + b
O_exp = torch.exp(O)
Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True)
y_hat = torch.argmax(Y_hat, dim=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
    plt.subplot(1, number_examples, i+1)
    plt.imshow(X[i][0], cmap='binary')
    plt.title(f'True: {label_list[y[i].item()]}\n Pred: {label_list[y_hat[i]]}')
    plt.xticks([])
    plt.yticks([])
plt.show()


# ### 2.4. Softmax regression using APIs in PyTorch

# In[3]:


import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils import data
import torchvision

# Get the dataset (transform the image data from PIL type to normalized 32-bit floating point tensors)
fmnist_train = torchvision.datasets.FashionMNIST(root='data', train=True, download=True, 
                                                 transform=torchvision.transforms.ToTensor())
fmnist_test = torchvision.datasets.FashionMNIST(root='data', train=False, download=True,
                                                transform=torchvision.transforms.ToTensor())

# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1

# Use data iterators to read a minibatch at each iteration, shuffling the examples for the train set and using 4 processes
train_iter = data.DataLoader(fmnist_train, batch_size, shuffle=True, num_workers=4)
test_iter = data.DataLoader(fmnist_test, batch_size, shuffle=False, num_workers=4)

# Define the model, with a flatten layer to reshape the inputs before the fully-connected layer
input_size = fmnist_train[0][0].nelement()
output_size = 10
model = nn.Sequential(nn.Flatten(), nn.Linear(input_size, output_size))

# Initialize the parameters by applying a function recursively to every submodule
def init(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, std=0.01)
model.apply(init);

# Define the loss function (with no reduction applied to the output, no mean, no sum, none)
loss = nn.CrossEntropyLoss(reduction='none')

# Define the optimization algorithm
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Initialize lists for the mean train loss, train and test accuracy over the minibatches for every epoch
train_loss = [[] for _ in range(number_epochs)]
train_accuracy = [[] for _ in range(number_epochs)]
test_accuracy = [[] for _ in range(number_epochs)]

# Loop over the epochs
for i in range(number_epochs):
    
    # Loop over the train examples in minibatches
    for X, y in train_iter:
        
        # Compute the predicted outputs
        Y_hat = model(X)
        
        # Compute the loss
        l = loss(Y_hat, y)
        
        # Save the mean loss for the current minibatch
        train_loss[i].append(torch.mean(l).item())
        
        # Compute the mean accuracy for the current minibatch and save it
        a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
        train_accuracy[i].append(a)
        
        # Set the gradients to zero
        optimizer.zero_grad()
        
        # Compute the gradient
        l.mean().backward()
        
        # Performs a single parameter update
        optimizer.step()
        
    # Derive the mean train loss and accuracy for the current epoch
    train_loss[i] = sum(train_loss[i])/len(train_loss[i])
    train_accuracy[i] = sum(train_accuracy[i])/len(train_accuracy[i])
    
    # Compute the test outputs and derive the test accuracy for every epoch, in minibatches
    with torch.no_grad():
        for X, y in test_iter:
            Y_hat = model(X)
            a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
            test_accuracy[i].append(a)
    test_accuracy[i] = sum(test_accuracy[i])/len(test_accuracy[i])
    
    # Print the progress
    print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}')
    
# Show some predictions
for X, y in test_iter:
    break
number_examples = 10
Y_hat = model(X[:number_examples])
y_hat = torch.argmax(Y_hat, dim=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
    plt.subplot(1, number_examples, i+1)
    plt.imshow(X[i][0], cmap='binary')
    plt.title(f'True: {label_list[y[i].item()]}\n Pred: {label_list[y_hat[i]]}')
    plt.xticks([])
    plt.yticks([])
plt.show()


# ### 2.5. Softmax regression using higher-level APIs in Keras

# In[2]:


import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

# Get the train and test inputs and outputs
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
X_train = X_train/255
X_test = X_test/255
input_size = X_train[0, :, :].shape
output_size = 10

# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1

# Define a model with flattened inputs and a densely-connected NN layer
model = tf.keras.Sequential([tf.keras.layers.Flatten(input_shape=input_size),
                             tf.keras.layers.Dense(output_size,
                                                   activation=None,
                                                   kernel_initializer=tf.initializers.RandomNormal(mean=0, stddev=0.01), 
                                                   bias_initializer='zeros')])

# Configure the model with SGD optimizer, cross-entropy loss (with integers, not one-hot), and accuracy metrics
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), \
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), \
              metrics=['accuracy'])

# Train the model
model.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=number_epochs, verbose=1)

# Show some predictions
number_examples = 10
Y_hat = model.predict(X_test[:number_examples, :, :])
y_hat = np.argmax(Y_hat, axis=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
    plt.subplot(1, number_examples, i+1)
    plt.imshow(X_test[i, :, :], cmap='binary')
    plt.title(f'True: {label_list[y_test[i]]}\n Pred: {label_list[y_hat[i]]}')
    plt.xticks([])
    plt.yticks([])
plt.show()


# ## 3. Multilayer Perceptron
# https://d2l.ai/chapter_multilayer-perceptrons

# ### 3.1. MLP from scratch in NumPy

# In[5]:


import matplotlib.pyplot as plt
import numpy as np
import random
import tensorflow as tf

# Get the train and test inputs and outputs
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
number_train = len(X_train)
number_test = len(X_test)

# Normalize and flatten the inputs
input_size = np.size(X_train[0])
X_train = np.reshape(X_train/255, (number_train, input_size))
X_test = np.reshape(X_test/255, (number_test, input_size))

# Derive one-hot versions of the train outputs
output_size = 10
Y_train = np.zeros((number_train, output_size))
Y_train[np.arange(number_train), y_train] = 1

# Initialize the weights and biases to recover
hidden_size = 256
W0 = np.random.default_rng().normal(0, 0.01, size=(input_size, hidden_size))
b0 = np.zeros(hidden_size)
W1 = np.random.default_rng().normal(0, 0.01, size=(hidden_size, output_size))
b1 = np.zeros(output_size)

# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1

# Initialize lists for the mean train loss and accuracy over the minibatches for every epoch
train_loss = [[] for _ in range(number_epochs)]
train_accuracy = [[] for _ in range(number_epochs)]

# Initialize a list for the test accuracy overall for every epoch
test_accuracy = [None]*number_epochs

# Loop over the epochs
for i in range(number_epochs):
    
    # Generate random indices for all the train examples
    train_indices = np.arange(number_train)
    random.shuffle(train_indices)
    
    # Loop over the train examples in minibatches
    for j in np.arange(0, number_train, batch_size):
        
        # Get the indices of the train examples for one minibatch
        batch_indices = train_indices[j:min(j+batch_size, number_train)]
        
        # Get the train inputs and outputs for the minibatch
        X = X_train[batch_indices, :]
        y = y_train[batch_indices]
        Y = Y_train[batch_indices]
        
        # Compute the outputs of the model (with ReLU)
        H = np.matmul(X, W0) + b0
        H[H<0] = 0
        O = np.matmul(H, W1) + b1
        
        # Compute the softmax of the logits (indirectly to avoid numerical stability issues)
        O = O-np.max(O, axis=1)[:, None]
        O_exp = np.exp(O)
        Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None]
        
        # Compute the mean cross-entropy loss for the minibatch and save it
        l = np.mean(np.log(np.sum(O_exp, axis=1)-np.sum(Y*O, axis=1)))
        train_loss[i].append(l)
        
        # Compute the mean accuracy for the minibatch and save it
        a = np.mean(np.argmax(Y_hat, axis=1)==y)
        train_accuracy[i].append(a)
        
        # Compute the derivative of the loss wrt the output of the output layer
        dl1 = Y_hat-Y
        
        # Derive the derivative of the loss wrt the output of the hidden layer (using the chain rule)
        dl0 = np.matmul(dl1, W1.T)
               
        # Update the weights and biases of the output layer using SGD
        W1 = W1-learning_rate*np.matmul(H.T, dl1)/np.shape(H)[0]
        b1 = b1-learning_rate*np.mean(dl1, axis=0)
        
        # Update the weights and biases of the hidden layer using SGD
        W0 = W0-learning_rate*np.matmul(X.T, dl0)/np.shape(X)[0]
        b0 = b0-learning_rate*np.mean(dl0, axis=0)

    # Derive the mean train loss and accuracy for the current epoch
    train_loss[i] = np.mean(train_loss[i])
    train_accuracy[i] = np.mean(train_accuracy[i])
    
    # Compute the test outputs and derive the test accuracy for the current epoch
    H = np.matmul(X_test, W0) + b0
    H[H<0] = 0
    O = np.matmul(H, W1) + b1
    O = O-np.max(O, axis=1)[:, None]
    O_exp = np.exp(O)
    Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None]
    test_accuracy[i] = np.mean(np.argmax(Y_hat, axis=1)==y_test)
    
    # Print the progress
    print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}')
    
# Show some predictions
number_examples = 10
H = np.matmul(X_test[:number_examples, :], W0) + b0
H[H<0] = 0
O = np.matmul(H, W1) + b1
O = O-np.max(O, axis=1)[:, None]
O_exp = np.exp(O)
Y_hat = O_exp/np.sum(O_exp, axis=1)[:, None]
y_hat = np.argmax(Y_hat, axis=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
    plt.subplot(1, number_examples, i+1)
    plt.imshow(np.reshape(X_test[i, :], (28, 28))*255, cmap='binary')
    plt.title(f'True: {label_list[y_test[i].item()]}\n Pred: {label_list[y_hat[i]]}')
    plt.xticks([])
    plt.yticks([])
plt.show()


# ### 3.2. MLP from scratch in PyTorch

# In[3]:


import matplotlib.pyplot as plt
import torch
import torchvision
from torch.utils import data

# Get the dataset (transform the image data from PIL type to normalized 32-bit floating point tensors)
fmnist_train = torchvision.datasets.FashionMNIST(root='data', train=True, download=True, 
                                                 transform=torchvision.transforms.ToTensor())
fmnist_test = torchvision.datasets.FashionMNIST(root='data', train=False, download=True,
                                                transform=torchvision.transforms.ToTensor())

# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1

# Use data iterators to read a minibatch at each iteration, shuffling the examples for the train set and using 4 processes
train_iter = data.DataLoader(fmnist_train, batch_size, shuffle=True, num_workers=4)
test_iter = data.DataLoader(fmnist_test, batch_size, shuffle=False, num_workers=4)

# Initialize the parameters to recover, requiring the gradients to be computed
input_size = fmnist_train[0][0].nelement()
output_size = 10
hidden_size = 256
W0 = torch.normal(0, 0.01, size=(input_size, hidden_size), requires_grad=True)
b0 = torch.zeros(hidden_size, requires_grad=True)
W1 = torch.normal(0, 0.01, size=(hidden_size, output_size), requires_grad=True)
b1 = torch.zeros(output_size, requires_grad=True)

# Initialize lists for the mean train loss, train and test accuracy over the minibatches for every epoch
train_loss = [[] for _ in range(number_epochs)]
train_accuracy = [[] for _ in range(number_epochs)]
test_accuracy = [[] for _ in range(number_epochs)]

# Loop over the epochs
for i in range(number_epochs):
    
    # Loop over the train examples in minibatches
    for X, y in train_iter:
        
        # Compute the outputs of the model (with ReLU), after flattening the images
        H = torch.matmul(torch.reshape(X, (-1, input_size)), W0) + b0
        H[H<0] = 0
        O = torch.matmul(H, W1) + b1

        # Compute the softmax of the logits
        O_exp = torch.exp(O)
        Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True)
        
        # Compute the cross-entropy loss (use the indices of the true classes in y_batch 
        # to get the corresponding probabilities in y_batch, for all the examples)
        l = -torch.log(Y_hat[range(Y_hat.shape[0]), y])
        
        # Save the mean loss for the current minibatch
        train_loss[i].append(torch.mean(l).item())
        
        # Compute the mean accuracy for the current minibatch and save it
        a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
        train_accuracy[i].append(a)
        
        # Compute the gradient on l with respect to W and b
        # (sum and not mean as the gradients will be divided by the batch size during SGD)
        torch.sum(l).backward()
        
        # Disable gradient calculation for the following operations not to be differentiable
        with torch.no_grad():
            
            # Update the weights and biases using SGD
            # (use augmented assignments to avoid modifying existing variables)
            W1 -= learning_rate*W1.grad/len(l)
            b1 -= learning_rate*b1.grad/len(l)
            W0 -= learning_rate*W0.grad/len(l)
            b0 -= learning_rate*b0.grad/len(l)
            
            # Set the gradients to zeros to avoid accumulating gradients
            W1.grad.zero_()
            b1.grad.zero_()
            W0.grad.zero_()
            b0.grad.zero_()
    
    # Derive the mean train loss and accuracy for the current epoch
    train_loss[i] = sum(train_loss[i])/len(train_loss[i])
    train_accuracy[i] = sum(train_accuracy[i])/len(train_accuracy[i])
    
    # Compute the test outputs and derive the test accuracy for every epoch, in minibatches
    with torch.no_grad():
        for X, y in test_iter:
            H = torch.matmul(torch.reshape(X, (-1, input_size)), W0) + b0
            H[H<0] = 0
            O = torch.matmul(H, W1) + b1
            O_exp = torch.exp(O)
            Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True)
            a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
            test_accuracy[i].append(a)
    test_accuracy[i] = sum(test_accuracy[i])/len(test_accuracy[i])
    
    # Print the progress
    print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}')
    
# Show some predictions
for X, y in test_iter:
    break
number_examples = 10
H = torch.matmul(torch.reshape(X[:number_examples], (-1, input_size)), W0) + b0
H[H<0] = 0
O = torch.matmul(H, W1) + b1
O_exp = torch.exp(O)
Y_hat = O_exp/torch.sum(O_exp, 1, keepdim=True)
y_hat = torch.argmax(Y_hat, dim=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
    plt.subplot(1, number_examples, i+1)
    plt.imshow(X[i][0], cmap='binary')
    plt.title(f'True: {label_list[y[i].item()]}\n Pred: {label_list[y_hat[i]]}')
    plt.xticks([])
    plt.yticks([])
plt.show()


# ### 3.3. MLP using APIs in PyTorch

# In[1]:


import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils import data
import torchvision

# Get the dataset (transform the image data from PIL type to normalized 32-bit floating point tensors)
fmnist_train = torchvision.datasets.FashionMNIST(root='data', train=True, download=True, 
                                                 transform=torchvision.transforms.ToTensor())
fmnist_test = torchvision.datasets.FashionMNIST(root='data', train=False, download=True,
                                                transform=torchvision.transforms.ToTensor())

# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1

# Use data iterators to read a minibatch at each iteration, shuffling the examples for the train set and using 4 processes
train_iter = data.DataLoader(fmnist_train, batch_size, shuffle=True, num_workers=4)
test_iter = data.DataLoader(fmnist_test, batch_size, shuffle=False, num_workers=4)

# Define the model, with a flatten layer to reshape the inputs, two fully-connected layer, and a ReLU in-between
input_size = fmnist_train[0][0].nelement()
hidden_size = 256
output_size = 10
model = nn.Sequential(nn.Flatten(), 
                      nn.Linear(input_size, hidden_size), 
                      nn.ReLU(), 
                      nn.Linear(hidden_size, output_size))

# Initialize the parameters by applying a function recursively to every submodule
def init(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, std=0.01)
model.apply(init);

# Define the loss function (with no reduction applied to the output, no mean, no sum, none)
loss = nn.CrossEntropyLoss(reduction='none')

# Define the optimization algorithm
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Initialize lists for the mean train loss, train and test accuracy over the minibatches for every epoch
train_loss = [[] for _ in range(number_epochs)]
train_accuracy = [[] for _ in range(number_epochs)]
test_accuracy = [[] for _ in range(number_epochs)]

# Loop over the epochs
for i in range(number_epochs):
    
    # Loop over the train examples in minibatches
    for X, y in train_iter:
        
        # Compute the predicted outputs
        Y_hat = model(X)
        
        # Compute the loss
        l = loss(Y_hat, y)
        
        # Save the mean loss for the current minibatch
        train_loss[i].append(torch.mean(l).item())
        
        # Compute the mean accuracy for the current minibatch and save it
        a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
        train_accuracy[i].append(a)
        
        # Set the gradients to zero
        optimizer.zero_grad()
        
        # Compute the gradient
        l.mean().backward()
        
        # Performs a single parameter update
        optimizer.step()
        
    # Derive the mean train loss and accuracy for the current epoch
    train_loss[i] = sum(train_loss[i])/len(train_loss[i])
    train_accuracy[i] = sum(train_accuracy[i])/len(train_accuracy[i])
    
    # Compute the test outputs and derive the test accuracy for every epoch, in minibatches
    with torch.no_grad():
        for X, y in test_iter:
            Y_hat = model(X)
            a = torch.mean((torch.argmax(Y_hat, dim=1)==y)*1.0).item()
            test_accuracy[i].append(a)
    test_accuracy[i] = sum(test_accuracy[i])/len(test_accuracy[i])
    
    # Print the progress
    print(f'{i+1}/{number_epochs}: train_loss={train_loss[i]:.3f}; train_accuracy={train_accuracy[i]:.3f}; test_accuracy={test_accuracy[i]:.3f}')
    
# Show some predictions
for X, y in test_iter:
    break
number_examples = 10
Y_hat = model(X[:number_examples])
y_hat = torch.argmax(Y_hat, dim=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
    plt.subplot(1, number_examples, i+1)
    plt.imshow(X[i][0], cmap='binary')
    plt.title(f'True: {label_list[y[i].item()]}\n Pred: {label_list[y_hat[i]]}')
    plt.xticks([])
    plt.yticks([])
plt.show()


# ### 3.4. MLP using higher-level APIs in Keras

# In[4]:


import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

# Get the train and test inputs and outputs
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
X_train = X_train/255
X_test = X_test/255
input_size = X_train[0, :, :].shape
hidden_size = 256
output_size = 10

# Define the parameters for the training
number_epochs = 10
batch_size = 256
learning_rate = 0.1

# Define a model with flattened inputs, a densely-connected NN layer with a ReLU, and another one without activation
model = tf.keras.Sequential([tf.keras.layers.Flatten(input_shape=input_size), 
                             tf.keras.layers.Dense(hidden_size, 
                                                   activation='relu', 
                                                   kernel_initializer=tf.initializers.RandomNormal(mean=0, stddev=0.01), 
                                                   bias_initializer='zeros'), 
                             tf.keras.layers.Dense(output_size, 
                                                   activation=None, 
                                                   kernel_initializer=tf.initializers.RandomNormal(mean=0, stddev=0.01), 
                                                   bias_initializer='zeros')])

# Configure the model with SGD optimizer, cross-entropy loss (with integers, not one-hot), and accuracy metrics
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), \
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), \
              metrics=['accuracy'])

# Train the model
model.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=number_epochs, verbose=1)

# Show some predictions
number_examples = 10
Y_hat = model.predict(X_test[:number_examples, :, :])
y_hat = np.argmax(Y_hat, axis=1)
label_list = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
plt.figure(figsize=(18, 2))
for i in range(number_examples):
    plt.subplot(1, number_examples, i+1)
    plt.imshow(X_test[i, :, :], cmap='binary')
    plt.title(f'True: {label_list[y_test[i]]}\n Pred: {label_list[y_hat[i]]}')
    plt.xticks([])
    plt.yticks([])
plt.show()


# In[ ]: