#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np from tqdm import tqdm from sklearn.datasets import load_digits from sklearn.metrics import confusion_matrix from seaborn import despine import seaborn as sns sns.set_style("ticks") sns.set_context("talk") from IPython.display import Image import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # # 0. Quick overview # In this notebook you will learn how to combine multiple Perceptrons to an **artificial neural network** and how to use the **backpropagation** algorithm to compute the partial derivatives needed to train an artificial neural network with gradient descent. # # 1. What if the data is not linearly separable? # In the previous notebook ([0-Perceptron-Gradient-Descent.ipynb](./0-Perceptron-Gradient-Descent.ipynb)), we have learned that we can use the Perceptron algorithm to build a binary classifier for linearly separable data (by learning a hyperplane). # # Yet, not all data is linearly separable. Take a look at the figure below and try to find a straight line that separates the blue and red dots: # In[2]: np.random.seed(4128) n_samples = 50 # generate random data X1 = [] X2 = [] for sample in range(n_samples): # class 1 if np.random.uniform(0,1,1) > 0.5: X1.append(np.array([np.random.normal(-1,0.2,1), np.random.normal(1,0.2,1)]).reshape(1,-1)) else: X1.append(np.array([np.random.normal(1,0.2,1), np.random.normal(-1,0.2,1)]).reshape(1,-1)) # class 2 if np.random.uniform(0,1,1) > 0.5: X2.append(np.array([np.random.normal(-1,0.2,1), np.random.normal(-1,0.2,1)]).reshape(1,-1)) else: X2.append(np.array([np.random.normal(1,0.2,1), np.random.normal(1,0.2,1)]).reshape(1,-1)) X = np.concatenate([np.concatenate(X1), np.concatenate(X2)]) y = np.zeros((X.shape[0],1)) y[n_samples:] = 1 y = y.astype(np.int) # plot the training data fig, ax = plt.subplots(figsize=(6,6)) ax.scatter(X[(y==0).ravel(),0], X[(y==0).ravel(),1], c='b') ax.scatter(X[(y==1).ravel(),0], X[(y==1).ravel(),1], c='r') ax.set_xlabel(r'$x_1$') ax.set_ylabel(r'$x_2$') ax.set_xlim(-2, 2) ax.set_ylim(-2, 2) despine(ax=ax) fig.tight_layout() # save fig.savefig('figures/Figure-1-0_Not-Linaerly-Separable.png', dpi=600) # Mh. I don't think it is possible to separate the blue and red points with a single straight line. # # Yet, the data would be easily separable if we could use two decision boundaries (or one that is curved). # # So how do we train a classifier for this problem? # # 2. XOR with multiple Perceptrons # Couldn't we just combine multiple Perceptrons to solve this classification problem? # # We could train one Perceptron to distinguish the red point cloud in the lower left from all others and another Perceptron to distinguish the red point cloud in the top right from all others. A third Perceptron could then be trained based on the predictions of the other two: if either of the first two Perceptrons predicts that a data point belongs to their target class (so if either predicts $y=1$), the third Perceptron would also predict $y=1$. # In[3]: # setup figure fig, axs = plt.subplots(1,3,figsize=(18,6)) # Perceptron 1 axs[0].set_title('Perceptron 1:\n'+'y=1, if '+r'$x_1<0$'+' and '+r'$x_2<0$') idx1 = np.logical_and(X[:,0]<0, X[:,1]<0) axs[0].scatter(X[idx1,0], X[idx1,1], color='red') axs[0].scatter(X[~idx1,0], X[~idx1,1], color='blue') # Perceptron 2 axs[1].set_title('Perceptron 2:\n'+'y=1, if '+r'$x_1>0$'+' and '+r'$x_2>0$') idx2 = np.logical_and(X[:,0]>0, X[:,1]>0) axs[1].scatter(X[idx2,0], X[idx2,1], color='red') axs[1].scatter(X[~idx2,0], X[~idx2,1], color='blue') # Perceptron 3 axs[2].set_title('Perceptron 3:\n'+'y=1, if Perceptr. 1 or Perceptr. 2') idx3 = np.logical_or(idx1, idx2) axs[2].scatter(X[idx3,0], X[idx3,1], color='red') axs[2].scatter(X[~idx3,0], X[~idx3,1], color='blue') # label axes for ax in axs: despine(ax=ax) ax.set_xlabel(r'$x_1$') ax.set_ylabel(r'$x_2$') fig.tight_layout() # save fig.savefig('figures/Figure-1-1_Multi-Perceptron-Classification.png', dpi=600) # This is a cool idea, let's see if we can get this implemented. # # To make the computational setup more clear, I have also created a simple sketch of how the three Perceptrons would work together: # In[4]: Image(filename='materials/images/free-use/Multilayer-Perceptron.png') # To set this up, we will use a Perceptron implementation similar to the one that we used in the [0-Perceptron-Gradient-Descent.ipynb](./0-Perceptron-Gradient-Descent.ipynb) notebook: # In[5]: # our activation function: def sigmoid(x): """the sigmoid function """ return 1.0/(1.0 + np.exp(-x)) # In[6]: # our loss: class cross_entropy_loss: def __init__(self): self.name = 'cross-entropy' def loss(self, y, y_pred, zerotol=1e-10): """the binary cross-entropy loss Args: y (array): labels for each instance (0 or 1) y_pred (array): predicted probabilty that each instance belongs to class 1 """ loss = -(y * np.log(y_pred + zerotol) + (1 - y) * np.log(1 - y_pred + zerotol)) return loss def derivative_loss(self, y, y_pred): """the derivative of the cross-entropy loss w.r.t. to sigmoid activation Args: y (array): labels for each instance (0 or 1) y_pred (array): predicted probabilty that each instance belongs to class 1 """ return y_pred - y # In[7]: # the Perceptron: class Perceptron: def __init__(self, n_in, activation=sigmoid, loss=cross_entropy_loss, b=None): """A simple Perceptron implementation. Args: n_in (int): number of input features for each instance activation (function): activation function of the Perceptron; takes individual values or array as input loss (class): loss function to use; class is expected to include two functions: loss, derivative_loss (indicating the derivative of the loss w.r.t. to the output activation) see cross_entropy class above as an example b (float): bias term; if a value is specified, the bias term is fixed to this value. if not, the bias will be estimated during training. """ self.n_in = n_in self.w = np.random.uniform(-1,1,n_in) if b is None: self.b = np.random.uniform(-1,1,1) self.fit_b = True else: self.b = b self.fit_b = False self.activation = activation self.loss = loss().loss self.derivative_loss = loss().derivative_loss def predict(self, x): """Predict probability that each instance of x (with smape n_instances x n_features) belongs to class 1 Args: x (ndarray): input data (n_instances x n_features) """ self.Z = np.dot(x, self.w) + self.b self.A = self.activation(self.Z) return self.A def gradient_descent_step(self, x, y, learning_rate): """A single gradient descent step. Args: x (ndarray): input data (n_instances x n_features) y (array): label of each instance (0 or 1) learning_rate (float): learning rate of gradient descent algorithm """ # compute derivative of loss wrt Z dZ = self.derivative_loss(y, self.predict(x)) dW = np.dot(dZ, x) # subtract average derivative from weights self.w -= learning_rate * 1.0/dW.shape[0] * dW if self.fit_b: self.b -= learning_rate * (1.0/x.shape[0] * np.sum(dZ)) def train(self, x, y, batch_size=8, learning_rate=1, n_steps=100): """Iteratively train the Perceptron. At each iteration, the algorithm will draw a random sample from x (with batch_size samples) and use this sample to perform a gradient descent step. Args: x (ndarray): input data (n_instances x n_features) y (array): label of each instance (0 or 1) learning_rate (float): learning rate of gradient descent algorithm n_steps (int): number of iterations to perform during training """ self.training_w = np.zeros((n_steps, self.n_in+1)) self.training_loss = np.zeros(n_steps) for s in tqdm(range(n_steps)): # draw a random batch batch_idx = np.random.choice(x.shape[0], batch_size, replace=False) # compute and store mean loss self.training_loss[s] = np.mean(self.loss(y[batch_idx], self.predict(X[batch_idx]))) # store current weights self.training_w[s,:self.n_in] = self.w self.training_w[s,-1] = self.b # perform gradient descent step self.gradient_descent_step(X[batch_idx], y[batch_idx], learning_rate) # Let's train the first Perceptron to distinguish the red point cloud in the lower left from all other (so $y=1$, if $x_1 < 0$ and $x_2 < 0$): # In[8]: # create new labels idx1 = np.logical_and(X[:,0]<0, X[:,1]<0) y1 = idx1.astype(np.int) # initialize instance of Perceptron np.random.seed(213) p1 = Perceptron(n_in=2) # train p1.train(X, y1) # predict p1_pred = p1.predict(X) # compute predictive accuracy acc_p1 = np.mean((p1_pred>0.5) == y1) print('Prediction accuracy: {}%'.format(acc_p1*100)) # Ok, that worked! # # Now let's do the same for the second Perceptron, which is supposed to distinguish the red point cloud in the top right from all others (so $y=1$, if $x_1 > 0$ and $x_2 > 0$): # In[9]: # create new labels idx2 = np.logical_and(X[:,0]>0, X[:,1]>0) y2 = idx2.astype(np.int) # initialize instance of Perceptron np.random.seed(4543) p2 = Perceptron(n_in=2) # train p2.train(X, y2) # predict p2_pred = p2.predict(X) # compute predictive accuracy acc_p2 = np.mean((p2_pred>0.5) == y2) print('Prediction accuracy: {}%'.format(acc_p2*100)) # Great! This also worked. # # Now, let's focus on the final step: training a third Perceptron based on the predictions of the first and second Perceptron. # # To do this, we will first standardize the predictions of the first two Perceptrons (to have a mean of 0 and a standard deviation of 1); This will help the third Perceptron to learn (and mimics [batch normalization](https://en.wikipedia.org/wiki/Batch_normalization)): # In[10]: # standardize predictions p1_pred = (p1_pred - np.mean(p1_pred, axis=0)) / np.std(p1_pred, axis=0) p2_pred = (p2_pred - np.mean(p2_pred, axis=0)) / np.std(p2_pred, axis=0) # Then, we combine the standardized predictions to a new dataset `X3`: # In[11]: X3 = np.concatenate([p1_pred.reshape(-1,1), p2_pred.reshape(-1,1)], axis=1) print('Mean axis 0: {}'.format(X3.mean(0))) print('Std axis 0: {}'.format(X3.std(0))) # As we can see, the mean and standard deviation of the two feature colums are now close to 0 and 1. # # Remember, the target (`y3`) of our third Percpetron equals 1 if the target of the first or second Perceptron is also 1: # In[12]: y3 = np.logical_or(y1==1, y2==1).astype(np.int) # We can now train our last Perceptron on this new dataset (`X3`, `y3`): # In[13]: # initialize instance of Perceptron np.random.seed(1412) p3 = Perceptron(n_in=2) # train p3.train(X3, y3, n_steps=20000, learning_rate=1) # predict p3_pred = p3.predict(X3) # compute predictive accuracy acc_p3 = np.mean((p3_pred>0.5) == y3) print('Prediction accuracy: {}%'.format(acc_p3*100)) # Yay; It worked! # # Let's take a quick look at the decision boundaries that each of the Perceptrons has learned: # In[14]: # setup a meshgrid (each x1 and x2 coordinate for which we want to predict a probability) xx1, xx2 = np.meshgrid(np.linspace(-2,2,100), np.linspace(-2,2,100)) # predict with Perceptron 1 & 2 zz1 = p1.predict(np.c_[xx1.ravel(), xx2.ravel()]) zz2 = p2.predict(np.c_[xx1.ravel(), xx2.ravel()]) # standardize zz1_stand = (zz1 - np.mean(zz1, axis=0)) / np.std(zz1, axis=0) zz2_stand = (zz2 - np.mean(zz2, axis=0)) / np.std(zz2, axis=0) # predict with Perceptron 3, based of prediction of perceptron 1 & 2 zz3 = p3.predict(np.c_[zz1_stand.ravel(), zz2_stand.ravel()]) # plot fig, axs = plt.subplots(1, 3, figsize=(18,7)) for i, (y, zz) in enumerate(zip([y1, y2, y3], [zz1, zz2, zz3])): ax = axs[i] cs = ax.contourf(xx1, xx2, zz.reshape(xx1.shape)) cbar = fig.colorbar(cs, ax=ax, shrink=0.9, orientation="horizontal") cbar.set_label('P(is red)') cbar.set_ticks([0, 0.25, 0.5, 0.75, 1]) for j in range(y.shape[0]): if y[j] == 0: marker = 'bo' else: marker = 'ro' ax.plot(X[j][0], X[j][1], marker) ax.set_xlabel(r'$x_1$') ax.set_ylabel(r'$x_2$') ax.set_title('Decision boundary:\nPerceptron {}'.format(i+1)) fig.tight_layout(w_pad=4) # save fig.savefig('figures/Figure-1-2_Multi-Perceptron-Decision-Boundaries.png', dpi=600) # Super cool! By combining three individual Perceptrons, we have managed to learn a decision boundary for data that is not linearly separable! # # **This idea to sequentially combine multiple simple, but non-linear, functions to a single classifier (or network) forms the basis of deep learning!** # # Below, we will explore this idea further. # # 3. Artificial neural networks # A network of multiple perceptrons (or neurons) that are sequentially combined to form a single predictive system is often called an "artificial neural network" (or ANN). Note that the individual *Perceptrons* of an artificial neural network are often also called *neurons* (in reference to a biological neural network). # # A standard feedforward (or dense) neural network consists of the following elements (also see the figure below): # # - An *input* layer, representing an instance of the input data # # - One or multiple *hidden* layer(s), each containing a set of neurons # # - An *output* layer, containing one output neuron for each target class in the dataset # # In the illustration below, the input data is an image of a single handwritten digit. This image has 8x8 pixels; To serve it to our ANN, we simply flatten it to be a single vector of 64 values. The input layer therefore would have 64 input neurons (each representing one of the 64 input features). # # As there are 10 unique handwritten digits (0 to 9), we would further set the output layer of our ANN to contain 10 neurons (one for each digit). Each output neuron describes the probability that the image depicts a digit of its class. # In[15]: Image(filename='materials/images/free-use/Artificial-Neural-Network.png') # In contrast to our previous example (in which we separately trained three individual Perceptrons), an ANN uses linear algebra wizardry to perform all computations in one go. # # Specifically, an ANN represents all neurons of a layer as a single matrix: # In[16]: Image(filename='materials/images/free-use/ANN-Forward-Pass.png') # The input layer is represented by the data matrix $X$, containing one data sample (or instance) in each row and one column for each feature of the data (these features are often also called input neurons). # # The hidden layer is represented by a weight-matrix $W_1$: It contains one row for each feature of the data (or neuron of the previous layer) and one column for each of its own neurons. # # Similarly, the output layer is also represented by a weight matrix $W_{out}$, containing one row for each neuron of the previous hidden layer and one column for each target class in the data (each representing one output neuron). # # Importantly, hidden layers (as well as the output layer) also contain a vector of bias terms $b$ (with one value for each neuron of the layer). # # Let's quickly settle on some notation that we will use throughout the rest of this notebook: # # - $X$: The input data # - $Z = X \cdot W + b$: The output of a layer (indicating the sum of the input to a layer multiplied by the layer's weights as well as the addition of the bias terms) # - $A = \phi(Z)$: The activation of a layer (indicating the application of a layer's activation function ($\phi$) to its output $Z$ (e.g., the sigmoid function in our previous example)) # To get a better feel for how the individual computations work, let's play it through really quick, with random weights: # In[17]: # we will use our previous X, containing 100 instances with two values per instance: print(X.shape) # Let's say our hidden layer has 10 neurons; The weight matrix would then be of shape (2, 10): # In[18]: np.random.seed(1342) # weights and bias of hidden layer: W_hidden = np.random.normal(size=(2,10)) b_hidden = np.random.normal(size=(1,10)) # one bias for each neuron # We have two output classes (blue and red); We can model these two classes with a single output neuron that is activated through the sigmoid function (if its output is >0.5, we predict $y = 1$). The output layer therefore has only 1 neuron with one weight for each neuron of the previous hidden layer (resulting in a shape of (10,1)): # In[19]: np.random.seed(473) # weights and bias of output layer: W_output = np.random.normal(size=(10,1)) b_output = np.random.normal(size=(1,1)) # one bias for each neuron # Ok, the let's put this all together: # In[20]: Z_hidden = X.dot(W_hidden) + b_hidden A_hidden = sigmoid(Z_hidden) # here we use the sigmoid as out hidden layer activation funcion A_hidden.shape # As we can see, the output of our hidden layer contains 100 entries with 10 values each (each value representing the activation of one of our 10 hidden layer neurons). # In[21]: Z_output = A_hidden.dot(W_output) + b_output A_output = sigmoid(Z_output) A_output.shape # Similarly, the output of our random ANN contains 100 entries one value each, representing the probability that each entry (or instance) belongs to class 1 of our data. # In[22]: fig, ax = plt.subplots(1,1,figsize=(6,6), dpi=50) ax.hist(A_output, np.linspace(0,1,20)) ax.set_xlabel(r'$A_{out}$') ax.set_ylabel('Frequency') plt.axvline(0.5, color='r', ls='--') # draw classification threshold despine(ax=ax) # As our weights were set randomly, the ANN wrongly "predicts" all instances to have class 0 (all probabilities are < 0.5). # # 4. Backpropagation # The goal of the [backpropagation](https://en.wikipedia.org/wiki/Backpropagation) algorithm is to enable the application of the Gradient Descent algorithm to an ANN; By computing the partial derivative of the loss w.r.t. to each weight of each neuron of the ANN. # # Each partial derivative indicates how much the loss function (at its current value) changes with a change in its respective weight. Conceptually, these partial derivatives indicate how much each weight contributed to the current value of the loss function (or the current "error"); They are therefore also often referred to as "errors". # # To compute all partial derivatives, the backpropagation algorithm procedes as follows (also see the figure below for a general overview): # # 1. In a first step, the backpropagation algorithm looks at the prediction $A_{out}$ of the ANN (resulting from a forward pass) and computes the partial derivative of the loss function $L$ w.r.t. the prediction (we will refer to this partial derivative as: $\frac{dL}{dA_{out}}$; It indicates how much the loss function changes with a change in the predictions $A_{out}$). # # # 2. Subsequently, the backpropagation algorithm looks at the output $Z_{out}$ of the ANN's output layer and estimates how much the prediction $A_{out}$ changes with $Z_{out}$ (by computing the partial derivative $\frac{dA_{out}}{dZ_{out}}$). By the use of this partial derivative, the algorithm can then also obtain the partial derivative of our loss function $L$ w.r.t. the output $Z_{out}$. To do this, it uses the [chain rule](https://en.wikipedia.org/wiki/Chain_rule) and simply multiplies the previously obtained partial derivative $\frac{dL}{dA_{out}}$ with the partial derivative $\frac{dA_{out}}{dZ_{out}}$: $\frac{dL}{dZ_{out}} = \frac{dA_{out}}{dZ_{out}} \times \frac{dL}{dA_{out}} $ # # Note that for the combination of sigmoid (or softmax) activation function and cross-entropy loss this partial derivative breaks down to $\frac{dL}{dZ_{out}} = A_{out} - y$; It is hence often reffered to as the "error signal". # # # 3. Then, the backpropagation algorithm goes back through every hidden layer of the ANN and estimates how much the output $Z_l$ of each layer $l$ has contributed to the current value of the loss function (by computing the partial derivative $\frac{dL}{dZ_l}$ for each layer). By the use of the chain rule, this is simply: $\frac{dL}{dZ_{l-1}} = \frac{dA_{l-1}}{dZ_{l-1}} \times \frac{dZ_{L}}{dA_{l-1}} \times \frac{dL}{dZ_l}$ # # # 4. Lastly, the backpropagation algorithm computes the partial derivatives $\frac{dL}{dW_l}$ from the previously obtained partial derivatives $\frac{dL}{dZ_l}$: $\frac{dL}{dW_{l}} = \frac{dZ_l}{dW_l} \times \frac{dL}{dZ_l}$ (indicating how much the loss function $L$ chanegs w.r.t. the weights $W_l$ of layer $l$). In typical gradient descent fashion, we then use these partial derivatives to update out ANN weights: $W_l -= \alpha \times \frac{dL}{dW_{l}}$ ($\alpha$ again indicating the learning rate). # # # # # For more details on the mechanics of the backpropagation algorithm, please see: # - https://www.youtube.com/watch?v=Ilg3gGewQ5U&t=387s # - https://www.youtube.com/watch?v=tIeHLnjs5U8&t=353s # - https://www.youtube.com/watch?v=yXcQ4B-YSjQ&t=711s # In[23]: Image(filename='materials/images/free-use/Backpropagation.png') # # 5. Our implementation # OK, let's put all of this together into an implementation of an ANN: # In[24]: # the sigmoid activation function class sigmoid(): """the sigmoid function """ def __init__(self): self.name = 'sigmoid' def forward(self, x): """forward pass with the sigmoid function """ return 1./(1.+np.exp(-x)) def derivative(self, a): """the derviative phi'(a) of the sigmoid function """ return a*(1-a) # In[25]: # the neural network class NeuralNetwork: """A simple Neural Network implementation. Args: n_in (int): number of input neurons n_out (int): number of output neurons n_hidden (array of ints): number of neurons for each layer: an array of [10, 10] would create two hidden layers with 10 neurons each activations (array of classes): activation for each hidden layer as well as the output layer; This should contain one activation function for each of the hidden layers (as specified by n_hidden) as well as one for the output layer (this should be the last entry of the array). If None, all activations are set to the sigmoid function. Importantly, each activation needs to be a class with two functions for the forward pass and the derivative w.r.t. its activations (see the sigmoid implementation above) seed (int): random seed to allow for reproducibiliy """ def __init__(self, n_in, n_out, n_hidden=[10], activations=None, seed=123): # init self.n_in = n_in self.n_out = n_out self.n_hidden = np.append(np.array(n_hidden), n_out).astype(np.int) # add output layer self.n_layers = self.n_hidden.size self.training_loss = [] # we will use this to collect the loss values during gradient descent self.seed = seed # initialize weights & biases self.parms = {} # a dictionary containing all network parameters np.random.seed(self.seed) n_in = int(self.n_in) for layer, n_out in enumerate(self.n_hidden): self.parms["W{}".format(layer)] = self._initialize_weight([n_in, n_out]) self.parms["b{}".format(layer)] = self._initialize_weight([1, n_out]) # also create empty storage for neuron activations self.parms["A{}".format(layer)] = np.zeros((n_in, n_out)) n_in = int(n_out) # set layer activations if activations is None: # if no activations are given, use sigmoid self.activations = [sigmoid()] * self.n_layers else: if len(activations) != self.n_layers: raise ValueError('/!\ Number of activations does not match number of layers.') self.activations = [a() for a in activations] def _initialize_weight(self, size, bound=1): """Draw weights randomly from a uniform distribution between -1 and 1 """ return np.random.uniform(-1, 1, size=size) def forward(self, X): """Forward pass of the neural network, given a dataset X Args: X (ndarray): input data with one instance per row and one column for each feature Returns: predicted probability (n_samples x n_output_neurons) """ self.parms['X'] = X for layer in range(self.n_layers): Z = X.dot(self.parms['W{}'.format(layer)]) + self.parms['b{}'.format(layer)] A = self.activations[layer].forward(Z) X = A self.parms["A{}".format(layer)] = A self.y_pred = A return A def predict(self, X): """Predict classes for each instance of an input dataset X Args: X (ndarray): input data with one instance per row and one column for each feature Returns: predicted class for each instance """ y_pred = self.forward(X) if self.n_out == 1: return np.array(y_pred > 0.5).astype(np.int).ravel() else: return np.argmax(y_pred, axis=1).ravel() def loss(self, X, y, zerotol=1e-10): """Compute mean cross entropy loss, over the samples of an input dataset (X) and a set of corresponding class labels (y). Args: X (ndarray): input data with one instance per row and one column for each feature y (ndarray): class label for each instance. Importantly, if n_out > 1, y needs to be one-hot encoded: such that it is of shape (n_samples x n_out) Returns: Average cross entropy loss""" if self.activations[-1].name not in ['sigmoid', 'softmax']: raise ValueError('loss function only valid for sigmoid or softmax output activations.') y_pred = self.forward(X) # binary cross-entropy loss if self.n_out == 1: loss = -y * np.log(y_pred + zerotol) loss -= (1 - y) * np.log(1 - y_pred + zerotol) # multi-class extension else: loss = -np.sum(y * np.log(y_pred), axis=1) return 1.0/X.shape[0] * np.sum(loss, axis=0) def derivative_loss(self, x, y): """Derivative of cross entropy loss""" if self.activations[-1].name not in ['sigmoid', 'softmax']: raise ValueError('derivative of loss only valid for sigmoid / softmax output activations.') y_pred = self.forward(x) return y_pred - y def backward(self, X, y, lr=0.1): """Compute gradient descent backward pass, updating all network weights, given an input dataset (X) and a set of corresponding class labels (y). Args: X (ndarray): input data with one instance per row and one column for each feature y (ndarray): class label for each instance. Importantly, if n_out > 1, y needs to be one-hot encoded: such that it is of shape (n_samples x n_out) lr (float): learning rate for weight update """ y_pred = self.forward(X) self.training_loss.append(self.loss(X, y)) dZ = self.derivative_loss(X, y) for layer in range(1, self.n_layers)[::-1]: # iterate hidden layers backward # dL/dW and dL/db are computed as averages over the batch # dL/dW is computed by multiplying dZ with the input to the layer # (here, A from the next lower layer) dW = 1.0/dZ.shape[0] * self.parms["A{}".format(layer-1)].T.dot(dZ) db = 1.0/dZ.shape[0] * np.sum(dZ, axis=0, keepdims=True) self.parms["W{}".format(layer)] -= self.lr * dW self.parms["b{}".format(layer)] -= self.lr * db # compute dZ for next lower layer dA = dZ.dot(self.parms["W{}".format(layer)].T) dZ = dA * self.activations[layer-1].derivative(self.parms["A{}".format(layer-1)]) # input layer dW = 1.0/dZ.shape[0] * X.T.dot(dZ) db = 1.0/dZ.shape[0] * np.sum(dZ, axis=0, keepdims=True) self.parms["W0"] -= self.lr * dW self.parms["b0"] -= self.lr * db def fit(self, X, y, lr=0.1, n_steps=10000, batch_size=32, verbose=False): """Fit the neural network to an input dataset. Args: X (ndarray): input data with one instance per row and one column for each feature y (ndarray): class label for each instance. Importantly, if n_out > 1, y needs to be one-hot encoded: such that it is of shape (n_samples x n_out) lr (float): learning rate for weight update n_steps (int): number of gradient descent steps to perform batch_size (int): number of random samples drawn from the dataset at each gradient descent iteration verbose (bool): whether or not to give a printed update about the state of the training""" # make sure y is right shape if self.n_out == 1 and np.ndim(y) < 2: y = y.reshape(-1,1) assert np.ndim(y) == 2, '/!\ y needs to be 2 dimenional' # make sure data and model match if y.shape[1]>1: if np.unique(y.argmax(axis=1)).size != self.n_out: raise ValueError('/!\ n_out does not match number of labels in y (dim-1)') if verbose: print('Beginning training for {} batches ({} samples/batch):'.format(n_steps, batch_size)) self.lr = lr iterator = range(n_steps) for step in (tqdm(iterator) if verbose else iterator): batch_i = np.random.choice(X.shape[0], batch_size, replace=False) self.forward(X[batch_i]) self.backward(X[batch_i], y[batch_i], lr=self.lr) def plot_training_stats(self, X, y, train_idx, test_idx, target_names=None): """Plot the training loss as well as a confusion matrix for the training and test datasets. Args: X (ndarray): input data with one instance per row and one column for each feature y (array): class label for each instance. train_idx (array): index indicating the position of the training instances in X and y test_idx (array): index indicating the position of the test instances in X and y target_names (array of strings): names of target classes in y Returns: Matrplotlib figure and axes """ fig, axs = plt.subplots(1,3,figsize=(20,6)) # plot training loss axs[0].set_title('Training loss') axs[0].set_ylabel('Loss') axs[0].set_xlabel('Training step') axs[0].plot(self.training_loss, color='k') despine(ax=axs[0]) # plot confusion matrix for training and test data for i, (label, idx) in enumerate(zip(['Training', 'Test'], [train_idx, test_idx])): y_pred = self.predict(X[idx]) acc = np.mean(y_pred == y[idx]) axs[1+i].set_title('{} data\nMean Acc.: {}%'.format(label, np.round(acc*100, 2))) conf_mat = confusion_matrix(y[idx], y_pred, normalize='true') sns.heatmap(conf_mat, annot=True, ax=axs[1+i], vmin=0, vmax=1, annot_kws={'fontsize': 100./self.n_out}) # this might be too large for few classes if target_names is not None: axs[1+i].set_xticklabels(target_names) axs[1+i].set_yticklabels(target_names) fig.tight_layout() return fig, axs # # 6. What happens if we change the hidden layer activation function? # Let's see if our implementation actually works and is able to distinguish between the red and blue points in our previous example. # # Now that we are at it, let's also make this a bit more interesting by changing the activation function of the hidden layer. # # For this, I have prepared three common activation functions (including their derivatives; see below): # - the [rectified linear unit (or ReLu)](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)), # - [hyperbolic tangent (or tanh)](https://mathworld.wolfram.com/HyperbolicTangent.html), and # - [softmax](https://en.wikipedia.org/wiki/Softmax_function). # # Note that the softmax function is typically used as the output activation for classification problems with more than two mutually exclusive classes (in which each sample can only belong to one out of the classes). # In[26]: # three common activation functions: class relu(): def __init__(self): self.name = 'relu' def forward(self, x): return np.maximum(x, 0) def derivative(self, a): d = np.zeros_like(a) d[a<0] = 0 d[a>0] = 1 return d class tanh(): def __init__(self): self.name = 'tanh' def forward(self, x): return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x)) def derivative(self, a): return 1-a**2 class softmax(): def __init__(self): self.name = 'softmax' def forward(self, x): e_x = np.exp(x) return e_x / e_x.sum(axis=1, keepdims=True) def derivative(self, a): SM = a.reshape((-1,1)) return (np.diagflat(a) - np.dot(SM, SM.T)).sum(axis=1) # In this example, we will use one output neuron for each of the two classes of our example dataset. # # We therefore set `n_out = 2` in our specification of the ANN. # # By doing so, we also need to change the encoding of our target (y), so that it has one value for each of the two output neurons. This type of encoding is common in machine learning and is called one-hot encoding. It looks like this: # # if y = 0: [1, 0], if y = 1: [0, 1] # # Thereby `y.argmax(axis=1)` is equal to our original y. # In[27]: y_onehot = np.zeros((y.size, 2)) y_onehot[np.arange(y.size), y.ravel()] = 1 print('This is how one-hot encoding looks like for class 0:') print(y_onehot[0]) print('And this is how it looks for class 1:') print(y_onehot[n_samples]) print('\nOverall, y now has dimension: {}x{}'.format(*y_onehot.shape)) # As the two classes (red and blue) are mutually exclusive, we further use the softmax activation function for the output layer: # In[28]: # set a random seed seed = 159 np.random.seed(seed) # setup figure fig, axs = plt.subplots(2,3,figsize=(15,10), sharey='row', sharex='row') # iterate activation functions for i, act in enumerate([sigmoid, tanh, relu]): # plot activation function of hidden layer ax = axs[0,i] x = np.linspace(-5,5,100) ax.axhline(0, color='k', ls='--') ax.axvline(0, color='k', ls='--') ax.plot(x, act().forward(x), color='red', lw=4) ax.set_title('Hidden layer activation: \n{}'.format(act().name.capitalize())) ax.set_xlabel(r'$x$') despine(ax=ax) # make and fit neural network instance nn = NeuralNetwork(n_in=2, n_out=2, n_hidden=[3], activations=[act, softmax], seed=seed) nn.fit(X, y_onehot, verbose=False) # perdict probabilities zz = nn.forward(np.c_[xx1.ravel(), xx2.ravel()]) zz = zz[:,1].reshape(xx1.shape) # plot decision boundary ax = axs[1,i] cs = ax.contourf(xx1,xx2,zz) for i in range(y.shape[0]): if y[i] == 0: marker = 'bo' else: marker = 'ro' ax.plot(X[i][0], X[i][1], marker) ax.set_title('Learned decision boundary') ax.set_xlabel(r'$x_1$') ax.set_xlim(-2, 2) ax.set_ylim(-2, 2) # some final plotting settings axs[0,0].set_ylabel('Activation '+r'$\phi(x)$') axs[1,0].set_ylabel(r'$x_2$') # save figure fig.tight_layout(h_pad=2, w_pad=4) fig.savefig('figures/Figure-1-3_Neural-Network-Hidden-Activations.png', dpi=600) # Cool! Our ANN correctly distinguishes between the two classes for all three hidden layer activation functions. # # Interestingly, the learned decision boundary changes with the different hidden layer activation functions. # # 7. Exercise: Classify handwritten digits # For this exercise, we will utilize a dataset that contains small grey-scale images of handwritten digits. # # This dataset is part of the scikit-learn library and we can load it as follows: # In[29]: from sklearn.datasets import load_digits # In[30]: data = load_digits() # `data`is a dictionary with the following elements: # In[31]: data.keys() # The digit label is encoded in the `target` entry: # In[32]: data['target'] # The 1797 images of the dataset are stored in `data`. # # Importantly, each image is flattened to 64 values. To visualize a few samples, we need to reshape them to 8 x 8 pixels. # In[33]: data['data'].shape # In[34]: # setup figure fig, axs = plt.subplots(2,5,figsize=(12,6)) axs = axs.ravel() # plot 10 examples for i in range(10): axs[i].imshow(data['data'][i].reshape(8,8), cmap='gray') axs[i].set_title('This is a {}'.format(data['target'][i])) despine(ax=axs[i]) axs[i].set_xticks([]) axs[i].set_yticks([]) fig.tight_layout() # save fig.savefig('figures/Figure-1-4_Hand-Written-Digits.png', dpi=600) # To classifiy the images with our neural network, we again need to create a one-hot encoding of the y-vector (this is necessary so that we can use 10 distinct output neurons): # In[35]: # create X & y data X = data['data'] y = data['target'] # create a one-hot version of y y_onehot = np.zeros((y.size, np.unique(y).size)) y_onehot[np.arange(y.size), y] = 1 assert np.all(np.argmax(y_onehot, axis=1) == y), '/!\ Error in one-hot encoding' # We will further separate the data into a distinct training and test dataset: # In[36]: np.random.seed(1827) # train / test split train_idx = np.random.choice(X.shape[0], int(0.7 * X.shape[0]), replace=False) test_idx = np.array([i for i in np.arange(X.shape[0]) if i not in train_idx]) assert np.all([i not in train_idx for i in test_idx]), '/!\ Overlapping training and test datasets.' # Now we are ready to setup and train our neural network on the training data! # # Let's create a simple network with 1 hidden layer and 10 neurons in this layer: # In[37]: # set a random seed seed = 2130 np.random.seed(seed) # setup nn = NeuralNetwork(n_in=X.shape[1], # the number of input neurons (or data features) n_out=y_onehot.shape[1], # the number of ourput neurons (or classes, here 10) n_hidden=[10], # 10 hidden neurons in a single hidden layer activations=[sigmoid, softmax], # we use the sigmoid activation function for the hidden layer seed=seed) # training: nn.fit(X=X[train_idx], y=y_onehot[train_idx], lr=0.01, n_steps=20000, verbose=True) # By the use of our handy `plot_training_stats` function, we can get a quick overview of the training statistics as well as the networks perdictive performance in the training and test data: # In[38]: nn.plot_training_stats(X=X, y=y, train_idx=train_idx, test_idx=test_idx, target_names=data['target_names']) # Ok, not bad; almost $77$% prediction accuracy in the test dataset. # # Can you beat this? # # **Try adjusting the hidden layer activations, as well as the number of number of hidden layers and the number of neurons per hidden layer in the code above.** # # You can increase the number of hidden layers by adding numbers to `n_hidden`: `n_hidden = [10,10]` would create two hidden layers with 10 neurons each. Importantly, don't forget to add activation functions to `activations` if you increase the number of hidden layers!