#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.datasets import load_digits
from sklearn.metrics import confusion_matrix

from seaborn import despine
import seaborn as sns
sns.set_style("ticks")
sns.set_context("talk")

from IPython.display import Image

import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')


# # 0. Quick overview

# In this notebook you will learn how to combine multiple Perceptrons to an **artificial neural network** and how to use the **backpropagation** algorithm to compute the partial derivatives needed to train an artificial neural network with gradient descent.

# # 1. What if the data is not linearly separable?

# In the previous notebook ([0-Perceptron-Gradient-Descent.ipynb](./0-Perceptron-Gradient-Descent.ipynb)), we have learned that we can use the Perceptron algorithm to build a binary classifier for linearly separable data (by learning a  hyperplane).
# 
# Yet, not all data is linearly separable. Take a look at the figure below and try to find a straight line that separates the blue and red dots:

# In[2]:


np.random.seed(4128)

n_samples = 50

# generate random data
X1 = []
X2 = []
for sample in range(n_samples):
    # class 1
    if np.random.uniform(0,1,1) > 0.5:
        X1.append(np.array([np.random.normal(-1,0.2,1), np.random.normal(1,0.2,1)]).reshape(1,-1))
    else:
        X1.append(np.array([np.random.normal(1,0.2,1), np.random.normal(-1,0.2,1)]).reshape(1,-1))
    # class 2  
    if np.random.uniform(0,1,1) > 0.5:
        X2.append(np.array([np.random.normal(-1,0.2,1), np.random.normal(-1,0.2,1)]).reshape(1,-1))
    else:
        X2.append(np.array([np.random.normal(1,0.2,1), np.random.normal(1,0.2,1)]).reshape(1,-1))
X = np.concatenate([np.concatenate(X1), np.concatenate(X2)])
y = np.zeros((X.shape[0],1))
y[n_samples:] = 1
y = y.astype(np.int)

# plot the training data
fig, ax = plt.subplots(figsize=(6,6))
ax.scatter(X[(y==0).ravel(),0], X[(y==0).ravel(),1], c='b')
ax.scatter(X[(y==1).ravel(),0], X[(y==1).ravel(),1], c='r')
ax.set_xlabel(r'$x_1$')
ax.set_ylabel(r'$x_2$')
ax.set_xlim(-2, 2)
ax.set_ylim(-2, 2)
despine(ax=ax)
fig.tight_layout()

# save
fig.savefig('figures/Figure-1-0_Not-Linaerly-Separable.png', dpi=600)


# Mh. I don't think it is  possible to separate the blue and red points with a single straight line.
# 
# Yet, the data would be easily separable if we could use two decision boundaries (or one that is curved).
# 
# So how do we train a classifier for this problem?

# # 2. XOR with multiple Perceptrons

# Couldn't we just combine multiple Perceptrons to solve this classification problem?
# 
# We could train one Perceptron to distinguish the red point cloud in the lower left from all others and another Perceptron to distinguish the red point cloud in the top right from all others. A third Perceptron could then be trained based on the predictions of the other two: if either of the first two Perceptrons predicts that a data point belongs to their target class (so if either predicts $y=1$), the third Perceptron would also predict $y=1$. 

# In[3]:


# setup figure
fig, axs = plt.subplots(1,3,figsize=(18,6))

# Perceptron 1
axs[0].set_title('Perceptron 1:\n'+'y=1, if '+r'$x_1<0$'+' and '+r'$x_2<0$')
idx1 = np.logical_and(X[:,0]<0, X[:,1]<0)
axs[0].scatter(X[idx1,0], X[idx1,1], color='red')
axs[0].scatter(X[~idx1,0], X[~idx1,1], color='blue')

# Perceptron 2
axs[1].set_title('Perceptron 2:\n'+'y=1, if '+r'$x_1>0$'+' and '+r'$x_2>0$')
idx2 = np.logical_and(X[:,0]>0, X[:,1]>0)
axs[1].scatter(X[idx2,0], X[idx2,1], color='red')
axs[1].scatter(X[~idx2,0], X[~idx2,1], color='blue')

# Perceptron 3
axs[2].set_title('Perceptron 3:\n'+'y=1, if Perceptr. 1 or Perceptr. 2')
idx3 = np.logical_or(idx1, idx2)
axs[2].scatter(X[idx3,0], X[idx3,1], color='red')
axs[2].scatter(X[~idx3,0], X[~idx3,1], color='blue')

# label axes
for ax in axs:
    despine(ax=ax)
    ax.set_xlabel(r'$x_1$')
    ax.set_ylabel(r'$x_2$')
fig.tight_layout()

# save
fig.savefig('figures/Figure-1-1_Multi-Perceptron-Classification.png', dpi=600)


# This is a cool idea, let's see if we can get this implemented.
# 
# To make the computational setup more clear, I have also created a simple sketch of how the three Perceptrons would work together:

# In[4]:


Image(filename='materials/images/free-use/Multilayer-Perceptron.png')


# To set this up, we will use a Perceptron implementation similar to the one that we used in the [0-Perceptron-Gradient-Descent.ipynb](./0-Perceptron-Gradient-Descent.ipynb) notebook:

# In[5]:


# our activation function:

def sigmoid(x):
    """the sigmoid function
    """
    return 1.0/(1.0 + np.exp(-x)) 


# In[6]:


# our loss: 

class cross_entropy_loss:
    def __init__(self):
        self.name = 'cross-entropy'
        
    def loss(self, y, y_pred, zerotol=1e-10):
        """the binary cross-entropy loss
        
        Args:
            y (array): labels for each instance (0 or 1)
            y_pred (array): predicted probabilty that
                each instance belongs to class 1
        """
        loss = -(y * np.log(y_pred + zerotol) + (1 - y) * np.log(1 - y_pred + zerotol))
        return loss
    
    def derivative_loss(self, y, y_pred):
        """the derivative of the cross-entropy 
        loss w.r.t. to sigmoid activation
        
        Args:
            y (array): labels for each instance (0 or 1)
            y_pred (array): predicted probabilty that
                each instance belongs to class 1
        """
        return y_pred - y


# In[7]:


# the Perceptron:

class Perceptron:
    
    def __init__(self, n_in, activation=sigmoid, loss=cross_entropy_loss, b=None):
        """A simple Perceptron implementation.
        
        Args:
            n_in (int): number of input features for each instance
            activation (function): activation function of the Perceptron;
                takes individual values or array as input
            loss (class): loss function to use; 
                class is expected to include two functions:
                loss, derivative_loss (indicating the derivative
                of the loss w.r.t. to the output activation)
                see cross_entropy class above as an example
            b (float): bias term; if a value is specified, the
                bias term is fixed to this value. if not, 
                the bias will be estimated during training.
        """
        self.n_in = n_in
        self.w = np.random.uniform(-1,1,n_in)
        if b is None:
            self.b = np.random.uniform(-1,1,1)
            self.fit_b = True
        else:
            self.b = b
            self.fit_b = False
        self.activation = activation
        self.loss = loss().loss
        self.derivative_loss = loss().derivative_loss

    def predict(self, x):
        """Predict probability that each 
        instance of x (with smape n_instances x n_features)
        belongs to class 1
        
        Args:
            x (ndarray): input data (n_instances x n_features)
        """
        self.Z = np.dot(x, self.w) + self.b
        self.A = self.activation(self.Z)
        return self.A
    
    def gradient_descent_step(self, x, y, learning_rate):
        """A single gradient descent step.
        
        Args:
            x (ndarray): input data (n_instances x n_features)
            y (array): label of each instance (0 or 1)
            learning_rate (float): learning rate of gradient
                descent algorithm
        """
        # compute derivative of loss wrt Z
        dZ = self.derivative_loss(y, self.predict(x))
        dW = np.dot(dZ, x)
        # subtract average derivative from weights
        self.w -= learning_rate * 1.0/dW.shape[0] * dW
        if self.fit_b:
            self.b -= learning_rate * (1.0/x.shape[0] * np.sum(dZ))
            
    def train(self, x, y, batch_size=8, learning_rate=1, n_steps=100):
        """Iteratively train the Perceptron.
        At each iteration, the algorithm will draw 
        a random sample from x (with batch_size samples) 
        and use this sample to perform a gradient
        descent step.
        
        Args:
            x (ndarray): input data (n_instances x n_features)
            y (array): label of each instance (0 or 1)
            learning_rate (float): learning rate of gradient
                descent algorithm
            n_steps (int): number of iterations to perform
                during training
        """
        self.training_w = np.zeros((n_steps, self.n_in+1))
        self.training_loss = np.zeros(n_steps)
        for s in tqdm(range(n_steps)):
            # draw a random batch
            batch_idx = np.random.choice(x.shape[0], batch_size, replace=False)
            # compute and store mean loss
            self.training_loss[s] = np.mean(self.loss(y[batch_idx], self.predict(X[batch_idx])))
            # store current weights
            self.training_w[s,:self.n_in] = self.w
            self.training_w[s,-1] = self.b
            # perform gradient descent step
            self.gradient_descent_step(X[batch_idx], y[batch_idx], learning_rate)


# Let's train the first Perceptron to distinguish the red point cloud in the lower left from all other (so $y=1$, if $x_1 < 0$ and $x_2 < 0$):

# In[8]:


# create new labels
idx1 = np.logical_and(X[:,0]<0, X[:,1]<0)
y1 = idx1.astype(np.int)

# initialize instance of Perceptron
np.random.seed(213)
p1 = Perceptron(n_in=2)

# train
p1.train(X, y1)

# predict
p1_pred = p1.predict(X)
# compute predictive accuracy
acc_p1 = np.mean((p1_pred>0.5) == y1) 
print('Prediction accuracy: {}%'.format(acc_p1*100))


# Ok, that worked! 
# 
# Now let's do the same for the second Perceptron, which is supposed to distinguish the red point cloud in the top right from all others (so $y=1$, if $x_1 > 0$ and $x_2 > 0$):

# In[9]:


# create new labels
idx2 = np.logical_and(X[:,0]>0, X[:,1]>0)
y2 = idx2.astype(np.int)

# initialize instance of Perceptron
np.random.seed(4543)
p2 = Perceptron(n_in=2)

# train
p2.train(X, y2)

# predict
p2_pred = p2.predict(X)
# compute predictive accuracy
acc_p2 = np.mean((p2_pred>0.5) == y2)
print('Prediction accuracy: {}%'.format(acc_p2*100))


# Great! This also worked.
# 
# Now, let's focus on the final step: training a third Perceptron based on the predictions of the first and second Perceptron.
# 
# To do this, we will first standardize the predictions of the first two Perceptrons (to have a mean of 0 and a standard deviation of 1); This will help the third Perceptron to learn (and mimics [batch normalization](https://en.wikipedia.org/wiki/Batch_normalization)):

# In[10]:


# standardize predictions
p1_pred = (p1_pred - np.mean(p1_pred, axis=0)) / np.std(p1_pred, axis=0)
p2_pred = (p2_pred - np.mean(p2_pred, axis=0)) / np.std(p2_pred, axis=0)


# Then, we combine the standardized predictions to a new dataset `X3`:

# In[11]:


X3 = np.concatenate([p1_pred.reshape(-1,1), p2_pred.reshape(-1,1)], axis=1)
print('Mean axis 0: {}'.format(X3.mean(0)))
print('Std axis 0: {}'.format(X3.std(0)))


# As we can see, the mean and standard deviation of the two feature colums are now close to 0 and 1.
# 
# Remember, the target (`y3`) of our third Percpetron equals 1 if the target of the first or second Perceptron is also 1:

# In[12]:


y3 = np.logical_or(y1==1, y2==1).astype(np.int)


# We can now train our last Perceptron on this new dataset (`X3`, `y3`):

# In[13]:


# initialize instance of Perceptron
np.random.seed(1412)
p3 = Perceptron(n_in=2)

# train
p3.train(X3, y3, n_steps=20000, learning_rate=1)

# predict
p3_pred = p3.predict(X3)
# compute predictive accuracy
acc_p3 = np.mean((p3_pred>0.5) == y3)
print('Prediction accuracy: {}%'.format(acc_p3*100))


# Yay; It worked! 
# 
# Let's take a quick look at the decision boundaries that each of the Perceptrons has learned:

# In[14]:


# setup a meshgrid (each x1 and x2 coordinate for which we want to predict a probability)
xx1, xx2 = np.meshgrid(np.linspace(-2,2,100), np.linspace(-2,2,100))

# predict with Perceptron 1 & 2
zz1 = p1.predict(np.c_[xx1.ravel(), xx2.ravel()])
zz2 = p2.predict(np.c_[xx1.ravel(), xx2.ravel()])
# standardize
zz1_stand = (zz1 - np.mean(zz1, axis=0)) / np.std(zz1, axis=0)
zz2_stand = (zz2 - np.mean(zz2, axis=0)) / np.std(zz2, axis=0)

# predict with Perceptron 3, based of prediction of perceptron 1 & 2
zz3 = p3.predict(np.c_[zz1_stand.ravel(), zz2_stand.ravel()])

# plot
fig, axs = plt.subplots(1, 3, figsize=(18,7))
for i, (y, zz) in enumerate(zip([y1, y2, y3], [zz1, zz2, zz3])):
    ax = axs[i]
    cs = ax.contourf(xx1, xx2, zz.reshape(xx1.shape))
    cbar = fig.colorbar(cs, ax=ax, shrink=0.9, orientation="horizontal")
    cbar.set_label('P(is red)')
    cbar.set_ticks([0, 0.25, 0.5, 0.75, 1])
    for j in range(y.shape[0]):
        if y[j] == 0:
            marker = 'bo'
        else:
            marker = 'ro'
        ax.plot(X[j][0], X[j][1], marker)
    ax.set_xlabel(r'$x_1$')
    ax.set_ylabel(r'$x_2$')
    ax.set_title('Decision boundary:\nPerceptron {}'.format(i+1))
fig.tight_layout(w_pad=4)

# save
fig.savefig('figures/Figure-1-2_Multi-Perceptron-Decision-Boundaries.png', dpi=600)


# Super cool! By combining three individual Perceptrons, we have managed to learn a decision boundary for data that is not linearly separable!
# 
# **This idea to sequentially combine multiple simple, but non-linear, functions to a single classifier (or network) forms the basis of deep learning!**
# 
# Below, we will explore this idea further.

# # 3. Artificial neural networks 

# A network of multiple perceptrons (or neurons) that are sequentially combined to form a single predictive system is often called an "artificial neural network" (or ANN). Note that the individual *Perceptrons* of an artificial neural network are often also called *neurons* (in reference to a biological neural network).
# 
# A standard feedforward (or dense) neural network consists of the following elements (also see the figure below):
# 
# - An *input* layer, representing an instance of the input data
# 
# - One or multiple *hidden* layer(s), each containing a set of neurons
# 
# - An *output* layer, containing one output neuron for each target class in the dataset
# 
# In the illustration below, the input data is an image of a single handwritten digit. This image has 8x8 pixels; To serve it to our ANN, we simply flatten it to be a single vector of 64 values. The input layer therefore would have 64 input neurons (each representing one of the 64 input features).
# 
# As there are 10 unique handwritten digits (0 to 9), we  would further set the output layer of our ANN to contain 10 neurons (one for each digit). Each output neuron describes the probability that the image depicts a digit of its class.

# In[15]:


Image(filename='materials/images/free-use/Artificial-Neural-Network.png')


# In contrast to our previous example (in which we separately trained three individual Perceptrons), an ANN uses linear algebra wizardry to perform all computations in one go.
# 
# Specifically, an ANN represents all neurons of a layer as a single matrix:

# In[16]:


Image(filename='materials/images/free-use/ANN-Forward-Pass.png')


# The input layer is represented by the data matrix $X$, containing one data sample (or instance) in each row and one column for each feature of the data (these features are often also called input neurons).
# 
# The hidden layer is represented by a weight-matrix $W_1$: It contains one row for each feature of the data (or neuron of the previous layer) and one column for each of its own neurons.
# 
# Similarly, the output layer is also represented by a weight matrix $W_{out}$, containing one row for each neuron of the previous hidden layer and one column for each target class in the data (each representing one output neuron).
# 
# Importantly, hidden layers (as well as the output layer) also contain a vector of bias terms $b$ (with one value for each neuron of the layer).
# 
# Let's quickly settle on some notation that we will use throughout the rest of this notebook:
# 
# - $X$: The input data
# - $Z = X \cdot W + b$: The output of a layer (indicating the sum of the input to a layer multiplied by the layer's weights as well as the addition of the bias terms)
# - $A = \phi(Z)$: The activation of a layer (indicating the application of a layer's activation function ($\phi$) to its output $Z$ (e.g., the sigmoid function in our previous example))

# To get a better feel for how the individual computations work, let's play it through really quick, with random weights:

# In[17]:


# we will use our previous X, containing 100 instances with two values per instance:
print(X.shape)


# Let's say our hidden layer has 10 neurons; The weight matrix would then be of shape (2, 10):

# In[18]:


np.random.seed(1342)
# weights and bias of hidden layer:
W_hidden  = np.random.normal(size=(2,10))
b_hidden = np.random.normal(size=(1,10)) # one bias for each neuron


# We have two output classes (blue and red); We can model these two classes with a single output neuron that is activated through the sigmoid function (if its output is >0.5, we predict $y = 1$). The output layer therefore has only 1 neuron with one weight for each neuron of the previous hidden layer (resulting in a shape of (10,1)):

# In[19]:


np.random.seed(473)
# weights and bias of output layer:
W_output  = np.random.normal(size=(10,1))
b_output = np.random.normal(size=(1,1)) # one bias for each neuron


# Ok, the let's put this all together:

# In[20]:


Z_hidden = X.dot(W_hidden) + b_hidden
A_hidden = sigmoid(Z_hidden) # here we use the sigmoid as out hidden layer activation funcion
A_hidden.shape


# As we can see, the output of our hidden layer contains 100 entries with 10 values each (each value representing the activation of one of our 10 hidden layer neurons).

# In[21]:


Z_output = A_hidden.dot(W_output) + b_output
A_output = sigmoid(Z_output)
A_output.shape


# Similarly, the output of our random ANN contains 100 entries one value each, representing the probability that each entry (or instance) belongs to class 1 of our data.

# In[22]:


fig, ax = plt.subplots(1,1,figsize=(6,6), dpi=50)
ax.hist(A_output, np.linspace(0,1,20))
ax.set_xlabel(r'$A_{out}$')
ax.set_ylabel('Frequency')
plt.axvline(0.5, color='r', ls='--') # draw classification threshold
despine(ax=ax)


# As our weights were set randomly, the ANN wrongly "predicts" all instances to have class 0 (all probabilities are < 0.5).

# # 4. Backpropagation

# The goal of the [backpropagation](https://en.wikipedia.org/wiki/Backpropagation) algorithm is to enable the application of the Gradient Descent algorithm to an ANN; By computing the partial derivative of the loss w.r.t. to each weight of each neuron of the ANN. 
# 
# Each partial derivative indicates how much the loss function (at its current value) changes with a change in its respective weight. Conceptually, these partial derivatives indicate how much each weight contributed to the current value of the loss function (or the current "error"); They are therefore also often referred to as "errors". 
# 
# To compute all partial derivatives, the backpropagation algorithm procedes as follows (also see the figure below for a general overview):
# 
# 1. In a first step, the backpropagation algorithm looks at the prediction $A_{out}$ of the ANN (resulting from a forward pass) and computes the partial derivative of the loss function $L$ w.r.t. the prediction (we will refer to this partial derivative as: $\frac{dL}{dA_{out}}$; It indicates how much the loss function changes with a change in the predictions $A_{out}$). 
# 
# 
# 2. Subsequently, the backpropagation algorithm looks at the output $Z_{out}$ of the ANN's output layer and estimates how much the prediction $A_{out}$ changes with $Z_{out}$ (by computing the partial derivative $\frac{dA_{out}}{dZ_{out}}$). By the use of this partial derivative, the algorithm can then also obtain the partial derivative of our loss function $L$ w.r.t. the output $Z_{out}$. To do this, it uses the [chain rule](https://en.wikipedia.org/wiki/Chain_rule) and simply multiplies the previously obtained partial derivative $\frac{dL}{dA_{out}}$ with the partial derivative $\frac{dA_{out}}{dZ_{out}}$: $\frac{dL}{dZ_{out}} = \frac{dA_{out}}{dZ_{out}} \times \frac{dL}{dA_{out}} $ 
# 
#     Note that for the combination of sigmoid (or softmax) activation function and cross-entropy loss this partial derivative breaks down to $\frac{dL}{dZ_{out}} = A_{out} - y$; It is hence often reffered to as the "error signal".
# 
# 
# 3. Then, the backpropagation algorithm goes back through every hidden layer of the ANN and estimates how much the output $Z_l$ of each layer $l$ has contributed to the current value of the loss function (by computing the partial derivative $\frac{dL}{dZ_l}$ for each layer). By the use of the chain rule, this is simply: $\frac{dL}{dZ_{l-1}} = \frac{dA_{l-1}}{dZ_{l-1}} \times \frac{dZ_{L}}{dA_{l-1}} \times \frac{dL}{dZ_l}$ 
# 
# 
# 4. Lastly, the backpropagation algorithm computes the partial derivatives $\frac{dL}{dW_l}$ from the previously obtained partial derivatives $\frac{dL}{dZ_l}$: $\frac{dL}{dW_{l}} = \frac{dZ_l}{dW_l} \times \frac{dL}{dZ_l}$ (indicating how much the loss function $L$ chanegs w.r.t. the weights $W_l$ of layer $l$). In typical gradient descent fashion, we then use these partial derivatives to update out ANN weights: $W_l -= \alpha \times \frac{dL}{dW_{l}}$ ($\alpha$ again indicating the learning rate).
# 
# 
# 
# 
# For more details on the mechanics of the backpropagation algorithm, please see: 
# - https://www.youtube.com/watch?v=Ilg3gGewQ5U&t=387s
# - https://www.youtube.com/watch?v=tIeHLnjs5U8&t=353s
# - https://www.youtube.com/watch?v=yXcQ4B-YSjQ&t=711s

# In[23]:


Image(filename='materials/images/free-use/Backpropagation.png')


# # 5. Our implementation 

# OK, let's put all of this together into an implementation of an ANN:

# In[24]:


# the sigmoid activation function

class sigmoid():
    """the sigmoid function
    """
    def __init__(self):
        self.name = 'sigmoid'
        
    def forward(self, x):
        """forward pass with the sigmoid function
        """
        return 1./(1.+np.exp(-x))
    
    def derivative(self, a):
        """the derviative phi'(a) of the sigmoid function
        """
        return a*(1-a)


# In[25]:


# the neural network

class NeuralNetwork:
    """A simple Neural Network implementation.
    
    Args: 
        n_in (int): number of input neurons
        n_out (int): number of output neurons
        n_hidden (array of ints): number of neurons for each layer:
            an array of [10, 10] would create two hidden 
            layers with 10 neurons each
        activations (array of classes): activation for each hidden layer
            as well as the output layer; This should contain one activation
            function for each of the hidden layers (as specified by 
            n_hidden) as well as one for the output layer (this should be 
            the last entry of the array). If None, all activations are
            set to the sigmoid function.
            Importantly, each activation needs to be a
            class with two functions for the forward pass and
            the derivative w.r.t. its activations 
            (see the sigmoid implementation above)
        seed (int): random seed to allow for reproducibiliy
            
    """
            
    def __init__(self, n_in, n_out, n_hidden=[10], activations=None, seed=123):
        
        # init
        self.n_in = n_in
        self.n_out = n_out
        self.n_hidden = np.append(np.array(n_hidden), n_out).astype(np.int) # add output layer
        self.n_layers = self.n_hidden.size
        self.training_loss = [] # we will use this to collect the loss values during gradient descent
        self.seed = seed
        
        # initialize weights & biases
        self.parms = {} # a dictionary containing all network parameters    
        np.random.seed(self.seed)
        n_in = int(self.n_in)
        for layer, n_out in enumerate(self.n_hidden):
            self.parms["W{}".format(layer)] = self._initialize_weight([n_in, n_out])
            self.parms["b{}".format(layer)] = self._initialize_weight([1, n_out])
            # also create empty storage for neuron activations
            self.parms["A{}".format(layer)] = np.zeros((n_in, n_out)) 
            n_in = int(n_out)
        
        # set layer activations
        if activations is None:
            # if no activations are given, use sigmoid
            self.activations = [sigmoid()] * self.n_layers
        else:
            if len(activations) != self.n_layers:
                raise ValueError('/!\ Number of activations does not match number of layers.')
            self.activations = [a() for a in activations]
             
                
    def _initialize_weight(self, size, bound=1):
        """Draw weights randomly from 
        a uniform distribution between -1 and 1
        """
        return np.random.uniform(-1, 1, size=size)
    
    
    def forward(self, X):
        """Forward pass of the neural network,
        given a dataset X
        
        Args: 
            X (ndarray): input data with one instance 
                per row and one column for each feature
                
        Returns:
            predicted probability (n_samples x n_output_neurons)
        """
        self.parms['X'] = X
        for layer in range(self.n_layers):
            Z = X.dot(self.parms['W{}'.format(layer)]) + self.parms['b{}'.format(layer)]
            A = self.activations[layer].forward(Z)
            X = A
            self.parms["A{}".format(layer)] = A
        self.y_pred = A
        return A
        
        
    def predict(self, X):
        """Predict classes for each instance
         of an input dataset X
        
        Args: 
            X (ndarray): input data with one instance 
                per row and one column for each feature
                
        Returns:
            predicted class for each instance
        """
        y_pred = self.forward(X)
        if self.n_out == 1:
            return np.array(y_pred > 0.5).astype(np.int).ravel()
        else:
            return np.argmax(y_pred, axis=1).ravel()
    
    
    def loss(self, X, y, zerotol=1e-10):
        """Compute mean cross entropy loss,
        over the samples of an input dataset (X) and a set 
        of corresponding class labels (y).
        
        Args: 
            X (ndarray): input data with one instance 
                per row and one column for each feature
            y (ndarray): class label for each instance. 
                Importantly, if n_out > 1, y needs to be 
                one-hot encoded: such that it is of
                shape (n_samples x n_out)
        
        Returns:
            Average cross entropy loss"""
        if self.activations[-1].name not in ['sigmoid', 'softmax']:
            raise ValueError('loss function only valid for sigmoid or softmax output activations.')
        y_pred = self.forward(X)
        # binary cross-entropy loss
        if self.n_out == 1:
            loss = -y * np.log(y_pred + zerotol) 
            loss -= (1 - y) * np.log(1 - y_pred + zerotol)
        # multi-class extension
        else:
            loss = -np.sum(y * np.log(y_pred), axis=1)
        return 1.0/X.shape[0] * np.sum(loss, axis=0)
    
    
    def derivative_loss(self, x, y):
        """Derivative of cross entropy loss"""
        if self.activations[-1].name not in ['sigmoid', 'softmax']:
            raise ValueError('derivative of loss only valid for sigmoid / softmax output activations.')
        y_pred = self.forward(x)
        return y_pred - y
        
        
    def backward(self, X, y, lr=0.1):
        """Compute gradient descent backward pass,
        updating all network weights, given an
        input dataset (X) and a set of corresponding 
        class labels (y).
        
        Args: 
            X (ndarray): input data with one instance 
                per row and one column for each feature
            y (ndarray): class label for each instance. 
                Importantly, if n_out > 1, y needs to be 
                one-hot encoded: such that it is of
                shape (n_samples x n_out)
            lr (float): learning rate for weight update
        """
        y_pred = self.forward(X)
        self.training_loss.append(self.loss(X, y))
        dZ = self.derivative_loss(X, y)
        for layer in range(1, self.n_layers)[::-1]: # iterate hidden layers backward
            # dL/dW and dL/db are computed as averages over the batch
            # dL/dW is computed by multiplying dZ with the input to the layer 
            # (here, A from the next lower layer)
            dW = 1.0/dZ.shape[0] * self.parms["A{}".format(layer-1)].T.dot(dZ)
            db = 1.0/dZ.shape[0] * np.sum(dZ, axis=0, keepdims=True)
            self.parms["W{}".format(layer)] -= self.lr * dW
            self.parms["b{}".format(layer)] -= self.lr * db
            # compute dZ for next lower layer
            dA = dZ.dot(self.parms["W{}".format(layer)].T)
            dZ = dA * self.activations[layer-1].derivative(self.parms["A{}".format(layer-1)])
        # input layer
        dW = 1.0/dZ.shape[0] * X.T.dot(dZ)
        db = 1.0/dZ.shape[0] * np.sum(dZ, axis=0, keepdims=True)
        self.parms["W0"] -= self.lr * dW
        self.parms["b0"] -= self.lr * db
          
            
    def fit(self, X, y, lr=0.1, n_steps=10000, batch_size=32, verbose=False):
        """Fit the neural network to an input dataset.
        
        Args: 
            X (ndarray): input data with one instance 
                per row and one column for each feature
            y (ndarray): class label for each instance. 
                Importantly, if n_out > 1, y needs to be 
                one-hot encoded: such that it is of
                shape (n_samples x n_out)
            lr (float): learning rate for weight update
            n_steps (int): number of gradient descent steps
                to perform
            batch_size (int): number of random samples 
                drawn from the dataset at each gradient descent iteration
            verbose (bool): whether or not to give a printed update
                about the state of the training"""
        # make sure y is right shape
        if self.n_out == 1 and np.ndim(y) < 2:
            y = y.reshape(-1,1)  
        assert np.ndim(y) == 2, '/!\ y needs to be 2 dimenional'
        # make sure data and model match
        if y.shape[1]>1:
            if np.unique(y.argmax(axis=1)).size != self.n_out:
                raise ValueError('/!\ n_out does not match number of labels in y (dim-1)')
        if verbose:
            print('Beginning training for {} batches ({} samples/batch):'.format(n_steps, batch_size))
        self.lr = lr
        iterator = range(n_steps)
        for step in (tqdm(iterator) if verbose else iterator):
            batch_i = np.random.choice(X.shape[0], batch_size, replace=False)
            self.forward(X[batch_i])
            self.backward(X[batch_i], y[batch_i], lr=self.lr)
         
        
    def plot_training_stats(self, X, y, train_idx, test_idx, target_names=None):
        """Plot the training loss as well as a confusion matrix
        for the training and test datasets.
        
        Args: 
            X (ndarray): input data with one instance 
                per row and one column for each feature
            y (array): class label for each instance. 
            train_idx (array): index indicating the
                position of the training instances in X and y
            test_idx (array): index indicating the
                position of the test instances in X and y
            target_names (array of strings): names of 
                target classes in y
                
        Returns:
            Matrplotlib figure and axes
        """
        fig, axs = plt.subplots(1,3,figsize=(20,6))
        # plot training loss
        axs[0].set_title('Training loss')
        axs[0].set_ylabel('Loss')
        axs[0].set_xlabel('Training step')
        axs[0].plot(self.training_loss, color='k')
        despine(ax=axs[0])
        # plot confusion matrix for training and test data
        for i, (label, idx) in enumerate(zip(['Training', 'Test'], [train_idx, test_idx])):
            y_pred = self.predict(X[idx])
            acc = np.mean(y_pred == y[idx])
            axs[1+i].set_title('{} data\nMean Acc.: {}%'.format(label, np.round(acc*100, 2)))
            conf_mat = confusion_matrix(y[idx], y_pred, normalize='true')
            sns.heatmap(conf_mat, annot=True, ax=axs[1+i], vmin=0, vmax=1,
                        annot_kws={'fontsize': 100./self.n_out}) # this might be too large for few classes
            if target_names is not None:
                axs[1+i].set_xticklabels(target_names)
                axs[1+i].set_yticklabels(target_names)
        fig.tight_layout()
        return fig, axs


# # 6. What happens if we change the hidden layer activation function?

# Let's see if our implementation actually works and is able to distinguish between the red and blue points in our previous example.
# 
# Now that we are at it, let's also make this a bit more interesting by changing the activation function of the hidden layer.
# 
# For this, I have prepared three common activation functions (including their derivatives; see below): 
# - the [rectified linear unit (or ReLu)](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)),
# - [hyperbolic tangent (or tanh)](https://mathworld.wolfram.com/HyperbolicTangent.html), and
# - [softmax](https://en.wikipedia.org/wiki/Softmax_function). 
# 
# Note that the softmax function is typically used as the output activation for classification problems with more than two mutually exclusive classes (in which each sample can only belong to one out of the classes).

# In[26]:


# three common activation functions:

class relu():
    def __init__(self):
        self.name = 'relu'
        
    def forward(self, x):
        return np.maximum(x, 0)
    
    def derivative(self, a):
        d = np.zeros_like(a)
        d[a<0] = 0
        d[a>0] = 1
        return d
    
    
class tanh():
    def __init__(self):
        self.name = 'tanh'
        
    def forward(self, x):
        return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
    
    def derivative(self, a):
        return 1-a**2
    
    
class softmax():
    def __init__(self):
        self.name = 'softmax'
        
    def forward(self, x):
        e_x = np.exp(x)
        return e_x / e_x.sum(axis=1, keepdims=True)
    
    def derivative(self, a):
        SM = a.reshape((-1,1))
        return (np.diagflat(a) - np.dot(SM, SM.T)).sum(axis=1)


# In this example, we will use one output neuron for each of the two classes of our example dataset.
# 
# We therefore set `n_out = 2` in our specification of the ANN.
# 
# By doing so, we also need to change the encoding of our target (y), so that it has one value for each of the two output neurons. This type of encoding is common in machine learning and is called one-hot encoding. It looks like this: 
# 
# if y = 0: [1, 0], if y = 1: [0, 1]
# 
# Thereby `y.argmax(axis=1)` is equal to our original y.

# In[27]:


y_onehot = np.zeros((y.size, 2))
y_onehot[np.arange(y.size), y.ravel()] = 1
print('This is how one-hot encoding looks like for class 0:')
print(y_onehot[0])
print('And this is how it looks for class 1:')
print(y_onehot[n_samples])
print('\nOverall, y now has dimension: {}x{}'.format(*y_onehot.shape))


# As the two classes (red and blue) are mutually exclusive, we further use the softmax activation function for the output layer:

# In[28]:


# set a random seed
seed = 159
np.random.seed(seed)

# setup figure
fig, axs = plt.subplots(2,3,figsize=(15,10), sharey='row', sharex='row')

# iterate activation functions
for i, act in enumerate([sigmoid, tanh, relu]):
    
    # plot activation function of hidden layer
    ax = axs[0,i]
    x = np.linspace(-5,5,100)
    ax.axhline(0, color='k', ls='--')
    ax.axvline(0, color='k', ls='--')
    ax.plot(x, act().forward(x), color='red', lw=4)
    ax.set_title('Hidden layer activation: \n{}'.format(act().name.capitalize()))
    ax.set_xlabel(r'$x$')
    despine(ax=ax)
    
    # make and fit neural network instance
    nn = NeuralNetwork(n_in=2, n_out=2, n_hidden=[3], activations=[act, softmax], seed=seed)
    nn.fit(X, y_onehot, verbose=False)
    
    # perdict probabilities
    zz = nn.forward(np.c_[xx1.ravel(), xx2.ravel()])
    zz = zz[:,1].reshape(xx1.shape)
    
    # plot decision boundary
    ax = axs[1,i]
    cs = ax.contourf(xx1,xx2,zz)
    for i in range(y.shape[0]):
        if y[i] == 0:
            marker = 'bo'
        else:
            marker = 'ro'
        ax.plot(X[i][0], X[i][1], marker)
    ax.set_title('Learned decision boundary')
    ax.set_xlabel(r'$x_1$')
    ax.set_xlim(-2, 2)
    ax.set_ylim(-2, 2)

# some final plotting settings
axs[0,0].set_ylabel('Activation '+r'$\phi(x)$')
axs[1,0].set_ylabel(r'$x_2$')

# save figure
fig.tight_layout(h_pad=2, w_pad=4)
fig.savefig('figures/Figure-1-3_Neural-Network-Hidden-Activations.png', dpi=600)


# Cool! Our ANN correctly distinguishes between the two classes for all three hidden layer activation functions.
# 
# Interestingly, the learned decision boundary changes with the different hidden layer activation functions.

# # 7. Exercise: Classify handwritten digits

# For this exercise, we will utilize a dataset that contains small grey-scale images of handwritten digits.
# 
# This dataset is part of the scikit-learn library and we can load it as follows:

# In[29]:


from sklearn.datasets import load_digits


# In[30]:


data = load_digits()


# `data`is a dictionary with the following elements: 

# In[31]:


data.keys()


# The digit label is encoded in the `target` entry:

# In[32]:


data['target']


# The 1797 images of the dataset are stored in `data`.
# 
# Importantly, each image is flattened to 64 values. To visualize a few samples, we need to reshape them to 8 x 8 pixels.

# In[33]:


data['data'].shape


# In[34]:


# setup figure
fig, axs = plt.subplots(2,5,figsize=(12,6))
axs = axs.ravel()

# plot 10 examples
for i in range(10):
    axs[i].imshow(data['data'][i].reshape(8,8), cmap='gray')
    axs[i].set_title('This is a {}'.format(data['target'][i]))
    despine(ax=axs[i])
    axs[i].set_xticks([])
    axs[i].set_yticks([])
fig.tight_layout()

# save
fig.savefig('figures/Figure-1-4_Hand-Written-Digits.png', dpi=600)


# To classifiy the images with our neural network, we again need to create a one-hot encoding of the y-vector (this is necessary so that we can use 10 distinct output neurons):

# In[35]:


# create X & y data
X = data['data']
y = data['target']

# create a one-hot version of y
y_onehot = np.zeros((y.size, np.unique(y).size))
y_onehot[np.arange(y.size), y] = 1
assert np.all(np.argmax(y_onehot, axis=1) == y), '/!\ Error in one-hot encoding'


# We will further separate the data into a distinct training and test dataset:

# In[36]:


np.random.seed(1827)
# train / test split
train_idx = np.random.choice(X.shape[0], int(0.7 * X.shape[0]), replace=False)
test_idx = np.array([i for i in np.arange(X.shape[0]) if i not in train_idx])
assert np.all([i not in train_idx for i in test_idx]), '/!\ Overlapping training and test datasets.'


# Now we are ready to setup and train our neural network on the training data!
# 
# Let's create a simple network with 1 hidden layer and 10 neurons in this layer:

# In[37]:


# set a random seed
seed = 2130
np.random.seed(seed)

# setup
nn = NeuralNetwork(n_in=X.shape[1], # the number of input neurons (or data features)
                   n_out=y_onehot.shape[1], # the number of ourput neurons (or classes, here 10)
                   n_hidden=[10], # 10 hidden neurons in a single hidden layer
                   activations=[sigmoid, softmax], # we use the sigmoid activation function for the hidden layer
                   seed=seed)

# training:
nn.fit(X=X[train_idx], y=y_onehot[train_idx], lr=0.01,  n_steps=20000, verbose=True)


# By the use of our handy `plot_training_stats` function, we can get a quick overview of the training statistics as well as the networks perdictive performance in the training and test data:

# In[38]:


nn.plot_training_stats(X=X, y=y,
                       train_idx=train_idx,
                       test_idx=test_idx,
                       target_names=data['target_names'])


# Ok, not bad; almost $77$% prediction accuracy in the test dataset.
# 
# Can you beat this?
# 
# **Try adjusting the hidden layer activations, as well as the number of number of hidden layers and the number of neurons per hidden layer in the code above.**
# 
# You can increase the number of hidden layers by adding numbers to `n_hidden`: `n_hidden = [10,10]` would create two hidden layers with 10 neurons each. Importantly, don't forget to add activation functions to `activations` if you increase the number of hidden layers!