#!/usr/bin/env python
# coding: utf-8

# # `numpy` Neural Networks: Multi-Layer Perceptron

# Now that we've done one layer successfully, let's try more! We begin with adding one hidden layer to our network, and then generalize to include any number of hidden layers.

# In[1]:


import numpy as np
import matplotlib.pyplot as plt

get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams['figure.facecolor'] = 'white'


# The "hard" dataset that we couldn't predict with logistic regression:

# In[2]:


def rescale(X):
    return (X - X.mean(axis=0)) / (X.var(axis=0))

n_samples = 1000
X0 = np.random.normal(loc=[0,0], scale=[2,0.5], size=(int(n_samples/2), 2))
X11 = np.random.normal(loc=[0,3.5], scale=[0.5,1], size=(int(n_samples/4), 2))
X12 = np.random.normal(loc=[0,-3.5], scale=[0.5,1], size=(int(n_samples/4), 2))
X1 = np.vstack([X11, X12])
X = np.vstack([X0, X1])

# X = rescale(X)

y0 = np.zeros(shape=(int(n_samples/2), 1))
y1 = np.ones(shape=(int(n_samples/2), 1))
yhat = np.vstack([y0, y1])


# In[3]:


X0 = X[(yhat==0).reshape(-1)]
X1 = X[(yhat==1).reshape(-1)]

plt.scatter(*X0.T, label='0', alpha=0.4); plt.scatter(*X1.T, label='1', alpha=0.4)
plt.legend();


# Sigmoid Activation and Cross-Entropy Loss

# In[4]:


def sig(z):
    return 1 / (1 + np.exp(-z))

def dsig_dz(z):
    return sig(z) * (1 - sig(z))

def J(y, yhat):
    eps = 1e-8
    return -(yhat*np.log(y+eps) + (1-yhat)*np.log(1-y+eps))

def dJ_dy(y, yhat):
    eps = 1e-8
    return (1-yhat)/(1-y+eps) - yhat/(y+eps)


# For our hidden layer we will use a new type of unit (neuron w/ activation function): The rectified linear unit. 
# $$\mathrm{ReLU}(z) = \begin{cases}
# z & z\ge0\\
# 0 & z<0
# \end{cases} $$
# 
# This activation is nice because it is fast to compute, and it doesn't saturate, by which we mean the derivative doesn't go to zero asymptotically, so terms that depend on the derivative don't die out:
# 
# $$\frac{\mathrm{d}\,\mathrm{ReLU}}{\mathrm{d}z} = \begin{cases}
# 1 & z\ge0\\
# 0 & z<0
# \end{cases} $$

# In[5]:


def relu(z):
    return np.where(z>0, z, 0)
    
def drelu_dz(z):
    return np.where(z>0, 1, 0)


# In[6]:


z = np.linspace(-5,5)
plt.plot(z, relu(z), label='ReLU$(z)$'); plt.plot(z, drelu_dz(z), label='$d$ ReLU / $dz$'); plt.legend();
plt.title('The ReLU function');


# # 1 Hidden Layer
# 
# To start off, let's consider the case of having a single hidden ReLU layer with 2 hidden units. 

# In[7]:


n_input = 2
n_hidden = 10
n_output = 1


# We now have two sets of weights, $w^1$ and $w^2$. We must now consider the biases as matrices connecting the units of consecutive layers. Define them to have the same number of rows as units in the previous layer, and the same number of columns as units in the next layer. So $w^1 \in \Re^{(2,2)}$ and $w^2 \in \Re^{(2,1)}$. We will also explicitly consider bias vectors $b^1$ and $b^2$ which have the size of the next layer (one bias per activation unit).
# 
# In the previous notebook, we dealt with bias by setting $x_0 = 0$ and then adding an additional weight, but I now find it more clear to just add the bias separately. In any case the effect is the same. The actions of the layers are then:
# 
# \begin{align*}
# z^1 &= x^0 w^{1} + b^{1T} & x^1 &= \mathrm{ReLU}(z^1) \\
# z^2 &= x^1 w^{2} + b^{2T} & y = x^2 &= \sigma(z^1)
# \end{align*}
# 
# where the bias vectors are transposed since $x$ and $z$ are rows. Here we denote the sigmoid function as $\sigma$. 
# 
# Let's initialize some random nonzero weights to demonstrate as we go along:

# In[8]:


w1 = np.random.normal(0,0.1, size=(n_input, n_hidden))
w2 = np.random.normal(0,0.1, size=(n_hidden, n_output))

b1 = np.random.normal(0,0.1, size=(n_hidden, 1))
b2 = np.random.normal(0,0.1, size=(n_output, 1))


# The forward pass is simply the calculation performed above:

# In[9]:


def forward1(x0, w1, b1, w2, b2):
    x1 = relu(np.dot(x0, w1) + b1.T)  # output of hidden layer
    return sig(np.dot(x1, w2) + b2.T)  # output of output layer


# We can check the prediction on the first sample:

# In[10]:


y = forward1(X[0], w1, b1, w2, b2)
y


# For the backward pass, we follow the approach detailed in [this video](https://www.youtube.com/watch?v=gl3lfL-g5mA) and define a quantity that will become useful later on: $\delta^\ell$, the partial derivative of the cost function with respect to $z^\ell$:
# 
# $$ \delta^\ell \equiv \frac{\partial J}{\partial z^\ell} $$
# 
# We can begin explicit calculation with the last delta:
# 
# \begin{align*}
# \delta^2 &= \frac{\partial J}{\partial z^2}\\
# &= \frac{\partial J}{\partial y}\frac{\partial y}{\partial z^2}\\
# &= \frac{\partial J}{\partial y} \frac{\partial \sigma}{\partial z^2}\\
# \end{align*}
# 

# Moving on to $\delta^{1}$, we write out the vector and matrix indices explicitly for initial clarity:
# 
# \begin{align*}
# \delta^1_i &= \frac{\partial J}{\partial z^1_i}\\
# &= \sum_j \frac{\partial J}{\partial z^2_j}\frac{\partial z^2_j}{\partial z^1_i}\\
# &= \sum_{j,k} \frac{\partial J}{\partial z^2_j}\frac{\partial z^2_j}{\partial x^1_k}\frac{\partial x^1_k}{\partial z^1_i}
# \end{align*}
# 
# Now, 
# 
# $$ z^2_j = \sum_m x^1_m w^2_{mj} + b^2_j, $$
# so
# 
# $$\frac{\partial z^2_j}{\partial x^1_k} = w^2_{kj}.$$
# 
# Furthermore, 
# $$ x^1_k = \mathrm{ReLU}(z^1_k)\ \rightarrow\  \frac{\partial x^1_k}{\partial z^1_i} = \mathrm{ReLU}^\prime(z^1_k)\delta_{ki}$$
# where $\delta_{ki}$ is the Kronecker delta (1 if $k=m$, 0 otherwise). This has the effect of forcing $m=i$ in the whole equation, and we are left with

# $$\delta^1_i = \sum_j \delta^2_j w^2_{ij}\, \mathrm{ReLU}^\prime(z^1_i),$$
# 
# or, flipping back to matrix notation,
# 
# 

# $$\delta^1 = w^2 \delta^{2} \odot \mathrm{ReLU}^\prime(z^1)^T$$
# where $\odot$ is element-wise multiplication. 
# 
# We see that $\delta^2$ appeared in the definition of $\delta^1$. It's easy to convince yourself that this generalizes to
# 
# $$\delta^\ell = w^{\ell+1} \delta^{\ell+1} \odot \mathrm{ReLU}^\prime(z^\ell)^T,$$
# 
# and the form of the equation makes it easy to substitute other activation functions $\mathrm{ReLU}$ as well.
# 
# Furthermore, the update rules for training, $w^\ell\ \rightarrow\ w^\ell-\alpha\, \partial J/\partial w^\ell$ and $\beta^\ell\ \rightarrow\ \beta^\ell-\alpha\, \partial J/\partial \beta^\ell$, can be also written in terms of $\delta$:

# \begin{align*}
# \frac{\partial J}{\partial w^\ell_{ij}} &= \sum_k \frac{\partial J}{\partial z^\ell_k} \frac{\partial z^\ell_k}{\partial w^\ell_{ij}}\\
# &= \sum_k \delta^\ell_k x^{\ell-1}_i \delta_{kj}\\
# &= \delta^\ell_j x^{\ell-1}_i
# \end{align*}
# so
# $$ \frac{\partial J}{\partial w^\ell} = (\delta^\ell x^{\ell-1})^T $$
# and similarly
# $$\frac{\partial J}{\partial b^\ell} = \frac{\partial J}{\partial z^\ell} \frac{\partial z^\ell}{\partial b^\ell} = \delta^{\ell}$$

# Armed with these convenient definitions we can implement our backward pass to update our weights, and our training function, which looks much the same as in the previous notebook:

# In[11]:


def backward1(x0, w1, b1, w2, b2, y, yhat, alpha):
    # quantities
    z1 = np.dot(x0, w1) + b1.T
    x1 = relu(z1)
    z2 = np.dot(x1, w2) + b2.T
    #y = sig(z2)

    delta2 = dJ_dy(y, yhat) * dsig_dz(z2)
    delta1 = np.matmul(w2, delta2) * drelu_dz(z1).T

    w2 -= alpha * np.multiply(delta2, x1).T
    w1 -= alpha * np.multiply(delta1, x0).T

    b2 -= alpha * delta2
    b1 -= alpha * delta1
    
    return w1, b1, w2, b2


# In[12]:


alpha=0.1

y = forward1(X[0], w1, b1, w2, b2)
w1, b1, w2, b2 = backward1(X[0], w1, b1, w2, b2, y, yhat[0], alpha)
print(y)
print(J(y, yhat[0]))


# In[13]:


def train1(X, yhat, n_hidden, alpha, n_epoch):
    n_samples = X.shape[0]
    n_input = X.shape[1]
    n_output = 1
    
    # keep track of performance during training
    costs = np.zeros(shape=(n_epoch,1))

    # random nonzero initialization
    w1 = np.random.normal(0, 1, size=(n_input, n_hidden))
    w2 = np.random.normal(0, 1, size=(n_hidden, n_output))
    
    b1 = np.random.normal(0, 1, size=(n_hidden, 1))
    b2 = np.random.normal(0, 1, size=(n_output, 1))

    for epoch in range(n_epoch):
        for i in range(n_samples):
            x0 = X[i,:]; yh = yhat[i]
            y = forward1(x0, w1, b1, w2, b2)  # prediction for one sample
            w1, b1, w2, b2 = backward1(x0, w1, b1, w2, b2, y, yh, alpha)  # take step
        
        # ### Some niceness to see our progress
        # Calculate total cost after epoch
        predictions = forward1(X, w1, b1, w2, b2)  # predictions for entire set
        costs[epoch] = np.mean(J(predictions, yhat))  # mean cost per sample
        # report progress
        if ((epoch % 10) == 0) or (epoch == (n_epoch - 1)):
            #print(predictions.round())
            accuracy = np.mean(predictions.round() == yhat)  # current accuracy on entire set
            print('Training accuracy after epoch {}: {:.4%}'.format(epoch, accuracy))
            
    return w1, b1, w2, b2, costs


# Let's give it a try!

# In[14]:


n_epoch = 200
n_hidden = 2
alpha = 0.001
w1, b1, w2, b2, costs = train1(X, yhat, n_hidden, alpha, n_epoch)


# In[15]:


plt.plot(costs); plt.title('Mean cost per sample after each epoch');


# In[16]:


x1 = np.linspace(-8,8, 250)
x2 = np.linspace(-10,10, 250)
fun_map = np.empty((x1.size, x2.size))
for n,i in enumerate(x1):
    for m,j in enumerate(x2):
        fun_map[m,n] = forward1([i,-j], w1, b1, w2, b2)


# In[17]:


X0 = X[(yhat==0).reshape(-1)]
X1 = X[(yhat==1).reshape(-1)]

plt.figure(figsize=(10,5))
plt.imshow(fun_map, extent=[x1.min(), x1.max(), x2.min(), x2.max()], 
           vmin=0, vmax=1, aspect='auto')
plt.colorbar()
plt.contour(x1, -x2, fun_map, levels=[0.5], colors=['r'], linewidths=2)
plt.scatter(*X0.T, label='0', alpha=0.4); plt.scatter(*X1.T, label='1', alpha=0.4)
plt.legend();


# Looks pretty good! We added the decision boundary in <span style="color:red;">red</span> so that we can more easily visualize how it changes as we make changes to our network.

# # $n$ Hidden Layers
# 
# Our functions are starting to take a lot of parameters, which is often a sign that you should be generalizing. Since all our definitions are recursive anyway, we can actually handle any number of layers with ease. 
# 
# Instead of specifying `n_hidden`, we will move to specifying the number of units per layer (including the input and output layers), using a tuple which we'll call `shape`. 
# 
# We will store quantities that are matrix-valued, but inconsistent in dimension across different layers, in a dictionary with the layer number $\ell$ as the key.

# In[18]:


def init_model(shape):
    w = {}
    b = {}
    for layer in range(1, len(shape)):  # no weights for input layer
        w[layer] = np.random.normal(0,0.1, size=(shape[layer-1], shape[layer]))  # dim: from_units, to_units
        b[layer] = np.random.normal(0,0.1, size=(shape[layer], 1))  # dim: to_units, 1
    return w,b


# In[19]:


def forwardn(x0, w, b):
    n_layers = len(w)
    x_prev = x0
    for l in range(1, n_layers):
        x_l = relu(np.dot(x_prev, w[l]) + b[l].T)  # output of a hidden layer
        x_prev = x_l
    return sig(np.dot(x_prev, w[n_layers]) + b[n_layers].T)  # output of output layer


# Let's test these out with a bunch of layers!

# In[20]:


t = (2,3,6,8,1,)


# In[21]:


w,b = init_model(t)


# In[22]:


print(w)


# In[23]:


y = forwardn(X[:10], w, b)
y


# In[24]:


def backwardn(x0, w, b, y, yhat, alpha):
    n_layers = len(w)
    z = {}
    x = {0:x0}
    
    # x and z values for calculating derivatives
    for l in range(1, n_layers+1):
        z[l] = np.dot(x[l-1], w[l]) + b[l].T
        x[l] = relu(z[l])
        
    delta = {}
    
    # deltas and updates
    for l in range(n_layers, 0, -1):  # start with last layer and move backward
        if l == n_layers:  # base case
            delta[l] = dJ_dy(y, yhat)*dsig_dz(z[n_layers])
        else:  # recursive case
            delta[l] = np.dot(w[l+1], delta[l+1]) * drelu_dz(z[l]).T

        # update weights and biases
        w[l] -= alpha * np.multiply(delta[l], x[l-1]).T
        b[l] -= alpha * delta[l]
    
    return w, b


# Does this work for a simple case? If we update a bunch of times using one sample, we should see the prediction move towards $\hat y$:

# In[25]:


for i in range(1000):
    w,b = backwardn(X[0], w, b, y[0], 0, 0.1)
    y[0] = forwardn(X[0], w, b)
    if i%100 == 0:
        print(y[0],'-->',yhat[0])


# Awesome! Confident, we also implement the training function to do this with all the samples:

# In[26]:


def trainn(X, yhat, shape, alpha, n_epoch):
    n_samples = X.shape[0]
    n_input = X.shape[1]
    
    # keep track of performance during training
    costs = np.zeros(shape=(n_epoch,1))

    # random nonzero initialization
    w,b = init_model(shape)

    for epoch in range(n_epoch):
        for i in range(n_samples):
            x0 = X[i,:]; yh = yhat[i]
            y = forwardn(x0, w, b)  # prediction for one sample
            w, b = backwardn(x0, w, b, y, yh, alpha)  # take step
        
        # ### Some niceness to see our progress
        # Calculate total cost after epoch
        predictions = forwardn(X, w, b)  # predictions for entire set
        costs[epoch] = np.mean(J(predictions, yhat))  # mean cost per sample
        # report progress
        if ((epoch % 10) == 0) or (epoch == (n_epoch - 1)):
            accuracy = np.mean(predictions.round() == yhat)  # current accuracy on entire set
            print('Training accuracy after epoch {}: {:.4%}'.format(epoch, accuracy))
            
    return w, b, costs


# In[27]:


n_epoch = 200
shape = (2,5,3,1)
alpha = 0.001
w, b, costs = trainn(X, yhat, shape, alpha, n_epoch)


# In[28]:


plt.plot(costs); plt.title('Mean cost per sample after each epoch');


# In[29]:


x1 = np.linspace(-8,8, 250)
x2 = np.linspace(-10,10, 250)
fun_map = np.empty((x1.size, x2.size))
for n,i in enumerate(x1):
    for m,j in enumerate(x2):
        fun_map[m,n] = forwardn([i,-j], w, b)


# In[30]:


X0 = X[(yhat==0).reshape(-1)]
X1 = X[(yhat==1).reshape(-1)]

plt.figure(figsize=(10,5))
plt.imshow(fun_map, extent=[x1.min(), x1.max(), x2.min(), x2.max()], 
           vmin=0, vmax=1, aspect='auto')
plt.colorbar()
plt.scatter(*X0.T, label='0', alpha=0.4); plt.scatter(*X1.T, label='1', alpha=0.4)
plt.contour(x1, -x2, fun_map, levels=[0.5], colors=['r'], label='Decision boundary', linewidths=2)
plt.legend();


# Even more flexibility than before!
# 
# Of course unlimited flexibility has its own pitfalls, so in the next notebook we'll look into some ways of getting that under control.