#!/usr/bin/env python # coding: utf-8 # # `numpy` Neural Networks: Multi-Layer Perceptron # Now that we've done one layer successfully, let's try more! We begin with adding one hidden layer to our network, and then generalize to include any number of hidden layers. # In[1]: import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') plt.rcParams['figure.facecolor'] = 'white' # The "hard" dataset that we couldn't predict with logistic regression: # In[2]: def rescale(X): return (X - X.mean(axis=0)) / (X.var(axis=0)) n_samples = 1000 X0 = np.random.normal(loc=[0,0], scale=[2,0.5], size=(int(n_samples/2), 2)) X11 = np.random.normal(loc=[0,3.5], scale=[0.5,1], size=(int(n_samples/4), 2)) X12 = np.random.normal(loc=[0,-3.5], scale=[0.5,1], size=(int(n_samples/4), 2)) X1 = np.vstack([X11, X12]) X = np.vstack([X0, X1]) # X = rescale(X) y0 = np.zeros(shape=(int(n_samples/2), 1)) y1 = np.ones(shape=(int(n_samples/2), 1)) yhat = np.vstack([y0, y1]) # In[3]: X0 = X[(yhat==0).reshape(-1)] X1 = X[(yhat==1).reshape(-1)] plt.scatter(*X0.T, label='0', alpha=0.4); plt.scatter(*X1.T, label='1', alpha=0.4) plt.legend(); # Sigmoid Activation and Cross-Entropy Loss # In[4]: def sig(z): return 1 / (1 + np.exp(-z)) def dsig_dz(z): return sig(z) * (1 - sig(z)) def J(y, yhat): eps = 1e-8 return -(yhat*np.log(y+eps) + (1-yhat)*np.log(1-y+eps)) def dJ_dy(y, yhat): eps = 1e-8 return (1-yhat)/(1-y+eps) - yhat/(y+eps) # For our hidden layer we will use a new type of unit (neuron w/ activation function): The rectified linear unit. # $$\mathrm{ReLU}(z) = \begin{cases} # z & z\ge0\\ # 0 & z<0 # \end{cases} $$ # # This activation is nice because it is fast to compute, and it doesn't saturate, by which we mean the derivative doesn't go to zero asymptotically, so terms that depend on the derivative don't die out: # # $$\frac{\mathrm{d}\,\mathrm{ReLU}}{\mathrm{d}z} = \begin{cases} # 1 & z\ge0\\ # 0 & z<0 # \end{cases} $$ # In[5]: def relu(z): return np.where(z>0, z, 0) def drelu_dz(z): return np.where(z>0, 1, 0) # In[6]: z = np.linspace(-5,5) plt.plot(z, relu(z), label='ReLU$(z)$'); plt.plot(z, drelu_dz(z), label='$d$ ReLU / $dz$'); plt.legend(); plt.title('The ReLU function'); # # 1 Hidden Layer # # To start off, let's consider the case of having a single hidden ReLU layer with 2 hidden units. # In[7]: n_input = 2 n_hidden = 10 n_output = 1 # We now have two sets of weights, $w^1$ and $w^2$. We must now consider the biases as matrices connecting the units of consecutive layers. Define them to have the same number of rows as units in the previous layer, and the same number of columns as units in the next layer. So $w^1 \in \Re^{(2,2)}$ and $w^2 \in \Re^{(2,1)}$. We will also explicitly consider bias vectors $b^1$ and $b^2$ which have the size of the next layer (one bias per activation unit). # # In the previous notebook, we dealt with bias by setting $x_0 = 0$ and then adding an additional weight, but I now find it more clear to just add the bias separately. In any case the effect is the same. The actions of the layers are then: # # \begin{align*} # z^1 &= x^0 w^{1} + b^{1T} & x^1 &= \mathrm{ReLU}(z^1) \\ # z^2 &= x^1 w^{2} + b^{2T} & y = x^2 &= \sigma(z^1) # \end{align*} # # where the bias vectors are transposed since $x$ and $z$ are rows. Here we denote the sigmoid function as $\sigma$. # # Let's initialize some random nonzero weights to demonstrate as we go along: # In[8]: w1 = np.random.normal(0,0.1, size=(n_input, n_hidden)) w2 = np.random.normal(0,0.1, size=(n_hidden, n_output)) b1 = np.random.normal(0,0.1, size=(n_hidden, 1)) b2 = np.random.normal(0,0.1, size=(n_output, 1)) # The forward pass is simply the calculation performed above: # In[9]: def forward1(x0, w1, b1, w2, b2): x1 = relu(np.dot(x0, w1) + b1.T) # output of hidden layer return sig(np.dot(x1, w2) + b2.T) # output of output layer # We can check the prediction on the first sample: # In[10]: y = forward1(X[0], w1, b1, w2, b2) y # For the backward pass, we follow the approach detailed in [this video](https://www.youtube.com/watch?v=gl3lfL-g5mA) and define a quantity that will become useful later on: $\delta^\ell$, the partial derivative of the cost function with respect to $z^\ell$: # # $$ \delta^\ell \equiv \frac{\partial J}{\partial z^\ell} $$ # # We can begin explicit calculation with the last delta: # # \begin{align*} # \delta^2 &= \frac{\partial J}{\partial z^2}\\ # &= \frac{\partial J}{\partial y}\frac{\partial y}{\partial z^2}\\ # &= \frac{\partial J}{\partial y} \frac{\partial \sigma}{\partial z^2}\\ # \end{align*} # # Moving on to $\delta^{1}$, we write out the vector and matrix indices explicitly for initial clarity: # # \begin{align*} # \delta^1_i &= \frac{\partial J}{\partial z^1_i}\\ # &= \sum_j \frac{\partial J}{\partial z^2_j}\frac{\partial z^2_j}{\partial z^1_i}\\ # &= \sum_{j,k} \frac{\partial J}{\partial z^2_j}\frac{\partial z^2_j}{\partial x^1_k}\frac{\partial x^1_k}{\partial z^1_i} # \end{align*} # # Now, # # $$ z^2_j = \sum_m x^1_m w^2_{mj} + b^2_j, $$ # so # # $$\frac{\partial z^2_j}{\partial x^1_k} = w^2_{kj}.$$ # # Furthermore, # $$ x^1_k = \mathrm{ReLU}(z^1_k)\ \rightarrow\ \frac{\partial x^1_k}{\partial z^1_i} = \mathrm{ReLU}^\prime(z^1_k)\delta_{ki}$$ # where $\delta_{ki}$ is the Kronecker delta (1 if $k=m$, 0 otherwise). This has the effect of forcing $m=i$ in the whole equation, and we are left with # $$\delta^1_i = \sum_j \delta^2_j w^2_{ij}\, \mathrm{ReLU}^\prime(z^1_i),$$ # # or, flipping back to matrix notation, # # # $$\delta^1 = w^2 \delta^{2} \odot \mathrm{ReLU}^\prime(z^1)^T$$ # where $\odot$ is element-wise multiplication. # # We see that $\delta^2$ appeared in the definition of $\delta^1$. It's easy to convince yourself that this generalizes to # # $$\delta^\ell = w^{\ell+1} \delta^{\ell+1} \odot \mathrm{ReLU}^\prime(z^\ell)^T,$$ # # and the form of the equation makes it easy to substitute other activation functions $\mathrm{ReLU}$ as well. # # Furthermore, the update rules for training, $w^\ell\ \rightarrow\ w^\ell-\alpha\, \partial J/\partial w^\ell$ and $\beta^\ell\ \rightarrow\ \beta^\ell-\alpha\, \partial J/\partial \beta^\ell$, can be also written in terms of $\delta$: # \begin{align*} # \frac{\partial J}{\partial w^\ell_{ij}} &= \sum_k \frac{\partial J}{\partial z^\ell_k} \frac{\partial z^\ell_k}{\partial w^\ell_{ij}}\\ # &= \sum_k \delta^\ell_k x^{\ell-1}_i \delta_{kj}\\ # &= \delta^\ell_j x^{\ell-1}_i # \end{align*} # so # $$ \frac{\partial J}{\partial w^\ell} = (\delta^\ell x^{\ell-1})^T $$ # and similarly # $$\frac{\partial J}{\partial b^\ell} = \frac{\partial J}{\partial z^\ell} \frac{\partial z^\ell}{\partial b^\ell} = \delta^{\ell}$$ # Armed with these convenient definitions we can implement our backward pass to update our weights, and our training function, which looks much the same as in the previous notebook: # In[11]: def backward1(x0, w1, b1, w2, b2, y, yhat, alpha): # quantities z1 = np.dot(x0, w1) + b1.T x1 = relu(z1) z2 = np.dot(x1, w2) + b2.T #y = sig(z2) delta2 = dJ_dy(y, yhat) * dsig_dz(z2) delta1 = np.matmul(w2, delta2) * drelu_dz(z1).T w2 -= alpha * np.multiply(delta2, x1).T w1 -= alpha * np.multiply(delta1, x0).T b2 -= alpha * delta2 b1 -= alpha * delta1 return w1, b1, w2, b2 # In[12]: alpha=0.1 y = forward1(X[0], w1, b1, w2, b2) w1, b1, w2, b2 = backward1(X[0], w1, b1, w2, b2, y, yhat[0], alpha) print(y) print(J(y, yhat[0])) # In[13]: def train1(X, yhat, n_hidden, alpha, n_epoch): n_samples = X.shape[0] n_input = X.shape[1] n_output = 1 # keep track of performance during training costs = np.zeros(shape=(n_epoch,1)) # random nonzero initialization w1 = np.random.normal(0, 1, size=(n_input, n_hidden)) w2 = np.random.normal(0, 1, size=(n_hidden, n_output)) b1 = np.random.normal(0, 1, size=(n_hidden, 1)) b2 = np.random.normal(0, 1, size=(n_output, 1)) for epoch in range(n_epoch): for i in range(n_samples): x0 = X[i,:]; yh = yhat[i] y = forward1(x0, w1, b1, w2, b2) # prediction for one sample w1, b1, w2, b2 = backward1(x0, w1, b1, w2, b2, y, yh, alpha) # take step # ### Some niceness to see our progress # Calculate total cost after epoch predictions = forward1(X, w1, b1, w2, b2) # predictions for entire set costs[epoch] = np.mean(J(predictions, yhat)) # mean cost per sample # report progress if ((epoch % 10) == 0) or (epoch == (n_epoch - 1)): #print(predictions.round()) accuracy = np.mean(predictions.round() == yhat) # current accuracy on entire set print('Training accuracy after epoch {}: {:.4%}'.format(epoch, accuracy)) return w1, b1, w2, b2, costs # Let's give it a try! # In[14]: n_epoch = 200 n_hidden = 2 alpha = 0.001 w1, b1, w2, b2, costs = train1(X, yhat, n_hidden, alpha, n_epoch) # In[15]: plt.plot(costs); plt.title('Mean cost per sample after each epoch'); # In[16]: x1 = np.linspace(-8,8, 250) x2 = np.linspace(-10,10, 250) fun_map = np.empty((x1.size, x2.size)) for n,i in enumerate(x1): for m,j in enumerate(x2): fun_map[m,n] = forward1([i,-j], w1, b1, w2, b2) # In[17]: X0 = X[(yhat==0).reshape(-1)] X1 = X[(yhat==1).reshape(-1)] plt.figure(figsize=(10,5)) plt.imshow(fun_map, extent=[x1.min(), x1.max(), x2.min(), x2.max()], vmin=0, vmax=1, aspect='auto') plt.colorbar() plt.contour(x1, -x2, fun_map, levels=[0.5], colors=['r'], linewidths=2) plt.scatter(*X0.T, label='0', alpha=0.4); plt.scatter(*X1.T, label='1', alpha=0.4) plt.legend(); # Looks pretty good! We added the decision boundary in red so that we can more easily visualize how it changes as we make changes to our network. # # $n$ Hidden Layers # # Our functions are starting to take a lot of parameters, which is often a sign that you should be generalizing. Since all our definitions are recursive anyway, we can actually handle any number of layers with ease. # # Instead of specifying `n_hidden`, we will move to specifying the number of units per layer (including the input and output layers), using a tuple which we'll call `shape`. # # We will store quantities that are matrix-valued, but inconsistent in dimension across different layers, in a dictionary with the layer number $\ell$ as the key. # In[18]: def init_model(shape): w = {} b = {} for layer in range(1, len(shape)): # no weights for input layer w[layer] = np.random.normal(0,0.1, size=(shape[layer-1], shape[layer])) # dim: from_units, to_units b[layer] = np.random.normal(0,0.1, size=(shape[layer], 1)) # dim: to_units, 1 return w,b # In[19]: def forwardn(x0, w, b): n_layers = len(w) x_prev = x0 for l in range(1, n_layers): x_l = relu(np.dot(x_prev, w[l]) + b[l].T) # output of a hidden layer x_prev = x_l return sig(np.dot(x_prev, w[n_layers]) + b[n_layers].T) # output of output layer # Let's test these out with a bunch of layers! # In[20]: t = (2,3,6,8,1,) # In[21]: w,b = init_model(t) # In[22]: print(w) # In[23]: y = forwardn(X[:10], w, b) y # In[24]: def backwardn(x0, w, b, y, yhat, alpha): n_layers = len(w) z = {} x = {0:x0} # x and z values for calculating derivatives for l in range(1, n_layers+1): z[l] = np.dot(x[l-1], w[l]) + b[l].T x[l] = relu(z[l]) delta = {} # deltas and updates for l in range(n_layers, 0, -1): # start with last layer and move backward if l == n_layers: # base case delta[l] = dJ_dy(y, yhat)*dsig_dz(z[n_layers]) else: # recursive case delta[l] = np.dot(w[l+1], delta[l+1]) * drelu_dz(z[l]).T # update weights and biases w[l] -= alpha * np.multiply(delta[l], x[l-1]).T b[l] -= alpha * delta[l] return w, b # Does this work for a simple case? If we update a bunch of times using one sample, we should see the prediction move towards $\hat y$: # In[25]: for i in range(1000): w,b = backwardn(X[0], w, b, y[0], 0, 0.1) y[0] = forwardn(X[0], w, b) if i%100 == 0: print(y[0],'-->',yhat[0]) # Awesome! Confident, we also implement the training function to do this with all the samples: # In[26]: def trainn(X, yhat, shape, alpha, n_epoch): n_samples = X.shape[0] n_input = X.shape[1] # keep track of performance during training costs = np.zeros(shape=(n_epoch,1)) # random nonzero initialization w,b = init_model(shape) for epoch in range(n_epoch): for i in range(n_samples): x0 = X[i,:]; yh = yhat[i] y = forwardn(x0, w, b) # prediction for one sample w, b = backwardn(x0, w, b, y, yh, alpha) # take step # ### Some niceness to see our progress # Calculate total cost after epoch predictions = forwardn(X, w, b) # predictions for entire set costs[epoch] = np.mean(J(predictions, yhat)) # mean cost per sample # report progress if ((epoch % 10) == 0) or (epoch == (n_epoch - 1)): accuracy = np.mean(predictions.round() == yhat) # current accuracy on entire set print('Training accuracy after epoch {}: {:.4%}'.format(epoch, accuracy)) return w, b, costs # In[27]: n_epoch = 200 shape = (2,5,3,1) alpha = 0.001 w, b, costs = trainn(X, yhat, shape, alpha, n_epoch) # In[28]: plt.plot(costs); plt.title('Mean cost per sample after each epoch'); # In[29]: x1 = np.linspace(-8,8, 250) x2 = np.linspace(-10,10, 250) fun_map = np.empty((x1.size, x2.size)) for n,i in enumerate(x1): for m,j in enumerate(x2): fun_map[m,n] = forwardn([i,-j], w, b) # In[30]: X0 = X[(yhat==0).reshape(-1)] X1 = X[(yhat==1).reshape(-1)] plt.figure(figsize=(10,5)) plt.imshow(fun_map, extent=[x1.min(), x1.max(), x2.min(), x2.max()], vmin=0, vmax=1, aspect='auto') plt.colorbar() plt.scatter(*X0.T, label='0', alpha=0.4); plt.scatter(*X1.T, label='1', alpha=0.4) plt.contour(x1, -x2, fun_map, levels=[0.5], colors=['r'], label='Decision boundary', linewidths=2) plt.legend(); # Even more flexibility than before! # # Of course unlimited flexibility has its own pitfalls, so in the next notebook we'll look into some ways of getting that under control.