#!/usr/bin/env python # coding: utf-8 # ## Approximation using Multi-layers Perceptrons # # This code benchmark the approximation of functions using a Multi-Layer Perceptron (MLP) with 2 layers. A MLP with $q$ neurons is defined as # $$ #  \forall x \in \mathbb{R}^d, # \quad # f_q(x) \triangleq \sum_{k=1}^q c_k \phi( \langle x,a_k \rangle + b_k ) # $$ # where the parameters are # $(a_k,b_k,c_k) \in \mathbb{R}^{d} \times \mathbb{R} \times \mathbb{R}$, so that the total number of parameters is $q (d+2)$. Here $\phi : \mathbb{R} \to \mathbb{R}$ is a sigmoid function with bounded range (assumed to be $[0,1]$). # # The theorem of Barron ensures that for a class of smooth function with # $$ # \|f\|_B \triangleq # \int_{\mathbb{R}^d} \|\omega\| |\hat f(\omega)| \text{d} \omega < +\infty # $$ # and a probability distribution $\mu$ supported on a ball of radius $R$, there exists a neural MLP $f_n$ with $n$ neurons such that # # $$ # \int (f(x)-f_q(x))^2 \text{d} \mu(x) # = # O(\|f\|_B R / \sqrt{q}). # $$ # # The goal of this tour is to illustrate this theorem. # In[1]: import numpy as np import matplotlib.pyplot as plt import torch import torch.nn as nn import torch.optim as optim # In[2]: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) # # Approximation using gradient descent # Create synthetic data in dimension $d$. # In[4]: d = 1 d = 3 d = 2 if d==1: n = 256; # #samples x = np.linspace(-1,1,n) y = np.sin(6*np.pi*np.abs(x)**1.5) + np.abs(x)**2 x = x[:,None] y = y[:,None] plt.clf plt.plot(x, y); if d==2: n0 = 50 n = n0**2 t = np.linspace(-1,1,n0) s = .4 # width of the Gaussian y = np.exp( - (t[:,None]**2 + t[None,:]**2)/(2*s**2) ) y = y.flatten() [u,v] = np.meshgrid(t,t) x = np.concatenate([u.flatten()[:,None],v.flatten()[:,None]], axis=1) plt.imshow( np.reshape(y, [n0,n0]) ) if d>2: # random point on a cube n = 1000 x = 2*np.random.rand(n,d)-1 s = .4 # width of the Gaussian y = np.exp( - np.sum(x**2, axis=1)/(2*s**2) ) y = y.flatten() plt.plot( np.sum(x, axis=1), y, '.' ) # Convert into Torch array of size $(n,d+1)$ # In[5]: X = torch.Tensor(x).to(device) Y = torch.Tensor(y[:,None]).to(device) # Define a MLP with $q$ hidden neurons. # In[6]: def create_mlp(q): model = nn.Sequential( nn.Linear(d, q), nn.Tanh(), #nn.Sigmoid(), #nn.ReLU(), nn.Linear(q, 1), ) if torch.cuda.is_available(): model.cuda() return model # Initialize the weights. # In[7]: def my_init(m): nn.init.normal_(m[0].bias, 0, 1) nn.init.normal_(m[0].weight, 0,1) nn.init.normal_(m[2].bias, 0, 0.001) # set it to 0 for the global bias nn.init.normal_(m[2].weight, 0.0001) # In[8]: q = 10 # neurons model = create_mlp(q) my_init(model) # Define the $\ell^2$ loss function. # In[9]: loss_func = torch.nn.MSELoss() loss = loss_func(model(X), Y) print( loss.item() ) # implementing "by hand" the gradient descent. # In[13]: my_init(model) tau = .01 niter = 1000 L = np.zeros((niter,1)) for i in range(niter): loss = loss_func(model(X), Y) L[i] = loss.item() model.zero_grad() loss.backward() with torch.no_grad(): for theta in model.parameters(): theta -= tau * theta.grad plt.plot(np.log(L)); # Same using Pytorch. # In[14]: my_init(model) optimizer = torch.optim.SGD(model.parameters(), lr = 0.01) optimizer = torch.optim.Adam(model.parameters(), lr = 0.01) model.train() niter = 5000 L = [] for it in range(niter): loss = loss_func(model(X), Y) model.zero_grad() # reset the gradient loss.backward() L.append(loss.item()) optimizer.step() plt.plot(np.log(L)); # Quasi-Newton. # In[15]: my_init(model) optimizer = optim.LBFGS(model.parameters()); niter = 40 L = [] for it in range(niter): def closure(): optimizer.zero_grad() # reset the gradient loss = loss_func(model(X), Y) model.zero_grad() loss.backward() L.append(loss.item()) return loss optimizer.step(closure) plt.plot(np.log(L)); # In[ ]: plt.plot(np.log(L)); # Display the repartition of the weights. # In[20]: def torch2np(x): return x.detach().cpu().numpy() plt.subplot(2,2,1) plt.plot(torch2np(model[0].bias)) plt.subplot(2,2,2) plt.plot( np.std(torch2np(model[0].weight), axis=1) ) plt.subplot(2,2,3) plt.plot(torch2np(model[2].weight).flatten()) print( 'Output bias:' + str(torch2np(model[2].bias.data)) ) # Display the fitted function. # In[21]: y1 = model(X).detach().cpu().numpy() if d==1: plt.plot(x,y, 'b') plt.plot(x,y1, 'r') if d==2: y1 = np.reshape(y1, [n0,n0]) plt.imshow( y1 ) if d>2: plt.plot( np.sum(x, axis=1), y, 'b.' ) plt.plot( np.sum(x, axis=1), y1, 'r.' ) # # Geedy neuron-by-neuron training # # In order to illustrate Barron's theorem, we train the network in a greedy fashion, neuron per neuron. # In[22]: import time import progressbar R = Y # residual to fit qmax = 500 # maximum # neurons niter = 500 # for the optimizer L = [loss_func(R*0, R).item()] # bug, model(X) doit model=0 for iq in progressbar.progressbar(range(qmax)): # create a MLP with a single neuron model = create_mlp(1) my_init(model) # be sure to start from 0 model[2].bias.data=0*model[2].bias.data model[2].weight.data=0*model[2].weight.data #optimizer = optim.LBFGS(model.parameters()); #optimizer = torch.optim.SGD(model.parameters(), lr = 0.01) optimizer = torch.optim.Adam(model.parameters(), lr = 0.01) model.train() l = [] for it in range(niter): if 0: def closure(): optimizer.zero_grad() # reset the gradient loss = loss_func(model(X), R) model.zero_grad() loss.backward() l.append(loss.item()) return loss optimizer.step(closure) else: loss = loss_func(model(X), R) model.zero_grad() # reset the gradient loss.backward() l.append(loss.item()) optimizer.step() # update residual L.append( loss_func(model(X), R).item() ) # loss.item()) R = R - model(X).detach() # Display the evolution of the loss in log-domain, and compare with the theoretical bound in $O(1/\sqrt{q})$. # In[24]: qlist = np.arange(qmax+1) plt.loglog(qlist, np.sqrt(L), 'b.-') plt.loglog(qlist[1:], np.sqrt(L[0])/np.sqrt(qlist[1:]), 'k--'); # In[ ]: