import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
np.random.seed(1)
%matplotlib inline
np.random.seed(1)
N = 100
x = np.random.rand(N,1)*5
# Let the following command be the true function
y = 2.3 + 5.1*x
# Get some noisy observations
y_obs = y + 2*np.random.randn(N,1)
plt.scatter(x,y_obs,label='Observations')
plt.plot(x,y,c='r',label='True function')
plt.legend()
plt.show()
We are trying to minimise $\sum \xi_i^2$.
\begin{align} \mathcal{L} & = \frac{1}{N}\sum_{i=1}^N (y_i-f(x_i,w,b))^2 \\ \frac{\delta\mathcal{L}}{\delta w} & = -\frac{1}{N}\sum_{i=1}^N 2(y_i-f(x_i,w,b))\frac{\delta f(x_i,w,b)}{\delta w} \\ & = -\frac{1}{N}\sum_{i=1}^N 2\xi_i\frac{\delta f(x_i,w,b)}{\delta w} \end{align}where $\xi_i$ is the error term $y_i-f(x_i,w,b)$ and $$ \frac{\delta f(x_i,w,b)}{\delta w} = x_i $$
Similar expression can be found for $\frac{\delta\mathcal{L}}{\delta b}$ (exercise).
Finally the weights can be updated as $w_{new} = w_{current} - \gamma \frac{\delta\mathcal{L}}{\delta w}$ where $\gamma$ is a learning rate between 0 and 1.
# Helper functions
def f(w,b):
return w*x+b
def loss_function(e):
L = np.sum(np.square(e))/N
return L
def dL_dw(e,w,b):
return -2*np.sum(e*df_dw(w,b))/N
def df_dw(w,b):
return x
def dL_db(e,w,b):
return -2*np.sum(e*df_db(w,b))/N
def df_db(w,b):
return np.ones(x.shape)
# The Actual Gradient Descent
def gradient_descent(iter=100,gamma=0.1):
# get starting conditions
w = 10*np.random.randn()
b = 10*np.random.randn()
params = []
loss = np.zeros((iter,1))
for i in range(iter):
# from IPython.core.debugger import Tracer; Tracer()()
params.append([w,b])
e = y_obs - f(w,b) # Really important that you use y_obs and not y (you do not have access to true y)
loss[i] = loss_function(e)
#update parameters
w_new = w - gamma*dL_dw(e,w,b)
b_new = b - gamma*dL_db(e,w,b)
w = w_new
b = b_new
return params, loss
params, loss = gradient_descent()
iter=100
gamma = 0.1
w = 10*np.random.randn()
b = 10*np.random.randn()
params = []
loss = np.zeros((iter,1))
for i in range(iter):
# from IPython.core.debugger import Tracer; Tracer()()
params.append([w,b])
e = y_obs - f(w,b) # Really important that you use y_obs and not y (you do not have access to true y)
loss[i] = loss_function(e)
#update parameters
w_new = w - gamma*dL_dw(e,w,b)
b_new = b - gamma*dL_db(e,w,b)
w = w_new
b = b_new
dL_dw(e,w,b)
0.0078296405377948283
plt.plot(loss)
[<matplotlib.lines.Line2D at 0x1184f7be0>]
params = np.array(params)
plt.plot(params[:,0],params[:,1])
plt.title('Gradient descent')
plt.xlabel('w')
plt.ylabel('b')
plt.show()
params[-1]
array([ 4.98099133, 2.75130442])
We are trying to minimise $\sum \xi_i^2$. This time $ f = Xw$ where $w$ is Dx1 and $X$ is NxD.
\begin{align} \mathcal{L} & = \frac{1}{N} (y-Xw)^T(y-Xw) \\ \frac{\delta\mathcal{L}}{\delta w} & = -\frac{1}{N} 2\left(\frac{\delta f(X,w)}{\delta w}\right)^T(y-Xw) \\ & = -\frac{2}{N} \left(\frac{\delta f(X,w)}{\delta w}\right)^T\xi \end{align}where $\xi_i$ is the error term $y_i-f(X,w)$ and $$ \frac{\delta f(X,w)}{\delta w} = X $$
Finally the weights can be updated as $w_{new} = w_{current} - \gamma \frac{\delta\mathcal{L}}{\delta w}$ where $\gamma$ is a learning rate between 0 and 1.
N = 1000
D = 5
X = 5*np.random.randn(N,D)
w = np.random.randn(D,1)
y = X.dot(w)
y_obs = y + np.random.randn(N,1)
w
array([[-0.19670626], [-0.74645194], [ 0.93774813], [-2.62540124], [ 0.74616483]])
X.shape
(1000, 5)
w.shape
(5, 1)
(X*w.T).shape
(1000, 5)
# Helper functions
def f(w):
return X.dot(w)
def loss_function(e):
L = e.T.dot(e)/N
return L
def dL_dw(e,w):
return -2*X.T.dot(e)/N
def gradient_descent(iter=100,gamma=1e-3):
# get starting conditions
w = np.random.randn(D,1)
params = []
loss = np.zeros((iter,1))
for i in range(iter):
params.append(w)
e = y_obs - f(w) # Really important that you use y_obs and not y (you do not have access to true y)
loss[i] = loss_function(e)
#update parameters
w = w - gamma*dL_dw(e,w)
return params, loss
params, loss = gradient_descent()
plt.plot(loss)
[<matplotlib.lines.Line2D at 0x119a10940>]
params[-1]
array([[-0.18613581], [-0.74929568], [ 0.95245495], [-2.602146 ], [ 0.73246543]])
model = LinearRegression(fit_intercept=False)
model.fit(X,y)
model.coef_.T
array([[-0.19670626], [-0.74645194], [ 0.93774813], [-2.62540124], [ 0.74616483]])
# compare parameters side by side
np.hstack([params[-1],model.coef_.T])
array([[ 0.93191854, 0.93774813], [-2.62216904, -2.62540124], [ 0.74418434, 0.74616483], [ 0.64963419, 0.67411002], [ 1.00760672, 1.0142675 ]])
def dL_dw(X,e,w):
return -2*X.T.dot(e)/len(X)
def gradient_descent(gamma=1e-3, n_epochs=100, batch_size=20, decay=0.9):
epoch_run = int(len(X)/batch_size)
# get starting conditions
w = np.random.randn(D,1)
params = []
loss = np.zeros((n_epochs,1))
for i in range(n_epochs):
params.append(w)
for j in range(epoch_run):
idx = np.random.choice(len(X),batch_size,replace=False)
e = y_obs[idx] - X[idx].dot(w) # Really important that you use y_obs and not y (you do not have access to true y)
#update parameters
w = w - gamma*dL_dw(X[idx],e,w)
loss[i] = e.T.dot(e)/len(e)
gamma = gamma*decay #decay the learning parameter
return params, loss
params, loss = gradient_descent()
plt.plot(loss)
[<matplotlib.lines.Line2D at 0x1197dd278>]
np.hstack([params[-1],model.coef_.T])
array([[-0.18387464, -0.19670626], [-0.74895223, -0.74645194], [ 0.9476572 , 0.93774813], [-2.62055252, -2.62540124], [ 0.75009766, 0.74616483]])