#!/usr/bin/env python # coding: utf-8 # # Loss landcape of a linear regression model # # We'll assume $\beta = 0$ to be able visualize the loss as a function of the slope $\alpha$ and error term $\sigma$. # # $y \sim \mathcal{N}(\alpha x, \sigma)$ # # ## Setting up the environment # In[2]: import os import pandas as pd import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import seaborn as sns import torch import torch.nn.functional as F from sklearn.model_selection import train_test_split cwd = os.getcwd() if cwd.endswith('notebook'): os.chdir('..') cwd = os.getcwd() # In[3]: sns.set(palette='colorblind', font_scale=1.3) palette = sns.color_palette() # In[4]: seed = 444 np.random.seed(seed); torch.manual_seed(seed); # ## Normal distribution # # If $z$ is drawn from a normal distribution with mean $\mu$ and standard deviation $\sigma$, then the probability of $z$ follows the relationship: # # $z \sim \mathcal{N}(\mu, \sigma) \iff P(z \mid \mu, \sigma) = \frac{1}{\sigma\sqrt{2\pi}} e^{-\frac{1}{2}(\frac{z - \mu}{\sigma})^2}$ # # $P(z \mid \mu, \sigma)$ is the probability density function (PDF) of the normal distribution. # In[35]: def plot_normal_distributions(): f, ax = plt.subplots(1, 1, figsize=(12, 6)) normal_fn = lambda z, m, s: (1/(s * np.sqrt(2*np.pi))) * np.exp(-0.5*((z - m) / s)**2) x = np.linspace(-7.5, 10, 300) ax.plot(x, normal_fn(x, 0, 1), label=r'$\mu = 0$, $\sigma = 1$', color=palette[0]) ax.plot(x, normal_fn(x, 0, 2), label=r'$\mu = 0$, $\sigma = 2$', color=palette[1]) ax.plot(x, normal_fn(x, 5, 0.9), label=r'$\mu = 5$, $\sigma = .9$', color=palette[2]) ax.set_xlabel('$z$'); ax.set_ylabel(r'$P(z \mid \mu, \sigma)$'); ax.set_title('Probability density function (PDF) of the normal distribution'); ax.legend(loc='upper left'); return f, ax # In[36]: f, _ = plot_normal_distributions() # In[37]: #f.savefig( # '/Users/srom/workspace/romainstrock.com/assets/img/uncertainty/normal.png', # bbox_inches='tight', # dpi=120, #) # ## Softplus # In[5]: f, ax = plt.subplots(1, 1, figsize=(10, 5)) x_r = np.linspace(-10, 10) y_r = np.log(1 + np.exp(x_r)) ax.plot(x_r, y_r); ax.set_xlabel('x'); ax.set_ylabel('y') ax.set_title('softplus(x) = $ln(1 + e^x)$'); # In[6]: f.savefig( '/Users/srom/workspace/romainstrock.com/assets/img/uncertainty/softplus.png', bbox_inches='tight', dpi=80, ) # ## Generate dataset # In[7]: def generate_samples(α, β, σ, min_x=-1, max_x=1, n_samples=500): x = np.linspace(min_x, max_x, n_samples)[:, np.newaxis] y = α * x + β dist = torch.distributions.Normal(torch.from_numpy(y), σ) y_sample = dist.sample().detach().numpy() return x, y, y_sample # In[8]: def plot_line(x, y, y_sample): f, ax = plt.subplots(1, 1, figsize=(12, 6)) ax.plot(x.flatten(), y.flatten(), '-', color=palette[0], linewidth=3) ax.scatter(x.flatten(), y_sample.flatten(), color=palette[0], alpha=0.8) ax.set_xlabel('input x') ax.set_ylabel('output y') return ax # In[9]: α_actual = 2.6 β_actual = 0.0 σ_actual = 0.7 x, y, y_sample = generate_samples(α_actual, β_actual, σ_actual) # In[10]: plot_line(x, y, y_sample); # ## Model # In[11]: initial_α = 1.8 initial_s = 1.5 # In[12]: class LinearNormal(torch.nn.Module): def __init__(self): super().__init__() self.α = torch.nn.Parameter(torch.tensor(initial_α)) self.s = torch.nn.Parameter(torch.tensor(initial_s)) @property def sigma(self): return F.softplus(self.s) # ensure σ > 0 def forward(self, x): m = self.α * x σ = self.sigma return torch.distributions.Normal(m, σ) # In[13]: def compute_loss(model, x, y): out_dist = model(x) neg_log_likelihood = -out_dist.log_prob(y) return torch.mean(neg_log_likelihood) def predict(model, x): model.eval() out_dist = model(x) return out_dist.mean, out_dist.stddev, out_dist # In[14]: def plot_results(x, y, y_pred): f, ax = plt.subplots(1, 1, figsize=(12, 6)) ax.plot(x.flatten(), y.flatten(), '-', color=palette[0], linewidth=2, label='Actual line') ax.plot(x.flatten(), y_pred.flatten(), '-', color=palette[1], linewidth=2, label='Predicted line') ax.set_xlabel('input x') ax.set_ylabel('output y') ax.legend() return ax # In[15]: model = LinearNormal() # In[16]: y_pred, _, y_dist = predict(model, torch.from_numpy(x)) y_pred_sample = y_dist.sample() # In[17]: ax = plot_results( x.flatten(), α_actual * x.flatten(), y_pred.detach().numpy(), ) ax.set_title('Prediction just after initialization'); # In[18]: def compute_train_test_split(x, y, test_size): x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size) return ( torch.from_numpy(x_train), torch.from_numpy(x_test), torch.from_numpy(y_train), torch.from_numpy(y_test), ) # In[19]: x_train, x_test, y_train, y_test = compute_train_test_split(x, y_sample, test_size=0.2) # ## Loss function # # The **probability density function** (PDF) of the normal distribution $\mathcal{N}(\mu, \sigma)$, also called **likelihood function**, is defined as: # # $P(z \mid \mu, \sigma) = \frac{1}{\sigma\sqrt{2\pi}} e^{-\frac{1}{2}(\frac{z - \mu}{\sigma})^2}$ # # It is customary to consider the logarithm of the likelihood function for numerical stability reason. It is also typical in machine learning to minimize rather than maximize functions, so we'll consider **the negative log likelihood**: # # $-\ln{P(z \mid \mu, \sigma)} = \ln{(\sigma\sqrt{2\pi})} + \frac{1}{2\sigma^2}(z - \mu)^2$ # # In the context of linear regression, we have $\mu = \alpha x + \beta$ and we want to minimize the sum of the negative log likelihood function evaluated for all points $(x, y)$ in our dataset of observations $\mathcal{D} \subset \mathbb{R}^2$. # # $\text{Loss}\,(\alpha, \beta, \sigma, \mathcal{D}) = \sum\limits_{(x, y) \in \mathcal{D}} -\ln{P(y \mid \alpha x + \beta, \sigma)} = \sum\limits_{(x, y) \in \mathcal{D}} \ln{(\sigma\sqrt{2\pi})} + \frac{1}{2\sigma^2}\{y - (\alpha x + \beta)\}^2$ # In[20]: def loss_fn(x, y, alpha, sigma): m = torch.from_numpy(alpha * x) σ = torch.from_numpy(sigma) observations = torch.from_numpy(y) dist = torch.distributions.Normal(m, σ) neg_log_likelihood = -dist.log_prob(observations) return float(torch.mean(neg_log_likelihood).detach().numpy()) def fitness_fn(x, y, alpha, sigma): output = np.zeros((len(alpha),)) for i in range(len(alpha)): output[i] = loss_fn(x, y, alpha[i], sigma[i]) return output # Create a grid of 300 by 300 = 90,000 input points a = np.linspace(1.5, 3.5, 300) b = np.log(1 + np.exp(np.linspace(-1.0, 2.0, 300))) A, B = np.meshgrid(a, b) X_grid = np.array([[u, v] for u, v in zip(np.ravel(A), np.ravel(B))]) # Sample outputs based on grid points alpha = X_grid[:,:1] s = X_grid[:,1:] zs = fitness_fn(x, y_sample, alpha, s) Z = zs.reshape(A.shape) # In[21]: fig = plt.figure(figsize=(16, 6)) gs = mpl.gridspec.GridSpec(1, 2, figure=fig) ax1 = fig.add_subplot(gs[0,0], projection='3d') ax2 = fig.add_subplot(gs[0,1]) # Plot surface ax1.plot_surface(A, B, Z, cmap='coolwarm', linewidth=0) # Plot filled contours cs = ax2.contourf(A, B, Z, cmap='coolwarm', levels=80) # Colour map cbar = plt.colorbar(cs, ax=ax2) cbar.ax.set_ylabel('Loss', rotation=360) cbar.ax.get_yaxis().labelpad = 25 # Surface labels ax1.get_xaxis().labelpad = 10 ax1.get_yaxis().labelpad = 10 ax1.get_zaxis().labelpad = 10 ax1.set_xlabel(r'$\alpha$'); ax1.set_ylabel(r'$\sigma$'); ax1.set_zlabel('Loss'); ax1.grid(False); ax1.view_init(20, 25); ax1.set_facecolor('white'); # Contour labels ax2.set_xlabel(r'$\alpha$'); ax2.set_ylabel(r'$\sigma$'); ax2.grid(False); # ## Visualizing gradient descent # In[22]: def train_one_step(model, optimizer, x_batch, y_batch): model.train() optimizer.zero_grad() loss = compute_loss(model, x_batch, y_batch) loss.backward() optimizer.step() return loss # In[23]: optimizer = torch.optim.SGD(model.parameters(), lr=1.0) # In[24]: fig = plt.figure(figsize=(8, 6)) gs = mpl.gridspec.GridSpec(1, 1, figure=fig) ax2 = fig.add_subplot(gs[0,0]) # Plot filled contours cs = ax2.contourf(A, B, Z, cmap='coolwarm', levels=80) # Colour map cbar = plt.colorbar(cs, ax=ax2) cbar.ax.set_ylabel('Loss', rotation=360) cbar.ax.get_yaxis().labelpad = 25 # Contour labels ax2.set_xlabel(r'$\alpha$'); ax2.set_ylabel(r'$\sigma$'); ax2.grid(False); fn = lambda x: np.log(1 + np.exp(x)) # Initial value xx = [initial_α] yy = [fn(initial_s)] # gradient steps for _ in range(5): loss = train_one_step(model, optimizer, x_train, y_train) xx.append(float(model.α.detach().numpy())) yy.append(fn(float(model.s.detach().numpy()))) ax2.plot(xx, yy, 'o-', color='white', markersize=10); ax2.set_title('5 steps of gradient descent'); # In[ ]: