Notebook

Introduction to Machine Learning¶

CSE204-2018 Lab session #03

Jérémie DECOCK

Objectives¶

Introduction to parametric models
Implement a linear regressor
Approximate the optimal parameters using a gradient descent algorithm
Linear regression with Scikit Learn
Implement a polynomial regressor

Imports and tool functions¶

In [ ]:

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
import sklearn.linear_model
import sklearn.pipeline
import sklearn.preprocessing

In [ ]:

def gen_1d_linear_regression_samples(n_samples = 20):

    x = np.random.uniform(low=-10., high=10., size=n_samples)
    y = 2. * x + 3. + np.random.normal(scale=2., size=x.shape)

    df = pd.DataFrame(np.array([x, y]).T, columns=['x', 'y'])

    df = sklearn.utils.shuffle(df).reset_index(drop=True)
    
    return df

In [ ]:

def gen_1d_polynomial_regression_samples(n_samples = 15):

    x = np.random.uniform(low=0., high=10., size=n_samples)
    #x = np.random.uniform(low=0., high=1., size=n_samples)

    y = 3. - 2. * x + x ** 2 - x ** 3 + np.random.normal(scale=10., size=x.shape)
    #y = np.cos(1.5 * np.pi * x) + np.random.normal(scale=0.1, size=x.shape)

    df = pd.DataFrame(np.array([x, y]).T, columns=['x', 'y'])

    df = sklearn.utils.shuffle(df).reset_index(drop=True)

    return df

In [ ]:

def plot_1d_regression_samples(dataframe, model=None):
    fig, ax = plt.subplots(figsize=(8, 8))
    
    df = dataframe  # make an alias
    
    ERROR_MSG1 = "The `dataframe` parameter should be a Pandas DataFrame having the following columns: ['x', 'y']"
    assert df.columns.values.tolist() == ['x', 'y'], ERROR_MSG1
    
    if model is not None:
        
        # Compute the model's prediction
        
        x_pred = np.linspace(df.x.min(), df.x.max(), 100).reshape(-1, 1)
        y_pred = model.predict(x_pred)
        
        df_pred = pd.DataFrame(np.array([x_pred.flatten(), y_pred.flatten()]).T, columns=['x', 'y'])
        
        df_pred.plot(x='x', y='y', style='r--', ax=ax)

    # Plot also the training points
    
    df.plot.scatter(x='x', y='y', ax=ax)
    
    delta_y = df.y.max() - df.y.min()
    
    plt.ylim((df.y.min() - 0.15 * delta_y,
              df.y.max() + 0.15 * delta_y))

In [ ]:

def plot_ex2(X, y, theta_0=None, theta_1=None):
    df = pd.DataFrame(np.array([X, y]).T, columns=['x', 'y'])

    ax = df.plot.scatter(x="x", y="y")

    if theta_0 is not None and theta_1 is not None:
        x = np.array([1, 9])
        y = theta_0 + theta_1 * x

        ax.plot(x, y, "--r")

In [ ]:

def plot_ex4(X, y, theta_1=None, theta_2=None):
    df = pd.DataFrame(np.array([X, y]).T, columns=['x', 'y'])

    ax = df.plot.scatter(x="x", y="y")

    if theta_1 is not None and theta_2 is not None:
        x = np.linspace(0, 6, 50)
        y = theta_1 * x + theta_2 * x**2

        ax.plot(x, y, "--r")

In [ ]:

import matplotlib.colors as colors
from mpl_toolkits.mplot3d import axes3d

def plot_contour_2d_solution_space(func,
                                   fig=None,
                                   ax=None,
                                   show=True,
                                   theta_min=-np.ones(2),
                                   theta_max=np.ones(2),
                                   theta_star=None,
                                   theta_visited=None,
                                   title=""):
    """Plot points visited during the execution of an optimization algorithm."""
    if (fig is None) or (ax is None):                # TODO
        fig, ax = plt.subplots(figsize=(12, 8))

    if theta_visited is not None:
        theta_min = np.amin(np.hstack([theta_min.reshape([-1, 1]), theta_visited]), axis=1)
        theta_max = np.amax(np.hstack([theta_max.reshape([-1, 1]), theta_visited]), axis=1)

    x1_space = np.linspace(theta_min[0], theta_max[0], 200)
    x2_space = np.linspace(theta_min[1], theta_max[1], 200)

    x1_mesh, x2_mesh = np.meshgrid(x1_space, x2_space)

    zz = func(np.array([x1_mesh.ravel(), x2_mesh.ravel()])).reshape(x1_mesh.shape)

    ############################

    if theta_star is not None:
        min_value = func(theta_star)
    else:
        min_value = zz.min()
        
    max_value = zz.max()

    levels = np.logspace(0.1, 3., 5)          # TODO

    im = ax.pcolormesh(x1_mesh, x2_mesh, zz,
                       vmin=0.1,              # TODO
                       vmax=max_value,
                       norm=colors.LogNorm(), # TODO
                       shading='gouraud',
                       cmap='gnuplot2') # 'jet' # 'gnuplot2'

    plt.colorbar(im, ax=ax)

    cs = plt.contour(x1_mesh, x2_mesh, zz, levels,
                     linewidths=(2, 2, 2, 2, 3),
                     linestyles=('dotted', '-.', 'dashed', 'solid', 'solid'),
                     alpha=0.5,
                     colors='white')
    ax.clabel(cs, inline=False, fontsize=12)

    ############################

    if theta_visited is not None:
        ax.plot(theta_visited[0],
                theta_visited[1],
                '-og',
                alpha=0.5,
                label="$visited$")

    ############################

    if theta_star is not None:
        sc = ax.scatter(theta_star[0],
                   theta_star[1],
                   c='red',
                   label=r"$\theta^*$")
        sc.set_zorder(10)        # put this point above every thing else

    ############################

    ax.set_title(title, fontsize=16)

    ax.set_xlabel(r"$\theta_0$", fontsize=16)
    ax.set_ylabel(r"$\theta_1$", fontsize=16)

    ax.legend(fontsize=16)

    if show:
        plt.show()

    return fig, ax


def plot_2d_solution_space(func,
                           fig=None,
                           ax=None,
                           show=True,
                           theta_min=-np.ones(2),
                           theta_max=np.ones(2),
                           theta_star=None,
                           theta_visited=None,
                           angle_view=None,
                           title=""):
    """Plot points visited during the execution of an optimization algorithm."""
    if fig is None or ax is None:                # TODO
        fig = plt.figure(figsize=(12, 8))
        ax = axes3d.Axes3D(fig)

    if angle_view is not None:
        ax.view_init(angle_view[0], angle_view[1])

    x1_space = np.linspace(theta_min[0], theta_max[0], 100)
    x2_space = np.linspace(theta_min[1], theta_max[1], 100)

    x1_mesh, x2_mesh = np.meshgrid(x1_space, x2_space)

    zz = func(np.array([x1_mesh.ravel(), x2_mesh.ravel()])).reshape(x1_mesh.shape)   # TODO

    ############################

    surf = ax.plot_surface(x1_mesh,
                           x2_mesh,
                           zz,
                           cmap='gnuplot2', # 'jet' # 'gnuplot2'
                           norm=colors.LogNorm(),   # TODO
                           rstride=1,
                           cstride=1,
                           #color='b',
                           shade=False)

    ax.set_zlabel(r"$E(\theta)$")

    fig.colorbar(surf, shrink=0.5, aspect=5)

    ############################

    if theta_star is not None:
        ax.scatter(theta_star[0],
                   theta_star[1],
                   func(theta_star),
                   #s=50,          # TODO
                   c='red',
                   alpha=1,
                   label=r"$\theta^*$")

    ax.set_title(title, fontsize=16)

    ax.set_xlabel(r"$\theta_0$", fontsize=16)
    ax.set_ylabel(r"$\theta_1$", fontsize=16)

    #ax.legend(fontsize=16)

    if show:
        plt.show()

    return fig, ax

Introduction¶

In the previous lab session, you have used a non-parametric models (k Nearest Neighbors) to solve classification and regression problems. Today you will learn to solve regression problems using parametric models (the application of parametric models to classification problems will be the subject of the next session): you will use a parametric function $f_{\boldsymbol{\theta}}: \boldsymbol{x} \mapsto y$ to infer the link existing between input vectors $\boldsymbol{x} \in \mathbb{R}^p$ and output values $y \in \mathbb{R}$ in a learning set $\mathcal{D} = \{(y^{(i)}, \boldsymbol{x^{(i)}})\}_{1 \leq i \leq n}$ of $n$ examples.

The hypothesis space $\mathcal{H}$ of $f_{\boldsymbol{\theta}}$ is a priori chosen so that the model fits reasonably well the data in $\mathcal{D}$. For instance, $\mathcal{H}$ can be the space of linear functions if data seems to be distributed along a line in $\mathcal{D}$. The space of polynomial function of degree $d>1$ may be a good choice otherwise.

The parameter $\boldsymbol{\theta}^* = \begin{pmatrix} \theta_0^* & \dots & \theta_p^* \end{pmatrix}^T$ is then searched to obtain the best fit between $f_{\boldsymbol{\theta}}$ and $\mathcal{D}$. This is an optimization problem.

For instance, assuming you have chosen the space of linear functions to make a model that describes data you have in $\mathcal{D}$. Your model is then $y = \theta_0 + \theta_1 x$ and the regression problem consists in finding the best parameters (or estimators) $\theta_0$ and $\theta_1$ for it.

Note: there are some differences in notations with the lecture slides: parameters are noted $w$ in lectures but they are noted $\theta$ here.

Linear regression: an analytic definition of the optimal parameters¶

We have a learning set $\mathcal{D} = \{(y^{(i)}, \boldsymbol{x^{(i)}})\}_{1 \leq i \leq n}$.

We assume:

Errors (difference between actual labels $y$ and predicted labels $f_{\theta}(\boldsymbol{x})$) are gaussian random values centered on 0: $y = f_{\boldsymbol{\theta}}(\boldsymbol{x}) + \epsilon$ with $\epsilon \sim \mathcal{N}(0, \sigma^2)$.
Data is modeled with a linear function: $f_{\boldsymbol{\theta}}(\boldsymbol{x}) = \theta_0 + \sum_{j=1}^p \theta_j \boldsymbol{x}_j$.

Observations $\boldsymbol{x} \in \mathbb{R}^p$ can be defined as $p$ random values $X_1, X_2, \dots, X_p$
Labels $y$ are then realization of a random value $Y$ so that:

$$Y \sim \mathcal{N}(\underbrace{f(\boldsymbol{x} | \boldsymbol{\theta})}_{\mu}, \sigma^2)$$

We want to find the estimator $\boldsymbol{\theta}^* = \begin{pmatrix} \theta_0^* & \dots & \theta_p^* \end{pmatrix}^T$ that gives the best fit between $f_{\boldsymbol{\theta}}$ and $\mathcal{D}$ (optimization problem).

Finding the best $\boldsymbol{\theta}^*$ is a maximum likelihood problem : $\boldsymbol{\theta}^* \leftarrow \arg\max_{\boldsymbol{\theta}} \mathbb{P}(\mathcal{D}|\boldsymbol{\theta})$. Here, this is equivalent to apply the method of least squares or to minimize the Mean Square Error (MSE). Using the matrix notation, we define the linear regression problem as:

$$\boldsymbol{\theta}^* \leftarrow \arg\min_{\boldsymbol{\theta}} E(\boldsymbol{\theta}) \quad \text{with} \quad E(\boldsymbol{\theta}) = (\boldsymbol{y} - \boldsymbol{X} \boldsymbol{\theta})^2$$

and with

$$ \boldsymbol{X} = \begin{pmatrix} 1 & x_1^{(1)} & \dots & x_p^{(1)} \\ \vdots & \vdots & \dots & \vdots \\ 1 & x_1^{(n)} & \dots & x_p^{(n)} \end{pmatrix} \quad \quad \boldsymbol{y} = \begin{pmatrix} y^{(1)} \\ \vdots \\ y^{(n)} \end{pmatrix} \quad \quad \boldsymbol{\theta} = \begin{pmatrix} \theta_0 \\ \vdots \\ \theta_p \end{pmatrix} $$

$E(\boldsymbol{\theta})$ is a quadratic form (convex function) thus it has a unique global minimum $\boldsymbol{\theta^*}$ where $\nabla_{\boldsymbol{\theta^*}} E(\boldsymbol{\theta^*}) = \boldsymbol{0}$

Exercise 1¶

On a sheet of paper:

Compute the analytic formulation of the gradient $\nabla_{\boldsymbol{\theta}} E(\boldsymbol{\theta})$ of the Mean Square Error $E(\boldsymbol{\theta}) = (\boldsymbol{y} - \boldsymbol{X} \boldsymbol{\theta})^2$
Compute the analytic formulation of the optimal parameter $\boldsymbol{\theta^*}$

Correction¶

In the simple case where $x \in \mathbb{R}$ and a linear model is used for the model, its two parameters $\theta^*_0$ (the intercept) and $\theta^*_1$ (the solpe) can be obtained as follow:

\begin{align} E(\boldsymbol{\theta}) = E(\theta_0, \theta_1) & = \sum_{i=1}^{n} [ y^{(i)} - \underbrace{(\theta_0 + \theta_1 x^{(i)})}_{f\left(x^{(i)}\right)} ]^2 \\ & = \sum_{i=1}^{n} \left[ y^{(i)} - \theta_0 - \theta_1 x^{(i)} \right]^2 \end{align}

$$ \nabla_{\boldsymbol{\theta}} E(\boldsymbol{\theta}) = \begin{pmatrix} \frac{\partial E}{\partial \theta_0}(\boldsymbol{\theta}) \\ \frac{\partial E}{\partial \theta_1}(\boldsymbol{\theta}) \end{pmatrix} $$

with

\begin{align} \frac{\partial E}{\partial \theta_0}(\boldsymbol{\theta}) &= \sum_{i=1}^{n} \left[ \frac{\partial \left( y^{(i)} - \theta_0 - \theta_1 x^{(i)} \right)}{\partial \theta_0} 2 \left( y^{(i)} - \theta_0 - \theta_1 x^{(i)} \right) \right] \\ &= \sum_{i=1}^{n} \left[ -2 \left( y^{(i)} - \theta_0 - \theta_1 x^{(i)} \right) \right] \\ &= -2 \sum_{i=1}^{n} \left( y^{(i)} - \theta_0 - \theta_1 x^{(i)} \right) \end{align}

and

\begin{align} \frac{\partial E}{\partial \theta_1}(\boldsymbol{\theta}) &= \sum_{i=1}^{n} \left[ \frac{\partial \left( y^{(i)} - \theta_0 - \theta_1 x^{(i)} \right)}{\partial \theta_1} 2 \left( y^{(i)} - \theta_0 - \theta_1 x^{(i)} \right) \right] \\ &= \sum_{i=1}^{n} \left[ - x^{(i)} 2 \left( y^{(i)} - \theta_0 - \theta_1 x^{(i)} \right) \right] \\ &= -2 \sum_{i=1}^{n} \left[ x^{(i)} \left( y^{(i)} - \theta_0 - \theta_1 x^{(i)} \right) \right] \end{align}

Recall: $(f \circ g)' = g' (f' \circ g)$.

Thus $\theta^*_0$ and $\theta^*_1$ are given by:

\begin{align} \sum_{i=1}^{n} \left[ x^{(i)} \left( y^{(i)} - \theta^*_0 - \theta^*_1 x^{(i)} \right) \right] &= 0 \\ \Leftrightarrow \theta^*_1 &= \frac{\sum_{i=1}^{n} \left( x^{(i)} - \overline{x} \right) \left( y^{(i)} - \overline{y} \right)}{\sum_{i=1}^{n} \left( x^{(i)} - \overline{x} \right)^2} \end{align}

and

\begin{align} \sum_{i=1}^{n} \left( y^{(i)} - \theta^*_0 - \theta^*_1 x^{(i)} \right) &= 0 \\ \Leftrightarrow \theta^*_0 &= \overline{y} - \theta^*_1 \overline{x} \end{align}

with

$$\overline{x} = \frac1n \sum_{i=1}^{n} x^{(i)}$$

and

$$\overline{y} = \frac1n \sum_{i=1}^{n} y^{(i)}$$

In the above definition, there are only two parameters to optimize: $\theta_0$ and $\theta_1$. When the number of parameters is greater, it's usually more convenient to write the problem using matrix formalism. Formulae that follow are an alternative writing of $\nabla_{\boldsymbol{\theta}} E(\boldsymbol{\theta})$ and $\boldsymbol{\theta^*}$ (i.e. a more general definition than the previous ones).

Recall about matrices:

$$ (\boldsymbol{A} + \boldsymbol{B})^T = \boldsymbol{A}^T + \boldsymbol{B}^T $$$$ (\boldsymbol{A} \boldsymbol{B})^T = \boldsymbol{B}^T \boldsymbol{A}^T $$\begin{align} \text{Associativity:} & \quad (\boldsymbol{A} \boldsymbol{B}) \boldsymbol{C} = \boldsymbol{A} (\boldsymbol{B} \boldsymbol{C}) = \boldsymbol{A} \boldsymbol{B} \boldsymbol{C} \\ \text{Non-commutativity:} & \quad \boldsymbol{A} \boldsymbol{B} \neq \boldsymbol{B} \boldsymbol{A} \\ \text{Distributivity:} & \quad \boldsymbol{A} (\boldsymbol{B} + \boldsymbol{C}) = \boldsymbol{A} \boldsymbol{B} + \boldsymbol{A} \boldsymbol{C} \\ & \quad (\boldsymbol{B} + \boldsymbol{C}) \boldsymbol{A} = \boldsymbol{B} \boldsymbol{A} + \boldsymbol{C} \boldsymbol{A} \end{align}

Recall about matrix differentiation:

\begin{align} d(\boldsymbol{A}) & = 0 \quad \quad \quad \text{if } \boldsymbol{A} \text{ is not function of } \boldsymbol{X} \\ d(a\boldsymbol{X}) & = a d \boldsymbol{X} \quad \quad \text{if } a \text{ is not function of } \boldsymbol{X} \\ d(\boldsymbol{X} +\boldsymbol{Y}) & = d \boldsymbol{X} + d \boldsymbol{Y} \\ d(\boldsymbol{XY}) & = (d \boldsymbol{X}) \boldsymbol{Y} + \boldsymbol{X} (d \boldsymbol{Y}) \\ d(\boldsymbol{X}^T) & = (d \boldsymbol{X})^T \\ \end{align}

To begin with, lets develop $E(\boldsymbol{\theta})$:

\begin{align} E(\boldsymbol{\theta}) & = \left( \boldsymbol{y} - \boldsymbol{X} \boldsymbol{\theta} \right)^T \left( \boldsymbol{y} - \boldsymbol{X} \boldsymbol{\theta} \right) \\ & = \left( \boldsymbol{y}^T - (\boldsymbol{X} \boldsymbol{\theta})^T \right) \left( \boldsymbol{y} - \boldsymbol{X} \boldsymbol{\theta} \right) \\ & = \left( \boldsymbol{y}^T - (\boldsymbol{\theta}^T \boldsymbol{X}^T) \right) \left( \boldsymbol{y} - \boldsymbol{X} \boldsymbol{\theta} \right) \\ & = \boldsymbol{y}^T \boldsymbol{y} - \boldsymbol{y}^T \boldsymbol{X} \boldsymbol{\theta} - (\boldsymbol{\theta}^T \boldsymbol{X}^T) \boldsymbol{y} + (\boldsymbol{\theta}^T \boldsymbol{X}^T) (\boldsymbol{X} \boldsymbol{\theta}) \\ & = \boldsymbol{y}^T \boldsymbol{y} - \underbrace{\boldsymbol{y}^T \boldsymbol{X} \boldsymbol{\theta}}_{\text{scalar}} - \underbrace{\boldsymbol{\theta}^T \boldsymbol{X}^T \boldsymbol{y}}_{\text{scalar}} + \boldsymbol{\theta}^T \boldsymbol{X}^T \boldsymbol{X} \boldsymbol{\theta} \\ \end{align}

Since $\boldsymbol{y}^T \boldsymbol{X} \boldsymbol{\theta}$ is a $1 \times 1$ matrix, or a scalar, and its transpose $(\boldsymbol{y}^T \boldsymbol{X} \boldsymbol{\theta})^T = (\boldsymbol{X} \boldsymbol{\theta})^T \boldsymbol{y} = \boldsymbol{\theta}^T \boldsymbol{X}^T \boldsymbol{y}$ is the same scalar, then $- \boldsymbol{y}^T \boldsymbol{X} \boldsymbol{\theta} - \boldsymbol{\theta}^T \boldsymbol{X}^T \boldsymbol{y} = -2 \boldsymbol{y}^T \boldsymbol{X} \boldsymbol{\theta} = -2 \boldsymbol{\theta}^T \boldsymbol{X}^T \boldsymbol{y}$.

Thus we have:

\begin{align} E(\boldsymbol{\theta}) & = \boldsymbol{y}^T \boldsymbol{y} \underbrace{-2 \boldsymbol{\theta}^T \boldsymbol{X}^T \boldsymbol{y}} + \boldsymbol{\theta}^T \boldsymbol{X}^T \boldsymbol{X} \boldsymbol{\theta} \\ \end{align}

Then we can write $\nabla_{\boldsymbol{\theta}} E(\boldsymbol{\theta})$:

\begin{align} \nabla_{\boldsymbol{\theta}} E(\boldsymbol{\theta}) & = \nabla_{\boldsymbol{\theta}} \left[ \left( \boldsymbol{y} - \boldsymbol{X} \boldsymbol{\theta} \right)^T \left( \boldsymbol{y} - \boldsymbol{X} \boldsymbol{\theta} \right) \right] \\ & = \nabla_{\boldsymbol{\theta}} \left[ \boldsymbol{y}^T \boldsymbol{y} -2 \boldsymbol{\theta}^T \boldsymbol{X}^T \boldsymbol{y} + \boldsymbol{\theta}^T \boldsymbol{X}^T \boldsymbol{X} \boldsymbol{\theta} \right] \\ & = \nabla_{\boldsymbol{\theta}} \left[ \boldsymbol{y}^T \boldsymbol{y} \right] + \nabla_{\boldsymbol{\theta}} \left[-2 \boldsymbol{\theta}^T \boldsymbol{X}^T \boldsymbol{y} \right] + \underbrace{\nabla_{\boldsymbol{\theta}} \left[ \boldsymbol{\theta}^T \boldsymbol{X}^T \boldsymbol{X} \boldsymbol{\theta} \right]}_{\text{see recall below}} \\ & = -2 \boldsymbol{X}^T \boldsymbol{y} + 2 \boldsymbol{X}^T \boldsymbol{X} \boldsymbol{\theta} \\ & = -2 \boldsymbol{X}^T (\boldsymbol{y} - \boldsymbol{X\theta}) \end{align}

Recall:

$$ \frac{\partial \boldsymbol{\theta}^T \boldsymbol{A} \boldsymbol{\theta}}{\partial \boldsymbol{\theta}} = 2 \boldsymbol{A} \boldsymbol{\theta} \quad \quad \text{if } \boldsymbol{A} \text{ is not a function of } \boldsymbol{\theta} \text{ and if } \boldsymbol{A} \text{ is a symmetric matrix} $$

Then we can write the formula of the optimal parameter $\boldsymbol{\theta^*}$ (the one that minimize $E(\boldsymbol{\theta})$ i.e. the one such that $\nabla_{\boldsymbol{\theta}} E(\boldsymbol{\theta}) = 0$):

\begin{align} \nabla_{\boldsymbol{\theta^*}} E(\boldsymbol{\theta^*}) & = 0 \\ \Leftrightarrow -2\boldsymbol{X}^T (\boldsymbol{y} - \boldsymbol{X\theta^*}) & = 0\\ \Leftrightarrow -\boldsymbol{X}^T (\boldsymbol{y} - \boldsymbol{X\theta^*}) & = 0\\ \Leftrightarrow - \boldsymbol{X^T y} + \boldsymbol{X^T X\theta^*} & = 0\\ \Leftrightarrow \boldsymbol{X^T X\theta^*} & = \boldsymbol{X^T y} \\ \Leftrightarrow \boldsymbol{\theta^*} & = (\boldsymbol{X^T X})^{-1} \boldsymbol{X}^T \boldsymbol{y} \end{align}

Example¶

Exercise 2¶

Question 1¶

Use the previous equations to compute by hand (i.e. on a sheet of paper) the optimal parameters $\theta_0$ and $\theta_1$ of the model $y = \theta_0 + \theta_1 x$ to best fit the following dataset (of four observations):

$$\mathcal{D} = \left\{ \begin{pmatrix} 2 \\ 1 \end{pmatrix}, \begin{pmatrix} 5 \\ 2 \end{pmatrix}, \begin{pmatrix} 7 \\ 3 \end{pmatrix}, \begin{pmatrix} 8 \\ 3 \end{pmatrix} \right\}$$

In [ ]:

X = [2, 5, 7, 8]
y = [1, 2, 3, 3]

plot_ex2(X, y)

Question 2¶

Check graphically that the model you obtained fits well with the data using the following cell (uncomment and complete the first two lines and uncomment the last one).

In [ ]:

X = [2, 5, 7, 8]
y = [1, 2, 3, 3]

#theta_0 =                          # <- TO UNCOMMENT AND TO COMPLETE (intercept)
#theta_1 =                          # <- TO UNCOMMENT AND TO COMPLETE (slope)

#plot_ex2(X, y, theta_0, theta_1)   # <- TO UNCOMMENT

Question 3¶

Plot the MSE $E(\theta)$ with the following cells. What is plotted ? What is the input space and the output space ?

What can you say about these plots ?

In [ ]:

X = np.array([[1, 1, 1, 1], [2, 5, 7, 8]]).T
y = np.array([1, 2, 3, 3]).reshape(-1, 1)

In [ ]:

class MSE:
    def __init__(self, X, y):
        self.X = np.copy(X)
        self.y = np.copy(y)
        
    def __call__(self, theta):
        return 1./2. * ((np.tile(self.y, theta.shape[1]) - np.dot(self.X, theta))**2).sum(axis=0)
    
mse = MSE(X, y)

In [ ]:

plot_contour_2d_solution_space(mse,
                               theta_min=np.array([-5, -1]),
                               theta_max=np.array([5, 1]),
                               theta_star=np.array([[2./7.], [5./14.]]));

In [ ]:

plot_2d_solution_space(mse,
                       theta_min=np.array([-5, -1]),
                       theta_max=np.array([5, 1]));

Correction¶

Question 1¶

Using matrices, the problem is defined as follow:

$$ \boldsymbol{y} = \begin{pmatrix} 1 \\ 2 \\ 3 \\ 3 \end{pmatrix} \quad \boldsymbol{X} = \begin{pmatrix} 1 & 2 \\ 1 & 5 \\ 1 & 7 \\ 1 & 8 \end{pmatrix} \quad \boldsymbol{\theta} = \begin{pmatrix} \theta_0 \\ \theta_1 \end{pmatrix} $$

The optimal parameters $\boldsymbol{\theta}^*$ according to the least squares is:

$$\boldsymbol{\theta}^* = (\boldsymbol{X}^T\boldsymbol{X})^{-1} \boldsymbol{X}^T\boldsymbol{y}$$

Thus we have:

$$ \boldsymbol{X}^T\boldsymbol{X} = \begin{pmatrix} 1 & 1 & 1 & 1 \\ 2 & 5 & 7 & 8 \end{pmatrix} \begin{pmatrix} 1 & 2 \\ 1 & 5 \\ 1 & 7 \\ 1 & 8 \end{pmatrix} = \begin{pmatrix} 4 & 22 \\ 22 & 142 \end{pmatrix} $$$$ \det (\boldsymbol{X}^T\boldsymbol{X}) = \det \begin{pmatrix} 4 & 22 \\ 22 & 142 \end{pmatrix} = 4 \times 142 - 22 \times 22 = 84 $$$$ (\boldsymbol{X}^T\boldsymbol{X})^{-1} = \frac{1}{84} \begin{pmatrix} 142 & -22 \\ -22 & 4 \end{pmatrix} $$$$ \boldsymbol{X}^T\boldsymbol{y} = \begin{pmatrix} 1 & 1 & 1 & 1 \\ 2 & 5 & 7 & 8 \end{pmatrix} \begin{pmatrix} 1 \\ 2 \\ 3 \\ 3 \end{pmatrix} = \begin{pmatrix} 9 \\ 57 \end{pmatrix} $$

Then the normal equation is:

\begin{align} \begin{pmatrix} \theta_0^* \\ \theta_1^* \end{pmatrix} & = \begin{pmatrix} 4 & 22 \\ 22 & 142 \end{pmatrix}^{-1} \begin{pmatrix} 9 \\ 57 \end{pmatrix} \\ & = \frac{1}{84} \begin{pmatrix} 142 & -22 \\ -22 & 4 \end{pmatrix} \begin{pmatrix} 9 \\ 57 \end{pmatrix} \\ & = \frac{1}{84} \begin{pmatrix} 24 \\ 30 \end{pmatrix} \\ & = \begin{pmatrix} 2/7 \\ 5/14 \end{pmatrix} \end{align}

The equation of the model is:

$$y = \frac{2}{7} + \frac{5}{14} x$$

Question 2¶

In [ ]:

X = [2, 5, 7, 8]
y = [1, 2, 3, 3]

theta_0 = 2./7.
theta_1 = 5./14.

plot_ex2(X, y, theta_0, theta_1)

Question 3¶

These plots show the error with respect to parameters $\theta_0$ and $\theta_1$ (the solution space). The color bar on the right gives the error $E(\boldsymbol{\theta})$ associated to each color ($\boldsymbol{\theta}$ vectors in the yellow area have high error whereas $\boldsymbol{\theta}$ vectors in the black area have low error).

This plot confirms again that the smallest error is obtained for $\begin{pmatrix} \theta_0 \\ \theta_1 \end{pmatrix} = \begin{pmatrix} 2/7 \\ 5/14 \end{pmatrix} \approx \begin{pmatrix} 0.29 \\ 0.36 \end{pmatrix}$.

Linear regression: an approximated solution using a gradient descent method¶

When $(X^TX)^{-1}$ cannot be easily computed (e.g. no analytical solution or $\mathcal{D}$ contains a lot of examples or the dimension of the solution space $\mathcal{X}$ is too large), an approximated solution can be computed using a gradient descent method.

$\nabla_{\theta}E(\hat{\theta})$ gives the direction of the largest slope at the point $\hat{\theta}$. Thus, if we explore iteratively the parameter's space by following the opposite direction of this gradient as described in the following definition, we should converge to the parameter $\theta^*$ that minimize the MSE i.e. the parameter $\theta^*$ such that $\nabla_{\theta^*}E(\hat{\theta^*}) = 0$.

Starting from a random point $\theta$, the gradient descent method proposes a new point $\theta \leftarrow \theta - \eta \nabla_{\theta}E(\theta)$ at each iteration until a stopping criterion has been reached: e.g. $||\nabla_{\theta}E(\theta)||_2^2 > \epsilon_{\delta}$ with $\epsilon_{\delta}$ a chosen minimal length for the gradient to continue iterations.

The learning rate $\eta \in \mathbb{R}_+^*$ is a parameter to tweak for the considered problem.

If $\eta$ is too large, the optimization may not converge toward 0.
If $\eta$ is too small, the optimization may require a lot of iterations to converge.

Exercise 3¶

Question 1¶

Implement a gradient descent method to solve exercise 2 with an approximated solution. Use the analytic formulation of $\nabla_{\theta}E(\theta)$ that has been computed in exercise 1.

You can use a very basic stopping criteria: the number of iterations (e.g. 10000). You can start with $\eta = 0.001$.

Question 2¶

Print or plot the value of $\theta$ and $E(\theta)$ obtained at each iteration. Check that $E(\theta)$ converges near to 0 and that $\theta$ converges near to the solution obtained in exercise 2.

Question 3¶

Print or plot the norm of the gradient. How do you interpret it ?

Question 4¶

Restart the optimization using a different learning rate $\eta$. What do you observe ?

Correction¶

Question 1¶

We saw in the first exercise that: \begin{align} \nabla_{\boldsymbol{\theta}} E(\boldsymbol{\theta}) & = \nabla_{\boldsymbol{\theta}} \left[ \left( \boldsymbol{y} - \boldsymbol{X} \boldsymbol{\theta} \right)^T \left( \boldsymbol{y} - \boldsymbol{X} \boldsymbol{\theta} \right) \right] \\ & = -\boldsymbol{X}^T (\boldsymbol{y} - \boldsymbol{X\theta}) \\ & = -\boldsymbol{X}^T \boldsymbol{y} + \boldsymbol{X}^T \boldsymbol{X\theta} \end{align}

Here we have: $$ \boldsymbol{y} = \begin{pmatrix} 1 \\ 2 \\ 3 \\ 3 \end{pmatrix} \quad \boldsymbol{X} = \begin{pmatrix} 1 & 2 \\ 1 & 5 \\ 1 & 7 \\ 1 & 8 \end{pmatrix} \quad \boldsymbol{\theta} = \begin{pmatrix} \theta_0 \\ \theta_1 \end{pmatrix} $$

As we saw in exercise 2: $$ \boldsymbol{X}^T\boldsymbol{X} = \begin{pmatrix} 1 & 1 & 1 & 1 \\ 2 & 5 & 7 & 8 \end{pmatrix} \begin{pmatrix} 1 & 2 \\ 1 & 5 \\ 1 & 7 \\ 1 & 8 \end{pmatrix} = \begin{pmatrix} 4 & 22 \\ 22 & 142 \end{pmatrix} $$

$$ \boldsymbol{X}^T\boldsymbol{y} = \begin{pmatrix} 1 & 1 & 1 & 1 \\ 2 & 5 & 7 & 8 \end{pmatrix} \begin{pmatrix} 1 \\ 2 \\ 3 \\ 3 \end{pmatrix} = \begin{pmatrix} 9 \\ 57 \end{pmatrix} $$

Thus the gradient of the error $E$ is:

\begin{align} \nabla_{\boldsymbol{\theta}} E(\boldsymbol{\theta}) & = -\begin{pmatrix} 9 \\ 57 \end{pmatrix} + \begin{pmatrix} 4 & 22 \\ 22 & 142 \end{pmatrix} \begin{pmatrix} \theta_0 \\ \theta_1 \end{pmatrix} \\ & = \begin{pmatrix} -9 + 4 ~ \theta_0 + 22 ~ \theta_1 \\ -57 + 22 ~ \theta_0 + 142 ~ \theta_1 \end{pmatrix} \end{align}

The gradient descent algorithm can be implemented as follow:

In [ ]:

def gradient_descent(gradient, eta=0.001, max_iteration=10000, initial_theta=None):

    if initial_theta is None:
        # The initial solution is selected randomly
        theta = np.random.normal(loc=0, scale=10, size=[2, 1])
    else:
        theta = initial_theta

    grad_list = []      # Keep the gradient of all iterations
    theta_list = []     # Keep the solution of all iterations

    for i in range(max_iteration):
        grad = gradient(theta)
        theta = theta - eta * grad     # Update the current solution
        
        grad_list.append([grad[0][0], grad[1][0]])      # Keep the gradient
        theta_list.append([theta[0][0], theta[1][0]])   # Keep the solution
    
    return grad_list, theta_list

In [ ]:

Xy = np.array([[-9], [-57]])
XX = np.array([[4, 22], [22, 142]])

def grad_theta(theta):
    return Xy + np.dot(XX, theta)

grad_list, theta_list = gradient_descent(grad_theta)

df_grad = pd.DataFrame(grad_list, columns=["grad1", "grad2"])
df_theta = pd.DataFrame(theta_list, columns=["theta1", "theta2"])

df_theta.tail()

Question 2¶

In [ ]:

theta_min = df_theta.min().values
theta_max = df_theta.max().values

plot_contour_2d_solution_space(mse,
                               theta_min=theta_min - 2.,
                               theta_max=theta_max + 2.,
                               theta_visited=df_theta.values.T);

Green points are computed solution at each iteration.

Question 3¶

In [ ]:

df_grad['norm'] = np.sqrt(df_grad['grad1']**2 + df_grad['grad2']**2)
ax = df_grad.norm.plot(loglog=True, figsize=(16, 8))

ax.set_title(r"Evolution of $||\nabla_{\theta} E(\theta)||_2$", fontsize=16)
ax.set_xlabel("Iteration number", fontsize=16)
ax.set_ylabel(r"Norm of $\nabla_{\theta} E(\theta)$", fontsize=16);

Question 4¶

With $\eta = 0.000001$¶

If $\eta$ is too small, it converges very slowly toward the optimum.

In [ ]:

grad_list, theta_list = gradient_descent(grad_theta, eta=0.000001, initial_theta=np.array([[7],[2]]))

df_grad = pd.DataFrame(grad_list, columns=["grad1", "grad2"])
df_theta = pd.DataFrame(theta_list, columns=["theta1", "theta2"])

theta_min = df_theta.min().values
theta_max = df_theta.max().values

plot_contour_2d_solution_space(mse,
                               theta_min=theta_min - 2.,
                               theta_max=theta_max + 2.,
                               theta_visited=df_theta.values.T);

In [ ]:

df_grad['norm'] = np.sqrt(df_grad['grad1']**2 + df_grad['grad2']**2)
ax = df_grad.norm.plot(loglog=True, figsize=(16, 8))

ax.set_title(r"Evolution of $||\nabla_{\theta} E(\theta)||_2$", fontsize=16)
ax.set_xlabel("Iteration number", fontsize=16)
ax.set_ylabel(r"Norm of $\nabla_{\theta} E(\theta)$", fontsize=16);

With $\eta = 0.0135$¶

With a larger value, the algorithm oscillates but eventually converges.

In [ ]:

grad_list, theta_list = gradient_descent(grad_theta, eta=0.0135, initial_theta=np.array([[7],[2]]))

df_grad = pd.DataFrame(grad_list, columns=["grad1", "grad2"])
df_theta = pd.DataFrame(theta_list, columns=["theta1", "theta2"])

theta_min = df_theta.min().values
theta_max = df_theta.max().values

plot_contour_2d_solution_space(mse,
                               theta_min=theta_min - 2.,
                               theta_max=theta_max + 2.,
                               theta_visited=df_theta.values.T);

In [ ]:

df_grad['norm'] = np.sqrt(df_grad['grad1']**2 + df_grad['grad2']**2)
ax = df_grad.norm.plot(loglog=True, figsize=(16, 8))

ax.set_title(r"Evolution of $||\nabla_{\theta} E(\theta)||_2$", fontsize=16)
ax.set_xlabel("Iteration number", fontsize=16)
ax.set_ylabel(r"Norm of $\nabla_{\theta} E(\theta)$", fontsize=16);

With $\eta = 0.02$¶

If $\eta$ is too large, the algorithm diverges and makes an overflow error (i.e. solutions value become too large to be contained in a float variable).

In [ ]:

grad_list, theta_list = gradient_descent(grad_theta, eta=0.02, max_iteration=5, initial_theta=np.array([[7],[2]]))

df_grad = pd.DataFrame(grad_list, columns=["grad1", "grad2"])
df_theta = pd.DataFrame(theta_list, columns=["theta1", "theta2"])

theta_min = df_theta.min().values
theta_max = df_theta.max().values

plot_contour_2d_solution_space(mse,
                               theta_min=theta_min - 2.,
                               theta_max=theta_max + 2.,
                               theta_visited=df_theta.values.T);

In [ ]:

df_grad['norm'] = np.sqrt(df_grad['grad1']**2 + df_grad['grad2']**2)
ax = df_grad.norm.plot(loglog=True, figsize=(16, 8))

ax.set_title(r"Evolution of $||\nabla_{\theta} E(\theta)||_2$", fontsize=16)
ax.set_xlabel("Iteration number", fontsize=16)
ax.set_ylabel(r"Norm of $\nabla_{\theta} E(\theta)$", fontsize=16);

Linear regression with Scikit Learn¶

Let's play with the Scikit Learn implementation of linear regression. The official documentation is there: https://scikit-learn.org/stable/modules/linear_model.html

Use the gen_1d_linear_regression_samples() function (defined above) to generate a dataset and plot_1d_regression_samples() to plot it.

In [ ]:

df = gen_1d_linear_regression_samples()

plot_1d_regression_samples(df)

Once the dataset is ready, let's make the regressor and train it with the following code:

In [ ]:

model = sklearn.linear_model.LinearRegression()

model.fit(df[['x']], df[['y']])

The following cell plots the learned model (the red dashed line) and the dataset $\mathcal{D}$ (blue points).

In [ ]:

plot_1d_regression_samples(df, model=model)

Exercise 4¶

Question 1¶

What are the optimal parameters $\theta_1$ (intercept) and $\theta_2$ obtained ? (use model.coef_ and model.intercept_ attributes)

In [ ]:

Question 2¶

Write the mathematical definition of your model.

...

Question 3¶

Use the model.predict() function to guess the class of the following points:

$$x_{p1} = -2, \quad x_{p2} = 2, \quad x_{p3} = 6$$

In [ ]:

Correction¶

Question 1¶

In [ ]:

print("Model slope:    ", model.coef_[0])
print("Model intercept:", model.intercept_)

Question 2¶

The gen_1d_linear_regression_samples() returns randomly noised data, thus form one call to another the dataset $\mathcal{D}$ changes. The slope and intercept of your model slightly changes too but it should be close to this:

$y \approx 2 x + 3$

In [ ]:

df = gen_1d_linear_regression_samples()

def f(X):
    return 2 * X + 3

ax = df.plot.scatter(x='x', y='y', figsize=(16, 6))

X = np.array([-10, 10])
ax.plot(X, f(X));

Question 3¶

In [ ]:

model.predict(np.array([[-2], [2], [6]]))

Actual values are:

In [ ]:

f(-2), f(2), f(6)

Polynomial regression and basis expansion¶

It is a common practice to use linear models trained on nonlinear functions of the data in machine learning. This approach maintains the generally fast performance of linear methods, while allowing them to fit a much wider range of data.

For instance, a linear model can be extended by making polynomial features from the coefficients. Linear model in exercises 1 and 2 looks like this (one-dimensional data):

$$f_{\theta}(x) = \theta_0 + \theta_1 x$$

If we want to fit a quadratic curve to the data instead of a line, we can combine the features in second-order polynomials, so that the model looks like this:

$$f_{\theta}(x) = \theta_0 + \theta_1 x + \theta_2 x^2$$

This is still a linear model: to illustrate this, imagine creating a new variable

$$z = [x, x^2]$$

With this re-labeling of the data, our problem can be written

$$f_{\theta}(x) = \theta_0 + \theta_1 z_1 + \theta_2 z_2$$

The resulting polynomial regression is in the same class of linear models we'd considered above (i.e. the model is linear in $\theta$) and can be solved by the same techniques. Thus the linear model has the flexibility to fit a much broader range of data.

Exercise 5¶

Question 1¶

Use the previous equations to compute by hand (i.e. on a sheet of paper) the optimal parameters $\theta_1$ and $\theta_2$ of the model $y = \theta_1 x + \theta_1 x^2$ to best fit the following dataset (of four examples):

$$\mathcal{D} = \left\{ \begin{pmatrix} 1 \\ 1.8 \end{pmatrix}, \begin{pmatrix} 2 \\ 2.7 \end{pmatrix}, \begin{pmatrix} 3 \\ 3.4 \end{pmatrix}, \begin{pmatrix} 4 \\ 3.8 \end{pmatrix}, \begin{pmatrix} 5 \\ 3.9 \end{pmatrix} \right\}$$

In [ ]:

X = [1, 2, 3, 4, 5]
y = [1.8, 2.7, 3.4, 3.8, 3.9]

plot_ex4(X, y)

Question 2¶

Check graphically that the model you obtained fits well with the data using the following cell (complete the first two lines).

In [ ]:

X = [1, 2, 3, 4, 5]
y = [1.8, 2.7, 3.4, 3.8, 3.9]

#theta_1 =                          # <- TO UNCOMMENT AND TO COMPLETE
#theta_2 =                          # <- TO UNCOMMENT AND TO COMPLETE

#plot_ex4(X, y, theta_1, theta_2)   # <- TO UNCOMMENT

Correction¶

Question 1¶

Using matrices, the problem is defined as follow:

$$ \boldsymbol{y} = \begin{pmatrix} 1.8 \\ 2.7 \\ 3.4 \\ 3.8 \\ 3.9 \end{pmatrix} \quad \boldsymbol{X} = \begin{pmatrix} 1 & 1 \\ 2 & 4 \\ 3 & 9 \\ 4 & 16 \\ 5 & 25 \end{pmatrix} \quad \boldsymbol{\theta} = \begin{pmatrix} \theta_1 \\ \theta_2 \end{pmatrix} \quad \boldsymbol{\epsilon} = \begin{pmatrix} \epsilon_1 \\ \epsilon_2 \\ \epsilon_3 \\ \epsilon_4 \\ \epsilon_5 \end{pmatrix} $$

The optimal parameters $\boldsymbol{\theta}^*$ according to the least squares is:

$$\boldsymbol{\theta}^* = (\boldsymbol{X}^T\boldsymbol{X})^{-1} \boldsymbol{X}^T\boldsymbol{y}$$

Thus we have:

$$ \boldsymbol{X}^T\boldsymbol{X} = \begin{pmatrix} 1 & 2 & 3 & 4 & 5 \\ 1 & 4 & 9 & 16 & 25 \end{pmatrix} \begin{pmatrix} 1 & 1 \\ 2 & 4 \\ 3 & 9 \\ 4 & 16 \\ 5 & 25 \end{pmatrix} = \begin{pmatrix} 55 & 225 \\ 225 & 979 \end{pmatrix} $$$$ \det (\boldsymbol{X}^T\boldsymbol{X}) = \det \begin{pmatrix} 55 & 225 \\ 225 & 979 \end{pmatrix} = 55 \times 979 - 225 \times 225 = 3220 $$$$ \boldsymbol{X}^T\boldsymbol{y} = \begin{pmatrix} 1 & 2 & 3 & 4 & 5 \\ 1 & 4 & 9 & 16 & 25 \end{pmatrix} \begin{pmatrix} 1.8 \\ 2.7 \\ 3.4 \\ 3.8 \\ 3.9 \end{pmatrix} = \begin{pmatrix} 52.1 \\ 201.5 \end{pmatrix} $$

Then the normal equation is:

\begin{align} \begin{pmatrix} \theta_1^* \\ \theta_2^* \end{pmatrix} & = \begin{pmatrix} 55 & 225 \\ 225 & 979 \end{pmatrix}^{-1} \begin{pmatrix} 52.1 \\ 201.5 \end{pmatrix} \\ & = \frac{1}{3220} \begin{pmatrix} 979 & -225 \\ -225 & 55 \end{pmatrix} \begin{pmatrix} 52.1 \\ 201.5 \end{pmatrix} \\ & \approx \frac{1}{3220} \begin{pmatrix} 5668.4 \\ -640 \end{pmatrix} \\ & \approx \begin{pmatrix} 1.76 \\ -0.2 \end{pmatrix} \end{align}

The equation of the model is:

$$y = 1.76 x - 0.2 x^2$$

We can check with Numpy:

In [ ]:

X = np.array([[1, 2, 3, 4, 5],[1, 4, 9, 16, 25]]).T
y = np.array([1.8, 2.7, 3.4, 3.8, 3.9])

In [ ]:

XX = np.dot(X.T, X)
XX

In [ ]:

Xy = np.dot(X.T, y)
Xy

In [ ]:

invXX = np.linalg.inv(XX)
invXX

In [ ]:

np.dot(invXX, Xy)

Question 2¶

In [ ]:

X = [1, 2, 3, 4, 5]
y = [1.8, 2.7, 3.4, 3.8, 3.9]

theta_1 = 1.76
theta_2 = -0.2

plot_ex4(X, y, theta_1, theta_2)

Polynomial regression with Scikit Learn¶

Let's play with the Scikit Learn implementation of polynomial regression. The official documentation is there: https://scikit-learn.org/stable/modules/linear_model.html#polynomial-regression-extending-linear-models-with-basis-functions

First we make the dataset, plot it, make the regressor and train it with the following code:

In [ ]:

df = gen_1d_polynomial_regression_samples(n_samples=20)

plot_1d_regression_samples(df)

polynomial_features = sklearn.preprocessing.PolynomialFeatures(degree=3)  # Try with degree = 1, 4 and 15
linear_regression = sklearn.linear_model.LinearRegression(fit_intercept=False)

model = sklearn.pipeline.Pipeline([("polynomial_features", polynomial_features),
                                   ("linear_regression", linear_regression)])

model.fit(df[['x']], df[['y']])

In sklearn.preprocessing.PolynomialFeatures(), degree is the degree of the polynomal function.

The following cell plots the learned model (the red dashed line) and the dataset $\mathcal{D}$ (blue points).

In [ ]:

plot_1d_regression_samples(df, model=model)

Exercise 6¶

Question 1¶

What are the optimal parameters $\theta_0, \theta_1, \theta_2, \theta_3$ obtained ? (use the linear_regression.coef_[0][0] attribute for the intercept and linear_regression.coef_[0][1:] for the others coefficients)

In [ ]:

Question 2¶

Write the mathematical definition of your model.

...

Question 3¶

Use the model.predict() function to guess the class of the following points:

$$x_{p1} = 1, \quad x_{p2} = 2, \quad x_{p3} = 6$$

In [ ]:

Question 4¶

In sklearn.preprocessing.PolynomialFeatures(), change the value of degree and describe what happen on the plot (use e.g. 1 and 15). What is the name of the observed phenomenons ?

...

Correction¶

Question 1¶

In [ ]:

print("theta_3:            ", linear_regression.coef_[0][3])
print("theta_2:            ", linear_regression.coef_[0][2])
print("theta_1:            ", linear_regression.coef_[0][1])
print("theta_0 (intercept):", linear_regression.coef_[0][0])

Question 2¶

The gen_1d_polynomial_regression_samples() returns randomly noised data, thus form one call to another the dataset $\mathcal{D}$ changes. The coefficients of your model changes too but it should be "graphically" equivalent to the following one on the interval $[-10 ; 10]$:

$y = \theta_3 x^3 + \theta_2 x^2 + \theta_1 x + \theta_0 \approx -x^3 + x^2 -2 x + 3$

In [ ]:

df = gen_1d_polynomial_regression_samples(n_samples=20)

ax = df.plot.scatter(x='x', y='y', figsize=(14, 6))

x = np.linspace(0., 10., 50)

y_pred = np.full(shape=x.shape, fill_value=linear_regression.coef_[0][0])
y_pred += linear_regression.coef_[0][1] * x
y_pred += linear_regression.coef_[0][2] * x**2
y_pred += linear_regression.coef_[0][3] * x**3

y = np.full(shape=x.shape, fill_value=3.)
y += -2. * x
y += x**2
y += -x**3

df_model = pd.DataFrame(np.array([x, y, y_pred]).T, columns=['x', 'y', 'y_pred'])

df_model.plot(x='x', y='y', style=':r', label='actual y', ax=ax)
df_model.plot(x='x', y='y_pred', label='y predicted', ax=ax);

Question 3¶

In [ ]:

model.predict(np.array([[1], [2], [6]]))

Question 4¶

With a degree of 1¶

In [ ]:

df = gen_1d_polynomial_regression_samples(n_samples=20)

polynomial_features = sklearn.preprocessing.PolynomialFeatures(degree=1)  # Try with degree = 1, 4 and 15
linear_regression = sklearn.linear_model.LinearRegression(fit_intercept=False)

model = sklearn.pipeline.Pipeline([("polynomial_features", polynomial_features),
                                   ("linear_regression", linear_regression)])

model.fit(df[['x']], df[['y']])

plot_1d_regression_samples(df, model=model)

The model is not complex enough to fit well data. This is an example of under fitting.

With a degree of 15¶

In [ ]:

df = gen_1d_polynomial_regression_samples(n_samples=20)

polynomial_features = sklearn.preprocessing.PolynomialFeatures(degree=15)  # Try with degree = 1, 4 and 15
linear_regression = sklearn.linear_model.LinearRegression(fit_intercept=False)

model = sklearn.pipeline.Pipeline([("polynomial_features", polynomial_features),
                                   ("linear_regression", linear_regression)])

model.fit(df[['x']], df[['y']])

plot_1d_regression_samples(df, model=model)

The model is too complex and learn the noise contained in $\mathcal{D}$. As a result, the model will have poor performance on new data (i.e. it is not capable to generalize well). This is an example of over fitting.

CO2 Emission Forecast (bonus)¶

In this exercise, you will forecast 5 years of future CO2 emission from power generation using natural gas.

This exercise use a dataset taken from https://www.kaggle.com/berhag/co2-emission-forecast-with-python-seasonal-arima.

This public dataset contain monthly carbon dioxide emissions from electricity generation. The dataset includes CO2 emissions starting January 1973 to July 2016.

In [ ]:

#URL = "natural_gas_co2_emissions_for_electric_power_sector.csv"
URL = "https://raw.githubusercontent.com/jeremiedecock/polytechnique-cse204-2018/master/natural_gas_co2_emissions_for_electric_power_sector.csv"

df = pd.read_csv(URL,
                 parse_dates=[0]) #, index_col=0) #, squeeze=True)
df.head()

In [ ]:

df.plot(x='date', y='co2_emissions', figsize=(15,10), title='Natural Gas Electric Power Sector CO2 Emissions');

Exercise 7 (bonus)¶

Implement a model to make predictions on this dataset. Use polynomial basis functions plus two sinusoids to handle the seasonality of this time series: $\sin\left(\frac{2 \pi}{12} x\right)$ and $\cos\left(\frac{2 \pi}{12} x \right)$. This signal contains a periodic component of 12 time steps (with one time step equals to one month).

We use both sine and cosine to avoid unaligned phases with the time series. Eventually we could use only $\sin\left(\frac{2 \pi}{12} x + \phi\right)$ or $\cos\left(\frac{2 \pi}{12} x + \phi\right)$ as long as $\phi$ is properly set (peaks of the chosen sinusoid should be aligned with peaks in the dataset): $\phi = \pi / 2$ in the first case and $\phi = 0$ in the second one if $x^{(0)} = 0$.

What are the limitations of this model?

Correction¶

Basis expansions with Scikit Learn¶

The sine basis is not implemented in sklearn.preprocessing.PolynomialFeatures but Scikit Learn can be easily extended to handle any basis. See for instance: http://madrury.github.io/jekyll/update/statistics/2017/08/04/basis-expansions.html.

Here we will use a simpler method: we simply compute the $Z$ matrix as follow and apply a linear regression on it.

$$Z = \begin{pmatrix} 1 & x^{(1)} & x^{(1)2} & x^{(1)3} & \sin\left(\frac{2 \pi}{12} x^{(1)} \right) & \cos\left(\frac{2 \pi}{12} x^{(1)} \right) \\ 1 & x^{(2)} & x^{(2)2} & x^{(2)3} & \sin\left(\frac{2 \pi}{12} x^{(2)} \right) & \cos\left(\frac{2 \pi}{12} x^{(2)} \right) \\ \vdots & \vdots & \vdots & \vdots & \vdots & \vdots \\ 1 & x^{(n)} & x^{(n)2} & x^{(n)3} & \sin\left(\frac{2 \pi}{12} x^{(n)} \right) & \cos\left(\frac{2 \pi}{12} x^{(n)} \right) \end{pmatrix}$$

Reminder:

$x^{(n)}$ is the value of $x$ for the $n^{\text{th}}$ observation in $\mathcal{D}$
$x^{(n)2}$ is the squared value of $x$ for the $n^{\text{th}}$ observation in $\mathcal{D}$
...

First we make the $Z$ matrix of features:

In [ ]:

X = df.index            # Here X is the time step index

y = df.co2_emissions

z_intercept = np.ones(shape=X.shape)
z_1 = X.copy()
z_2 = X**2
z_3 = X**3
z_sin = np.sin(2. * np.pi * X / 12.)
z_cos = np.cos(2. * np.pi * X / 12.)

Z = np.array([z_intercept, z_1, z_2, z_3, z_sin, z_cos]).T

df_Z = pd.DataFrame(Z, columns=['intercept', 'z', 'z2', 'z3', 'sin', 'cos'])
df_Z.tail()

With the following plot, we check that the seasonality of the time series is correctly aligned with our sine and cosine bases:

In [ ]:

ax = df_Z.loc[:,['sin', 'cos']].plot(figsize=(18,8))
ax.plot(y);

Then we make and fit the model:

In [ ]:

model = sklearn.linear_model.LinearRegression()

model.fit(df_Z[['intercept', 'z', 'z2', 'z3', 'sin', 'cos']], y)

The following cell plots the learned model and the dataset $\mathcal{D}$ (blue points):

In [ ]:

fig, ax = plt.subplots(figsize=(18, 8))

# Compute the model's prediction

y_pred = model.predict(df_Z)
ax.plot(y_pred, label="Precicted y");

# Plot also the training points

ax.plot(y, label="Actual y")
ax.legend()
plt.show()

print("Coefs:", model.coef_)

The we use the model to (roughly) forecast the CO2 emission for the 5 next years:

In [ ]:

X_forecast = np.arange(12 * 5) + df.index[-1]            # Here X is the time step index

z_intercept = np.ones(shape=X_forecast.shape)
z_1 = X_forecast.copy()
z_2 = X_forecast**2
z_3 = X_forecast**3
z_sin = np.sin(2. * np.pi * X_forecast / 12.)
z_cos = np.cos(2. * np.pi * X_forecast / 12.)

Z_forecast = np.array([z_intercept, z_1, z_2, z_3, z_sin, z_cos]).T

df_Z_forecast = pd.DataFrame(Z_forecast, columns=['intercept', 'z', 'z2', 'z3', 'sin', 'cos'])

# Compute the model's prediction

y_forecast = model.predict(df_Z_forecast)

# Plot also the training points

fig, ax = plt.subplots(figsize=(18, 8))

ax.plot(X_forecast, y_forecast, label="y forecast")
ax.plot(X, y, label="y")

ax.legend();

The following cell gather all steps to quickly test other basis:

In [ ]:

#X = df.index + 1       # Here X is the time step index
X = df.index            # Here X is the time step index

y = df.co2_emissions

z_intercept = np.ones(shape=X.shape)
z_1 = X.copy()
z_2 = X**2
z_3 = X**3
z_4 = X**4
z_5 = X**5
z_sin = np.sin(2. * np.pi / 12. * X)                  # Bad results when no other sinusoid complete this one (when index starts at 0)
#z_sin = np.sin(2. * np.pi / 12. * X + np.pi / 2.)    # Ok, equiv np.cos(2. * np.pi * X / 12.)
z_cos = np.cos(2. * np.pi / 12. * X)                  # Ok
#z_sin2 = np.sin(2. * np.pi / (12. * 9) * X)          # Ok but almost useless
#z_cos2 = np.cos(2. * np.pi / (12. * 9) * X)          # Ok but almost useless

Z = np.array([z_intercept,
              z_1,
              z_2,
              z_3,
              z_4,
              z_5,
              z_sin,
              z_cos,
              #z_sin2,
              #z_cos2
             ]).T

df_Z = pd.DataFrame(Z, columns=['intercept',
                                'z',
                                'z2',
                                'z3',
                                'z4',
                                'z5',
                                'sin',
                                'cos',
                                #'sin2',
                                #'cos2'
                               ])

# Make and fit the model

model = sklearn.linear_model.LinearRegression(fit_intercept=False)
model.fit(df_Z, y)

# Compute the model's prediction

y_pred = model.predict(df_Z)

# Plot also the training points

fig, ax = plt.subplots(figsize=(18, 8))
ax.plot(y_pred)
ax.plot(y)
plt.show()

print("Coefs:", model.coef_)

The use of sine and cosine basis improved the fit compared to a basic polynomial regression but we can clearly see the limitation of out model on this dataset as the amplitude of periodic part of the CO2 time series is not constant in time.

To improve the fit (especially on the "peaks"), we should let the sine and cosine coefficients vary with respect to time (i.e. with respect to $x$). For this, could use autoregressive models (like SARIMA where a linear combination of passed observations is fitted using linear regression methods) or non-linear regression methods (like Neural Networks).

See for instance Box, G. E., Jenkins, G. M. (1976). Time series analysis: forecasting and control. John Wiley & Sons if you want more information on autoregressive models.

Non-linear regression models will be introduced in the $6^{\text{th}}$ lab session with Neural Networks.