#!/usr/bin/env python
# coding: utf-8
# jupyter nbconvert --to slides --reveal-prefix "http://lab.hakim.se/reveal-js/"
# In[3]:
import pods
import mlai
from matplotlib import pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
# # Introduction to Machine Learning and Data Science
#
# ### Data Science in Africa Summer School
#
# ### Makerere University, Kampala, Uganda
#
# ### Neil D. Lawrence
#
# ### 27th June 2016
# # Text
#
#
#
# @Rogers:book11
#
# # Another Text
#
#
# 
#
# @Bishop:book06
#
# # What is Machine Learning?
#
#
# **
data + model = prediction**
#
# - **data** : observations, could be actively or passively
# acquired (meta-data).
#
# - **model** : assumptions, based on previous experience (other data!
# transfer learning etc), or beliefs about the regularities of
# the universe. Inductive bias.
#
# - **prediction** : an action to be taken or a categorization or a
# quality score.
#
# # Fitting Data
#
# - **data**
# In[3]:
import numpy as np
# Create some data
x = np.array([1, 3])
y = np.array([3, 1])
# - **model**
# $$y=mx + c$$
# # Model Fitting
#
# $$m = \frac{y_2- y_1}{x_2-x_1}$$
# $$ c = y_1 - m x_1 $$
# In[4]:
xvals = np.linspace(0, 5, 2);
m = (y[1]-y[0])/(x[1]-x[0]);
c = y[0]-m*x[0];
yvals = m*xvals+c;
# In[5]:
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
xvals = np.linspace(0, 5, 2);
m = (y[1]-y[0])/(x[1]-x[0]);
c = y[0]-m*x[0];
yvals = m*xvals+c;
ylim = np.array([0, 5])
xlim = np.array([0, 5])
f, ax = plt.subplots(1,1,figsize=(5,5))
a = ax.plot(xvals, yvals, '-', linewidth=3);
ax.set_xlim(xlim)
ax.set_ylim(ylim)
plt.xlabel('$x$', fontsize=30)
plt.ylabel('$y$',fontsize=30)
plt.text(4, 4, '$y=mx+c$', horizontalalignment='center', verticalalignment='bottom', fontsize=30)
plt.savefig('diagrams/straight_line1.svg')
ctext = ax.text(0.15, c+0.15, '$c$', horizontalalignment='center', verticalalignment='bottom', fontsize=20)
xl = np.array([1.5, 2.5])
yl = xl*m + c;
mhand = ax.plot([xl[0], xl[1]], [yl.min(), yl.min()], color=[0, 0, 0])
mhand2 = ax.plot([xl.min(), xl.min()], [yl[0], yl[1]], color=[0, 0, 0])
mtext = ax.text(xl.mean(), yl.min()-0.2, '$m$', horizontalalignment='center', verticalalignment='bottom',fontsize=20);
plt.savefig('diagrams/straight_line2.svg')
a2 = ax.plot(x, y, '.', markersize=20, linewidth=3, color=[1, 0, 0])
plt.savefig('diagrams/straight_line3.svg')
xs = 2
ys = m*xs + c + 0.3
ast = ax.plot(xs, ys, '.', markersize=20, linewidth=3, color=[0, 1, 0])
plt.savefig('diagrams/straight_line4.svg')
m = (y[1]-ys)/(x[1]-xs);
c = ys-m*xs;
yvals = m*xvals+c;
for i in a:
i.set_visible(False)
for i in mhand:
i.set_visible(False)
for i in mhand2:
i.set_visible(False)
mtext.set_visible(False)
ctext.set_visible(False)
a3 = ax.plot(xvals, yvals, '-', linewidth=2, color=[0, 0, 1])
for i in ast:
i.set_color([1, 0, 0])
plt.savefig('diagrams/straight_line5.svg')
m = (ys-y[0])/(xs-x[0])
c = y[0]-m*x[0]
yvals = m*xvals+c
for i in a3:
i.set_visible(False)
a4 = ax.plot(xvals, yvals, '-', linewidth=2, color=[0, 0, 1]);
for i in ast:
i.set_color([1, 0, 0])
plt.savefig('diagrams/straight_line6.svg')
for i in a:
i.set_visible(True)
for i in a3:
i.set_visible(True)
plt.savefig('diagrams/straight_line7.svg')
# In[6]:
import pods
pods.notebook.display_plots('straight_line{plot}.svg',
directory='./diagrams', plot=(1, 7))
# # $y = mx + c$
#
# point 1: $x = 1$, $y=3$ $$3 = m + c$$
# point 2: $x = 3$, $y=1$ $$1 = 3m + c$$
# point 3: $x = 2$, $y=2.5$ $$2.5 = 2m + c$$
#
#
#
#
# # $y = mx + c + \epsilon$
#
# point 1: $x = 1$, $y=3$
# $$3 = m + c + \epsilon_1$$
#
# point 2: $x = 3$, $y=1$
# $$1 = 3m + c + \epsilon_2$$
#
# point 3: $x = 2$, $y=2.5$
# $$2.5 = 2m + c + \epsilon_3$$
# ### Regression Examples
#
# - Predict a real value, $y_i$ given some inputs
# $x_i$.
#
# - Predict quality of meat given spectral measurements (Tecator data).
#
# - Radiocarbon dating, the C14 calibration curve: predict age given
# quantity of C14 isotope.
#
# - Predict quality of different Go or Backgammon moves given expert
# rated training data.
#
# ### Olympic 100m Data
#
# - Gold medal times for Olympic 100 m runners since 1896.
#
# 
# Image from Wikimedia Commons
# ### Olympic 100m Data
#
# In[3]:
data = pods.datasets.olympic_100m_men()
f, ax = plt.subplots(figsize=(7,7))
ax.plot(data['X'], data['Y'], 'ro', markersize=10)
# ### Olympic Marathon Data
#
#
# 
# Image from Wikimedia Commons
#
# ### Olympic Marathon Data
#
# - Gold medal times for Olympic Marathon since 1896.
#
# - Marathons before 1924 didn’t have a standardised distance.
#
# - Present results using pace per km.
#
# - In 1904 Marathon was badly organised leading to very slow times.
# ### Olympic Marathon Data
#
# In[4]:
data = pods.datasets.olympic_marathon_men()
f, ax = plt.subplots(figsize=(7,7))
ax.plot(data['X'], data['Y'], 'ro',markersize=10)
# ### What is Machine Learning?
#
# $$ \text{data} + \text{model} = \text{prediction}$$
#
# - $\text{data}$ : observations, could be actively or passively
# acquired (meta-data).
#
# - $\text{model}$ : assumptions, based on previous experience (other data!
# transfer learning etc), or beliefs about the regularities of
# the universe. Inductive bias.
#
# - $\text{prediction}$ : an action to be taken or a categorization or a
# quality score.
#
# ### Regression: Linear Releationship
#
# $$y_i = m x_i + c$$
#
# - $y_i$ : winning time/pace.
#
# - $x_i$ : year of Olympics.
#
# - $m$ : rate of improvement over time.
#
# - $c$ : winning time at year 0.
# # Overdetermined System
#
# 
# # $y = mx + c$
#
# point 1: $x = 1$, $y=3$ $$3 = m + c$$
# point 2: $x = 3$, $y=1$ $$1 = 3m + c$$
# point 3: $x = 2$, $y=2.5$ $$2.5 = 2m + c$$
#
# # $y = mx + c + \epsilon$
#
# point 1: $x = 1$, $y=3$
# $$3 = m + c + \epsilon_1$$
#
# point 2: $x = 3$, $y=1$
# $$1 = 3m + c + \epsilon_2$$
#
# point 3: $x = 2$, $y=2.5$
# $$2.5 = 2m + c + \epsilon_3$$
# ### The Gaussian Density
# - Perhaps the most common probability density.
# \begin{align*}
# p(y| \mu, \sigma^2) & = \frac{1}{\sqrt{2\pi\sigma^2}}\exp\left(-\frac{(y - \mu)^2}{2\sigma^2}\right)\\
# & \buildrel\triangle\over = \mathcal{N}(y|\mu, \sigma^2)
# \end{align*}
# - The Gaussian density.
#
# In[5]:
import numpy as np
h = np.linspace(0, 2.5, 1000)
sigma2 = 0.0225
mu = 1.7
p = 1./np.sqrt(2*np.pi*sigma2)*np.exp(-(h-mu)**2/(2*sigma2**2))
f2, ax2 = plt.subplots(figsize=(7, 3.5))
ax2.plot(h, p, 'b-', linewidth=3)
ylim = (0, 3)
ax2.vlines(mu, ylim[0], ylim[1], colors='r', linewidth=3)
ax2.set_ylim(ylim)
ax2.set_xlim(1.4, 2.0)
ax2.set_xlabel('$h/m$', fontsize=20)
ax2.set_ylabel('$p(h|\mu, \sigma^2)$', fontsize = 20)
f2.savefig('./diagrams/gaussian_of_height.svg')
# ### Gaussian Density
# 
# The Gaussian PDF with $\mu=1.7$ and variance $\sigma^2=
# 0.0225$. Mean shown as red line. It could represent the heights of a population of
# students.
# ### Gaussian Density
# $$
# \mathcal{N}(y|\mu, \sigma^2) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(y-\mu)^2}{2\sigma^2}\right)
# $$
# $\sigma^2$ is the variance of the density and $\mu$ is the mean.
#
# ### Laplace's Idea
#
# ### A Probabilistic Process
#
# - Set the mean of Gaussian to be a function.
# $$p\left(y_i|x_i\right)=\frac{1}{\sqrt{2\pi\sigma^2}}\exp \left(-\frac{\left(y_i-f\left(x_i\right)\right)^{2}}{2\sigma^2}\right).$$
#
# - This gives us a ‘noisy function’.
#
# - This is known as a stochastic process.
# ### Height as a Function of Weight
#
# - In the standard Gaussian, parametized by mean and variance.
#
# - Make the mean a linear function of an *input*.
#
# - This leads to a regression model.
# \begin{align*}
# y_i=&f\left(x_i\right)+\epsilon_i,\\
# \epsilon_i \sim &\mathcal{N}(0, \sigma^2).
# \end{align*}
#
# - Assume $y_i$ is height and $x_i$ is weight.
# ### Data Point Likelihood
#
# - Likelihood of an individual data point
# $$p\left(y_i|x_i,m,c\right)=\frac{1}{\sqrt{2\pi \sigma^2}}\exp \left(-\frac{\left(y_i-mx_i-c\right)^{2}}{2\sigma^2}\right).$$
#
# - Parameters are gradient, $m$, offset, $c$ of the function and noise
# variance $\sigma^2$.
# ### Data Set Likelihood
#
# - If the noise, $\epsilon_i$ is sampled independently for each
# data point.
#
# - Each data point is independent (given $m$ and $c$).
#
# - For independent variables:
# $$p(\mathbf{y}) = \prod_{i=1}^n p(y_i)$$
# $$p(\mathbf{y}|\mathbf{x}, m, c) = \prod_{i=1}^n p(y_i|x_i, m, c)$$
#
# ### For Gaussian
#
# - i.i.d. assumption
#
# $$p(\mathbf{y}|\mathbf{x}, m, c) = \prod_{i=1}^n \frac{1}{\sqrt{2\pi \sigma^2}}\exp \left(-\frac{\left(y_i-mx_i-c\right)^{2}}{2\sigma^2}\right).$$
# $$p(\mathbf{y}|\mathbf{x}, m, c) = \frac{1}{\left(2\pi \sigma^2\right)^{\frac{n}{2}}}\exp \left(-\frac{\sum_{i=1}^n\left(y_i-mx_i-c\right)^{2}}{2\sigma^2}\right).$$
# ### Log Likelihood Function
#
# - Normally work with the log likelihood:
# $$L(m,c,\sigma^{2})=-\frac{n}{2}\log 2\pi -\frac{n}{2}\log \sigma^2 -\sum _{i=1}^{n}\frac{\left(y_i-mx_i-c\right)^{2}}{2\sigma^2}.$$
# ### Error Function
#
# - Negative log likelihood is the error function leading to an error
# function
# $$E(m,c,\sigma^{2})=\sum _{i=1}^{n}\left(y_i-mx_i-c\right)^{2}.$$
#
# - Learning proceeds by minimizing this error function for the data
# set provided.
# ### Connection: Sum of Squares Error
#
# - Ignoring terms which don’t depend on $m$ and $c$ gives
# $$E(m, c) \propto \sum_{i=1}^n (y_i - f(x_i))^2$$
# where $f(x_i) = mx_i + c$.
#
# - This is known as the *sum of squares* error function.
#
# - Commonly used and is closely associated with the
# Gaussian likelihood.
# ## Reminder
#
# - Two functions involved:
# - Prediction function: $f(x_i)$
# - Error, or Objective function: $E(m, c)$
# - Error function depends on parameters through prediction function.
# ### Mathematical Interpretation
#
# - What is the mathematical interpretation?
#
# - There is a cost function.
#
# - It expresses mismatch between your prediction and reality.
# $$E(m, c)=\sum_{i=1}^n \left(y_i - mx_i -c\right)^2$$
#
# - This is known as the sum of squares error.
#
# ### Nonlinear Regression
#
# - Problem with Linear Regression—$\mathbf{x}$ may not be linearly
# related to $\mathbf{y}$.
#
# - Potential solution: create a feature space: define
# $\phi(\mathbf{x})$ where $\phi(\cdot)$ is a
# nonlinear function of $\mathbf{x}$.
#
# - Model for target is a linear combination of these nonlinear
# functions
# $$f(\mathbf{x}) = \sum_{j=1}^k w_j \phi_j(\mathbf{x})$$
# In[5]:
f, ax = plt.subplots(figsize=(7, 7))
loc =[[0, 1.4,],
[0, -0.7],
[0.75, -0.2]]
text =['$\phi(x) = 1$',
'$\phi(x) = x$',
'$\phi(x) = x^2$']
mlai.plot_basis(mlai.polynomial, x_min=-1.3, x_max=1.3, fig=f, ax=ax, loc=loc, text=text)
# ### Quadratic Basis
#
#
#
# In[25]:
pods.notebook.display_plots('polynomial_basis{num_basis}.svg', directory='./diagrams', num_basis=(1,3))
# ### Functions Derived from Quadratic Basis
#
# $$f(x) = {\color{\redColor}w_0} + {\color{\magentaColor}w_1x} + {\color{\blueColor}w_2 x^2}$$
#
#
# In[7]:
pods.notebook.display_plots('polynomial_function{func_num}.svg', directory='./diagrams', func_num=(1,3))
# In[5]:
f, ax = plt.subplots(figsize=(7, 7))
loc = [[-1.25, -0.4],
[0., 1.25],
[1.25, -0.4]]
text = ['$\phi_1(x) = e^{-(x + 1)^2}$',
'$\phi_2(x) = e^{-2x^2}$',
'$\phi_3(x) = e^{-2(x-1)^2}$']
mlai.plot_basis(mlai.radial, x_min=-2, x_max=2, fig=f, ax=ax, loc=loc, text=text)
# ### Radial Basis Functions
#
# In[8]:
pods.notebook.display_plots('radial_basis{num_basis}.svg', directory='./diagrams', num_basis=(1,3))
# ### Functions Derived from Radial Basis
#
# $$f(x) = {\color{\redColor}w_1 e^{-2(x+1)^2}} + {\color{\magentaColor}w_2e^{-2x^2}} + {\color{\blueColor}w_3 e^{-2(x-1)^2}}$$
# In[10]:
pods.notebook.display_plots('radial_function{func_num}.svg', directory='./diagrams', func_num=(1,3))
# ### Basis Function Models
#
# - The *prediction function* is now defined as
# $$f(\mathbf{x}_i) = \sum_{j=1}^m w_j \phi_{i, j}$$
#
# ## Vector Notation
#
# - Write in vector notation,
# $$f(\mathbf{x}_i) = \mathbf{w}^\top \boldsymbol{\phi}_i$$
# ### Log Likelihood for Basis Function Model
#
# - The likelihood of a single data point is
# $$p\left(y_i|x_i\right)=\frac{1}{\sqrt{2\pi\sigma^2}}\exp
# \left(-\frac{\left(y_i-\mathbf{w}^{\top}\boldsymbol{\phi}_i\right)^{2}}{2\sigma^2}\right).$$
# ### Log Likelihood for Basis Function Model
#
# - Leading to a log likelihood for the data set of
# $$L(\mathbf{w},\sigma^2)= -\frac{n}{2}\log \sigma^2
# -\frac{n}{2}\log 2\pi -\frac{\sum
# _{i=1}^{n}\left(y_i-\mathbf{w}^{\top}\boldsymbol{\phi}_i\right)^{2}}{2\sigma^2}.$$
#
#
# ### Objective Function
#
# - And a corresponding *objective function* of the form
# $$E(\mathbf{w},\sigma^2)= \frac{n}{2}\log
# \sigma^2 + \frac{\sum
# _{i=1}^{n}\left(y_i-\mathbf{w}^{\top}\boldsymbol{\phi}_i\right)^{2}}{2\sigma^2}.$$
# ### Polynomial Fits to Olympic Data
# In[8]:
basis = mlai.polynomial
data = pods.datasets.olympic_marathon_men()
f, ax = plt.subplots(1, 2, figsize=(10,5))
x = data['X']
y = data['Y']
data_limits = [1892, 2020]
max_basis = 7
ll = np.array([np.nan]*(max_basis))
sum_squares = np.array([np.nan]*(max_basis))
for num_basis in range(1,max_basis+1):
model= mlai.LM(x, y, basis, num_basis=num_basis, data_limits=data_limits)
model.fit()
sum_squares[num_basis-1] = model.objective()
ll[num_basis-1] = model.log_likelihood()
mlai.plot_marathon_fit(model=model, data_limits=data_limits,
objective=sum_squares, objective_ylim=[0,8],
fig=f, ax=ax)
# In[11]:
pods.notebook.display_plots('olympic_LM_polynomial{num_basis}.svg', directory='./diagrams', num_basis=(1,7))
# ### Polynomial Fits to Olymics Data
# In[14]:
import pods
import numpy as np
import scipy as sp
import mlai
from matplotlib import pyplot as plt
max_basis = 7
basis = mlai.polynomial
data = pods.datasets.olympic_marathon_men()
x = data['X']
y = data['Y']
data_limits = [1892, 2020]
num_data = x.shape[0]
# In[2]:
f, ax = plt.subplots(1, 2, figsize=(10,5))
ll = np.array([np.nan]*(max_basis))
sum_squares = np.array([np.nan]*(max_basis))
for num_basis in range(1,max_basis+1):
model= mlai.LM(x, y, basis, num_basis=num_basis, data_limits=data_limits)
model.fit()
sum_squares[num_basis-1] = model.objective()/num_data
ll[num_basis-1] = model.log_likelihood()
mlai.plot_marathon_fit(model=model, data_limits=data_limits,
objective=np.sqrt(sum_squares), objective_ylim=[0, 0.3],
title='Root Mean Square Training Error',
fig=f, ax=ax)
# In[15]:
pods.notebook.display_plots('olympic_LM_polynomial{num_basis}.svg',
directory='./diagrams', num_basis=(1, max_basis))
# ### Overfitting
# - Increase number of basis functions we obtain a better 'fit' to the data.
# - How will the model perform on previously unseen data?
# - Let's consider predicting the future.
# In[17]:
f, ax = plt.subplots(1, 2, figsize=(10,5))
val_start = 20;
x = data['X'][:val_start, :]
x_val = data['X'][val_start:, :]
y = data['Y'][:val_start, :]
y_val = data['Y'][val_start:, :]
num_val_data = x_val.shape[0]
ll = np.array([np.nan]*(max_basis))
ss = np.array([np.nan]*(max_basis))
ss_val = np.array([np.nan]*(max_basis))
for num_basis in range(1,max_basis+1):
model= mlai.LM(x, y, basis, num_basis=num_basis, data_limits=data_limits)
model.fit()
ss[num_basis-1] = model.objective()
f_val, _ = model.predict(x_val)
ss_val[num_basis-1] = ((y_val-f_val)**2).mean()
ll[num_basis-1] = model.log_likelihood()
mlai.plot_marathon_fit(model=model, data_limits=data_limits,
objective=np.sqrt(ss_val), objective_ylim=[0,0.6],
fig=f, ax=ax, prefix='olympic_val',
title="Hold Out Validation",
x_val=x_val, y_val=y_val)
# In[26]:
pods.notebook.display_plots('olympic_val_LM_polynomial{num_basis}.svg',
directory='./diagrams', num_basis=(1, max_basis))
# ### Extrapolation
#
# - Here we are training beyond where the model has learnt.
# - This is known as *extrapolation*.
# - Extrapolation is predicting into the future here, but could be:
# - Predicting back to the unseen past (pre 1892)
# - Spatial prediction (e.g. Cholera rates outside Manchester given rates inside Manchester).
# ### Alan Turing
# - He was a formidable Marathon runner.
# - In 1946 he ran a time 2 hours 46 minutes.
# - What is the probability he would have won an Olympics if one had been held in 1946?
# 
# *Alan Turing, in 1946 he was only 11 minutes slower than the winner of the 1948 games. Would he have won a hypothetical games held in 1946? Source: [Alan Turing Internet Scrapbook](http://www.turing.org.uk/scrapbook/run.html).*
#
# ### Interpolation
# - Predicting the wining time for 1946 Olympics is *interpolation*.
# - This is because we have times from 1936 and 1948.
# - If we want a model for *interpolation* how can we test it?
# - One trick is to sample the validation set from throughout the data set.
# In[19]:
f, ax = plt.subplots(1, 2, figsize=(10,5))
val_start = 20;
perm = np.random.permutation(data['X'].shape[0])
x = data['X'][perm[:val_start], :]
x_val = data['X'][perm[val_start:], :]
y = data['Y'][perm[:val_start], :]
y_val = data['Y'][perm[val_start:], :]
num_val_data = x_val.shape[0]
ll = np.array([np.nan]*(max_basis))
ss = np.array([np.nan]*(max_basis))
ss_val = np.array([np.nan]*(max_basis))
for num_basis in range(1,max_basis+1):
model= mlai.LM(x, y, basis, num_basis=num_basis, data_limits=data_limits)
model.fit()
ss[num_basis-1] = model.objective()
f_val, _ = model.predict(x_val)
ss_val[num_basis-1] = ((y_val-f_val)**2).mean()
ll[num_basis-1] = model.log_likelihood()
mlai.plot_marathon_fit(model=model, data_limits=data_limits,
objective=np.sqrt(ss_val), objective_ylim=[0.1,0.6],
fig=f, ax=ax, prefix='olympic_val_inter',
title="Hold Out Validation",
x_val=x_val, y_val=y_val)
# In[27]:
pods.notebook.display_plots('olympic_val_inter_LM_polynomial{num_basis}.svg',
directory='./diagrams', num_basis=(1, max_basis))
# ### Choice of Validation Set
#
# - The choice of validation set should reflect how you will use the model in practice.
# - For extrapolation into the future we tried validating with data from the future.
# - For interpolation we chose validation set from data.
# - For different validation sets we could get different results.
# ### Leave One Out Error
# - Take training set and remove one point.
# - Train on the remaining data.
# - Compute the error on the point you removed (which wasn't in the training data).
# - Do this for each point in the training set in turn.
# - Average the resulting error.
# - This is the leave one out error.
# In[ ]:
f, ax = plt.subplots(1, 2, figsize=(10,5))
num_data = data['X'].shape[0]
num_parts = num_data
partitions = []
for part in range(num_parts):
train_ind = list(range(part))
train_ind.extend(range(part+1,num_data))
val_ind = [part]
partitions.append((train_ind, val_ind))
ll = np.array([np.nan]*(max_basis))
ss = np.array([np.nan]*(max_basis))
ss_val = np.array([np.nan]*(max_basis))
for num_basis in range(1,max_basis+1):
ss_val_temp = 0.
for part, (train_ind, val_ind) in enumerate(partitions):
x = data['X'][train_ind, :]
x_val = data['X'][val_ind, :]
y = data['Y'][train_ind, :]
y_val = data['Y'][val_ind, :]
num_val_data = x_val.shape[0]
model= mlai.LM(x, y, basis, num_basis=num_basis, data_limits=data_limits)
model.fit()
ss[num_basis-1] = model.objective()
f_val, _ = model.predict(x_val)
ss_val_temp += ((y_val-f_val)**2).mean()
mlai.plot_marathon_fit(model=model, data_limits=data_limits,
objective=np.sqrt(ss_val), objective_ylim=[0.1,0.6],
fig=f, ax=ax, prefix='olympic_loo' + str(part) + '_inter',
x_val=x_val, y_val=y_val)
ss_val[num_basis-1] = ss_val_temp/(num_parts)
ax[1].cla()
mlai.plot_marathon_fit(model=model, data_limits=data_limits,
objective=np.sqrt(ss_val), objective_ylim=[0.1,0.6],
fig=f, ax=ax, prefix='olympic_loo' + str(len(partitions)) + '_inter',
title="Leave One Out Validation",
x_val=x_val, y_val=y_val)
# In[28]:
pods.notebook.display_plots('olympic_loo{part}_inter_LM_polynomial{num_basis}.svg',
directory='./diagrams', num_basis=(1, max_basis), part=(0,len(partitions)))
# ### $k$ Fold Cross Validation
#
# - Leave one out error can be very time consuming.
# - Need to train your algorithm $n$ times.
# - An alternative: $k$ fold cross validation.
# In[ ]:
f, ax = plt.subplots(1, 2, figsize=(10,5))
num_data = data['X'].shape[0]
num_parts = 5
partitions = []
ind = list(np.random.permutation(num_data))
start = 0
for part in range(num_parts):
end = round((float(num_data)/num_parts)*(part+1))
train_ind = ind[:start]
train_ind.extend(ind[end:])
val_ind = ind[start:end]
partitions.append((train_ind, val_ind))
start = end
max_basis = 7
ll = np.array([np.nan]*(max_basis))
ss = np.array([np.nan]*(max_basis))
ss_val = np.array([np.nan]*(max_basis))
for num_basis in range(1,max_basis+1):
ss_val_temp = 0.
for part, (train_ind, val_ind) in enumerate(partitions):
x = data['X'][train_ind, :]
x_val = data['X'][val_ind, :]
y = data['Y'][train_ind, :]
y_val = data['Y'][val_ind, :]
num_val_data = x_val.shape[0]
model= mlai.LM(x, y, basis, num_basis=num_basis, data_limits=data_limits)
model.fit()
ss[num_basis-1] = model.objective()
f_val, _ = model.predict(x_val)
ss_val_temp += ((y_val-f_val)**2).mean()
mlai.plot_marathon_fit(model=model, data_limits=data_limits,
objective=np.sqrt(ss_val), objective_ylim=[0.2,0.6],
fig=f, ax=ax, prefix='olympic_' + str(num_parts) + 'cv' + str(part) + '_inter',
title='5-fold Cross Validation',
x_val=x_val, y_val=y_val)
ss_val[num_basis-1] = ss_val_temp/(num_parts)
ax[1].cla()
mlai.plot_marathon_fit(model=model, data_limits=data_limits,
objective=np.sqrt(ss_val), objective_ylim=[0.2,0.6],
fig=f, ax=ax, prefix='olympic_' + str(num_parts) + 'cv' + str(len(partitions)) + '_inter',
title='5-fold Cross Validation',
x_val=x_val, y_val=y_val)
# In[24]:
pods.notebook.display_plots('olympic_5cv{part}_inter_LM_polynomial{num_basis}.svg',
directory='./diagrams', num_basis=(1, max_basis), part=(0,5))
# ### Reading
#
# - Section 1.3-1.5 of @Rogers:book11.
#