Lab 3: Global Optimization with Gaussian Processes

Gaussian Process Summer School 2021

The goal of this lab session is to illustrate the concepts seen during the tutorial in Gaussian processes for Global optimization. We will focus on two aspects of Bayesian Optimization (BO): (1) the choice of the model (2) the choice of the acquisition function.

The technical material associated to the methods used in this lab can be found in the slides of the tutorial.

1. Getting started

In addition to GPy, this lab uses Emukit (https://emukit.github.io/), a Python package that is useful for solving, among other things, global optimization problems. Please be sure that it is correctly installed before starting.

Now, just as in the previous lab, specify to include plots in the notebook and to import relevant libraries.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
 
import GPy
import emukit
import numpy as np
from numpy.random import seed
seed(12345)

We will first define some utility functions for plotting. The next cell can be skimmed over as it contains no information relevent to learning about BO.

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from emukit.core import ParameterSpace
from emukit.bayesian_optimization.loops import BayesianOptimizationLoop


def plot_acquisition(emukit_bo_loop: BayesianOptimizationLoop, space: ParameterSpace,
                     label_x: str=None, label_y: str=None):
    """
    Plots the model and the acquisition function.
        if self.input_dim = 1: Plots data, mean and variance in one plot and the acquisition function in another plot
        if self.input_dim = 2: as before but it separates the mean and variance of the model in two different plots
    :param label_x: Graph's x-axis label, if None it is renamed to 'x' (1D) or 'X1' (2D)
    :param label_y: Graph's y-axis label, if None it is renamed to 'f(x)' (1D) or 'X2' (2D)
    """
    return _plot_acquisition(space.get_bounds(),
                            emukit_bo_loop.loop_state.X.shape[1],
                            emukit_bo_loop.model,
                            emukit_bo_loop.model.X,
                            emukit_bo_loop.model.Y,
                            emukit_bo_loop.candidate_point_calculator.acquisition,
                            emukit_bo_loop.get_next_points(None),
                            label_x,
                            label_y)

def _plot_acquisition(bounds, input_dim, model, Xdata, Ydata, acquisition_function, suggested_sample,
                     label_x=None, label_y=None, color_by_step=True):
    '''
    Plots of the model and the acquisition function in 1D and 2D examples.
    '''

    if input_dim ==1:
        # Plots for 1D input
        if not label_x:
            label_x = 'x'

        if not label_y:
            label_y = 'f(x)'

        x_grid = np.arange(bounds[0][0], bounds[0][1], 0.001)
        x_grid = x_grid.reshape(len(x_grid),1)
        acq = acquisition_function.evaluate(x_grid)
        acq_normalized = (acq - min(acq))/(max(acq) - min(acq))
        m, v = model.predict(x_grid)

        upper_conf_bound = m[:, 0] + 1.96 * np.sqrt(v)[:, 0]
        lower_conf_bound = m[:, 0] - 1.96 * np.sqrt(v)[:, 0]

        plt.fill_between(x_grid[:, 0], upper_conf_bound, lower_conf_bound, color='b', alpha=0.2)
        plt.plot(x_grid, lower_conf_bound, 'k-', alpha=0.2)
        plt.plot(x_grid, upper_conf_bound, 'k-', alpha=0.2)
        plt.plot(x_grid, m, 'k-', lw=1, alpha=0.6)

        plt.plot(Xdata, Ydata, 'r.', markersize=10)
        plt.axvline(x=suggested_sample[len(suggested_sample)-1], color='r')
        factor = max(upper_conf_bound) - min(lower_conf_bound)

        plt.plot(x_grid, 0.2*factor*acq_normalized - abs(min(lower_conf_bound)) - 0.25*factor, 'r-',
                 lw=2, label='Acquisition')
        plt.xlabel(label_x)
        plt.ylabel(label_y)
        plt.ylim(min(lower_conf_bound) - 0.25*factor,  max(upper_conf_bound) + 0.05*factor)
        plt.axvline(x=suggested_sample[len(suggested_sample)-1],color='r')
        plt.legend(loc='upper left')

    elif input_dim == 2:

        if not label_x:
            label_x = 'X1'

        if not label_y:
            label_y = 'X2'

        n = Xdata.shape[0]
        colors = np.linspace(0, 1, n)
        cmap = plt.cm.Reds
        norm = plt.Normalize(vmin=0, vmax=1)
        points_var_color = lambda X: plt.scatter(
            X[:,0], X[:,1], c=colors, label=u'Observations', cmap=cmap, norm=norm)
        points_one_color = lambda X: plt.plot(
            X[:,0], X[:,1], 'r.', markersize=10, label=u'Observations')
        X1 = np.linspace(bounds[0][0], bounds[0][1], 200)
        X2 = np.linspace(bounds[1][0], bounds[1][1], 200)
        x1, x2 = np.meshgrid(X1, X2)
        X = np.hstack((x1.reshape(200*200,1),x2.reshape(200*200,1)))
        acq = acquisition_function.evaluate(X)
        acq_normalized = (acq - min(acq))/(max(acq) - min(acq))
        acq_normalized = acq_normalized.reshape((200,200))
        m, v = model.predict(X)
        plt.figure(figsize=(15,5))
        plt.subplot(1, 3, 1)
        plt.contourf(X1, X2, m.reshape(200,200),100)
        plt.colorbar()
        if color_by_step:
            points_var_color(Xdata)
        else:
            points_one_color(Xdata)
        plt.ylabel(label_y)
        plt.title('Posterior mean')
        plt.axis((bounds[0][0],bounds[0][1],bounds[1][0],bounds[1][1]))
        ##
        plt.subplot(1, 3, 2)
        plt.contourf(X1, X2, np.sqrt(v.reshape(200,200)),100)
        plt.colorbar()
        if color_by_step:
            points_var_color(Xdata)
        else:
            points_one_color(Xdata)
        plt.xlabel(label_x)
        plt.ylabel(label_y)
        plt.title('Posterior sd.')
        plt.axis((bounds[0][0],bounds[0][1],bounds[1][0],bounds[1][1]))
        ##
        plt.subplot(1, 3, 3)
        plt.contourf(X1, X2, acq_normalized,100)
        plt.colorbar()
        plt.plot(suggested_sample[:,0],suggested_sample[:,1],'m.', markersize=10)
        plt.xlabel(label_x)
        plt.ylabel(label_y)
        plt.title('Acquisition function')
        plt.axis((bounds[0][0],bounds[0][1],bounds[1][0],bounds[1][1]))

    else:
        raise ValueError(f'Cannot plot inputs higher than 2D, {input_dim} given')

def plot_convergence(X, Y):
    '''
    Plots the convergence of standard Bayesian optimization algorithms.

    :param X: Locations of evaluated points
    :param Y: Results of evaluations

    '''
    n = X.shape[0]

    ## Distances between consecutive x's
    aux = (X[1:n,:] - X[0:n-1,:])**2
    distances = np.sqrt(aux.sum(axis=1))

    plt.figure(figsize=(10,5))
    plt.subplot(1, 2, 1)
    plt.plot(list(range(n-1)), distances, '-ro')
    plt.xlabel('Iteration')
    plt.ylabel('d(x[n], x[n-1])')
    plt.title('Distance between consecutive x\'s')

    # Best found value at each iteration
    best_Y = np.minimum.accumulate(Y)

    plt.subplot(1, 2, 2)
    plt.plot(list(range(n)), best_Y, '-o')
    plt.title('Value of the best selected sample')
    plt.xlabel('Iteration')
    plt.ylabel('Best y')

Remembering the basics

Before starting with the lab, remember that (BO) is an heuristic for global optimization of black-box functions. Let $f: {\mathcal X} \to R$ be a 'well behaved' continuous function defined on a compact subset ${\mathcal X} \subseteq R^d$. Our goal is to solve the global optimization problem of finding $$ x_{M} = \arg \min_{x \in {\mathcal X}} f(x). $$

We assume that $f$ is a black-box from which only perturbed evaluations of the type $y_i = f(x_i) + \epsilon_i$, with $\epsilon_i \sim\mathcal{N}(0,\sigma^2)$, are available. The goal is to find $x_M$ by minimizing the number of evaluations of $f$. To do this, we need to determine two crucial bits:

  1. A Gaussian process that will capture the our beliefs on $f$.

  2. An acquisition function that based on the model will be useful to determine where to collect new evaluations of f.

Remember that every time a new data point is collected the model is updated and the acquisition function optimized again.

Running example

We start with a one-dimensional example. Consider here the Forrester function

$$ f(x) =(6x-2)^2 \sin(12x-4) $$

defined on the interval $[0, 1]$.

The minimum of this function is located at $x_{min}=0.78$. We assume that the evaluations of $f$ to are perturbed by zero-mean Gaussian noise with standard deviation 0.25. The Forrester function is part of the benchmark of functions of Emukit. To create the true function, the perturbed version and the boundaries of the problem you need to run the following cell.

In [3]:
from emukit.test_functions import forrester_function

f_true, bounds = forrester_function()             # true function and parameter space
f_objective, _ = forrester_function(noise_standard_deviation=.25)        # noisy version, will be used as noisy objective

Let's plot the true $f$:

In [4]:
x_plot = np.linspace(bounds.parameters[0].min, bounds.parameters[0].max, 200)[:, None]
y_plot = f_true(x_plot)

plt.plot(x_plot, y_plot)
plt.xlabel(r"$x$")
plt.ylabel(r"$f(x)$")
plt.show()

f_objective is the function that we are going to optimize. You can define your own objective but it should be able to map any numpy array of dimension $n\times d$ (inputs) to a numpy array of dimension $n\times 1$ (outputs). For instance:

In [5]:
n = 8
x = np.random.rand(n).reshape(n,1)
x
Out[5]:
array([[0.92961609],
       [0.31637555],
       [0.18391881],
       [0.20456028],
       [0.56772503],
       [0.5955447 ],
       [0.96451452],
       [0.6531771 ]])
In [6]:
f_objective(x)
Out[6]:
array([[ 9.99379804],
       [ 0.30951655],
       [-0.53213703],
       [-0.92083089],
       [ 0.70757395],
       [ 0.04499155],
       [14.12247715],
       [-2.14113699]])

In Emukit the bounds of the problem are defined as a ParameterSpace object. For real valued parameters the upper and lower limits of the box in which the optimization will be performed shall be provided. In our example:

In [7]:
from emukit.core import ParameterSpace, ContinuousParameter

custom_bounds = ParameterSpace([
    ContinuousParameter('var_1', 0.0, 1.0)
])

To use BO to solve this problem, we need to create an Emukit object in which we need to specify the following elements:

  • The function to optimize.
  • The box constrains of the problem.
  • The model, that is fixed by default to be a GP with a SE kernel.
  • The acquisition function (and its parameters).

We create an SE kernel as we do in GPy

In [8]:
from emukit.model_wrappers.gpy_model_wrappers import GPyModelWrapper

X_init = np.random.rand(3).reshape(3,1)
Y_init = f_objective(X_init)

k = GPy.kern.RBF(1)
gpy_model = GPy.models.GPRegression(X_init, Y_init, k)
emukit_model = GPyModelWrapper(gpy_model)

And now we have all the elements to start optimizing $f$. We create the optimization problem instance. Note that you don't need to specify the evaluation budget of. This is because at this stage we are not running the optimization, we are just initializing the different elements of the BO algorithm.

In [9]:
from emukit.bayesian_optimization.loops import BayesianOptimizationLoop

# Creation of the object that we will use to run BO.
myBopt = BayesianOptimizationLoop(space=bounds, model=emukit_model)
In [10]:
?BayesianOptimizationLoop
Init signature:
BayesianOptimizationLoop(
    space: emukit.core.parameter_space.ParameterSpace,
    model: emukit.core.interfaces.models.IModel,
    acquisition: emukit.core.acquisition.acquisition.Acquisition = None,
    update_interval: int = 1,
    batch_size: int = 1,
    acquisition_optimizer: emukit.core.optimization.acquisition_optimizer.AcquisitionOptimizerBase = None,
)
Docstring:     
Generic outer loop that provides the framework for decision making parts of Emukit.

The loop can be used in two modes:

1. Emukit calculates the next point(s) to try and evaluates your function at these points until some stopping
   criterion is met.
2. Emukit only calculates the next points(s) to try and you evaluate your function or perform the experiment.

This object exposes the following events. See ``emukit.core.event_handler`` for details of how to subscribe:
     - ``loop_start_event`` called at the start of the `run_loop` method
     - ``iteration_end_event`` called at the end of each iteration
Init docstring:
Emukit class that implement a loop for building modular Bayesian optimization

:param space: Input space where the optimization is carried out.
:param model: The model that approximates the underlying function
:param acquisition: The acquisition function that will be used to collect new points (default, EI). If batch
                    size is greater than one, this acquisition must output positive values only.
:param update_interval: Number of iterations between optimization of model hyper-parameters. Defaults to 1.
:param batch_size: How many points to evaluate in one iteration of the optimization loop. Defaults to 1.
:param acquisition_optimizer: Optimizer selecting next evaluation points
                              by maximizing acquisition.
                              Gradient based optimizer is used if None.
                              Defaults to None.
File:           ~/miniconda3/envs/gpss2022/lib/python3.9/site-packages/emukit/bayesian_optimization/loops/bayesian_optimization_loop.py
Type:           type
Subclasses:     

You will find that the optimisation loop state is already initialized, although with just the random initial locations.

In [11]:
myBopt.loop_state.X
Out[11]:
array([[0.46759901],
       [0.32558468],
       [0.43964461]])
In [12]:
myBopt.loop_state.Y
Out[12]:
array([[0.91335901],
       [0.16091135],
       [0.38734197]])

Now we can run the optimisation loop itself for several iterations.

In [13]:
max_iter = 10
myBopt.run_loop(user_function=f_objective, stopping_condition=max_iter)

And that's it! We can re-inspect the loop state to see if it contains new data, and visualize the model and the acquisition function.

In [14]:
myBopt.loop_state.X
Out[14]:
array([[0.46759901],
       [0.32558468],
       [0.43964461],
       [0.06846958],
       [1.        ],
       [0.        ],
       [0.17209704],
       [0.18813837],
       [0.13191696],
       [0.76643402],
       [0.73489764],
       [0.74491886],
       [0.74531516]])
In [15]:
myBopt.loop_state.Y
Out[15]:
array([[ 0.91335901],
       [ 0.16091135],
       [ 0.38734197],
       [-0.01945318],
       [15.86294966],
       [ 3.24801321],
       [-0.60465149],
       [-0.51233215],
       [-0.93618376],
       [-5.69478129],
       [-5.90275939],
       [-5.89682697],
       [-5.34498455]])
In [16]:
plot_acquisition(myBopt, bounds)

You can only make the previous plot if the dimension of the problem is 1 or 2. However, you can always make a plot to see how the optimization evolved.

In [17]:
plot_convergence(myBopt.loop_state.X, myBopt.loop_state.Y)

The first plot shows the distance between the last two collected observations at each iteration. This plot is useful to evaluate the convergence of the method. The second plot shows the best found value at each iteration. It is useful to compare different methods. The fastest the curve decreases the better the method.

Exercise 1

Use Bayesian optimization to find the minimum of the function $f(x)= x^2 + 10 \sin(x)$ in the interval [-10, 10].

(a) Define the bounds of the problem, the function and check that it admits a numpy array of observations as input.

In [18]:
def g(x):
    return x**2 + 10*np.sin(x)

space_g = ParameterSpace([
    ContinuousParameter('var_1', -10.0, 10.0)
])

x = np.random.rand(n).reshape(n,1)
g(x)
Out[18]:
array([[8.94701871],
       [7.35478289],
       [8.8629034 ],
       [8.12512458],
       [6.0138235 ],
       [3.9414941 ],
       [7.55910992],
       [1.5016547 ]])

(b) Create an Emukit object for global optimization using a Matern52 kernel.

In [19]:
from emukit.model_wrappers.gpy_model_wrappers import GPyModelWrapper

X_init = np.random.rand(3).reshape(3, 1)
Y_init = g(X_init)

k_g = GPy.kern.Matern52(1)
gpy_model_g = GPy.models.GPRegression(X_init, Y_init, k_g)
emukit_model_g = GPyModelWrapper(gpy_model_g)

BO_g = BayesianOptimizationLoop(space_g, emukit_model_g)

(c) For stability reasons, constrain the noise of the model to be 10e-4.

In [20]:
gpy_model_g.noise_var = 0.0001

(d) Run the optimization for 10 iterations. Make and comment the convergence plots. Has the method converged?

In [21]:
BO_g.run_loop(g, 10)
plot_acquisition(BO_g, space_g)
plot_convergence(BO_g.loop_state.X, BO_g.loop_state.Y)
 /Users/magnus/miniconda3/envs/gpss2022/lib/python3.9/site-packages/paramz/transformations.py:111: RuntimeWarning:overflow encountered in expm1

2. Acquisition functions

In this section we are going to have a look to different acquisition functions. Emukit provides a variety of acquisition functions for Bayesian optimization, including the expected improvement ('EI') we already used, the probability of improvement ('PoI') and the lower confidence bound. Emukit uses EI by default, but any other acquisition functions can also be specified. In this section we will create these functions as separate objects and study their behavior.

In [22]:
seed(1234)
n = 10
X = np.random.rand(n).reshape(n,1)
Y = f_objective(X)
m = GPy.models.GPRegression(X,Y)
m.optimize()
m.plot([0,1])

## Now we pass this model into a GPyOpt Gaussian process model

from emukit.model_wrappers import GPyModelWrapper
model = GPyModelWrapper(m, n_restarts=5)

We define the bounds of the input space to be between zero and one.

In [23]:
space = ParameterSpace([
    ContinuousParameter('var_1', 0.0, 1.0)
])

Now let's import and create the acquisition function objects.

In [24]:
from emukit.bayesian_optimization.acquisitions import ExpectedImprovement, ProbabilityOfImprovement, NegativeLowerConfidenceBound
In [25]:
acq_EI = ExpectedImprovement(model, jitter=0)
acq_NLCB = NegativeLowerConfidenceBound(model, beta=2.0)
acq_PI = ProbabilityOfImprovement(model, jitter=0)

The objects acq_EI, acq_NLCB, acq_PI contain the acquisition functions and their gradients. By running the following piece of code you can visualize the three acquisitions. In this plot, the larger is the value of the acquisition, the better is the point.

In [26]:
# Plot the three acquisition functions (factor 0.1 added in in the LCB for visualization)
X_grid = np.linspace(0,1,200)[:, None]
plt.figure(figsize=(10,6))
plt.title('Acquisition functions')
plt.plot(X_grid, 0.1*acq_NLCB.evaluate(X_grid), label='NLCB')
plt.plot(X_grid, acq_EI.evaluate(X_grid), label='EI')
plt.plot(X_grid, acq_PI.evaluate(X_grid), label='PI')
plt.xlabel('x')
plt.ylabel('$alpha(x)$')
plt.legend();

Exercise 2

(a) According to the previous plot, what areas in the domain are worth exploring and why? How can we interpret the previous plot in terms of the exploration/exploitation trade off of each one of the three acquisitions?

In [ ]:
 

(b) Now make a plot comparing the shape of the NLCB acquisition (of GP-UCB in the literature) with values different values of the beta parameter. Use the values $[0,0.1,0.25,0.5,1,2,5]$. How does the decision about where to collect the sample change when we increase the value of the parameter?

In [27]:
beta_grid = np.array([0, 0.1, 0.25, 0.5, 1, 2, 5])

plt.figure(figsize=(10,6))
plt.title('Acquisition functions', size=15)
for param in beta_grid:
    acq_NLCB = NegativeLowerConfidenceBound(model, beta=param)
    plt.plot(X_grid, acq_NLCB.evaluate(X_grid), label=str(param))
plt.legend()
Out[27]:
<matplotlib.legend.Legend at 0x147259160>

Exercise 3

Consider the sixhumpcamel function defined as $$f(x_1,x_2) = \left(4-2.1x_1^2 + \frac{x_1^4}{3} \right)x_1^2 + x_1x_2 + (-4 +4x_2^2)x_2^2,$$

in $[-2,2]\times [-1,1]$. This function has two global minima, at $(0.0898,-0.7126)$ and $(-0.0898,0.7126)$. This function is also implemented in Emukit so, to load it simply run:

In [28]:
from emukit.test_functions import sixhumpcamel_function
f_shc, space_shc = sixhumpcamel_function()

(a) Create three objects to optimize this function using EI (with parameter equal to zero), NLCB (with parameter equal to 2) and PI (with parameter equal to zero). Use the same initial data in the three cases (hint: create a separate GPy model for each BO object and use the same X and Y).

In [29]:
from emukit.core.initial_designs import RandomDesign

n_data = 5
sampler = RandomDesign(space_shc)
X = sampler.get_samples(n_data)
Y = f_shc(X)

model_ei = GPyModelWrapper(GPy.models.GPRegression(X, Y))
acq_ei = ExpectedImprovement(model_ei, jitter=0)
bo_ei = BayesianOptimizationLoop(space_shc, model_ei, acquisition=acq_ei)

model_nlcb = GPyModelWrapper(GPy.models.GPRegression(X, Y))
acq_nlcb = NegativeLowerConfidenceBound(model_nlcb, beta=2.0)
bo_nlcb = BayesianOptimizationLoop(space_shc, model_nlcb, acquisition=acq_nlcb)

model_pi = GPyModelWrapper(GPy.models.GPRegression(X, Y))
acq_pi = ProbabilityOfImprovement(model_pi, jitter=0)
bo_pi = BayesianOptimizationLoop(space_shc, model_pi, acquisition=acq_pi)

(b) In the three cases run the optimization for 30 iterations

In [30]:
max_iter = 30
bo_ei.run_loop(f_shc, max_iter)
bo_nlcb.run_loop(f_shc, max_iter)
bo_pi.run_loop(f_shc, max_iter)

(c) Now make a plot comparing the three methods. The x axis should contain the number of iterations and y axis the best found value. Which acquisition has the best performance in this example?

In [31]:
bo_ei_best = np.minimum.accumulate(bo_ei.loop_state.Y)
bo_nlcb_best = np.minimum.accumulate(bo_nlcb.loop_state.Y)
bo_pi_best = np.minimum.accumulate(bo_pi.loop_state.Y)


plt.figure(figsize=(10,6))
plt.title('Comparison of acquisition functions', size=15)
plt.plot(bo_ei_best, label='EI')
plt.plot(bo_nlcb_best, label='NLCB')
plt.plot(bo_pi_best, label='PI')
plt.legend();

(d) Compare the models and the acquisition functions in the three cases (after the 30 iterations). What do you observe?

In [32]:
plot_acquisition(bo_ei, space_shc)
plot_acquisition(bo_nlcb, space_shc)
plot_acquisition(bo_pi, space_shc)

display(model_ei.model)
display(model_nlcb.model)
display(model_pi.model)

Model: GP regression
Objective: 35.46595935729767
Number of Parameters: 3
Number of Optimization Parameters: 3
Updates: True

GP_regression. valueconstraintspriors
rbf.variance 7.000810723107303 +ve
rbf.lengthscale 0.5989319985473173 +ve
Gaussian_noise.variance5.04722354143363e-08 +ve

Model: GP regression
Objective: 16.952289978982442
Number of Parameters: 3
Number of Optimization Parameters: 3
Updates: True

GP_regression. valueconstraintspriors
rbf.variance 11.306537689994341 +ve
rbf.lengthscale 0.6670054870327963 +ve
Gaussian_noise.variance2.3262838359004527e-14 +ve

Model: GP regression
Objective: -103.54000415054362
Number of Parameters: 3
Number of Optimization Parameters: 3
Updates: True

GP_regression. valueconstraintspriors
rbf.variance 6.152782301692156 +ve
rbf.lengthscale 0.589129404132994 +ve
Gaussian_noise.variance1.4160788501106164e-15 +ve