Notebook

Tutorial for Hyperactive (v4)¶

This is a tutorial to introduce you to the basic functionalities of Hyperactive and provide some interesting applications. It will also give an introduction to some optimization techniques. Hyperactive is a package that can optimize any python function and collect its search data.

You can learn more about Hyperactive on Github ¶

Table of contents:¶

Introduction
Optimization basics:
- Convex Optimization
- Non-convex Optimization
Advanced Usage:

This tutorial is made for version 4 of Hyperactive

In [1]:

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import time
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, RBF, ConstantKernel

from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import load_wine, load_iris

from hyperactive import Hyperactive
from hyperactive.optimizers import BayesianOptimizer, HillClimbingOptimizer
from surfaces.visualize import plotly_surface, plotly_heatmap

color_scale = px.colors.sequential.Jet

In [2]:

def _create_grid(objective_function, search_space):
    def objective_function_np(*args):
        para = {}
        for arg, key in zip(args, search_space.keys()):
            para[key] = arg

        return objective_function(para)

    (x_all, y_all) = search_space.values()
    xi, yi = np.meshgrid(x_all, y_all)
    zi = objective_function_np(xi, yi)

    return xi, yi, zi

import plotly.graph_objects as go
from plotly.subplots import make_subplots

def compare_objective_functions(objective_function1, objective_function2):
    search_space_plot = {
        "x": list(np.arange(-5, 5, 0.2)),
        "y": list(np.arange(-5, 5, 0.2)),
    }

    xi_c, yi_c, zi_c = _create_grid(objective_function1, search_space_plot)
    xi_a, yi_a, zi_a = _create_grid(objective_function2, search_space_plot)

    fig1 = go.Surface(x=xi_c, y=yi_c, z=zi_c, colorscale=color_scale)
    fig2 = go.Surface(x=xi_a, y=yi_a, z=zi_a, colorscale=color_scale)

    fig = make_subplots(rows=1, cols=2,
                        specs=[[{'is_3d': True}, {'is_3d': True}]],
                        subplot_titles=['Convex Function', 'Non-convex Function'],
                        )

    fig.add_trace(fig1, 1, 1)
    fig.add_trace(fig2, 1, 2)
    fig.update_layout(title_text="Objective Function Surface")
    fig.show()

In [3]:

from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Dense, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD
from keras.utils import np_utils
from tensorflow import keras

import tensorflow as tf

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True

sess = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(sess)

2024-01-02 08:46:37.569045: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-02 08:46:37.650657: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:
2024-01-02 08:46:37.650673: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-01-02 08:46:38.153488: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:
2024-01-02 08:46:38.153539: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:
2024-01-02 08:46:38.153543: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.

Device mapping: no known devices.

2024-01-02 08:46:38.609600: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-02 08:46:38.695715: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-01-02 08:46:38.695926: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:
2024-01-02 08:46:38.695976: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:
2024-01-02 08:46:38.696017: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:
2024-01-02 08:46:38.696055: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:
2024-01-02 08:46:38.696094: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcurand.so.10'; dlerror: libcurand.so.10: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:
2024-01-02 08:46:38.696132: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:
2024-01-02 08:46:38.696170: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusparse.so.11'; dlerror: libcusparse.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:
2024-01-02 08:46:38.696273: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...

WARNING:tensorflow:From /tmp/ipykernel_1406914/2148136278.py:15: The name tf.keras.backend.set_session is deprecated. Please use tf.compat.v1.keras.backend.set_session instead.

In [4]:

# load dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

img_width = 28
img_height = 28

x_train = x_train.astype("float32")
x_train /= 255.0
x_test = x_test.astype("float32")
x_test /= 255.0

# reshape input data
x_train = x_train.reshape(x_train.shape[0], img_width, img_height, 1)
x_test = x_test.reshape(x_test.shape[0], img_width, img_height, 1)

# one hot encode outputs
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
num_classes = y_test.shape[1]


data = load_wine()
X_boston, y_boston = data.data, data.target

data = load_iris()
X_iris, y_iris = data.data, data.target

Introduction ¶

There are two things you need to define before starting your first optimization run:

- the objective function: 
    Contains some kind of model. It always returns a score that will be maximized during
- a search space: 
    Defines the parameter space in which the optimizer searches for the best parameter set

In this notebook you will see several different examples for objective functions.

In [5]:

def objective_function(para):
    loss = para["x"]*para["x"]
    # -x*x is an inverted parabola 
    return -loss

# We have only one dimension here
search_space = {
    "x": list(np.arange(-5, 5, 0.01)),
}

In the next step we will start the optimization run.

You only need the objective_function, search_space and the number of iterations. Each iteration will evaluate the objective function. This will generate a score, that the optimization algorithm uses to determine which position in the search space to look next. All of the calculations will be done by Hyperactive in the background. You will receive the results of the optimization run when all iterations are done.

In [6]:

hyper_0 = Hyperactive(verbosity=False)
hyper_0.add_search(objective_function, search_space, n_iter=70, initialize={"random": 2, "vertices": 2})
hyper_0.run()

search_data_0 = hyper_0.search_data(objective_function)
search_data_0[["x", "score"]]

 hyper_optimizer search_data 
        x    score
0   0.22  -0.0484
1  -1.46  -2.1316
2   4.99 -24.9001
3  -5.00 -25.0000
4   4.06 -16.4836
..   ...      ...
65 -4.73 -22.3729
66 -2.99  -8.9401
67  4.65 -21.6225
68  4.01 -16.0801
69 -0.78  -0.6084

[70 rows x 2 columns]

Out[6]:

	x	score
0	0.22	-0.0484
1	-1.46	-2.1316
2	4.99	-24.9001
3	-5.00	-25.0000
4	4.06	-16.4836
...	...	...
65	-4.73	-22.3729
66	-2.99	-8.9401
67	4.65	-21.6225
68	4.01	-16.0801
69	-0.78	-0.6084

70 rows × 2 columns

In the table above you can see the 70 iterations performed during the run. This is called the search data. In each row you can see the parameter x and the corresponding score. As we previously discussed the optimization algorithm determines which position to select next based on the score from the evaluated objective function.

When Hyperactive starts the optimization the first iterations are initializations from the initialize-dictionary. In the example above there are 4 initializations (2 random and 2 vertices). They determine the initial positions in the search space that are used to evaluate the objective funtion. As you can see in the search data the 2. and 3. iteration are the vertices (edge points) of the search space. The 0. and 1. The first rows of the search data are randomly selected. After those few initialization steps the optimization algorithm will select the next positions in the search space based on the score of the previous position(s).

The default algorithm for the optimization is the random-search. You can see the random pattern in the last few iterations of the search data. We can also see the random pattern if we plot the search data:

Random Search Optimizer¶

The random search optimizer is a very simple algorithm. It randomly selects the position in each iteration without adapting to the optimization problem (exploitation). One the other hand it is very useful to initialy explore the search space or find new regions with optima (exploration).

The following two gifs show how random search explores the search space for two different objective functions:

In [7]:

fig = px.scatter(search_data_0, x="x", y="score")
fig.show()

The plot above shows the score of each parameter set (in this case just one parameter "x"). The random search explores the search space very well, so that we can see the inverse parabola.

Objective Functions ¶

The shape of the objective function in the search space is a very important part of the optimization problem, because it heavily impacts the behaviour of the algorithm. Each optimization algorithm is well equiped to solve some kind of objective functions and perform poorly for others.

A basic classifications of optimization problems is into convex- and nonconvex-problems. Convex optimization problems have a shape, where the score improves continuously as the position gets closer to the global optimum. So it does not have any local optima in its shape. Nonconvex-problems do have these local optima.

Convex Optimization ¶

Lets take a closer look at the convex optimization problem and try out different optimization algorithms:

In [8]:

def convex_function(para):
    loss = (para["x"]*para["x"] + para["y"]*para["y"])
    return -loss


search_space = {
    "x": list(np.arange(-5, 5, 0.01)),
    "y": list(np.arange(-5, 5, 0.01)),
}

In [9]:

hyper_convex_0 = Hyperactive(verbosity=False)
hyper_convex_0.add_search(convex_function, search_space, n_iter=2000)
hyper_convex_0.run()

 hyper_optimizer search_data 
          x     y    score
0    -4.92  4.14 -41.3460
1     3.81 -2.28 -19.7145
2    -1.67 -1.67  -5.5778
3    -1.67  1.66  -5.5445
4     1.66 -1.67  -5.5445
...    ...   ...      ...
1995  0.53  0.48  -0.5113
1996 -3.74 -2.94 -22.6312
1997 -3.21  2.37 -15.9210
1998  1.99 -3.61 -16.9922
1999  0.12  0.68  -0.4768

[2000 rows x 3 columns]

In [10]:

search_data_convex_0 = hyper_convex_0.search_data(convex_function)

In [11]:

fig = px.scatter(search_data_convex_0, x="x", y="y", color="score", color_continuous_scale=color_scale)
fig.update_layout(width=900, height=800, xaxis_range=[-5, 5], yaxis_range=[-5, 5])
fig.show()

The plot above shows the samples from the search data acquired from the convex-function in a 2-dimensional search space. The score is shown by the color of each point in the scatter plot.

We were able to see, that random search is a good optimization technique to explore the search space. But the goal is often to quickly find position in the search space with a high score. Therefore we should consider other optimization techniques like the hill climbing algorithm.

Hill Climbing Optimizer¶

The hill climbing optimization algorithm works by finding a random neighbour position close to the current position. If the score of the new position is better than the current one the algorithm makes a step to the new position and returns to finding the next position. This behaviour is like someone who tries to find the highest (highest score) position in a space by only moving up and never moves down.

The hill climbing algorithm works very well with convex optimization problems, because the score continuously improves towards a direction. Hill climbing can find this direction by exploring the scores of its neighbours. Hill climbing does not work of there are local optima. It tends to get "stuck" in certain regions, where the current position is surrounded by positions with worse scores. The algorithm would need to first "go down" and later "go up" again to find other (even better) positions in the search space.

In [12]:

optimizer = HillClimbingOptimizer(rand_rest_p=0)

hyper_convex_1 = Hyperactive(verbosity=False)
hyper_convex_1.add_search(convex_function, search_space, n_iter=90, optimizer=optimizer, initialize={"vertices":1})
hyper_convex_1.run()

search_data_convex_1 = hyper_convex_1.search_data(convex_function)

 hyper_optimizer search_data 
        x             y    score
0   4.99 -5.000000e+00 -49.9001
1   4.99 -4.470000e+00 -44.8810
2   4.82 -4.910000e+00 -47.3405
3   4.82 -4.790000e+00 -46.1765
4   4.99 -5.000000e+00 -49.9001
..   ...           ...      ...
85 -0.28  4.000000e-02  -0.0800
86  0.80 -1.065814e-13  -0.6400
87  0.02 -2.500000e-01  -0.0629
88  0.43 -1.065814e-13  -0.1849
89  0.23  6.000000e-02  -0.0565

[90 rows x 3 columns]

In [13]:

fig = px.scatter(search_data_convex_1, x="x", y="y", color="score", color_continuous_scale=color_scale)
fig.update_layout(width=900, height=800, xaxis_range=[-5, 5], yaxis_range=[-5, 5])
fig.show()

The 2D-scatter plot above shows that the hill climbing algorithm convertes quickly to the optimum of the objective function in the search space. Hill climbing is specialized to find the optimum of convex functions quickly. It was able to find a good score in less than 100 iterations, while the random search used much more for a similar maximum score.

Non-convex Optimization ¶

Next we explore the performance of different optimization algorithms on the ackley-function, which is a nonconvex objective function:

In [14]:

def ackley_function(para):
    x, y = para["x"], para["y"]

    loss = (
        -20 * np.exp(-0.2 * np.sqrt(0.5 * (x * x + y * y)))
        - np.exp(0.5 * (np.cos(2 * np.pi * x) + np.cos(2 * np.pi * y)))
        + np.exp(1)
        + 20
    )

    return -loss


search_space = {
    "x": list(np.arange(-5, 5, 0.01)),
    "y": list(np.arange(-5, 5, 0.01)),
}

The ackley function is a non convex function that has a lot of local optima. This is because of the cos-terms that create wave-like patterns in both dimensions.

In the following 3D-surface plots you can see an example for the sphere-function and ackley-function. Both plots are interactive, so you can take a closer look at the shape of those objective functions:

In [15]:

compare_objective_functions(convex_function, ackley_function)

In [16]:

hyper_ackley_0 = Hyperactive(verbosity=False)
hyper_ackley_0.add_search(ackley_function, search_space, n_iter=2000)
hyper_ackley_0.run()

search_data_ackley_0 = hyper_ackley_0.search_data(ackley_function)

 hyper_optimizer search_data 
          x     y      score
0     4.14 -3.61 -12.585249
1     4.46  1.01 -11.228130
2    -1.67 -1.67  -7.779507
3    -1.67  1.66  -7.781677
4     1.66 -1.67  -7.781677
...    ...   ...        ...
1995 -4.39 -0.53 -11.600498
1996  2.15 -2.98  -8.620673
1997 -1.54 -3.53 -10.740426
1998  2.02  2.12  -7.135418
1999  2.79  1.63  -9.250090

[2000 rows x 3 columns]

In [17]:

fig = px.scatter(search_data_ackley_0, x="x", y="y", color="score", color_continuous_scale=color_scale)
fig.update_layout(width=900, height=800)
fig.show()

The plot above shows the random search exploring the ackley function. Random search is not affected by the many local optima in the search space. Lets try out the hill climbing algorithm on the ackley function and see the results.

In [18]:

optimizer = HillClimbingOptimizer(rand_rest_p=0)

hyper_ackley_1 = Hyperactive(verbosity=False)
hyper_ackley_1.add_search(ackley_function, 
                          search_space, 
                          n_iter=100, 
                          optimizer=optimizer, 
                          initialize={"vertices": 1})
hyper_ackley_1.run()

search_data_ackley_1 = hyper_ackley_1.search_data(ackley_function)

 hyper_optimizer search_data 
        x     y      score
0  -5.00  4.99 -12.637734
1  -4.55  4.99 -13.998385
2  -4.75  4.99 -13.522024
3  -4.85  4.75 -13.719183
4  -4.81  4.99 -13.233342
..   ...   ...        ...
95 -4.82  4.68 -13.984259
96 -5.00  4.99 -12.637734
97 -5.00  4.99 -12.637734
98 -4.66  4.55 -14.280916
99 -5.00  4.99 -12.637734

[100 rows x 3 columns]

In [19]:

fig = px.scatter(search_data_ackley_1, x="x", y="y", color="score", color_continuous_scale=color_scale)
fig.update_layout(width=900, height=800, xaxis_range=[-5, 5], yaxis_range=[-5, 5])
fig.show()

Maybe you already expected, that the hill climbing algorithm delivers bad results optimizing the ackley function. That does not mean, that hill climbing is a bad algorithm in general. It means that it is bad for this kind of objective functions. This is a very important idea in mathematical optimization. It is very useful to know about the properties of the objective function, because you can choose an optimization algorithm that works really well for this problem.

Repulsing Hill Climbing Optimizer¶

The repulsing hill climbing optimizer tries to improve how hill climbing solves non-convex objective functions. It does so by increasing the radius in which hill climbing selects a neighbour position if the last position wasn't an improvement over the current one. This means the hill climber will jump away from its current position of it does not find better position in its close environment.

In [20]:

from hyperactive.optimizers import RepulsingHillClimbingOptimizer

optimizer = RepulsingHillClimbingOptimizer()

hyper_ackley_2 = Hyperactive(verbosity=False)
hyper_ackley_2.add_search(ackley_function, 
                          search_space, 
                          n_iter=100, 
                          optimizer=optimizer, 
                          initialize={"vertices": 1})
hyper_ackley_2.run()

search_data_ackley_2 = hyper_ackley_2.search_data(ackley_function)

 hyper_optimizer search_data 
        x     y      score
0   4.99  4.99 -12.633040
1   4.92  4.99 -12.741621
2  -1.34  0.02  -4.914987
3   4.99  4.93 -12.712329
4  -0.81  1.72  -6.338818
..   ...   ...        ...
95 -3.54  0.89  -9.877024
96  0.50  0.26  -3.662621
97 -1.50  0.79  -6.295015
98 -1.65  0.34  -6.387955
99 -0.08  0.71  -3.271799

[100 rows x 3 columns]

In [21]:

fig = px.scatter(search_data_ackley_2, x="x", y="y", color="score", color_continuous_scale=color_scale)
fig.update_layout(width=900, height=800, xaxis_range=[-5, 5], yaxis_range=[-5, 5])
fig.show()

The plot above shows how the repulsing hill climbing optimizer explored the search space of the ackley function. It does a much better job finding new optima in the space, while also exploring local regions.

Hyperactive memory ¶

Some objective functions (especially with machein- or deep- learning models) can take a long time to evaluate, which slows down the optimization run. This means, that we want to avoid any unnecessary evaluation of the objective function. Unfortunately most optimization algorithms won't avoid positions in the search space that were already evaluated. For example:

- Random Search, which could select a position it already selected before
- Hill climbing stuck in an optimum
- Particle swarms that converge on one position

The bottom line is, that optimization algorithms don't "remember" already explored positions and won't avoid them. But Hyperactive has a feature that solves this problem, by saving the position and score in a memory-dictionary. If a position is selected Hyperactive will look up if this position is already known. If it knows the position and score it won't reevaluate the objective function but extract the score, which saves time. This is very useful for computationally expensive objective functions.

You can even pass the search data from a previous optimization run into the memory_warm_start-argument, so that the optimization run "remembers" the memory from the previous one.

In [22]:

def dtr_model(opt):
    dtr = DecisionTreeRegressor(
        max_depth=opt["max_depth"],
        min_samples_split=opt["min_samples_split"],
    )
    scores = cross_val_score(dtr, X_boston, y_boston, cv=5)

    return scores.mean()


search_space_dtr = {
    "max_depth": list(range(10, 35)),
    "min_samples_split": list(range(2, 35)),
}

In [23]:

c_time1 = time.time()

hyper_dtr_0 = Hyperactive(verbosity=False)
hyper_dtr_0.add_search(dtr_model, search_space_dtr, n_iter=300)
hyper_dtr_0.run()

d_time1 = time.time() - c_time1
print("Optimization time 1:", round(d_time1, 2))

# Hyperactive collects the search data
search_data_dtr_0 = hyper_dtr_0.search_data(dtr_model, times=True)

 hyper_optimizer search_data 
      max_depth  min_samples_split     score
0           10                 21  0.229979
1           33                  5  0.294730
2           18                 12  0.229979
3           18                 22  0.317003
4           26                 12  0.205503
..         ...                ...       ...
295         15                  2  0.131560
296         11                 12  0.317003
297         20                 32  0.317003
298         34                 32  0.292527
299         21                  9  0.207706

[300 rows x 3 columns] 

Optimization time 1: 1.1

After the first optimization run we start an additional run and pass the search data into memory_warm_start. We expect that the next run will be faster, because we already have explored 300 positions in the search space from the previous optimization run.

In [24]:

c_time2 = time.time()

hyper_dtr_1 = Hyperactive(verbosity=False)
hyper_dtr_1.add_search(dtr_model, search_space_dtr, n_iter=300, memory_warm_start=search_data_dtr_0)
hyper_dtr_1.run()

d_time2 = time.time() - c_time2
print("Optimization time 2:", round(d_time2, 2))

 hyper_optimizer search_data 
      max_depth  min_samples_split     score
0           17                 29  0.229979
1           31                 34  0.317003
2           18                 12  0.229979
3           18                 22  0.317003
4           26                 12  0.205503
..         ...                ...       ...
295         12                  6  0.183230
296         34                  6  0.245779
297         13                 21  0.317003
298         32                 33  0.292527
299         18                 18  0.292527

[300 rows x 3 columns] 

Optimization time 2: 0.76

In [25]:

print("\n The second optimization run is "+'{}'.format(round((1-d_time2/d_time1)*100,2))+"% faster than the first one.")

 The second optimization run is 31.12% faster than the first one.

In [26]:

search_data_dtr_1 = hyper_dtr_1.search_data(dtr_model, times=True)
search_data_dtr = search_data_dtr_1.append(search_data_dtr_0, ignore_index=True)

In [27]:

# times in seconds
eval_times = search_data_dtr_0["eval_times"]
eval_times_mem = search_data_dtr_1["eval_times"]

opt_times = search_data_dtr["iter_times"]-search_data_dtr["eval_times"]

In [28]:

fig = go.Figure()
fig.add_trace(go.Histogram(x=eval_times, name="evaluation time", nbinsx=15))
fig.add_trace(go.Histogram(x=eval_times_mem, name="evaluation time second run", nbinsx=15))
fig.add_trace(go.Histogram(x=opt_times, name="optimization time", nbinsx=15))
fig.show()

The evaluation- and optimization- times from the two optimization runs is shown in the histogram above. There are several interesting things to see:

- Even for simple machine learning models the optimization algorithm is much faster.
- The evaluations, which are faster than the optimization are from the memory-dictionary lookup.
- The second optimization run has much more memory lookups than the first.

Machine Learning Hyperparameter Optimization ¶

Until now we mostly optimized test functions to show how an objective function and the search space can look like. These problems were easy to solve, because the objective funtion evaluates very fast and the search space is very small. Real optimization problems often have one of those two problems: - The objective function is computationally expensive, so it takes a long time to evaluate. This increases the iteration time and slowes down the optimization progress. - The search space is very large. This can makes it very difficult to find positions with a high score.

In the first case you would want to use optimization algorithms that are very inteligent in finding new positions with high scores. You don't want to waste too much time exploring the search space, because each evaluation takes such a long time. You want to get to a good position with a high score in as few steps as possible.

In the second case you would want a fast algorithm, that looks for a good score but also explores the search space very well.

Lets take a look at a (kind of) real optimization problem. We want to optimize the hyperparameters of a gradient boosting regressor that is trained on the boston housing regression dataset.

In [29]:

def gbr_model_0(opt):
    gbr = GradientBoostingRegressor(
        n_estimators=opt["n_estimators"],
        max_depth=opt["max_depth"],
    )
    scores = cross_val_score(gbr, X_boston, y_boston, cv=5)
    score = scores.mean()
    return score


search_space_gbr_0 = {
    "n_estimators": list(range(10, 100)),
    "max_depth": list(range(2, 12)),
}

In [30]:

hyper_gbr_0 = Hyperactive(verbosity=False)
hyper_gbr_0.add_search(gbr_model_0, search_space_gbr_0, n_iter=50)
hyper_gbr_0.run()

search_data_gbr_0 = hyper_gbr_0.search_data(gbr_model_0)

 hyper_optimizer search_data 
     n_estimators  max_depth     score
0             41         11  0.286493
1             10         11  0.158985
2             39          5  0.269010
3             39          8  0.289922
4             68          5  0.285994
5             68          8  0.287513
6             10         11  0.158985
7             99         11  0.275180
8             10          2  0.130693
9             99          2  0.326558
10            83          3  0.309352
11            49          2  0.324265
12            31          6  0.284971
13            84          4  0.272758
14            53          9  0.280875
15            53          2  0.324899
16            16         10  0.260080
17            15          3  0.265057
18            66          5  0.261062
19            30          6  0.293972
20            50          3  0.315253
21            63          4  0.271137
22            33          6  0.273074
23            68         10  0.285651
24            22          5  0.269218
25            12          7  0.199622
26            42          7  0.271228
27            44          3  0.305420
28            96          5  0.276716
29            43          2  0.323368
30            12          8  0.199739
31            45         11  0.289907
32            41         11  0.286493
33            43          6  0.260725
34            77         10  0.282316
35            81          8  0.267947
36            65         10  0.260475
37            31         11  0.266904
38            47          3  0.306781
39            87          7  0.276266
40            39          4  0.279326
41            49          4  0.273381
42            59          5  0.275943
43            47         11  0.288363
44            23          4  0.263894
45            26          6  0.267758
46            11          5  0.195222
47            69         10  0.275137
48            84          2  0.325823
49            69          6  0.268853

In [31]:

fig = px.scatter(search_data_gbr_0, 
                 x="n_estimators", 
                 y="max_depth", 
                 color="score", 
                 color_continuous_scale=color_scale)

fig.update_layout(width=900, height=800)
fig.show()

The scatter plot above contains the samples from the search data from the gbr-model. It seams that high max_depth delivers bad scores but we should explore higher values for n_estimators.

Continuing the Search ¶

Hyperactive makes it very easy to continue a search. The search data you already used for data exploration can just be passed to Hyperactive. This is done in multiple ways:

- You can extract the best parameters via the "best_para"-method. This can than be passed to "initialize" to start at this position in the search space
- The search data from the "results"-method can be passed to "memory_warm_start". The search data is automaticaly added into the memory-dictionary.
- You can also pass the search data to "warm_start_smbo". This has the effect that the Bayesian optimizer can do more precise approximations in the beginning of the optimization run.

In [32]:

best_para_gbr_0 = hyper_gbr_0.best_para(gbr_model_0)
initialize = {"random": 4, "warm_start": [best_para_gbr_0]}


search_space_gbr_01 = {
    "n_estimators": list(range(10, 250, 5)),
    "max_depth": list(range(2, 8)),
}


hyper_gbr_01 = Hyperactive(verbosity=False)
hyper_gbr_01.add_search(gbr_model_0, 
                        search_space_gbr_01, 
                        n_iter=50,
                        n_jobs=2,
                        memory_warm_start=search_data_gbr_0, 
                        initialize=initialize)
hyper_gbr_01.run()

search_data_gbr_01 = hyper_gbr_01.search_data(gbr_model_0)

 hyper_optimizer search_data 
     n_estimators  max_depth     score
0             45          6  0.260725
1             25          6  0.278369
2             30          2  0.311317
3            130          6  0.288743
4            100          2  0.326558
5             80          3  0.313228
6            240          7  0.282410
7             20          2  0.289883
8            210          7  0.285183
9             25          3  0.305628
10            55          6  0.276450
11           105          4  0.290260
12           180          3  0.306254
13           175          4  0.258077
14            20          5  0.243912
15           175          7  0.264526
16           235          2  0.329066
17           200          5  0.279308
18            80          7  0.267073
19           195          2  0.329525
20            70          2  0.326380
21           165          4  0.268225
22           245          5  0.260622
23           105          3  0.308383
24           145          4  0.276206
25            75          3  0.312939
26            60          3  0.312158
27           155          3  0.310263
28           150          5  0.262560
29           155          5  0.280357
30           130          7  0.272747
31           155          3  0.310263
32           220          3  0.313533
33            40          2  0.320701
34            75          4  0.228214
35           230          7  0.272366
36           180          2  0.328918
37            75          6  0.266215
38            10          7  0.176882
39           175          6  0.271295
40            95          4  0.292781
41            25          4  0.263894
42            50          4  0.273381
43           240          3  0.312109
44            90          4  0.271041
45           145          2  0.328595
46            65          3  0.315951
47           220          4  0.275379
48            30          2  0.311317
49           175          2  0.329184 


 hyper_optimizer search_data 
     n_estimators  max_depth     score
0             40          4  0.279326
1             95          2  0.326284
2             40          3  0.305565
3            215          7  0.278450
4            100          2  0.326558
5            115          7  0.263180
6            130          3  0.312360
7            115          4  0.272206
8             50          4  0.273381
9            175          6  0.270705
10           195          5  0.275754
11           165          5  0.269701
12            75          3  0.313818
13           190          7  0.273318
14            30          7  0.288718
15           245          3  0.308790
16           105          3  0.316376
17           100          4  0.278197
18           185          5  0.267187
19           125          5  0.269066
20            35          6  0.273074
21            30          6  0.267758
22           170          4  0.280237
23           180          4  0.264906
24           120          4  0.270004
25           140          3  0.310266
26           155          7  0.291970
27           155          3  0.313230
28           130          5  0.282460
29            40          3  0.305565
30           190          5  0.250403
31           220          7  0.270510
32           190          4  0.271958
33           235          2  0.329018
34           185          3  0.312214
35           205          7  0.256337
36            65          2  0.325732
37            70          3  0.313170
38            70          3  0.313170
39           205          6  0.280859
40           125          3  0.313643
41            55          6  0.288828
42            80          7  0.272950
43           150          3  0.311705
44            40          7  0.271878
45            45          2  0.323368
46            35          5  0.278078
47           240          7  0.270968
48           130          4  0.264316
49           120          6  0.281479

In [33]:

# merge the search data from the previous run and the current run
search_data_gbr_01_ = search_data_gbr_01.append(search_data_gbr_0, ignore_index=True)
search_data_gbr_01_

Out[33]:

	n_estimators	max_depth	score
0	40	4	0.279326
1	95	2	0.326284
2	40	3	0.305565
3	215	7	0.278450
4	100	2	0.326558
...	...	...	...
145	26	6	0.267758
146	11	5	0.195222
147	69	10	0.275137
148	84	2	0.325823
149	69	6	0.268853

150 rows × 3 columns

In [34]:

fig = px.scatter(search_data_gbr_01_, 
                 x="n_estimators", 
                 y="max_depth", 
                 color="score", 
                 color_continuous_scale=color_scale)

fig.update_layout(width=900, height=800)
fig.show()

The scatter plot shows the search data from the previous run on the left side and the new search data on the bottom, because of the different search spaces.

Parallel Computing ¶

Lets throw more computational resources at this problem:

- 1 job does a hill climbing search starting at the best position from the last run
- 1 job does a hill climbing search with four random initial positions
- 2 jobs doing a random search

All of those jobs run in parallel and merge their results into one search data file.

In [35]:

best_para_gbr_01 = hyper_gbr_01.best_para(gbr_model_0)
initialize = {"warm_start": [best_para_gbr_01]}


search_space_gbr_02 = {
    "n_estimators": list(range(150, 300, 2)),
    "max_depth": list(range(2, 5)),
}

optimizer = HillClimbingOptimizer(rand_rest_p=0)

hyper_gbr_02 = Hyperactive(verbosity=False)
hyper_gbr_02.add_search(gbr_model_0, 
                        search_space_gbr_02, 
                        n_iter=50,
                        n_jobs=1,
                        optimizer=optimizer,
                        memory_warm_start=search_data_gbr_01_, 
                        initialize=initialize)
hyper_gbr_02.add_search(gbr_model_0, 
                        search_space_gbr_02, 
                        n_iter=50,
                        n_jobs=1,
                        optimizer=optimizer,
                        memory_warm_start=search_data_gbr_01_, 
                        initialize={"random": 4})
hyper_gbr_02.add_search(gbr_model_0, 
                        search_space_gbr_02, 
                        n_iter=50,
                        n_jobs=2,
                        memory_warm_start=search_data_gbr_01_)
hyper_gbr_02.run()

search_data_gbr_02 = hyper_gbr_02.search_data(gbr_model_0)

 hyper_optimizer search_data 
     n_estimators  max_depth     score
0            178          4  0.263013
1            162          4  0.280177
2            168          2  0.328659
3            248          4  0.275434
4            178          4  0.263013
5            172          4  0.289256
6            178          4  0.263013
7            168          4  0.283338
8            178          4  0.263013
9            176          4  0.258077
10           174          4  0.269407
11           168          4  0.283338
12           174          4  0.269407
13           164          4  0.281814
14           172          4  0.289256
15           178          4  0.263013
16           170          4  0.280237
17           168          4  0.283338
18           166          4  0.268225
19           178          4  0.263013
20           174          4  0.269407
21           168          4  0.283338
22           176          4  0.258077
23           170          4  0.280237
24           176          4  0.258077
25           166          4  0.268225
26           174          4  0.269407
27           182          4  0.282154
28           178          4  0.263013
29           164          4  0.281814
30           170          4  0.280237
31           176          4  0.258077
32           164          4  0.281814
33           166          4  0.268225
34           180          4  0.264906
35           176          4  0.258077
36           170          4  0.280237
37           172          4  0.289256
38           168          4  0.283338
39           168          4  0.283338
40           172          4  0.289256
41           174          4  0.269407
42           174          4  0.269407
43           174          4  0.269407
44           160          4  0.261971
45           174          4  0.269407
46           162          4  0.280177
47           178          4  0.263013
48           178          4  0.263013
49           174          4  0.269407 


 hyper_optimizer search_data 
     n_estimators  max_depth     score
0            194          2  0.329392
1            194          2  0.329392
2            198          2  0.328777
3            190          2  0.328667
4            194          2  0.329392
5            202          2  0.328845
6            192          2  0.329001
7            190          2  0.328667
8            194          2  0.329392
9            196          2  0.329525
10           200          2  0.329898
11           202          2  0.328845
12           200          2  0.329898
13           200          2  0.329898
14           196          2  0.329525
15           202          2  0.328845
16           206          2  0.329736
17           206          2  0.329736
18           198          2  0.328777
19           192          2  0.329001
20           202          2  0.328845
21           204          2  0.328983
22           200          2  0.329898
23           194          2  0.329392
24           200          2  0.329898
25           202          2  0.328845
26           200          2  0.329898
27           202          2  0.328845
28           198          2  0.328777
29           200          2  0.329898
30           198          2  0.328777
31           200          2  0.329898
32           186          2  0.329217
33           198          2  0.328777
34           200          2  0.329898
35           198          2  0.328777
36           206          2  0.329736
37           194          2  0.329392
38           196          2  0.329525
39           198          2  0.328777
40           202          2  0.328845
41           214          2  0.330057
42           196          2  0.329525
43           222          2  0.328906
44           208          2  0.329644
45           212          2  0.329418
46           218          2  0.329348
47           212          2  0.329418
48           206          2  0.329736
49           214          2  0.330057 


 hyper_optimizer search_data 
     n_estimators  max_depth     score
0            288          4  0.237881
1            294          2  0.328950
2            198          2  0.328207
3            198          2  0.328207
4            246          2  0.328708
5            246          2  0.328708
6            150          4  0.263894
7            298          2  0.329070
8            298          4  0.264539
9            150          2  0.325823
10           192          4  0.286485
11           248          4  0.275830
12           262          2  0.329647
13           286          2  0.328659
14           192          4  0.286485
15           280          4  0.238565
16           288          4  0.237881
17           282          4  0.285908
18           230          4  0.271318
19           230          4  0.271318
20           262          2  0.329647
21           200          3  0.316830
22           208          3  0.316408
23           246          3  0.308790
24           218          3  0.315333
25           150          4  0.263894
26           232          3  0.316344
27           168          3  0.310756
28           286          4  0.278173
29           162          2  0.328547
30           230          4  0.271318
31           250          2  0.329677
32           182          4  0.283171
33           194          3  0.309264
34           238          2  0.329388
35           260          2  0.329516
36           250          4  0.275661
37           266          3  0.313437
38           274          4  0.267048
39           264          3  0.311236
40           216          3  0.308580
41           196          2  0.329525
42           276          3  0.314968
43           256          3  0.314426
44           198          4  0.278145
45           252          2  0.329357
46           250          4  0.275661
47           260          3  0.311066
48           280          4  0.238565
49           176          4  0.258077 


 hyper_optimizer search_data 
     n_estimators  max_depth     score
0            284          4  0.269907
1            224          3  0.312947
2            198          2  0.329464
3            198          2  0.329464
4            246          2  0.328885
5            246          2  0.328885
6            150          4  0.263894
7            298          2  0.329071
8            150          2  0.325823
9            298          4  0.271417
10           254          2  0.329152
11           284          4  0.269907
12           200          4  0.276576
13           262          4  0.275647
14           298          3  0.308374
15           192          2  0.329062
16           236          3  0.311532
17           240          4  0.289443
18           236          2  0.329066
19           224          2  0.329289
20           222          3  0.309378
21           164          2  0.328603
22           252          2  0.329503
23           284          2  0.329192
24           294          3  0.316338
25           228          4  0.279851
26           246          3  0.308790
27           280          4  0.272449
28           268          4  0.276267
29           150          4  0.263894
30           206          2  0.329596
31           290          3  0.313317
32           194          4  0.276474
33           244          4  0.272497
34           226          2  0.329349
35           226          4  0.253493
36           174          4  0.283864
37           232          4  0.281243
38           150          4  0.263894
39           200          3  0.316185
40           216          4  0.282455
41           280          2  0.328181
42           276          3  0.309901
43           284          3  0.310843
44           252          4  0.278502
45           278          3  0.309817
46           188          2  0.329047
47           258          4  0.283216
48           258          3  0.308893
49           282          4  0.294080

In [36]:

search_data_gbr_02_ = search_data_gbr_02.append(search_data_gbr_01_, ignore_index=True)

fig = px.scatter(search_data_gbr_02_, 
                 x="n_estimators", 
                 y="max_depth", 
                 color="score", 
                 color_continuous_scale=color_scale)

fig.update_layout(width=900, height=800)
fig.show()

All the search data we collected shows a clear pattern. We should keep the max_depth at 2. In the following plot the search data is filtered to show only max_depth == 2, while showing the n_estimators dependend on the score.

In [37]:

search_data_gbr_02_f = search_data_gbr_02_[search_data_gbr_02_["score"] > 0.68]
search_data_gbr_02_f_max_depth2 = search_data_gbr_02_f[search_data_gbr_02_["max_depth"] == 2]

In [38]:

fig = px.scatter(search_data_gbr_02_f_max_depth2, 
                 x="n_estimators", 
                 y="score")

fig.update_layout(width=900, height=800)
fig.show()

The filtering and visualization of the search data in the last few plots was an example how you can explore the model and search space yourself. Hyperactive makes it very easy to collect and reuse search data. Let's take a look how to collect more data:

Collecting more Data ¶

Until now you have seen, that the objective function always returns only one variable: The score, which is always a real number. But Hyperactive has the capability to accept more variables. Those additional variables won't affect the score or the decision making of the optimization algorithm, but they will be collected in each iteration and accessed in the search data.

This feature can be very useful, because you can add any variable you want to the search data, which might help you understand the model better. To collect additional data in the objective function you just put it into a dictionary and return it alongside with the score. The key will be the column name in the search data and the value will be collected.

In [39]:

def gbr_model_1(opt):
    gbr = GradientBoostingRegressor(
        n_estimators=opt["n_estimators"],
        max_depth=opt["max_depth"],
    )
    c_time = time.time()
    scores = cross_val_score(gbr, X_boston, y_boston, cv=5)
    cv_time = time.time() - c_time
    
    # add the dictionary to collect more data
    return scores.mean(), {"cv_time": cv_time}


search_space_gbr_1 = {
    "n_estimators": list(range(10, 250, 5)),
    "max_depth": list(range(2, 8)),
}

In [40]:

hyper_gbr_1 = Hyperactive(verbosity=False)
hyper_gbr_1.add_search(gbr_model_1, search_space_gbr_1, n_iter=15, n_jobs=8, initialize={"random": 10})
hyper_gbr_1.run()

search_data_gbr_1 = hyper_gbr_1.search_data(gbr_model_1)
search_data_gbr_1.head()

 hyper_optimizer search_data 
     n_estimators  max_depth   cv_time     score
0             45          6  0.118185  0.266934
1             25          6  0.066240  0.274931
2             30          2  0.064459  0.311317
3            130          6  0.313416  0.279695
4             80          3  0.194099  0.311537
5            240          7  0.481668  0.286414
6             20          2  0.043417  0.289883
7            210          7  0.446462  0.275240
8             25          3  0.061512  0.300796
9             55          6  0.135543  0.291698
10           105          4  0.263462  0.270263
11           180          3  0.429693  0.313951
12           175          4  0.449600  0.274124
13            20          5  0.053172  0.268555
14           175          7  0.404586  0.288742 


 hyper_optimizer search_data 
     n_estimators  max_depth   cv_time     score
0            170          4  0.429751  0.273267
1            115          7  0.278648  0.296102
2             85          4  0.211211  0.279554
3            135          4  0.342682  0.267721
4            220          6  0.454925  0.268529
5            135          5  0.323710  0.270547
6             35          3  0.084315  0.306963
7             60          5  0.149962  0.260005
8             25          2  0.053171  0.303257
9            140          2  0.278299  0.328258
10           210          2  0.428252  0.330240
11            80          2  0.163116  0.326211
12            25          4  0.064566  0.293585
13           115          6  0.277513  0.270716
14            45          3  0.110079  0.313095 


 hyper_optimizer search_data 
     n_estimators  max_depth   cv_time     score
0             40          4  0.104604  0.270946
1             95          2  0.191430  0.326538
2             40          3  0.096477  0.306560
3            215          7  0.447988  0.268221
4            115          7  0.277140  0.278463
5            130          3  0.307548  0.310494
6            115          4  0.285706  0.271133
7             50          4  0.124728  0.275723
8            175          6  0.403750  0.272474
9            195          5  0.425691  0.282180
10           165          5  0.390918  0.282236
11            75          3  0.183286  0.316066
12           190          7  0.423599  0.274997
13            30          7  0.077578  0.269481
14           245          3  0.560118  0.312848 


 hyper_optimizer search_data 
     n_estimators  max_depth   cv_time     score
0             50          3  0.149075  0.310441
1            195          3  0.459050  0.316121
2            230          2  0.474948  0.328721
3             70          3  0.166681  0.308237
4             40          7  0.101941  0.262045
5            100          2  0.203159  0.326424
6            125          6  0.300996  0.285009
7            145          5  0.351124  0.281005
8            195          7  0.435357  0.282233
9             30          7  0.080386  0.262075
10           175          7  0.413364  0.268366
11           175          3  0.413322  0.310711
12           110          7  0.265733  0.275947
13           140          6  0.335133  0.279896
14            95          2  0.193122  0.326247 


 hyper_optimizer search_data 
     n_estimators  max_depth   cv_time     score
0            175          6  0.412572  0.280366
1            100          4  0.250836  0.290253
2            240          2  0.479053  0.328462
3            195          5  0.432045  0.258361
4            140          6  0.336825  0.284946
5             10          3  0.027291  0.175655
6             80          3  0.189595  0.314869
7            115          2  0.236703  0.326597
8            135          7  0.328682  0.277129
9             65          7  0.164024  0.281355
10           220          2  0.447159  0.329678
11            70          3  0.166764  0.307975
12            80          2  0.161863  0.325785
13           225          3  0.515723  0.313758
14           200          6  0.429268  0.280143 


 hyper_optimizer search_data 
     n_estimators  max_depth   cv_time     score
0            220          3  0.528340  0.311830
1            125          7  0.306709  0.280501
2            195          2  0.394514  0.329501
3            150          4  0.377696  0.264062
4            235          5  0.466406  0.286418
5             40          5  0.101020  0.265324
6            150          5  0.367667  0.268419
7            235          3  0.550423  0.308684
8             10          2  0.024037  0.130693
9            100          3  0.237929  0.311202
10            65          2  0.132017  0.325816
11           195          4  0.463978  0.271931
12           170          7  0.396408  0.288230
13           180          3  0.419057  0.311722
14            35          7  0.088092  0.282675 


 hyper_optimizer search_data 
     n_estimators  max_depth   cv_time     score
0            180          7  0.414299  0.262485
1            190          2  0.376027  0.329066
2             80          4  0.199736  0.262852
3            205          4  0.481331  0.288552
4            180          3  0.418108  0.310108
5            240          5  0.469577  0.278504
6             85          6  0.205165  0.270781
7            190          7  0.421955  0.264020
8            190          4  0.460415  0.281676
9            210          7  0.442884  0.276648
10           105          4  0.261704  0.285338
11            10          7  0.029574  0.179460
12           135          7  0.324964  0.269716
13            90          5  0.217990  0.278937
14           130          4  0.323176  0.286018 


 hyper_optimizer search_data 
     n_estimators  max_depth   cv_time     score
0            190          5  0.448730  0.280003
1            165          6  0.392935  0.287462
2            125          5  0.304988  0.283130
3            210          3  0.486952  0.315794
4            115          4  0.286834  0.263519
5            110          7  0.266535  0.282376
6             55          4  0.139961  0.283277
7            240          4  0.567436  0.278804
8             85          6  0.214452  0.285471
9            225          3  0.516657  0.308658
10           160          4  0.397832  0.260815
11           100          4  0.248304  0.281050
12           225          2  0.444328  0.329629
13            90          3  0.213261  0.310404
14            85          3  0.200542  0.311639

Out[40]:

	n_estimators	max_depth	cv_time	score
0	40	4	0.104604	0.270946
1	95	2	0.191430	0.326538
2	40	3	0.096477	0.306560
3	215	7	0.447988	0.268221
4	115	7	0.277140	0.278463

In [41]:

fig = px.scatter(search_data_gbr_1, 
                 x="n_estimators", 
                 y="max_depth", 
                 color="score", 
                 size='cv_time', 
                 color_continuous_scale=color_scale)

fig.update_layout(width=900, height=800)
fig.show()

The scatter plot above shows the samples od the search data, but adds a visualization of the cross-validation-time with the size of the scatter-points.

Managing multiple objectives ¶

In the last chapter you were able to collect additional data during the optimization run. This data did not affect the score. But you can still try to create one score that represents information from multiple scores. In the following example we want to optimize a model to get a high score and at the same time a low training time.

In [42]:

def gbr_model_2(opt):
    gbr = GradientBoostingRegressor(
        n_estimators=opt["n_estimators"],
        max_depth=opt["max_depth"],
    )
    c_time = time.time()
    scores = cross_val_score(gbr, X_boston, y_boston, cv=5)
    cv_time = time.time() - c_time
    
    score_cv_avg = scores.mean()
    score_cv_std = scores.std()
    
    # the score is calculated from the cv-score and the cv-training time
    score = score_cv_avg / (cv_time**0.1)
    
    # independed from the score we want some additional data
    return score, {"cv_time": cv_time, 
                   "score_cv_avg": score_cv_avg,
                   "score_cv_std": score_cv_std,
                  "scores": scores,
                  }


search_space_gbr_2 = {
    "n_estimators": list(range(10, 250, 5)),
    "max_depth": list(range(2, 12)),
}

The objective function above enables us to return a score that is composed of multiple variables. At the same time, we also want to collect data about the variables the score is composed from. This helps us understand the score later during the data visualization.

In [43]:

hyper_gbr_2 = Hyperactive(verbosity=False)
hyper_gbr_2.add_search(gbr_model_2, search_space_gbr_2, n_iter=15, n_jobs=8, initialize={"random": 10})
hyper_gbr_2.run()

search_data_gbr_2 = hyper_gbr_2.search_data(gbr_model_2)
search_data_gbr_2.head()

 hyper_optimizer search_data 
     n_estimators  max_depth   cv_time     score  score_cv_avg  score_cv_std  \
0             45         10  0.136457  0.323857      0.265371      0.330783   
1             25         10  0.064831  0.357932      0.272257      0.336918   
2             30          3  0.072968  0.403566      0.310619      0.380542   
3            130         10  0.312861  0.318940      0.283952      0.349164   
4             80          4  0.200744  0.313594      0.267074      0.332184   
5            240          2  0.484914  0.354671      0.329908      0.406825   
6             10          2  0.023994  0.189776      0.130693      0.160954   
7             70          4  0.178738  0.328887      0.276865      0.341739   
8            175          6  0.407856  0.292649      0.267545      0.332618   
9             95         10  0.234092  0.309486      0.267658      0.332724   
10            55         10  0.135570  0.332052      0.271908      0.336840   
11            95          2  0.192210  0.385327      0.326744      0.402988   
12           130         10  0.312861  0.318940      0.283952      0.349164   
13           220          3  0.505374  0.336861      0.314639      0.385742   
14           200          9  0.434827  0.301456      0.277367      0.342242   

                                               scores  
0   [0.0, 0.7606979573331243, 0.0, 0.5661560583542...  
1   [0.0, 0.756928785905888, 0.0, 0.60435498531228...  
2   [0.0, 0.7618800535985606, 0.0, 0.7912142241900...  
3   [0.0, 0.7591975497210028, 0.0, 0.6605601524950...  
4   [0.0, 0.7592408879023933, 0.0, 0.5761300406532...  
5   [0.0, 0.7498086663521787, 0.0, 0.8997297401252...  
6   [0.0, 0.3000208717930798, 0.0, 0.3534418130637...  
7   [0.0, 0.7593214379685758, 0.0, 0.6250028639511...  
8   [0.0, 0.7591973311141902, 0.0, 0.5785281255638...  
9   [0.0, 0.7592063189382285, 0.0, 0.5790821541598...  
10  [0.0, 0.7597763989791204, 0.0, 0.5997639441451...  
11  [0.0, 0.7417446308421538, 0.0, 0.8919733614070...  
12  [0.0, 0.7591975497210028, 0.0, 0.6605601524950...  
13  [0.0, 0.7591973311141902, 0.0, 0.8139963950064...  
14  [0.0, 0.7591973311141902, 0.0, 0.6276383127284...   


 hyper_optimizer search_data 
     n_estimators  max_depth   cv_time     score  score_cv_avg  score_cv_std  \
0            170          7  0.413711  0.311161      0.284875      0.350164   
1            115          5  0.277382  0.332062      0.292097      0.358213   
2            105          8  0.256741  0.320797      0.280014      0.344981   
3            125         11  0.300331  0.323629      0.286951      0.352436   
4            135          9  0.323920  0.319626      0.285552      0.350901   
5             35          4  0.088649  0.354309      0.278067      0.343188   
6             60          9  0.147740  0.348017      0.287442      0.352994   
7             25          2  0.054084  0.405979      0.303257      0.372563   
8            140          3  0.329806  0.345044      0.308817      0.378309   
9            210          2  0.414936  0.359749      0.329456      0.406356   
10            80          3  0.188538  0.369897      0.313055      0.383698   
11            25          6  0.064801  0.370514      0.281814      0.346738   
12           115         11  0.275769  0.296294      0.260482      0.326254   
13            45          4  0.113226  0.354023      0.284725      0.350083   
14           210          8  0.440172  0.283969      0.261597      0.327226   

                                               scores  
0   [0.0, 0.7591973314288802, 0.0, 0.6651778198072...  
1   [0.0, 0.7591984186362237, 0.0, 0.7012863352115...  
2   [0.0, 0.7592004620942316, 0.0, 0.6408702197439...  
3   [0.0, 0.7591977059680856, 0.0, 0.6755583577510...  
4   [0.0, 0.7591974574568147, 0.0, 0.6685633062254...  
5   [0.0, 0.7621839309593008, 0.0, 0.6281502604805...  
6   [0.0, 0.7595464902301229, 0.0, 0.6776650535732...  
7   [0.0, 0.7118804031055922, 0.0, 0.8044043323926...  
8   [0.0, 0.7591974029750898, 0.0, 0.7848865763114...  
9   [0.0, 0.7476038105049081, 0.0, 0.8996785062807...  
10  [0.0, 0.7592408879023933, 0.0, 0.8060341746144...  
11  [0.0, 0.756928785905888, 0.0, 0.65214339070086...  
12  [0.0, 0.7591984186362237, 0.0, 0.5432121318684...  
13  [0.0, 0.7606979573331243, 0.0, 0.6629268123550...  
14  [0.0, 0.7591973311141902, 0.0, 0.5487874146446...   


 hyper_optimizer search_data 
     n_estimators  max_depth   cv_time     score  score_cv_avg  score_cv_std  \
0            180         11  0.415904  0.292008      0.267482      0.332559   
1             15          5  0.041179  0.314475      0.228589      0.285959   
2             95         11  0.229800  0.316754      0.273436      0.338287   
3             95         10  0.232698  0.339406      0.293358      0.359660   
4             60          9  0.148902  0.323756      0.267614      0.332720   
5             85         11  0.207217  0.334708      0.285962      0.351350   
6            190         11  0.419905  0.315123      0.288932      0.354635   
7             90          6  0.221628  0.308532      0.265376      0.330614   
8             90          2  0.199622  0.383714      0.326609      0.402842   
9            230          8  0.476602  0.282412      0.262240      0.327792   
10           245          6  0.475372  0.292884      0.271894      0.336772   
11           155          8  0.368208  0.314119      0.284252      0.349489   
12            90          3  0.211561  0.368240      0.315264      0.386551   
13            20          3  0.049844  0.386807      0.286586      0.351208   
14            60         11  0.147057  0.354909      0.292998      0.359256   

                                               scores  
0   [0.0, 0.7591973311141902, 0.0, 0.5782101756953...  
1   [0.0, 0.6635783675193356, 0.0, 0.4793645774808...  
2   [0.0, 0.7592063189382285, 0.0, 0.6079760893920...  
3   [0.0, 0.7592063189382285, 0.0, 0.7075857737728...  
4   [0.0, 0.7595464902301229, 0.0, 0.5785220373474...  
5   [0.0, 0.7592230854704327, 0.0, 0.6705870066605...  
6   [0.0, 0.7591973311141902, 0.0, 0.6854609574533...  
7   [0.0, 0.7592125490494327, 0.0, 0.5676686658178...  
8   [0.0, 0.7411685862901474, 0.0, 0.8918770859385...  
9   [0.0, 0.7591973311141902, 0.0, 0.5520006952587...  
10  [0.0, 0.7591973311141902, 0.0, 0.6002726153686...  
11  [0.0, 0.7591973405896815, 0.0, 0.6620637527677...  
12  [0.0, 0.7592125490494327, 0.0, 0.8171062246438...  
13  [0.0, 0.7358195863474217, 0.0, 0.6971118540768...  
14  [0.0, 0.7595464902301229, 0.0, 0.7054459913405...   


 hyper_optimizer search_data 
     n_estimators  max_depth   cv_time     score  score_cv_avg  score_cv_std  \
0            220          5  0.455640  0.304106      0.281117      0.346140   
1            125         11  0.302580  0.316119      0.280501      0.345492   
2             10          9  0.030269  0.196829      0.138736      0.179144   
3            110          9  0.265424  0.302174      0.264637      0.329940   
4             40          9  0.100850  0.344517      0.273891      0.338938   
5            150          8  0.358505  0.323525      0.291983      0.358083   
6            235          5  0.472267  0.292650      0.271498      0.336387   
7             10          3  0.028313  0.244344      0.171081      0.209606   
8            100          5  0.252887  0.325643      0.283814      0.349017   
9             65          2  0.134255  0.398686      0.326155      0.402016   
10           195          7  0.432069  0.310959      0.285929      0.351313   
11           170         10  0.398265  0.306941      0.279944      0.344908   
12            60          3  0.143180  0.376253      0.309791      0.379532   
13           230          3  0.526460  0.331901      0.311275      0.381422   
14           225          2  0.451513  0.357549      0.330219      0.407264   

                                               scores  
0   [0.0, 0.7591973311141902, 0.0, 0.6463874468818...  
1   [0.0, 0.7591977059680856, 0.0, 0.6433080143749...  
2   [0.0, 0.43657746289686405, 0.0, 0.257100661469...  
3   [0.0, 0.7591991773750808, 0.0, 0.5639876512819...  
4   [0.0, 0.761449900875621, 0.0, 0.60800697241692...  
5   [0.0, 0.7591973518071372, 0.0, 0.7007177747400...  
6   [0.0, 0.7591973311141902, 0.0, 0.5982950718976...  
7   [0.0, 0.43657746289686405, 0.0, 0.418826802922...  
8   [0.0, 0.759202637170055, 0.0, 0.65986925166211...  
9   [0.0, 0.7437668317273509, 0.0, 0.8870068552202...  
10  [0.0, 0.7591973311141902, 0.0, 0.6704501104566...  
11  [0.0, 0.7591973314288802, 0.0, 0.6405251282294...  
12  [0.0, 0.7595464902301229, 0.0, 0.7894066997405...  
13  [0.0, 0.7591973311141902, 0.0, 0.7971785803182...  
14  [0.0, 0.7497741442819785, 0.0, 0.9013226961820...   


 hyper_optimizer search_data 
     n_estimators  max_depth   cv_time     score  score_cv_avg  score_cv_std  \
0             40          6  0.117105  0.339244      0.273760      0.338809   
1             95          2  0.192001  0.385126      0.326538      0.402701   
2             40          5  0.100304  0.346020      0.274937      0.339974   
3            215          7  0.456671  0.290090      0.268221      0.333253   
4            225          8  0.460565  0.300552      0.278131      0.343026   
5             65          7  0.159255  0.353656      0.294300      0.360753   
6            100          4  0.248682  0.298836      0.260015      0.325850   
7            100         10  0.254327  0.326997      0.285156      0.350470   
8            180         11  0.416745  0.300223      0.275062      0.339906   
9            150          9  0.362403  0.289249      0.261331      0.326992   
10           165          5  0.390241  0.257560      0.234429      0.307286   
11            55         11  0.136349  0.343151      0.281158      0.346221   
12           245          3  0.558418  0.334188      0.315273      0.386564   
13           210          4  0.489017  0.307314      0.286098      0.351498   
14           105          5  0.255243  0.330143      0.288003      0.353600   

                                               scores  
0   [0.0, 0.761449900875621, 0.0, 0.60735008583143...  
1   [0.0, 0.74171543851855, 0.0, 0.890972364821101...  
2   [0.0, 0.761449900875621, 0.0, 0.61323489059761...  
3   [0.0, 0.7591973311141902, 0.0, 0.5819101634954...  
4   [0.0, 0.7591973311141902, 0.0, 0.6314568449037...  
5   [0.0, 0.7594060236112754, 0.0, 0.7120962035144...  
6   [0.0, 0.759202637170055, 0.0, 0.54087068362118...  
7   [0.0, 0.759202637170055, 0.0, 0.66657920962673...  
8   [0.0, 0.7591973311141902, 0.0, 0.6161127493205...  
9   [0.0, 0.7591973518071372, 0.0, 0.5474566324285...  
10  [0.0, 0.7591973319805099, 0.0, 0.4129485460500...  
11  [0.0, 0.7597763989791204, 0.0, 0.6460118139438...  
12  [0.0, 0.7591973311141902, 0.0, 0.8171690398746...  
13  [0.0, 0.7591973311141902, 0.0, 0.6712930359609...  
14  [0.0, 0.7592004620942316, 0.0, 0.6808160058318...   


 hyper_optimizer search_data 
     n_estimators  max_depth   cv_time     score  score_cv_avg  score_cv_std  \
0            175         10  0.435712  0.300426      0.276476      0.341333   
1            100          7  0.242967  0.335800      0.291498      0.357530   
2            240          3  0.553861  0.332909      0.313809      0.384670   
3            195          9  0.428541  0.292784      0.268997      0.333985   
4            140         10  0.339210  0.320703      0.287839      0.353418   
5             10          4  0.029351  0.241386      0.169619      0.207891   
6             80          5  0.195049  0.313984      0.266638      0.331779   
7            115          2  0.231999  0.377976      0.326597      0.402624   
8            135          4  0.336672  0.302481      0.271280      0.336175   
9            215          2  0.426309  0.358897      0.329566      0.406412   
10            70          5  0.170563  0.334608      0.280366      0.345358   
11            80          2  0.164777  0.390169      0.325794      0.401664   
12           225          5  0.458695  0.289719      0.267997      0.333041   
13           200         10  0.437606  0.297075      0.273511      0.338360   
14            85          8  0.204948  0.331356      0.282787      0.347915   

                                               scores  
0   [0.0, 0.7591973311141902, 0.0, 0.6231820929314...  
1   [0.0, 0.759202637170055, 0.0, 0.69828739724825...  
2   [0.0, 0.7591973311141902, 0.0, 0.8098490909401...  
3   [0.0, 0.7591973311141902, 0.0, 0.5857860644759...  
4   [0.0, 0.7591974029750898, 0.0, 0.6799969200704...  
5   [0.0, 0.43657746289686405, 0.0, 0.411517709726...  
6   [0.0, 0.7592408879023933, 0.0, 0.5739502865654...  
7   [0.0, 0.743909637104236, 0.0, 0.88907769969297...  
8   [0.0, 0.7591974574568147, 0.0, 0.5972046427982...  
9   [0.0, 0.7489188791452149, 0.0, 0.8989108289244...  
10  [0.0, 0.7593214379685758, 0.0, 0.6425066168565...  
11  [0.0, 0.741652812212936, 0.0, 0.88731529803996...  
12  [0.0, 0.7591973311141902, 0.0, 0.5807874747754...  
13  [0.0, 0.7591973311141902, 0.0, 0.6083562144719...  
14  [0.0, 0.7592230854704327, 0.0, 0.6547128579433...   


 hyper_optimizer search_data 

 hyper_optimizer search_data 
      n_estimators  max_depth   cv_time     score  score_cv_avg  score_cv_std  \
0            190          8  0.425644  0.304549      0.279615      0.344565   
1            165         10  0.395935  0.313447      0.285710      0.351074   
2            125          8  0.301297  0.314241      0.278716      0.343631   
3            210          5  0.442195  0.302296      0.278608      0.343518   
4            115          6  0.277537  0.301964      0.265636      0.330850   
5            110          4  0.274258  0.325823      0.286284      0.351703   
6            105          6  0.252859  0.289422      0.252243      0.319459   
7             85         11  0.205357  0.341245      0.291284      0.357288   
8            225          4  0.513295  0.301982      0.282499      0.347606   
9            160          7  0.378928  0.291803      0.264817      0.330102   
10           100          7  0.240047  0.317631      0.275392      0.340238   
11           225          2  0.443083  0.356974      0.329067      0.405642   
12            90          4  0.223649  0.326463      0.281055      0.346075   
13            85          5  0.204921  0.314538      0.268431      0.333452   
14            35          4  0.088334  0.331140      0.259791      0.326065   

                                               scores  
0   [0.0, 0.7591973311141902, 0.0, 0.6388801546981...  
1   [0.0, 0.7591973319805099, 0.0, 0.6693539423234...  
2   [0.0, 0.7591977059680856, 0.0, 0.6343847529870...  
3   [0.0, 0.7591973311141902, 0.0, 0.6338418345754...  
4   [0.0, 0.7591984186362237, 0.0, 0.5689810757456...  
5   [0.0, 0.7591991773750808, 0.0, 0.6722232597348...  
6   [0.0, 0.7592004620942316, 0.0, 0.5020131961419...  
7   [0.0, 0.7592230854704327, 0.0, 0.6971975231665...  
8   [0.0, 0.7591973311141902, 0.0, 0.6532994289108...  
9   [0.0, 0.7591973339658764, 0.0, 0.5648870415883...  
10  [0.0, 0.759202637170055, 0.0, 0.61775959245503...  
11  [0.0, 0.749907029558438, 0.0, 0.89542940208227...  
12  [0.0, 0.7592125490494327, 0.0, 0.6460601110964...  
13  [0.0, 0.7592230854704327, 0.0, 0.5829309386913...  
14  [0.0, 0.7621839309593008, 0.0, 0.5367704553759...   

    n_estimators  max_depth   cv_time     score  score_cv_avg  score_cv_std  \
0             50          4  0.134709  0.325120      0.266062      0.331350   
1            195          4  0.465079  0.288511      0.267248      0.332341   
2            230          2  0.460997  0.355187      0.328721      0.405174   
3             70          5  0.171615  0.311101      0.260830      0.326572   
4             40          6  0.103232  0.328847      0.262045      0.327909   
5             25          7  0.068596  0.341303      0.261078      0.326479   
6            195          8  0.429352  0.293719      0.269907      0.334852   
7            140         11  0.341563  0.308038      0.276663      0.341524   
8            230          3  0.534215  0.333307      0.313052      0.383695   
9            235         10  0.466711  0.302454      0.280262      0.345240   
10           215         10  0.461447  0.298496      0.276281      0.341135   
11            50          7  0.124703  0.330690      0.268540      0.333653   
12           215          8  0.447357  0.311976      0.287864      0.353445   
13           170          6  0.404659  0.303095      0.276878      0.341742   
14            20          4  0.054090  0.362930      0.271103      0.334056   

                                               scores  
0   [0.0, 0.7601428859741948, 0.0, 0.5701674479267...  
1   [0.0, 0.7591973311141902, 0.0, 0.5770435925038...  
2   [0.0, 0.7496839489591742, 0.0, 0.8939188384891...  
3   [0.0, 0.7593214379685758, 0.0, 0.5448284353802...  
4   [0.0, 0.761449900875621, 0.0, 0.54877415913990...  
5   [0.0, 0.756928785905888, 0.0, 0.54846060837347...  
6   [0.0, 0.7591973311141902, 0.0, 0.5903374255550...  
7   [0.0, 0.7591974029750898, 0.0, 0.6241196406447...  
8   [0.0, 0.7591973311141902, 0.0, 0.8060615508503...  
9   [0.0, 0.7591973311141902, 0.0, 0.6421115372831...  
10  [0.0, 0.7591973311141902, 0.0, 0.6222059494211...  
11  [0.0, 0.7601428859741948, 0.0, 0.5825584181845...  
12  [0.0, 0.7591973311141902, 0.0, 0.6801219906510...  
13  [0.0, 0.7591973314288802, 0.0, 0.6251904453111...  
14  [0.0, 0.7358195863474217, 0.0, 0.6196937978507...

Out[43]:

	n_estimators	max_depth	cv_time	score	score_cv_avg	score_cv_std	scores
0	40	6	0.117105	0.339244	0.273760	0.338809	[0.0, 0.761449900875621, 0.0, 0.60735008583143...
1	95	2	0.192001	0.385126	0.326538	0.402701	[0.0, 0.74171543851855, 0.0, 0.890972364821101...
2	40	5	0.100304	0.346020	0.274937	0.339974	[0.0, 0.761449900875621, 0.0, 0.61323489059761...
3	215	7	0.456671	0.290090	0.268221	0.333253	[0.0, 0.7591973311141902, 0.0, 0.5819101634954...
4	225	8	0.460565	0.300552	0.278131	0.343026	[0.0, 0.7591973311141902, 0.0, 0.6314568449037...

In [44]:

fig = px.scatter(search_data_gbr_2, 
                 x="n_estimators", 
                 y="max_depth", 
                 color="score", 
                 size='cv_time', 
                 color_continuous_scale=color_scale)

fig.update_layout(width=800, height=700)
fig.show()

In [45]:

fig = px.scatter(search_data_gbr_2, 
                 x="cv_time", 
                 y="score_cv_avg", 
                 color="score", 
                 size='score_cv_std', 
                 color_continuous_scale=color_scale)

fig.update_layout(width=800, height=700)
fig.show()

Non-numerical Search Spaces ¶

This chapter describes a very unique and helpful feature of Hyperactive: non-numerical values in the search space. You are not constrained to use numeric values in your search space, but also strings or even functions. Because of this you can do some really interesting stuff like:

- hyperparameter optimization of any parameter
- preprocessing-optimization
- neural architecture search

Lets take a look at the following example:

In [46]:

def mlp_model(opt): 
    scaler = MinMaxScaler()
    X_norm = scaler.fit_transform(X_iris)
    
    mlp = MLPClassifier(
        hidden_layer_sizes=opt["hidden_layer_sizes"],
        activation=opt["activation"],
        alpha=opt["alpha"],
        learning_rate_init=opt["learning_rate_init"],

    )
    scores = cross_val_score(mlp, X_norm, y_iris, cv=5)

    return scores.mean()


search_space_mlp = {
    "hidden_layer_sizes": list(range(10, 100, 10)),
    "activation": ["identity", "logistic", "tanh", "relu"],
    "solver":  ["lbfgs", "sgd", "adam"],
    "alpha": [1/(10**x) for x in range(1, 9)],
    "learning_rate_init": [1/(10**x) for x in range(1, 9)],

}

In [47]:

hyper_mlp_0 = Hyperactive(verbosity=False)
hyper_mlp_0.add_search(mlp_model, search_space_mlp, n_iter=40)
hyper_mlp_0.run()

mlp_search_data = hyper_mlp_0.search_data(mlp_model)
mlp_search_data.head()

 hyper_optimizer search_data 
     hidden_layer_sizes activation solver         alpha  learning_rate_init  \
0                   20       tanh    sgd  1.000000e-01        1.000000e-02   
1                   40       tanh   adam  1.000000e-07        1.000000e-03   
2                   50   logistic    sgd  1.000000e-04        1.000000e-04   
3                   90       relu  lbfgs  1.000000e-08        1.000000e-08   
4                   90       relu  lbfgs  1.000000e-01        1.000000e-01   
5                   10       relu  lbfgs  1.000000e-08        1.000000e-08   
6                   90       relu   adam  1.000000e-01        1.000000e-01   
7                   90       tanh   adam  1.000000e-05        1.000000e-06   
8                   60       relu  lbfgs  1.000000e-08        1.000000e-08   
9                   40       relu    sgd  1.000000e-02        1.000000e-04   
10                  70       tanh   adam  1.000000e-02        1.000000e-03   
11                  30   identity  lbfgs  1.000000e-04        1.000000e-04   
12                  40       tanh  lbfgs  1.000000e-03        1.000000e-04   
13                  80   logistic  lbfgs  1.000000e-02        1.000000e-01   
14                  20       relu   adam  1.000000e-07        1.000000e-05   
15                  60       relu    sgd  1.000000e-07        1.000000e-01   
16                  60       tanh   adam  1.000000e-03        1.000000e-01   
17                  10       relu   adam  1.000000e-01        1.000000e-03   
18                  20   identity    sgd  1.000000e-01        1.000000e-01   
19                  30   logistic  lbfgs  1.000000e-02        1.000000e-06   
20                  40       tanh  lbfgs  1.000000e-01        1.000000e-03   
21                  70   logistic  lbfgs  1.000000e-02        1.000000e-06   
22                  40   identity   adam  1.000000e-06        1.000000e-08   
23                  70   logistic    sgd  1.000000e-07        1.000000e-05   
24                  30   identity   adam  1.000000e-07        1.000000e-01   
25                  50   logistic    sgd  1.000000e-07        1.000000e-08   
26                  20       relu  lbfgs  1.000000e-03        1.000000e-06   
27                  80   logistic  lbfgs  1.000000e-05        1.000000e-08   
28                  10       tanh    sgd  1.000000e-05        1.000000e-08   
29                  30   identity  lbfgs  1.000000e-04        1.000000e-03   
30                  40       tanh    sgd  1.000000e-08        1.000000e-07   
31                  90       tanh    sgd  1.000000e-08        1.000000e-03   
32                  50       tanh   adam  1.000000e-03        1.000000e-01   
33                  80       relu  lbfgs  1.000000e-05        1.000000e-07   
34                  50   logistic   adam  1.000000e-08        1.000000e-04   
35                  70       tanh    sgd  1.000000e-03        1.000000e-03   
36                  60       tanh   adam  1.000000e-03        1.000000e-03   
37                  70       relu    sgd  1.000000e-06        1.000000e-07   
38                  20       tanh   adam  1.000000e-06        1.000000e-07   
39                  20   identity  lbfgs  1.000000e-04        1.000000e-01   

       score  
0   0.973333  
1   0.900000  
2   0.420000  
3   0.306667  
4   0.973333  
5   0.393333  
6   0.973333  
7   0.386667  
8   0.313333  
9   0.466667  
10  0.920000  
11  0.433333  
12  0.480000  
13  0.973333  
14  0.380000  
15  0.973333  
16  0.973333  
17  0.740000  
18  0.973333  
19  0.333333  
20  0.900000  
21  0.266667  
22  0.340000  
23  0.333333  
24  0.973333  
25  0.333333  
26  0.360000  
27  0.320000  
28  0.400000  
29  0.866667  
30  0.253333  
31  0.940000  
32  0.973333  
33  0.333333  
34  0.406667  
35  0.926667  
36  0.926667  
37  0.266667  
38  0.306667  
39  0.973333

Out[47]:

	hidden_layer_sizes	activation	solver	alpha	learning_rate_init	score
0	20	tanh	sgd	1.000000e-01	1.000000e-02	0.973333
1	40	tanh	adam	1.000000e-07	1.000000e-03	0.900000
2	50	logistic	sgd	1.000000e-04	1.000000e-04	0.420000
3	90	relu	lbfgs	1.000000e-08	1.000000e-08	0.306667
4	90	relu	lbfgs	1.000000e-01	1.000000e-01	0.973333

In [48]:

parameter_names = list(search_space_mlp.keys())

mlp_search_data = mlp_search_data.sort_values('hidden_layer_sizes', ascending=False)

fig = px.parallel_categories(mlp_search_data, 
                             color="score", 
                             color_continuous_scale=color_scale, 
                             dimensions=parameter_names, 
                             )
fig.update_layout(width=950, height=700)
fig.show()

Deep Learning Optimization ¶

The optimization of deep learning models can be very difficult because the evaluation times of the objective functions are very high, due to the long training times. There is also the challenge of finding a way to find the optimal structure/architecture of the neural network. Hyperactive can help with both of those problems.

The optimization of the structure/architecture of a neural network is called neural architecture search. Because Hyperactive can handle functions in its search spaces performing nas is very easy.

In [49]:

def deep_learning_model(params):
    filters_0 = params["filters.0"]
    kernel_size_0 = params["kernel_size.0"]
    
    model = Sequential()
    model.add(Conv2D(filters_0, (kernel_size_0, kernel_size_0), input_shape=(img_width, img_height, 1), activation="relu"))
    
    model.add(MaxPooling2D(pool_size=(2, 2)))
        
    # the next two lines are layers that are put in during the optimization run
    model = params["layer.0"](params, model)
    model = params["layer.1"](params, model)

    model.add(Flatten())
    model.add(Dense(params["dense.0"], activation="relu"))
    model.add(Dense(num_classes, activation="softmax"))

    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    model.fit(
        x_train,
        y_train,
        validation_data=(x_test, y_test),
        epochs=5,
        verbose=False,
    )
    _, score = model.evaluate(x=x_test, y=y_test, verbose=False)

    return score

The following functions are the layers and layer-compositions that we will use in the search space. The params-argument enables the optimization of parameters inside the layer-function. There is also a no_layer-function because we want to test of it might be better for the score of the neural network if its number of layers is reduced.

In [50]:

def Conv2D_MaxPooling2D_layer(params, model):
    filters_1 = params["layer.0.filters"]
    kernel_size_1 = params["layer.0.kernel_size"]
    model.add(Conv2D(filters_1, (kernel_size_1, kernel_size_1), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    return model

def Conv2D_layer(params, model):
    filters_1 = params["layer.0.filters"]
    kernel_size_1 = params["layer.0.kernel_size"]
    model.add(Conv2D(filters_1, (kernel_size_1, kernel_size_1), activation='relu'))
    return model

def Dropout_layer(params, model):
    model.add(Dropout(params["layer.1.rate"]))
    return model

def no_layer(params, model):
    return model

In the search space you can see that the layer-functions are put inside lists. During the optimization run Hyperactive will select those layer-functions similar to any other variable inside the search space.

In [51]:

# you can put the layers into lists like any other variable
search_space_dl = {
    "filters.0": list(range(7, 15)),
    "kernel_size.0": list(range(3, 6)),
    
    "layer.0": [Conv2D_MaxPooling2D_layer, Conv2D_layer, no_layer],
    "layer.0.filters": list(range(5, 12)),
    "layer.0.kernel_size": list(range(3, 6)),
    
    "layer.1": [Dropout_layer, no_layer],
    "layer.1.rate": list(np.arange(0.2, 0.8, 0.1)),

    "dense.0": list(range(10, 200, 20)),
}

Bayesian Optimization¶

Bayesian optimization is a global optimization technique that uses a machine learning model (surrogate model) to approximate the objective function. It relies on a gaussian process regressor fitting to known positions and scores in the search space and predicting where to search next. It follows the following steps:

- fit the gaussian process regressor to the training data (positions in search space) and the target (score of each position).
- the regressor makes a prediction of every position in the search space
- from the predictions an acquisition function is calculated that determines which position to evaluate next
- after the evaluation the algorithm adds the position and score to the training data

Since the regressor is trained in every iteration the optimization step takes a long time compared to other algorithms. This is why it is often used for objective functions with a long evaluation time. The long optimization time does not matter if the evaluation time is even longer. In those cases it is much more important that each new position is carfully selected to avoid wasted time of an evaluation that has a low score.

The following plots show an example of the path a bayesian optimization algorithm would take in different objective functions:

In [52]:

optimizer = BayesianOptimizer()

hyper_dl = Hyperactive(verbosity=False)
hyper_dl.add_search(deep_learning_model, search_space_dl, n_iter=30, optimizer=optimizer)
hyper_dl.run()

dl_search_data = hyper_dl.search_data(deep_learning_model, times=True)

2024-01-02 08:47:40.065310: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-01-02 08:47:40.065497: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...

 hyper_optimizer search_data 
     filters.0  kernel_size.0  \
0          11              3   
1          13              4   
2          10              4   
3           7              5   
4           7              5   
5          14              5   
6           7              3   
7           7              3   
8           8              3   
9          14              5   
10         13              3   
11         11              3   
12         11              4   
13         12              5   
14         10              3   
15         11              3   
16         10              4   
17         11              3   
18         12              3   
19         11              3   
20         12              3   
21         11              3   
22         11              3   
23         11              3   
24         12              3   
25         10              3   
26         10              3   
27         10              3   
28         11              3   
29         10              3   

                                              layer.0  layer.0.filters  \
0   <function Conv2D_MaxPooling2D_layer at 0x7f3a6...                5   
1   <function Conv2D_MaxPooling2D_layer at 0x7f3a6...               11   
2           <function Conv2D_layer at 0x7f3a6331a670>                8   
3               <function no_layer at 0x7f3a6331a8b0>               11   
4               <function no_layer at 0x7f3a6331a8b0>               11   
5   <function Conv2D_MaxPooling2D_layer at 0x7f3a6...               11   
6   <function Conv2D_MaxPooling2D_layer at 0x7f3a6...                5   
7               <function no_layer at 0x7f3a6331a8b0>                8   
8   <function Conv2D_MaxPooling2D_layer at 0x7f3a6...                6   
9   <function Conv2D_MaxPooling2D_layer at 0x7f3a6...                5   
10  <function Conv2D_MaxPooling2D_layer at 0x7f3a6...               10   
11          <function Conv2D_layer at 0x7f3a6331a670>                9   
12  <function Conv2D_MaxPooling2D_layer at 0x7f3a6...                9   
13              <function no_layer at 0x7f3a6331a8b0>                8   
14  <function Conv2D_MaxPooling2D_layer at 0x7f3a6...                9   
15  <function Conv2D_MaxPooling2D_layer at 0x7f3a6...                9   
16  <function Conv2D_MaxPooling2D_layer at 0x7f3a6...                8   
17  <function Conv2D_MaxPooling2D_layer at 0x7f3a6...               11   
18  <function Conv2D_MaxPooling2D_layer at 0x7f3a6...               10   
19          <function Conv2D_layer at 0x7f3a6331a670>               10   
20  <function Conv2D_MaxPooling2D_layer at 0x7f3a6...               10   
21          <function Conv2D_layer at 0x7f3a6331a670>               10   
22  <function Conv2D_MaxPooling2D_layer at 0x7f3a6...               10   
23              <function no_layer at 0x7f3a6331a8b0>                9   
24  <function Conv2D_MaxPooling2D_layer at 0x7f3a6...               11   
25  <function Conv2D_MaxPooling2D_layer at 0x7f3a6...               11   
26  <function Conv2D_MaxPooling2D_layer at 0x7f3a6...               10   
27  <function Conv2D_MaxPooling2D_layer at 0x7f3a6...               11   
28  <function Conv2D_MaxPooling2D_layer at 0x7f3a6...               10   
29  <function Conv2D_MaxPooling2D_layer at 0x7f3a6...               10   

    layer.0.kernel_size                                     layer.1  \
0                     4       <function no_layer at 0x7f3a6331a8b0>   
1                     5  <function Dropout_layer at 0x7f3a6331a820>   
2                     4  <function Dropout_layer at 0x7f3a6331a820>   
3                     5       <function no_layer at 0x7f3a6331a8b0>   
4                     5       <function no_layer at 0x7f3a6331a8b0>   
5                     3  <function Dropout_layer at 0x7f3a6331a820>   
6                     3       <function no_layer at 0x7f3a6331a8b0>   
7                     3       <function no_layer at 0x7f3a6331a8b0>   
8                     5       <function no_layer at 0x7f3a6331a8b0>   
9                     5  <function Dropout_layer at 0x7f3a6331a820>   
10                    5  <function Dropout_layer at 0x7f3a6331a820>   
11                    5  <function Dropout_layer at 0x7f3a6331a820>   
12                    3  <function Dropout_layer at 0x7f3a6331a820>   
13                    3  <function Dropout_layer at 0x7f3a6331a820>   
14                    4  <function Dropout_layer at 0x7f3a6331a820>   
15                    5  <function Dropout_layer at 0x7f3a6331a820>   
16                    4  <function Dropout_layer at 0x7f3a6331a820>   
17                    4  <function Dropout_layer at 0x7f3a6331a820>   
18                    4  <function Dropout_layer at 0x7f3a6331a820>   
19                    3  <function Dropout_layer at 0x7f3a6331a820>   
20                    3       <function no_layer at 0x7f3a6331a8b0>   
21                    4  <function Dropout_layer at 0x7f3a6331a820>   
22                    4  <function Dropout_layer at 0x7f3a6331a820>   
23                    3  <function Dropout_layer at 0x7f3a6331a820>   
24                    4  <function Dropout_layer at 0x7f3a6331a820>   
25                    5  <function Dropout_layer at 0x7f3a6331a820>   
26                    4  <function Dropout_layer at 0x7f3a6331a820>   
27                    3  <function Dropout_layer at 0x7f3a6331a820>   
28                    5  <function Dropout_layer at 0x7f3a6331a820>   
29                    5       <function no_layer at 0x7f3a6331a8b0>   

    layer.1.rate  dense.0   score  
0            0.4      150  0.9879  
1            0.2      110  0.9898  
2            0.5       90  0.9905  
3            0.2      190  0.9871  
4            0.2       10  0.9746  
5            0.8       10  0.9750  
6            0.8      190  0.9836  
7            0.2      130  0.9864  
8            0.5       70  0.9853  
9            0.3       70  0.9824  
10           0.2      190  0.9887  
11           0.4      150  0.9895  
12           0.3      150  0.9900  
13           0.2      190  0.9860  
14           0.4      130  0.9905  
15           0.2      130  0.9893  
16           0.4      130  0.9860  
17           0.3      150  0.9900  
18           0.3      130  0.9908  
19           0.4      130  0.9909  
20           0.3      150  0.9891  
21           0.3      130  0.9902  
22           0.4      130  0.9885  
23           0.4      130  0.9870  
24           0.2      130  0.9882  
25           0.3      190  0.9904  
26           0.3      170  0.9905  
27           0.3      190  0.9842  
28           0.3      150  0.9892  
29           0.3      170  0.9856

In [53]:

# we need to replace the functions with their names for the plot
def func2str(row):
    return row.__name__

dl_search_data["layer.0"] = dl_search_data["layer.0"].apply(func2str)
dl_search_data["layer.1"] = dl_search_data["layer.1"].apply(func2str)

dl_search_data = dl_search_data.drop(["eval_times", "iter_times"], axis=1)

In [ ]:

In [54]:

score_max = np.amax(search_data_0["score"])
score_std = search_data_0["score"].std()
dl_search_data_f = dl_search_data[abs(search_data_0["score"]-score_max) < score_std*2]

In [55]:

dl_search_data

Out[55]:

	filters.0	kernel_size.0	layer.0	layer.0.filters	layer.0.kernel_size	layer.1	layer.1.rate	dense.0	score
0	11	3	Conv2D_MaxPooling2D_layer	5	4	no_layer	0.4	150	0.9879
1	13	4	Conv2D_MaxPooling2D_layer	11	5	Dropout_layer	0.2	110	0.9898
2	10	4	Conv2D_layer	8	4	Dropout_layer	0.5	90	0.9905
3	7	5	no_layer	11	5	no_layer	0.2	190	0.9871
4	7	5	no_layer	11	5	no_layer	0.2	10	0.9746
5	14	5	Conv2D_MaxPooling2D_layer	11	3	Dropout_layer	0.8	10	0.9750
6	7	3	Conv2D_MaxPooling2D_layer	5	3	no_layer	0.8	190	0.9836
7	7	3	no_layer	8	3	no_layer	0.2	130	0.9864
8	8	3	Conv2D_MaxPooling2D_layer	6	5	no_layer	0.5	70	0.9853
9	14	5	Conv2D_MaxPooling2D_layer	5	5	Dropout_layer	0.3	70	0.9824
10	13	3	Conv2D_MaxPooling2D_layer	10	5	Dropout_layer	0.2	190	0.9887
11	11	3	Conv2D_layer	9	5	Dropout_layer	0.4	150	0.9895
12	11	4	Conv2D_MaxPooling2D_layer	9	3	Dropout_layer	0.3	150	0.9900
13	12	5	no_layer	8	3	Dropout_layer	0.2	190	0.9860
14	10	3	Conv2D_MaxPooling2D_layer	9	4	Dropout_layer	0.4	130	0.9905
15	11	3	Conv2D_MaxPooling2D_layer	9	5	Dropout_layer	0.2	130	0.9893
16	10	4	Conv2D_MaxPooling2D_layer	8	4	Dropout_layer	0.4	130	0.9860
17	11	3	Conv2D_MaxPooling2D_layer	11	4	Dropout_layer	0.3	150	0.9900
18	12	3	Conv2D_MaxPooling2D_layer	10	4	Dropout_layer	0.3	130	0.9908
19	11	3	Conv2D_layer	10	3	Dropout_layer	0.4	130	0.9909
20	12	3	Conv2D_MaxPooling2D_layer	10	3	no_layer	0.3	150	0.9891
21	11	3	Conv2D_layer	10	4	Dropout_layer	0.3	130	0.9902
22	11	3	Conv2D_MaxPooling2D_layer	10	4	Dropout_layer	0.4	130	0.9885
23	11	3	no_layer	9	3	Dropout_layer	0.4	130	0.9870
24	12	3	Conv2D_MaxPooling2D_layer	11	4	Dropout_layer	0.2	130	0.9882
25	10	3	Conv2D_MaxPooling2D_layer	11	5	Dropout_layer	0.3	190	0.9904
26	10	3	Conv2D_MaxPooling2D_layer	10	4	Dropout_layer	0.3	170	0.9905
27	10	3	Conv2D_MaxPooling2D_layer	11	3	Dropout_layer	0.3	190	0.9842
28	11	3	Conv2D_MaxPooling2D_layer	10	5	Dropout_layer	0.3	150	0.9892
29	10	3	Conv2D_MaxPooling2D_layer	10	5	no_layer	0.3	170	0.9856

In [56]:

parameter_names = list(dl_search_data_f.keys())

fig = px.parallel_categories(dl_search_data, 
                             color="score", 
                             color_continuous_scale=color_scale, 
                             dimensions=parameter_names, 
                             )
fig.update_layout(width=950, height=700)
fig.show()