This is a tutorial to introduce you to the basic functionalities of Hyperactive and provide some interesting applications. It will also give an introduction to some optimization techniques. Hyperactive is a package that can optimize any python function and collect its search data.
This tutorial is made for version 4 of Hyperactive
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
import time
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, RBF, ConstantKernel
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import load_wine, load_iris
from hyperactive import Hyperactive
from hyperactive.optimizers import BayesianOptimizer, HillClimbingOptimizer
from surfaces.visualize import plotly_surface, plotly_heatmap
color_scale = px.colors.sequential.Jet
def _create_grid(objective_function, search_space):
def objective_function_np(*args):
para = {}
for arg, key in zip(args, search_space.keys()):
para[key] = arg
return objective_function(para)
(x_all, y_all) = search_space.values()
xi, yi = np.meshgrid(x_all, y_all)
zi = objective_function_np(xi, yi)
return xi, yi, zi
import plotly.graph_objects as go
from plotly.subplots import make_subplots
def compare_objective_functions(objective_function1, objective_function2):
search_space_plot = {
"x": list(np.arange(-5, 5, 0.2)),
"y": list(np.arange(-5, 5, 0.2)),
}
xi_c, yi_c, zi_c = _create_grid(objective_function1, search_space_plot)
xi_a, yi_a, zi_a = _create_grid(objective_function2, search_space_plot)
fig1 = go.Surface(x=xi_c, y=yi_c, z=zi_c, colorscale=color_scale)
fig2 = go.Surface(x=xi_a, y=yi_a, z=zi_a, colorscale=color_scale)
fig = make_subplots(rows=1, cols=2,
specs=[[{'is_3d': True}, {'is_3d': True}]],
subplot_titles=['Convex Function', 'Non-convex Function'],
)
fig.add_trace(fig1, 1, 1)
fig.add_trace(fig2, 1, 2)
fig.update_layout(title_text="Objective Function Surface")
fig.show()
from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Dense, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD
from keras.utils import np_utils
from tensorflow import keras
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True
sess = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(sess)
2024-01-02 08:46:37.569045: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2024-01-02 08:46:37.650657: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include: 2024-01-02 08:46:37.650673: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. 2024-01-02 08:46:38.153488: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include: 2024-01-02 08:46:38.153539: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include: 2024-01-02 08:46:38.153543: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
Device mapping: no known devices.
2024-01-02 08:46:38.609600: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2024-01-02 08:46:38.695715: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2024-01-02 08:46:38.695926: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include: 2024-01-02 08:46:38.695976: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include: 2024-01-02 08:46:38.696017: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include: 2024-01-02 08:46:38.696055: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include: 2024-01-02 08:46:38.696094: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcurand.so.10'; dlerror: libcurand.so.10: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include: 2024-01-02 08:46:38.696132: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include: 2024-01-02 08:46:38.696170: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusparse.so.11'; dlerror: libcusparse.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include: 2024-01-02 08:46:38.696273: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform. Skipping registering GPU devices...
WARNING:tensorflow:From /tmp/ipykernel_1406914/2148136278.py:15: The name tf.keras.backend.set_session is deprecated. Please use tf.compat.v1.keras.backend.set_session instead.
# load dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()
img_width = 28
img_height = 28
x_train = x_train.astype("float32")
x_train /= 255.0
x_test = x_test.astype("float32")
x_test /= 255.0
# reshape input data
x_train = x_train.reshape(x_train.shape[0], img_width, img_height, 1)
x_test = x_test.reshape(x_test.shape[0], img_width, img_height, 1)
# one hot encode outputs
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
num_classes = y_test.shape[1]
data = load_wine()
X_boston, y_boston = data.data, data.target
data = load_iris()
X_iris, y_iris = data.data, data.target
There are two things you need to define before starting your first optimization run:
- the objective function:
Contains some kind of model. It always returns a score that will be maximized during
- a search space:
Defines the parameter space in which the optimizer searches for the best parameter set
In this notebook you will see several different examples for objective functions.
def objective_function(para):
loss = para["x"]*para["x"]
# -x*x is an inverted parabola
return -loss
# We have only one dimension here
search_space = {
"x": list(np.arange(-5, 5, 0.01)),
}
In the next step we will start the optimization run.
You only need the objective_function, search_space and the number of iterations. Each iteration will evaluate the objective function. This will generate a score, that the optimization algorithm uses to determine which position in the search space to look next. All of the calculations will be done by Hyperactive in the background. You will receive the results of the optimization run when all iterations are done.
hyper_0 = Hyperactive(verbosity=False)
hyper_0.add_search(objective_function, search_space, n_iter=70, initialize={"random": 2, "vertices": 2})
hyper_0.run()
search_data_0 = hyper_0.search_data(objective_function)
search_data_0[["x", "score"]]
hyper_optimizer search_data x score 0 0.22 -0.0484 1 -1.46 -2.1316 2 4.99 -24.9001 3 -5.00 -25.0000 4 4.06 -16.4836 .. ... ... 65 -4.73 -22.3729 66 -2.99 -8.9401 67 4.65 -21.6225 68 4.01 -16.0801 69 -0.78 -0.6084 [70 rows x 2 columns]
x | score | |
---|---|---|
0 | 0.22 | -0.0484 |
1 | -1.46 | -2.1316 |
2 | 4.99 | -24.9001 |
3 | -5.00 | -25.0000 |
4 | 4.06 | -16.4836 |
... | ... | ... |
65 | -4.73 | -22.3729 |
66 | -2.99 | -8.9401 |
67 | 4.65 | -21.6225 |
68 | 4.01 | -16.0801 |
69 | -0.78 | -0.6084 |
70 rows × 2 columns
In the table above you can see the 70 iterations performed during the run. This is called the search data. In each row you can see the parameter x
and the corresponding score. As we previously discussed the optimization algorithm determines which position to select next based on the score from the evaluated objective function.
When Hyperactive starts the optimization the first iterations are initializations from the initialize
-dictionary. In the example above there are 4 initializations (2 random and 2 vertices). They determine the initial positions in the search space that are used to evaluate the objective funtion. As you can see in the search data the 2. and 3. iteration are the vertices (edge points) of the search space. The 0. and 1.
The first rows of the search data are randomly selected. After those few initialization steps the optimization algorithm will select the next positions in the search space based on the score of the previous position(s).
The default algorithm for the optimization is the random-search. You can see the random pattern in the last few iterations of the search data. We can also see the random pattern if we plot the search data:
The random search optimizer is a very simple algorithm. It randomly selects the position in each iteration without adapting to the optimization problem (exploitation). One the other hand it is very useful to initialy explore the search space or find new regions with optima (exploration).
The following two gifs show how random search explores the search space for two different objective functions:
fig = px.scatter(search_data_0, x="x", y="score")
fig.show()
The plot above shows the score of each parameter set (in this case just one parameter "x"). The random search explores the search space very well, so that we can see the inverse parabola.
The shape of the objective function in the search space is a very important part of the optimization problem, because it heavily impacts the behaviour of the algorithm. Each optimization algorithm is well equiped to solve some kind of objective functions and perform poorly for others.
A basic classifications of optimization problems is into convex- and nonconvex-problems. Convex optimization problems have a shape, where the score improves continuously as the position gets closer to the global optimum. So it does not have any local optima in its shape. Nonconvex-problems do have these local optima.
Lets take a closer look at the convex optimization problem and try out different optimization algorithms:
def convex_function(para):
loss = (para["x"]*para["x"] + para["y"]*para["y"])
return -loss
search_space = {
"x": list(np.arange(-5, 5, 0.01)),
"y": list(np.arange(-5, 5, 0.01)),
}
hyper_convex_0 = Hyperactive(verbosity=False)
hyper_convex_0.add_search(convex_function, search_space, n_iter=2000)
hyper_convex_0.run()
hyper_optimizer search_data x y score 0 -4.92 4.14 -41.3460 1 3.81 -2.28 -19.7145 2 -1.67 -1.67 -5.5778 3 -1.67 1.66 -5.5445 4 1.66 -1.67 -5.5445 ... ... ... ... 1995 0.53 0.48 -0.5113 1996 -3.74 -2.94 -22.6312 1997 -3.21 2.37 -15.9210 1998 1.99 -3.61 -16.9922 1999 0.12 0.68 -0.4768 [2000 rows x 3 columns]
search_data_convex_0 = hyper_convex_0.search_data(convex_function)
fig = px.scatter(search_data_convex_0, x="x", y="y", color="score", color_continuous_scale=color_scale)
fig.update_layout(width=900, height=800, xaxis_range=[-5, 5], yaxis_range=[-5, 5])
fig.show()
The plot above shows the samples from the search data acquired from the convex-function in a 2-dimensional search space. The score is shown by the color of each point in the scatter plot.
We were able to see, that random search is a good optimization technique to explore the search space. But the goal is often to quickly find position in the search space with a high score. Therefore we should consider other optimization techniques like the hill climbing algorithm.
The hill climbing optimization algorithm works by finding a random neighbour position close to the current position. If the score of the new position is better than the current one the algorithm makes a step to the new position and returns to finding the next position. This behaviour is like someone who tries to find the highest (highest score) position in a space by only moving up and never moves down.
The hill climbing algorithm works very well with convex optimization problems, because the score continuously improves towards a direction. Hill climbing can find this direction by exploring the scores of its neighbours. Hill climbing does not work of there are local optima. It tends to get "stuck" in certain regions, where the current position is surrounded by positions with worse scores. The algorithm would need to first "go down" and later "go up" again to find other (even better) positions in the search space.
optimizer = HillClimbingOptimizer(rand_rest_p=0)
hyper_convex_1 = Hyperactive(verbosity=False)
hyper_convex_1.add_search(convex_function, search_space, n_iter=90, optimizer=optimizer, initialize={"vertices":1})
hyper_convex_1.run()
search_data_convex_1 = hyper_convex_1.search_data(convex_function)
hyper_optimizer search_data x y score 0 4.99 -5.000000e+00 -49.9001 1 4.99 -4.470000e+00 -44.8810 2 4.82 -4.910000e+00 -47.3405 3 4.82 -4.790000e+00 -46.1765 4 4.99 -5.000000e+00 -49.9001 .. ... ... ... 85 -0.28 4.000000e-02 -0.0800 86 0.80 -1.065814e-13 -0.6400 87 0.02 -2.500000e-01 -0.0629 88 0.43 -1.065814e-13 -0.1849 89 0.23 6.000000e-02 -0.0565 [90 rows x 3 columns]
fig = px.scatter(search_data_convex_1, x="x", y="y", color="score", color_continuous_scale=color_scale)
fig.update_layout(width=900, height=800, xaxis_range=[-5, 5], yaxis_range=[-5, 5])
fig.show()
The 2D-scatter plot above shows that the hill climbing algorithm convertes quickly to the optimum of the objective function in the search space. Hill climbing is specialized to find the optimum of convex functions quickly. It was able to find a good score in less than 100 iterations, while the random search used much more for a similar maximum score.
Next we explore the performance of different optimization algorithms on the ackley-function, which is a nonconvex objective function:
def ackley_function(para):
x, y = para["x"], para["y"]
loss = (
-20 * np.exp(-0.2 * np.sqrt(0.5 * (x * x + y * y)))
- np.exp(0.5 * (np.cos(2 * np.pi * x) + np.cos(2 * np.pi * y)))
+ np.exp(1)
+ 20
)
return -loss
search_space = {
"x": list(np.arange(-5, 5, 0.01)),
"y": list(np.arange(-5, 5, 0.01)),
}
The ackley function is a non convex function that has a lot of local optima. This is because of the cos-terms that create wave-like patterns in both dimensions.
In the following 3D-surface plots you can see an example for the sphere-function and ackley-function. Both plots are interactive, so you can take a closer look at the shape of those objective functions:
compare_objective_functions(convex_function, ackley_function)
hyper_ackley_0 = Hyperactive(verbosity=False)
hyper_ackley_0.add_search(ackley_function, search_space, n_iter=2000)
hyper_ackley_0.run()
search_data_ackley_0 = hyper_ackley_0.search_data(ackley_function)
hyper_optimizer search_data x y score 0 4.14 -3.61 -12.585249 1 4.46 1.01 -11.228130 2 -1.67 -1.67 -7.779507 3 -1.67 1.66 -7.781677 4 1.66 -1.67 -7.781677 ... ... ... ... 1995 -4.39 -0.53 -11.600498 1996 2.15 -2.98 -8.620673 1997 -1.54 -3.53 -10.740426 1998 2.02 2.12 -7.135418 1999 2.79 1.63 -9.250090 [2000 rows x 3 columns]
fig = px.scatter(search_data_ackley_0, x="x", y="y", color="score", color_continuous_scale=color_scale)
fig.update_layout(width=900, height=800)
fig.show()
The plot above shows the random search exploring the ackley function. Random search is not affected by the many local optima in the search space. Lets try out the hill climbing algorithm on the ackley function and see the results.
optimizer = HillClimbingOptimizer(rand_rest_p=0)
hyper_ackley_1 = Hyperactive(verbosity=False)
hyper_ackley_1.add_search(ackley_function,
search_space,
n_iter=100,
optimizer=optimizer,
initialize={"vertices": 1})
hyper_ackley_1.run()
search_data_ackley_1 = hyper_ackley_1.search_data(ackley_function)
hyper_optimizer search_data x y score 0 -5.00 4.99 -12.637734 1 -4.55 4.99 -13.998385 2 -4.75 4.99 -13.522024 3 -4.85 4.75 -13.719183 4 -4.81 4.99 -13.233342 .. ... ... ... 95 -4.82 4.68 -13.984259 96 -5.00 4.99 -12.637734 97 -5.00 4.99 -12.637734 98 -4.66 4.55 -14.280916 99 -5.00 4.99 -12.637734 [100 rows x 3 columns]
fig = px.scatter(search_data_ackley_1, x="x", y="y", color="score", color_continuous_scale=color_scale)
fig.update_layout(width=900, height=800, xaxis_range=[-5, 5], yaxis_range=[-5, 5])
fig.show()
Maybe you already expected, that the hill climbing algorithm delivers bad results optimizing the ackley function. That does not mean, that hill climbing is a bad algorithm in general. It means that it is bad for this kind of objective functions. This is a very important idea in mathematical optimization. It is very useful to know about the properties of the objective function, because you can choose an optimization algorithm that works really well for this problem.
The repulsing hill climbing optimizer tries to improve how hill climbing solves non-convex objective functions. It does so by increasing the radius in which hill climbing selects a neighbour position if the last position wasn't an improvement over the current one. This means the hill climber will jump away from its current position of it does not find better position in its close environment.
from hyperactive.optimizers import RepulsingHillClimbingOptimizer
optimizer = RepulsingHillClimbingOptimizer()
hyper_ackley_2 = Hyperactive(verbosity=False)
hyper_ackley_2.add_search(ackley_function,
search_space,
n_iter=100,
optimizer=optimizer,
initialize={"vertices": 1})
hyper_ackley_2.run()
search_data_ackley_2 = hyper_ackley_2.search_data(ackley_function)
hyper_optimizer search_data x y score 0 4.99 4.99 -12.633040 1 4.92 4.99 -12.741621 2 -1.34 0.02 -4.914987 3 4.99 4.93 -12.712329 4 -0.81 1.72 -6.338818 .. ... ... ... 95 -3.54 0.89 -9.877024 96 0.50 0.26 -3.662621 97 -1.50 0.79 -6.295015 98 -1.65 0.34 -6.387955 99 -0.08 0.71 -3.271799 [100 rows x 3 columns]
fig = px.scatter(search_data_ackley_2, x="x", y="y", color="score", color_continuous_scale=color_scale)
fig.update_layout(width=900, height=800, xaxis_range=[-5, 5], yaxis_range=[-5, 5])
fig.show()
The plot above shows how the repulsing hill climbing optimizer explored the search space of the ackley function. It does a much better job finding new optima in the space, while also exploring local regions.
Some objective functions (especially with machein- or deep- learning models) can take a long time to evaluate, which slows down the optimization run. This means, that we want to avoid any unnecessary evaluation of the objective function. Unfortunately most optimization algorithms won't avoid positions in the search space that were already evaluated. For example:
- Random Search, which could select a position it already selected before
- Hill climbing stuck in an optimum
- Particle swarms that converge on one position
The bottom line is, that optimization algorithms don't "remember" already explored positions and won't avoid them. But Hyperactive has a feature that solves this problem, by saving the position and score in a memory-dictionary. If a position is selected Hyperactive will look up if this position is already known. If it knows the position and score it won't reevaluate the objective function but extract the score, which saves time. This is very useful for computationally expensive objective functions.
You can even pass the search data from a previous optimization run into the memory_warm_start
-argument, so that the optimization run "remembers" the memory from the previous one.
def dtr_model(opt):
dtr = DecisionTreeRegressor(
max_depth=opt["max_depth"],
min_samples_split=opt["min_samples_split"],
)
scores = cross_val_score(dtr, X_boston, y_boston, cv=5)
return scores.mean()
search_space_dtr = {
"max_depth": list(range(10, 35)),
"min_samples_split": list(range(2, 35)),
}
c_time1 = time.time()
hyper_dtr_0 = Hyperactive(verbosity=False)
hyper_dtr_0.add_search(dtr_model, search_space_dtr, n_iter=300)
hyper_dtr_0.run()
d_time1 = time.time() - c_time1
print("Optimization time 1:", round(d_time1, 2))
# Hyperactive collects the search data
search_data_dtr_0 = hyper_dtr_0.search_data(dtr_model, times=True)
hyper_optimizer search_data max_depth min_samples_split score 0 10 21 0.229979 1 33 5 0.294730 2 18 12 0.229979 3 18 22 0.317003 4 26 12 0.205503 .. ... ... ... 295 15 2 0.131560 296 11 12 0.317003 297 20 32 0.317003 298 34 32 0.292527 299 21 9 0.207706 [300 rows x 3 columns] Optimization time 1: 1.1
After the first optimization run we start an additional run and pass the search data into memory_warm_start
. We expect that the next run will be faster, because we already have explored 300 positions in the search space from the previous optimization run.
c_time2 = time.time()
hyper_dtr_1 = Hyperactive(verbosity=False)
hyper_dtr_1.add_search(dtr_model, search_space_dtr, n_iter=300, memory_warm_start=search_data_dtr_0)
hyper_dtr_1.run()
d_time2 = time.time() - c_time2
print("Optimization time 2:", round(d_time2, 2))
hyper_optimizer search_data max_depth min_samples_split score 0 17 29 0.229979 1 31 34 0.317003 2 18 12 0.229979 3 18 22 0.317003 4 26 12 0.205503 .. ... ... ... 295 12 6 0.183230 296 34 6 0.245779 297 13 21 0.317003 298 32 33 0.292527 299 18 18 0.292527 [300 rows x 3 columns] Optimization time 2: 0.76
print("\n The second optimization run is "+'{}'.format(round((1-d_time2/d_time1)*100,2))+"% faster than the first one.")
The second optimization run is 31.12% faster than the first one.
search_data_dtr_1 = hyper_dtr_1.search_data(dtr_model, times=True)
search_data_dtr = search_data_dtr_1.append(search_data_dtr_0, ignore_index=True)
# times in seconds
eval_times = search_data_dtr_0["eval_times"]
eval_times_mem = search_data_dtr_1["eval_times"]
opt_times = search_data_dtr["iter_times"]-search_data_dtr["eval_times"]
fig = go.Figure()
fig.add_trace(go.Histogram(x=eval_times, name="evaluation time", nbinsx=15))
fig.add_trace(go.Histogram(x=eval_times_mem, name="evaluation time second run", nbinsx=15))
fig.add_trace(go.Histogram(x=opt_times, name="optimization time", nbinsx=15))
fig.show()
The evaluation- and optimization- times from the two optimization runs is shown in the histogram above. There are several interesting things to see:
- Even for simple machine learning models the optimization algorithm is much faster.
- The evaluations, which are faster than the optimization are from the memory-dictionary lookup.
- The second optimization run has much more memory lookups than the first.
Until now we mostly optimized test functions to show how an objective function and the search space can look like. These problems were easy to solve, because the objective funtion evaluates very fast and the search space is very small. Real optimization problems often have one of those two problems: - The objective function is computationally expensive, so it takes a long time to evaluate. This increases the iteration time and slowes down the optimization progress. - The search space is very large. This can makes it very difficult to find positions with a high score.
In the first case you would want to use optimization algorithms that are very inteligent in finding new positions with high scores. You don't want to waste too much time exploring the search space, because each evaluation takes such a long time. You want to get to a good position with a high score in as few steps as possible.
In the second case you would want a fast algorithm, that looks for a good score but also explores the search space very well.
Lets take a look at a (kind of) real optimization problem. We want to optimize the hyperparameters of a gradient boosting regressor that is trained on the boston housing regression dataset.
def gbr_model_0(opt):
gbr = GradientBoostingRegressor(
n_estimators=opt["n_estimators"],
max_depth=opt["max_depth"],
)
scores = cross_val_score(gbr, X_boston, y_boston, cv=5)
score = scores.mean()
return score
search_space_gbr_0 = {
"n_estimators": list(range(10, 100)),
"max_depth": list(range(2, 12)),
}
hyper_gbr_0 = Hyperactive(verbosity=False)
hyper_gbr_0.add_search(gbr_model_0, search_space_gbr_0, n_iter=50)
hyper_gbr_0.run()
search_data_gbr_0 = hyper_gbr_0.search_data(gbr_model_0)
hyper_optimizer search_data n_estimators max_depth score 0 41 11 0.286493 1 10 11 0.158985 2 39 5 0.269010 3 39 8 0.289922 4 68 5 0.285994 5 68 8 0.287513 6 10 11 0.158985 7 99 11 0.275180 8 10 2 0.130693 9 99 2 0.326558 10 83 3 0.309352 11 49 2 0.324265 12 31 6 0.284971 13 84 4 0.272758 14 53 9 0.280875 15 53 2 0.324899 16 16 10 0.260080 17 15 3 0.265057 18 66 5 0.261062 19 30 6 0.293972 20 50 3 0.315253 21 63 4 0.271137 22 33 6 0.273074 23 68 10 0.285651 24 22 5 0.269218 25 12 7 0.199622 26 42 7 0.271228 27 44 3 0.305420 28 96 5 0.276716 29 43 2 0.323368 30 12 8 0.199739 31 45 11 0.289907 32 41 11 0.286493 33 43 6 0.260725 34 77 10 0.282316 35 81 8 0.267947 36 65 10 0.260475 37 31 11 0.266904 38 47 3 0.306781 39 87 7 0.276266 40 39 4 0.279326 41 49 4 0.273381 42 59 5 0.275943 43 47 11 0.288363 44 23 4 0.263894 45 26 6 0.267758 46 11 5 0.195222 47 69 10 0.275137 48 84 2 0.325823 49 69 6 0.268853
fig = px.scatter(search_data_gbr_0,
x="n_estimators",
y="max_depth",
color="score",
color_continuous_scale=color_scale)
fig.update_layout(width=900, height=800)
fig.show()
The scatter plot above contains the samples from the search data from the gbr-model. It seams that high max_depth
delivers bad scores but we should explore higher values for n_estimators
.
Hyperactive makes it very easy to continue a search. The search data you already used for data exploration can just be passed to Hyperactive. This is done in multiple ways:
- You can extract the best parameters via the "best_para"-method. This can than be passed to "initialize" to start at this position in the search space
- The search data from the "results"-method can be passed to "memory_warm_start". The search data is automaticaly added into the memory-dictionary.
- You can also pass the search data to "warm_start_smbo". This has the effect that the Bayesian optimizer can do more precise approximations in the beginning of the optimization run.
best_para_gbr_0 = hyper_gbr_0.best_para(gbr_model_0)
initialize = {"random": 4, "warm_start": [best_para_gbr_0]}
search_space_gbr_01 = {
"n_estimators": list(range(10, 250, 5)),
"max_depth": list(range(2, 8)),
}
hyper_gbr_01 = Hyperactive(verbosity=False)
hyper_gbr_01.add_search(gbr_model_0,
search_space_gbr_01,
n_iter=50,
n_jobs=2,
memory_warm_start=search_data_gbr_0,
initialize=initialize)
hyper_gbr_01.run()
search_data_gbr_01 = hyper_gbr_01.search_data(gbr_model_0)
hyper_optimizer search_data n_estimators max_depth score 0 45 6 0.260725 1 25 6 0.278369 2 30 2 0.311317 3 130 6 0.288743 4 100 2 0.326558 5 80 3 0.313228 6 240 7 0.282410 7 20 2 0.289883 8 210 7 0.285183 9 25 3 0.305628 10 55 6 0.276450 11 105 4 0.290260 12 180 3 0.306254 13 175 4 0.258077 14 20 5 0.243912 15 175 7 0.264526 16 235 2 0.329066 17 200 5 0.279308 18 80 7 0.267073 19 195 2 0.329525 20 70 2 0.326380 21 165 4 0.268225 22 245 5 0.260622 23 105 3 0.308383 24 145 4 0.276206 25 75 3 0.312939 26 60 3 0.312158 27 155 3 0.310263 28 150 5 0.262560 29 155 5 0.280357 30 130 7 0.272747 31 155 3 0.310263 32 220 3 0.313533 33 40 2 0.320701 34 75 4 0.228214 35 230 7 0.272366 36 180 2 0.328918 37 75 6 0.266215 38 10 7 0.176882 39 175 6 0.271295 40 95 4 0.292781 41 25 4 0.263894 42 50 4 0.273381 43 240 3 0.312109 44 90 4 0.271041 45 145 2 0.328595 46 65 3 0.315951 47 220 4 0.275379 48 30 2 0.311317 49 175 2 0.329184 hyper_optimizer search_data n_estimators max_depth score 0 40 4 0.279326 1 95 2 0.326284 2 40 3 0.305565 3 215 7 0.278450 4 100 2 0.326558 5 115 7 0.263180 6 130 3 0.312360 7 115 4 0.272206 8 50 4 0.273381 9 175 6 0.270705 10 195 5 0.275754 11 165 5 0.269701 12 75 3 0.313818 13 190 7 0.273318 14 30 7 0.288718 15 245 3 0.308790 16 105 3 0.316376 17 100 4 0.278197 18 185 5 0.267187 19 125 5 0.269066 20 35 6 0.273074 21 30 6 0.267758 22 170 4 0.280237 23 180 4 0.264906 24 120 4 0.270004 25 140 3 0.310266 26 155 7 0.291970 27 155 3 0.313230 28 130 5 0.282460 29 40 3 0.305565 30 190 5 0.250403 31 220 7 0.270510 32 190 4 0.271958 33 235 2 0.329018 34 185 3 0.312214 35 205 7 0.256337 36 65 2 0.325732 37 70 3 0.313170 38 70 3 0.313170 39 205 6 0.280859 40 125 3 0.313643 41 55 6 0.288828 42 80 7 0.272950 43 150 3 0.311705 44 40 7 0.271878 45 45 2 0.323368 46 35 5 0.278078 47 240 7 0.270968 48 130 4 0.264316 49 120 6 0.281479
# merge the search data from the previous run and the current run
search_data_gbr_01_ = search_data_gbr_01.append(search_data_gbr_0, ignore_index=True)
search_data_gbr_01_
n_estimators | max_depth | score | |
---|---|---|---|
0 | 40 | 4 | 0.279326 |
1 | 95 | 2 | 0.326284 |
2 | 40 | 3 | 0.305565 |
3 | 215 | 7 | 0.278450 |
4 | 100 | 2 | 0.326558 |
... | ... | ... | ... |
145 | 26 | 6 | 0.267758 |
146 | 11 | 5 | 0.195222 |
147 | 69 | 10 | 0.275137 |
148 | 84 | 2 | 0.325823 |
149 | 69 | 6 | 0.268853 |
150 rows × 3 columns
fig = px.scatter(search_data_gbr_01_,
x="n_estimators",
y="max_depth",
color="score",
color_continuous_scale=color_scale)
fig.update_layout(width=900, height=800)
fig.show()
The scatter plot shows the search data from the previous run on the left side and the new search data on the bottom, because of the different search spaces.
Lets throw more computational resources at this problem:
- 1 job does a hill climbing search starting at the best position from the last run
- 1 job does a hill climbing search with four random initial positions
- 2 jobs doing a random search
All of those jobs run in parallel and merge their results into one search data file.
best_para_gbr_01 = hyper_gbr_01.best_para(gbr_model_0)
initialize = {"warm_start": [best_para_gbr_01]}
search_space_gbr_02 = {
"n_estimators": list(range(150, 300, 2)),
"max_depth": list(range(2, 5)),
}
optimizer = HillClimbingOptimizer(rand_rest_p=0)
hyper_gbr_02 = Hyperactive(verbosity=False)
hyper_gbr_02.add_search(gbr_model_0,
search_space_gbr_02,
n_iter=50,
n_jobs=1,
optimizer=optimizer,
memory_warm_start=search_data_gbr_01_,
initialize=initialize)
hyper_gbr_02.add_search(gbr_model_0,
search_space_gbr_02,
n_iter=50,
n_jobs=1,
optimizer=optimizer,
memory_warm_start=search_data_gbr_01_,
initialize={"random": 4})
hyper_gbr_02.add_search(gbr_model_0,
search_space_gbr_02,
n_iter=50,
n_jobs=2,
memory_warm_start=search_data_gbr_01_)
hyper_gbr_02.run()
search_data_gbr_02 = hyper_gbr_02.search_data(gbr_model_0)
hyper_optimizer search_data n_estimators max_depth score 0 178 4 0.263013 1 162 4 0.280177 2 168 2 0.328659 3 248 4 0.275434 4 178 4 0.263013 5 172 4 0.289256 6 178 4 0.263013 7 168 4 0.283338 8 178 4 0.263013 9 176 4 0.258077 10 174 4 0.269407 11 168 4 0.283338 12 174 4 0.269407 13 164 4 0.281814 14 172 4 0.289256 15 178 4 0.263013 16 170 4 0.280237 17 168 4 0.283338 18 166 4 0.268225 19 178 4 0.263013 20 174 4 0.269407 21 168 4 0.283338 22 176 4 0.258077 23 170 4 0.280237 24 176 4 0.258077 25 166 4 0.268225 26 174 4 0.269407 27 182 4 0.282154 28 178 4 0.263013 29 164 4 0.281814 30 170 4 0.280237 31 176 4 0.258077 32 164 4 0.281814 33 166 4 0.268225 34 180 4 0.264906 35 176 4 0.258077 36 170 4 0.280237 37 172 4 0.289256 38 168 4 0.283338 39 168 4 0.283338 40 172 4 0.289256 41 174 4 0.269407 42 174 4 0.269407 43 174 4 0.269407 44 160 4 0.261971 45 174 4 0.269407 46 162 4 0.280177 47 178 4 0.263013 48 178 4 0.263013 49 174 4 0.269407 hyper_optimizer search_data n_estimators max_depth score 0 194 2 0.329392 1 194 2 0.329392 2 198 2 0.328777 3 190 2 0.328667 4 194 2 0.329392 5 202 2 0.328845 6 192 2 0.329001 7 190 2 0.328667 8 194 2 0.329392 9 196 2 0.329525 10 200 2 0.329898 11 202 2 0.328845 12 200 2 0.329898 13 200 2 0.329898 14 196 2 0.329525 15 202 2 0.328845 16 206 2 0.329736 17 206 2 0.329736 18 198 2 0.328777 19 192 2 0.329001 20 202 2 0.328845 21 204 2 0.328983 22 200 2 0.329898 23 194 2 0.329392 24 200 2 0.329898 25 202 2 0.328845 26 200 2 0.329898 27 202 2 0.328845 28 198 2 0.328777 29 200 2 0.329898 30 198 2 0.328777 31 200 2 0.329898 32 186 2 0.329217 33 198 2 0.328777 34 200 2 0.329898 35 198 2 0.328777 36 206 2 0.329736 37 194 2 0.329392 38 196 2 0.329525 39 198 2 0.328777 40 202 2 0.328845 41 214 2 0.330057 42 196 2 0.329525 43 222 2 0.328906 44 208 2 0.329644 45 212 2 0.329418 46 218 2 0.329348 47 212 2 0.329418 48 206 2 0.329736 49 214 2 0.330057 hyper_optimizer search_data n_estimators max_depth score 0 288 4 0.237881 1 294 2 0.328950 2 198 2 0.328207 3 198 2 0.328207 4 246 2 0.328708 5 246 2 0.328708 6 150 4 0.263894 7 298 2 0.329070 8 298 4 0.264539 9 150 2 0.325823 10 192 4 0.286485 11 248 4 0.275830 12 262 2 0.329647 13 286 2 0.328659 14 192 4 0.286485 15 280 4 0.238565 16 288 4 0.237881 17 282 4 0.285908 18 230 4 0.271318 19 230 4 0.271318 20 262 2 0.329647 21 200 3 0.316830 22 208 3 0.316408 23 246 3 0.308790 24 218 3 0.315333 25 150 4 0.263894 26 232 3 0.316344 27 168 3 0.310756 28 286 4 0.278173 29 162 2 0.328547 30 230 4 0.271318 31 250 2 0.329677 32 182 4 0.283171 33 194 3 0.309264 34 238 2 0.329388 35 260 2 0.329516 36 250 4 0.275661 37 266 3 0.313437 38 274 4 0.267048 39 264 3 0.311236 40 216 3 0.308580 41 196 2 0.329525 42 276 3 0.314968 43 256 3 0.314426 44 198 4 0.278145 45 252 2 0.329357 46 250 4 0.275661 47 260 3 0.311066 48 280 4 0.238565 49 176 4 0.258077 hyper_optimizer search_data n_estimators max_depth score 0 284 4 0.269907 1 224 3 0.312947 2 198 2 0.329464 3 198 2 0.329464 4 246 2 0.328885 5 246 2 0.328885 6 150 4 0.263894 7 298 2 0.329071 8 150 2 0.325823 9 298 4 0.271417 10 254 2 0.329152 11 284 4 0.269907 12 200 4 0.276576 13 262 4 0.275647 14 298 3 0.308374 15 192 2 0.329062 16 236 3 0.311532 17 240 4 0.289443 18 236 2 0.329066 19 224 2 0.329289 20 222 3 0.309378 21 164 2 0.328603 22 252 2 0.329503 23 284 2 0.329192 24 294 3 0.316338 25 228 4 0.279851 26 246 3 0.308790 27 280 4 0.272449 28 268 4 0.276267 29 150 4 0.263894 30 206 2 0.329596 31 290 3 0.313317 32 194 4 0.276474 33 244 4 0.272497 34 226 2 0.329349 35 226 4 0.253493 36 174 4 0.283864 37 232 4 0.281243 38 150 4 0.263894 39 200 3 0.316185 40 216 4 0.282455 41 280 2 0.328181 42 276 3 0.309901 43 284 3 0.310843 44 252 4 0.278502 45 278 3 0.309817 46 188 2 0.329047 47 258 4 0.283216 48 258 3 0.308893 49 282 4 0.294080
search_data_gbr_02_ = search_data_gbr_02.append(search_data_gbr_01_, ignore_index=True)
fig = px.scatter(search_data_gbr_02_,
x="n_estimators",
y="max_depth",
color="score",
color_continuous_scale=color_scale)
fig.update_layout(width=900, height=800)
fig.show()
All the search data we collected shows a clear pattern. We should keep the max_depth
at 2. In the following plot the search data is filtered to show only max_depth
== 2, while showing the n_estimators
dependend on the score.
search_data_gbr_02_f = search_data_gbr_02_[search_data_gbr_02_["score"] > 0.68]
search_data_gbr_02_f_max_depth2 = search_data_gbr_02_f[search_data_gbr_02_["max_depth"] == 2]
fig = px.scatter(search_data_gbr_02_f_max_depth2,
x="n_estimators",
y="score")
fig.update_layout(width=900, height=800)
fig.show()
The filtering and visualization of the search data in the last few plots was an example how you can explore the model and search space yourself. Hyperactive makes it very easy to collect and reuse search data. Let's take a look how to collect more data:
Until now you have seen, that the objective function always returns only one variable: The score, which is always a real number. But Hyperactive has the capability to accept more variables. Those additional variables won't affect the score or the decision making of the optimization algorithm, but they will be collected in each iteration and accessed in the search data.
This feature can be very useful, because you can add any variable you want to the search data, which might help you understand the model better. To collect additional data in the objective function you just put it into a dictionary and return it alongside with the score. The key will be the column name in the search data and the value will be collected.
def gbr_model_1(opt):
gbr = GradientBoostingRegressor(
n_estimators=opt["n_estimators"],
max_depth=opt["max_depth"],
)
c_time = time.time()
scores = cross_val_score(gbr, X_boston, y_boston, cv=5)
cv_time = time.time() - c_time
# add the dictionary to collect more data
return scores.mean(), {"cv_time": cv_time}
search_space_gbr_1 = {
"n_estimators": list(range(10, 250, 5)),
"max_depth": list(range(2, 8)),
}
hyper_gbr_1 = Hyperactive(verbosity=False)
hyper_gbr_1.add_search(gbr_model_1, search_space_gbr_1, n_iter=15, n_jobs=8, initialize={"random": 10})
hyper_gbr_1.run()
search_data_gbr_1 = hyper_gbr_1.search_data(gbr_model_1)
search_data_gbr_1.head()
hyper_optimizer search_data n_estimators max_depth cv_time score 0 45 6 0.118185 0.266934 1 25 6 0.066240 0.274931 2 30 2 0.064459 0.311317 3 130 6 0.313416 0.279695 4 80 3 0.194099 0.311537 5 240 7 0.481668 0.286414 6 20 2 0.043417 0.289883 7 210 7 0.446462 0.275240 8 25 3 0.061512 0.300796 9 55 6 0.135543 0.291698 10 105 4 0.263462 0.270263 11 180 3 0.429693 0.313951 12 175 4 0.449600 0.274124 13 20 5 0.053172 0.268555 14 175 7 0.404586 0.288742 hyper_optimizer search_data n_estimators max_depth cv_time score 0 170 4 0.429751 0.273267 1 115 7 0.278648 0.296102 2 85 4 0.211211 0.279554 3 135 4 0.342682 0.267721 4 220 6 0.454925 0.268529 5 135 5 0.323710 0.270547 6 35 3 0.084315 0.306963 7 60 5 0.149962 0.260005 8 25 2 0.053171 0.303257 9 140 2 0.278299 0.328258 10 210 2 0.428252 0.330240 11 80 2 0.163116 0.326211 12 25 4 0.064566 0.293585 13 115 6 0.277513 0.270716 14 45 3 0.110079 0.313095 hyper_optimizer search_data n_estimators max_depth cv_time score 0 40 4 0.104604 0.270946 1 95 2 0.191430 0.326538 2 40 3 0.096477 0.306560 3 215 7 0.447988 0.268221 4 115 7 0.277140 0.278463 5 130 3 0.307548 0.310494 6 115 4 0.285706 0.271133 7 50 4 0.124728 0.275723 8 175 6 0.403750 0.272474 9 195 5 0.425691 0.282180 10 165 5 0.390918 0.282236 11 75 3 0.183286 0.316066 12 190 7 0.423599 0.274997 13 30 7 0.077578 0.269481 14 245 3 0.560118 0.312848 hyper_optimizer search_data n_estimators max_depth cv_time score 0 50 3 0.149075 0.310441 1 195 3 0.459050 0.316121 2 230 2 0.474948 0.328721 3 70 3 0.166681 0.308237 4 40 7 0.101941 0.262045 5 100 2 0.203159 0.326424 6 125 6 0.300996 0.285009 7 145 5 0.351124 0.281005 8 195 7 0.435357 0.282233 9 30 7 0.080386 0.262075 10 175 7 0.413364 0.268366 11 175 3 0.413322 0.310711 12 110 7 0.265733 0.275947 13 140 6 0.335133 0.279896 14 95 2 0.193122 0.326247 hyper_optimizer search_data n_estimators max_depth cv_time score 0 175 6 0.412572 0.280366 1 100 4 0.250836 0.290253 2 240 2 0.479053 0.328462 3 195 5 0.432045 0.258361 4 140 6 0.336825 0.284946 5 10 3 0.027291 0.175655 6 80 3 0.189595 0.314869 7 115 2 0.236703 0.326597 8 135 7 0.328682 0.277129 9 65 7 0.164024 0.281355 10 220 2 0.447159 0.329678 11 70 3 0.166764 0.307975 12 80 2 0.161863 0.325785 13 225 3 0.515723 0.313758 14 200 6 0.429268 0.280143 hyper_optimizer search_data n_estimators max_depth cv_time score 0 220 3 0.528340 0.311830 1 125 7 0.306709 0.280501 2 195 2 0.394514 0.329501 3 150 4 0.377696 0.264062 4 235 5 0.466406 0.286418 5 40 5 0.101020 0.265324 6 150 5 0.367667 0.268419 7 235 3 0.550423 0.308684 8 10 2 0.024037 0.130693 9 100 3 0.237929 0.311202 10 65 2 0.132017 0.325816 11 195 4 0.463978 0.271931 12 170 7 0.396408 0.288230 13 180 3 0.419057 0.311722 14 35 7 0.088092 0.282675 hyper_optimizer search_data n_estimators max_depth cv_time score 0 180 7 0.414299 0.262485 1 190 2 0.376027 0.329066 2 80 4 0.199736 0.262852 3 205 4 0.481331 0.288552 4 180 3 0.418108 0.310108 5 240 5 0.469577 0.278504 6 85 6 0.205165 0.270781 7 190 7 0.421955 0.264020 8 190 4 0.460415 0.281676 9 210 7 0.442884 0.276648 10 105 4 0.261704 0.285338 11 10 7 0.029574 0.179460 12 135 7 0.324964 0.269716 13 90 5 0.217990 0.278937 14 130 4 0.323176 0.286018 hyper_optimizer search_data n_estimators max_depth cv_time score 0 190 5 0.448730 0.280003 1 165 6 0.392935 0.287462 2 125 5 0.304988 0.283130 3 210 3 0.486952 0.315794 4 115 4 0.286834 0.263519 5 110 7 0.266535 0.282376 6 55 4 0.139961 0.283277 7 240 4 0.567436 0.278804 8 85 6 0.214452 0.285471 9 225 3 0.516657 0.308658 10 160 4 0.397832 0.260815 11 100 4 0.248304 0.281050 12 225 2 0.444328 0.329629 13 90 3 0.213261 0.310404 14 85 3 0.200542 0.311639
n_estimators | max_depth | cv_time | score | |
---|---|---|---|---|
0 | 40 | 4 | 0.104604 | 0.270946 |
1 | 95 | 2 | 0.191430 | 0.326538 |
2 | 40 | 3 | 0.096477 | 0.306560 |
3 | 215 | 7 | 0.447988 | 0.268221 |
4 | 115 | 7 | 0.277140 | 0.278463 |
fig = px.scatter(search_data_gbr_1,
x="n_estimators",
y="max_depth",
color="score",
size='cv_time',
color_continuous_scale=color_scale)
fig.update_layout(width=900, height=800)
fig.show()
The scatter plot above shows the samples od the search data, but adds a visualization of the cross-validation-time with the size of the scatter-points.
In the last chapter you were able to collect additional data during the optimization run. This data did not affect the score. But you can still try to create one score that represents information from multiple scores. In the following example we want to optimize a model to get a high score and at the same time a low training time.
def gbr_model_2(opt):
gbr = GradientBoostingRegressor(
n_estimators=opt["n_estimators"],
max_depth=opt["max_depth"],
)
c_time = time.time()
scores = cross_val_score(gbr, X_boston, y_boston, cv=5)
cv_time = time.time() - c_time
score_cv_avg = scores.mean()
score_cv_std = scores.std()
# the score is calculated from the cv-score and the cv-training time
score = score_cv_avg / (cv_time**0.1)
# independed from the score we want some additional data
return score, {"cv_time": cv_time,
"score_cv_avg": score_cv_avg,
"score_cv_std": score_cv_std,
"scores": scores,
}
search_space_gbr_2 = {
"n_estimators": list(range(10, 250, 5)),
"max_depth": list(range(2, 12)),
}
The objective function above enables us to return a score that is composed of multiple variables. At the same time, we also want to collect data about the variables the score is composed from. This helps us understand the score later during the data visualization.
hyper_gbr_2 = Hyperactive(verbosity=False)
hyper_gbr_2.add_search(gbr_model_2, search_space_gbr_2, n_iter=15, n_jobs=8, initialize={"random": 10})
hyper_gbr_2.run()
search_data_gbr_2 = hyper_gbr_2.search_data(gbr_model_2)
search_data_gbr_2.head()
hyper_optimizer search_data n_estimators max_depth cv_time score score_cv_avg score_cv_std \ 0 45 10 0.136457 0.323857 0.265371 0.330783 1 25 10 0.064831 0.357932 0.272257 0.336918 2 30 3 0.072968 0.403566 0.310619 0.380542 3 130 10 0.312861 0.318940 0.283952 0.349164 4 80 4 0.200744 0.313594 0.267074 0.332184 5 240 2 0.484914 0.354671 0.329908 0.406825 6 10 2 0.023994 0.189776 0.130693 0.160954 7 70 4 0.178738 0.328887 0.276865 0.341739 8 175 6 0.407856 0.292649 0.267545 0.332618 9 95 10 0.234092 0.309486 0.267658 0.332724 10 55 10 0.135570 0.332052 0.271908 0.336840 11 95 2 0.192210 0.385327 0.326744 0.402988 12 130 10 0.312861 0.318940 0.283952 0.349164 13 220 3 0.505374 0.336861 0.314639 0.385742 14 200 9 0.434827 0.301456 0.277367 0.342242 scores 0 [0.0, 0.7606979573331243, 0.0, 0.5661560583542... 1 [0.0, 0.756928785905888, 0.0, 0.60435498531228... 2 [0.0, 0.7618800535985606, 0.0, 0.7912142241900... 3 [0.0, 0.7591975497210028, 0.0, 0.6605601524950... 4 [0.0, 0.7592408879023933, 0.0, 0.5761300406532... 5 [0.0, 0.7498086663521787, 0.0, 0.8997297401252... 6 [0.0, 0.3000208717930798, 0.0, 0.3534418130637... 7 [0.0, 0.7593214379685758, 0.0, 0.6250028639511... 8 [0.0, 0.7591973311141902, 0.0, 0.5785281255638... 9 [0.0, 0.7592063189382285, 0.0, 0.5790821541598... 10 [0.0, 0.7597763989791204, 0.0, 0.5997639441451... 11 [0.0, 0.7417446308421538, 0.0, 0.8919733614070... 12 [0.0, 0.7591975497210028, 0.0, 0.6605601524950... 13 [0.0, 0.7591973311141902, 0.0, 0.8139963950064... 14 [0.0, 0.7591973311141902, 0.0, 0.6276383127284... hyper_optimizer search_data n_estimators max_depth cv_time score score_cv_avg score_cv_std \ 0 170 7 0.413711 0.311161 0.284875 0.350164 1 115 5 0.277382 0.332062 0.292097 0.358213 2 105 8 0.256741 0.320797 0.280014 0.344981 3 125 11 0.300331 0.323629 0.286951 0.352436 4 135 9 0.323920 0.319626 0.285552 0.350901 5 35 4 0.088649 0.354309 0.278067 0.343188 6 60 9 0.147740 0.348017 0.287442 0.352994 7 25 2 0.054084 0.405979 0.303257 0.372563 8 140 3 0.329806 0.345044 0.308817 0.378309 9 210 2 0.414936 0.359749 0.329456 0.406356 10 80 3 0.188538 0.369897 0.313055 0.383698 11 25 6 0.064801 0.370514 0.281814 0.346738 12 115 11 0.275769 0.296294 0.260482 0.326254 13 45 4 0.113226 0.354023 0.284725 0.350083 14 210 8 0.440172 0.283969 0.261597 0.327226 scores 0 [0.0, 0.7591973314288802, 0.0, 0.6651778198072... 1 [0.0, 0.7591984186362237, 0.0, 0.7012863352115... 2 [0.0, 0.7592004620942316, 0.0, 0.6408702197439... 3 [0.0, 0.7591977059680856, 0.0, 0.6755583577510... 4 [0.0, 0.7591974574568147, 0.0, 0.6685633062254... 5 [0.0, 0.7621839309593008, 0.0, 0.6281502604805... 6 [0.0, 0.7595464902301229, 0.0, 0.6776650535732... 7 [0.0, 0.7118804031055922, 0.0, 0.8044043323926... 8 [0.0, 0.7591974029750898, 0.0, 0.7848865763114... 9 [0.0, 0.7476038105049081, 0.0, 0.8996785062807... 10 [0.0, 0.7592408879023933, 0.0, 0.8060341746144... 11 [0.0, 0.756928785905888, 0.0, 0.65214339070086... 12 [0.0, 0.7591984186362237, 0.0, 0.5432121318684... 13 [0.0, 0.7606979573331243, 0.0, 0.6629268123550... 14 [0.0, 0.7591973311141902, 0.0, 0.5487874146446... hyper_optimizer search_data n_estimators max_depth cv_time score score_cv_avg score_cv_std \ 0 180 11 0.415904 0.292008 0.267482 0.332559 1 15 5 0.041179 0.314475 0.228589 0.285959 2 95 11 0.229800 0.316754 0.273436 0.338287 3 95 10 0.232698 0.339406 0.293358 0.359660 4 60 9 0.148902 0.323756 0.267614 0.332720 5 85 11 0.207217 0.334708 0.285962 0.351350 6 190 11 0.419905 0.315123 0.288932 0.354635 7 90 6 0.221628 0.308532 0.265376 0.330614 8 90 2 0.199622 0.383714 0.326609 0.402842 9 230 8 0.476602 0.282412 0.262240 0.327792 10 245 6 0.475372 0.292884 0.271894 0.336772 11 155 8 0.368208 0.314119 0.284252 0.349489 12 90 3 0.211561 0.368240 0.315264 0.386551 13 20 3 0.049844 0.386807 0.286586 0.351208 14 60 11 0.147057 0.354909 0.292998 0.359256 scores 0 [0.0, 0.7591973311141902, 0.0, 0.5782101756953... 1 [0.0, 0.6635783675193356, 0.0, 0.4793645774808... 2 [0.0, 0.7592063189382285, 0.0, 0.6079760893920... 3 [0.0, 0.7592063189382285, 0.0, 0.7075857737728... 4 [0.0, 0.7595464902301229, 0.0, 0.5785220373474... 5 [0.0, 0.7592230854704327, 0.0, 0.6705870066605... 6 [0.0, 0.7591973311141902, 0.0, 0.6854609574533... 7 [0.0, 0.7592125490494327, 0.0, 0.5676686658178... 8 [0.0, 0.7411685862901474, 0.0, 0.8918770859385... 9 [0.0, 0.7591973311141902, 0.0, 0.5520006952587... 10 [0.0, 0.7591973311141902, 0.0, 0.6002726153686... 11 [0.0, 0.7591973405896815, 0.0, 0.6620637527677... 12 [0.0, 0.7592125490494327, 0.0, 0.8171062246438... 13 [0.0, 0.7358195863474217, 0.0, 0.6971118540768... 14 [0.0, 0.7595464902301229, 0.0, 0.7054459913405... hyper_optimizer search_data n_estimators max_depth cv_time score score_cv_avg score_cv_std \ 0 220 5 0.455640 0.304106 0.281117 0.346140 1 125 11 0.302580 0.316119 0.280501 0.345492 2 10 9 0.030269 0.196829 0.138736 0.179144 3 110 9 0.265424 0.302174 0.264637 0.329940 4 40 9 0.100850 0.344517 0.273891 0.338938 5 150 8 0.358505 0.323525 0.291983 0.358083 6 235 5 0.472267 0.292650 0.271498 0.336387 7 10 3 0.028313 0.244344 0.171081 0.209606 8 100 5 0.252887 0.325643 0.283814 0.349017 9 65 2 0.134255 0.398686 0.326155 0.402016 10 195 7 0.432069 0.310959 0.285929 0.351313 11 170 10 0.398265 0.306941 0.279944 0.344908 12 60 3 0.143180 0.376253 0.309791 0.379532 13 230 3 0.526460 0.331901 0.311275 0.381422 14 225 2 0.451513 0.357549 0.330219 0.407264 scores 0 [0.0, 0.7591973311141902, 0.0, 0.6463874468818... 1 [0.0, 0.7591977059680856, 0.0, 0.6433080143749... 2 [0.0, 0.43657746289686405, 0.0, 0.257100661469... 3 [0.0, 0.7591991773750808, 0.0, 0.5639876512819... 4 [0.0, 0.761449900875621, 0.0, 0.60800697241692... 5 [0.0, 0.7591973518071372, 0.0, 0.7007177747400... 6 [0.0, 0.7591973311141902, 0.0, 0.5982950718976... 7 [0.0, 0.43657746289686405, 0.0, 0.418826802922... 8 [0.0, 0.759202637170055, 0.0, 0.65986925166211... 9 [0.0, 0.7437668317273509, 0.0, 0.8870068552202... 10 [0.0, 0.7591973311141902, 0.0, 0.6704501104566... 11 [0.0, 0.7591973314288802, 0.0, 0.6405251282294... 12 [0.0, 0.7595464902301229, 0.0, 0.7894066997405... 13 [0.0, 0.7591973311141902, 0.0, 0.7971785803182... 14 [0.0, 0.7497741442819785, 0.0, 0.9013226961820... hyper_optimizer search_data n_estimators max_depth cv_time score score_cv_avg score_cv_std \ 0 40 6 0.117105 0.339244 0.273760 0.338809 1 95 2 0.192001 0.385126 0.326538 0.402701 2 40 5 0.100304 0.346020 0.274937 0.339974 3 215 7 0.456671 0.290090 0.268221 0.333253 4 225 8 0.460565 0.300552 0.278131 0.343026 5 65 7 0.159255 0.353656 0.294300 0.360753 6 100 4 0.248682 0.298836 0.260015 0.325850 7 100 10 0.254327 0.326997 0.285156 0.350470 8 180 11 0.416745 0.300223 0.275062 0.339906 9 150 9 0.362403 0.289249 0.261331 0.326992 10 165 5 0.390241 0.257560 0.234429 0.307286 11 55 11 0.136349 0.343151 0.281158 0.346221 12 245 3 0.558418 0.334188 0.315273 0.386564 13 210 4 0.489017 0.307314 0.286098 0.351498 14 105 5 0.255243 0.330143 0.288003 0.353600 scores 0 [0.0, 0.761449900875621, 0.0, 0.60735008583143... 1 [0.0, 0.74171543851855, 0.0, 0.890972364821101... 2 [0.0, 0.761449900875621, 0.0, 0.61323489059761... 3 [0.0, 0.7591973311141902, 0.0, 0.5819101634954... 4 [0.0, 0.7591973311141902, 0.0, 0.6314568449037... 5 [0.0, 0.7594060236112754, 0.0, 0.7120962035144... 6 [0.0, 0.759202637170055, 0.0, 0.54087068362118... 7 [0.0, 0.759202637170055, 0.0, 0.66657920962673... 8 [0.0, 0.7591973311141902, 0.0, 0.6161127493205... 9 [0.0, 0.7591973518071372, 0.0, 0.5474566324285... 10 [0.0, 0.7591973319805099, 0.0, 0.4129485460500... 11 [0.0, 0.7597763989791204, 0.0, 0.6460118139438... 12 [0.0, 0.7591973311141902, 0.0, 0.8171690398746... 13 [0.0, 0.7591973311141902, 0.0, 0.6712930359609... 14 [0.0, 0.7592004620942316, 0.0, 0.6808160058318... hyper_optimizer search_data n_estimators max_depth cv_time score score_cv_avg score_cv_std \ 0 175 10 0.435712 0.300426 0.276476 0.341333 1 100 7 0.242967 0.335800 0.291498 0.357530 2 240 3 0.553861 0.332909 0.313809 0.384670 3 195 9 0.428541 0.292784 0.268997 0.333985 4 140 10 0.339210 0.320703 0.287839 0.353418 5 10 4 0.029351 0.241386 0.169619 0.207891 6 80 5 0.195049 0.313984 0.266638 0.331779 7 115 2 0.231999 0.377976 0.326597 0.402624 8 135 4 0.336672 0.302481 0.271280 0.336175 9 215 2 0.426309 0.358897 0.329566 0.406412 10 70 5 0.170563 0.334608 0.280366 0.345358 11 80 2 0.164777 0.390169 0.325794 0.401664 12 225 5 0.458695 0.289719 0.267997 0.333041 13 200 10 0.437606 0.297075 0.273511 0.338360 14 85 8 0.204948 0.331356 0.282787 0.347915 scores 0 [0.0, 0.7591973311141902, 0.0, 0.6231820929314... 1 [0.0, 0.759202637170055, 0.0, 0.69828739724825... 2 [0.0, 0.7591973311141902, 0.0, 0.8098490909401... 3 [0.0, 0.7591973311141902, 0.0, 0.5857860644759... 4 [0.0, 0.7591974029750898, 0.0, 0.6799969200704... 5 [0.0, 0.43657746289686405, 0.0, 0.411517709726... 6 [0.0, 0.7592408879023933, 0.0, 0.5739502865654... 7 [0.0, 0.743909637104236, 0.0, 0.88907769969297... 8 [0.0, 0.7591974574568147, 0.0, 0.5972046427982... 9 [0.0, 0.7489188791452149, 0.0, 0.8989108289244... 10 [0.0, 0.7593214379685758, 0.0, 0.6425066168565... 11 [0.0, 0.741652812212936, 0.0, 0.88731529803996... 12 [0.0, 0.7591973311141902, 0.0, 0.5807874747754... 13 [0.0, 0.7591973311141902, 0.0, 0.6083562144719... 14 [0.0, 0.7592230854704327, 0.0, 0.6547128579433... hyper_optimizer search_data hyper_optimizer search_data n_estimators max_depth cv_time score score_cv_avg score_cv_std \ 0 190 8 0.425644 0.304549 0.279615 0.344565 1 165 10 0.395935 0.313447 0.285710 0.351074 2 125 8 0.301297 0.314241 0.278716 0.343631 3 210 5 0.442195 0.302296 0.278608 0.343518 4 115 6 0.277537 0.301964 0.265636 0.330850 5 110 4 0.274258 0.325823 0.286284 0.351703 6 105 6 0.252859 0.289422 0.252243 0.319459 7 85 11 0.205357 0.341245 0.291284 0.357288 8 225 4 0.513295 0.301982 0.282499 0.347606 9 160 7 0.378928 0.291803 0.264817 0.330102 10 100 7 0.240047 0.317631 0.275392 0.340238 11 225 2 0.443083 0.356974 0.329067 0.405642 12 90 4 0.223649 0.326463 0.281055 0.346075 13 85 5 0.204921 0.314538 0.268431 0.333452 14 35 4 0.088334 0.331140 0.259791 0.326065 scores 0 [0.0, 0.7591973311141902, 0.0, 0.6388801546981... 1 [0.0, 0.7591973319805099, 0.0, 0.6693539423234... 2 [0.0, 0.7591977059680856, 0.0, 0.6343847529870... 3 [0.0, 0.7591973311141902, 0.0, 0.6338418345754... 4 [0.0, 0.7591984186362237, 0.0, 0.5689810757456... 5 [0.0, 0.7591991773750808, 0.0, 0.6722232597348... 6 [0.0, 0.7592004620942316, 0.0, 0.5020131961419... 7 [0.0, 0.7592230854704327, 0.0, 0.6971975231665... 8 [0.0, 0.7591973311141902, 0.0, 0.6532994289108... 9 [0.0, 0.7591973339658764, 0.0, 0.5648870415883... 10 [0.0, 0.759202637170055, 0.0, 0.61775959245503... 11 [0.0, 0.749907029558438, 0.0, 0.89542940208227... 12 [0.0, 0.7592125490494327, 0.0, 0.6460601110964... 13 [0.0, 0.7592230854704327, 0.0, 0.5829309386913... 14 [0.0, 0.7621839309593008, 0.0, 0.5367704553759... n_estimators max_depth cv_time score score_cv_avg score_cv_std \ 0 50 4 0.134709 0.325120 0.266062 0.331350 1 195 4 0.465079 0.288511 0.267248 0.332341 2 230 2 0.460997 0.355187 0.328721 0.405174 3 70 5 0.171615 0.311101 0.260830 0.326572 4 40 6 0.103232 0.328847 0.262045 0.327909 5 25 7 0.068596 0.341303 0.261078 0.326479 6 195 8 0.429352 0.293719 0.269907 0.334852 7 140 11 0.341563 0.308038 0.276663 0.341524 8 230 3 0.534215 0.333307 0.313052 0.383695 9 235 10 0.466711 0.302454 0.280262 0.345240 10 215 10 0.461447 0.298496 0.276281 0.341135 11 50 7 0.124703 0.330690 0.268540 0.333653 12 215 8 0.447357 0.311976 0.287864 0.353445 13 170 6 0.404659 0.303095 0.276878 0.341742 14 20 4 0.054090 0.362930 0.271103 0.334056 scores 0 [0.0, 0.7601428859741948, 0.0, 0.5701674479267... 1 [0.0, 0.7591973311141902, 0.0, 0.5770435925038... 2 [0.0, 0.7496839489591742, 0.0, 0.8939188384891... 3 [0.0, 0.7593214379685758, 0.0, 0.5448284353802... 4 [0.0, 0.761449900875621, 0.0, 0.54877415913990... 5 [0.0, 0.756928785905888, 0.0, 0.54846060837347... 6 [0.0, 0.7591973311141902, 0.0, 0.5903374255550... 7 [0.0, 0.7591974029750898, 0.0, 0.6241196406447... 8 [0.0, 0.7591973311141902, 0.0, 0.8060615508503... 9 [0.0, 0.7591973311141902, 0.0, 0.6421115372831... 10 [0.0, 0.7591973311141902, 0.0, 0.6222059494211... 11 [0.0, 0.7601428859741948, 0.0, 0.5825584181845... 12 [0.0, 0.7591973311141902, 0.0, 0.6801219906510... 13 [0.0, 0.7591973314288802, 0.0, 0.6251904453111... 14 [0.0, 0.7358195863474217, 0.0, 0.6196937978507...
n_estimators | max_depth | cv_time | score | score_cv_avg | score_cv_std | scores | |
---|---|---|---|---|---|---|---|
0 | 40 | 6 | 0.117105 | 0.339244 | 0.273760 | 0.338809 | [0.0, 0.761449900875621, 0.0, 0.60735008583143... |
1 | 95 | 2 | 0.192001 | 0.385126 | 0.326538 | 0.402701 | [0.0, 0.74171543851855, 0.0, 0.890972364821101... |
2 | 40 | 5 | 0.100304 | 0.346020 | 0.274937 | 0.339974 | [0.0, 0.761449900875621, 0.0, 0.61323489059761... |
3 | 215 | 7 | 0.456671 | 0.290090 | 0.268221 | 0.333253 | [0.0, 0.7591973311141902, 0.0, 0.5819101634954... |
4 | 225 | 8 | 0.460565 | 0.300552 | 0.278131 | 0.343026 | [0.0, 0.7591973311141902, 0.0, 0.6314568449037... |
fig = px.scatter(search_data_gbr_2,
x="n_estimators",
y="max_depth",
color="score",
size='cv_time',
color_continuous_scale=color_scale)
fig.update_layout(width=800, height=700)
fig.show()
fig = px.scatter(search_data_gbr_2,
x="cv_time",
y="score_cv_avg",
color="score",
size='score_cv_std',
color_continuous_scale=color_scale)
fig.update_layout(width=800, height=700)
fig.show()
This chapter describes a very unique and helpful feature of Hyperactive: non-numerical values in the search space. You are not constrained to use numeric values in your search space, but also strings or even functions. Because of this you can do some really interesting stuff like:
- hyperparameter optimization of any parameter
- preprocessing-optimization
- neural architecture search
Lets take a look at the following example:
def mlp_model(opt):
scaler = MinMaxScaler()
X_norm = scaler.fit_transform(X_iris)
mlp = MLPClassifier(
hidden_layer_sizes=opt["hidden_layer_sizes"],
activation=opt["activation"],
alpha=opt["alpha"],
learning_rate_init=opt["learning_rate_init"],
)
scores = cross_val_score(mlp, X_norm, y_iris, cv=5)
return scores.mean()
search_space_mlp = {
"hidden_layer_sizes": list(range(10, 100, 10)),
"activation": ["identity", "logistic", "tanh", "relu"],
"solver": ["lbfgs", "sgd", "adam"],
"alpha": [1/(10**x) for x in range(1, 9)],
"learning_rate_init": [1/(10**x) for x in range(1, 9)],
}
hyper_mlp_0 = Hyperactive(verbosity=False)
hyper_mlp_0.add_search(mlp_model, search_space_mlp, n_iter=40)
hyper_mlp_0.run()
mlp_search_data = hyper_mlp_0.search_data(mlp_model)
mlp_search_data.head()
hyper_optimizer search_data hidden_layer_sizes activation solver alpha learning_rate_init \ 0 20 tanh sgd 1.000000e-01 1.000000e-02 1 40 tanh adam 1.000000e-07 1.000000e-03 2 50 logistic sgd 1.000000e-04 1.000000e-04 3 90 relu lbfgs 1.000000e-08 1.000000e-08 4 90 relu lbfgs 1.000000e-01 1.000000e-01 5 10 relu lbfgs 1.000000e-08 1.000000e-08 6 90 relu adam 1.000000e-01 1.000000e-01 7 90 tanh adam 1.000000e-05 1.000000e-06 8 60 relu lbfgs 1.000000e-08 1.000000e-08 9 40 relu sgd 1.000000e-02 1.000000e-04 10 70 tanh adam 1.000000e-02 1.000000e-03 11 30 identity lbfgs 1.000000e-04 1.000000e-04 12 40 tanh lbfgs 1.000000e-03 1.000000e-04 13 80 logistic lbfgs 1.000000e-02 1.000000e-01 14 20 relu adam 1.000000e-07 1.000000e-05 15 60 relu sgd 1.000000e-07 1.000000e-01 16 60 tanh adam 1.000000e-03 1.000000e-01 17 10 relu adam 1.000000e-01 1.000000e-03 18 20 identity sgd 1.000000e-01 1.000000e-01 19 30 logistic lbfgs 1.000000e-02 1.000000e-06 20 40 tanh lbfgs 1.000000e-01 1.000000e-03 21 70 logistic lbfgs 1.000000e-02 1.000000e-06 22 40 identity adam 1.000000e-06 1.000000e-08 23 70 logistic sgd 1.000000e-07 1.000000e-05 24 30 identity adam 1.000000e-07 1.000000e-01 25 50 logistic sgd 1.000000e-07 1.000000e-08 26 20 relu lbfgs 1.000000e-03 1.000000e-06 27 80 logistic lbfgs 1.000000e-05 1.000000e-08 28 10 tanh sgd 1.000000e-05 1.000000e-08 29 30 identity lbfgs 1.000000e-04 1.000000e-03 30 40 tanh sgd 1.000000e-08 1.000000e-07 31 90 tanh sgd 1.000000e-08 1.000000e-03 32 50 tanh adam 1.000000e-03 1.000000e-01 33 80 relu lbfgs 1.000000e-05 1.000000e-07 34 50 logistic adam 1.000000e-08 1.000000e-04 35 70 tanh sgd 1.000000e-03 1.000000e-03 36 60 tanh adam 1.000000e-03 1.000000e-03 37 70 relu sgd 1.000000e-06 1.000000e-07 38 20 tanh adam 1.000000e-06 1.000000e-07 39 20 identity lbfgs 1.000000e-04 1.000000e-01 score 0 0.973333 1 0.900000 2 0.420000 3 0.306667 4 0.973333 5 0.393333 6 0.973333 7 0.386667 8 0.313333 9 0.466667 10 0.920000 11 0.433333 12 0.480000 13 0.973333 14 0.380000 15 0.973333 16 0.973333 17 0.740000 18 0.973333 19 0.333333 20 0.900000 21 0.266667 22 0.340000 23 0.333333 24 0.973333 25 0.333333 26 0.360000 27 0.320000 28 0.400000 29 0.866667 30 0.253333 31 0.940000 32 0.973333 33 0.333333 34 0.406667 35 0.926667 36 0.926667 37 0.266667 38 0.306667 39 0.973333
hidden_layer_sizes | activation | solver | alpha | learning_rate_init | score | |
---|---|---|---|---|---|---|
0 | 20 | tanh | sgd | 1.000000e-01 | 1.000000e-02 | 0.973333 |
1 | 40 | tanh | adam | 1.000000e-07 | 1.000000e-03 | 0.900000 |
2 | 50 | logistic | sgd | 1.000000e-04 | 1.000000e-04 | 0.420000 |
3 | 90 | relu | lbfgs | 1.000000e-08 | 1.000000e-08 | 0.306667 |
4 | 90 | relu | lbfgs | 1.000000e-01 | 1.000000e-01 | 0.973333 |
parameter_names = list(search_space_mlp.keys())
mlp_search_data = mlp_search_data.sort_values('hidden_layer_sizes', ascending=False)
fig = px.parallel_categories(mlp_search_data,
color="score",
color_continuous_scale=color_scale,
dimensions=parameter_names,
)
fig.update_layout(width=950, height=700)
fig.show()
The optimization of deep learning models can be very difficult because the evaluation times of the objective functions are very high, due to the long training times. There is also the challenge of finding a way to find the optimal structure/architecture of the neural network. Hyperactive can help with both of those problems.
The optimization of the structure/architecture of a neural network is called neural architecture search. Because Hyperactive can handle functions in its search spaces performing nas is very easy.
def deep_learning_model(params):
filters_0 = params["filters.0"]
kernel_size_0 = params["kernel_size.0"]
model = Sequential()
model.add(Conv2D(filters_0, (kernel_size_0, kernel_size_0), input_shape=(img_width, img_height, 1), activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
# the next two lines are layers that are put in during the optimization run
model = params["layer.0"](params, model)
model = params["layer.1"](params, model)
model.add(Flatten())
model.add(Dense(params["dense.0"], activation="relu"))
model.add(Dense(num_classes, activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(
x_train,
y_train,
validation_data=(x_test, y_test),
epochs=5,
verbose=False,
)
_, score = model.evaluate(x=x_test, y=y_test, verbose=False)
return score
The following functions are the layers and layer-compositions that we will use in the search space. The params
-argument enables the optimization of parameters inside the layer-function. There is also a no_layer
-function because we want to test of it might be better for the score of the neural network if its number of layers is reduced.
def Conv2D_MaxPooling2D_layer(params, model):
filters_1 = params["layer.0.filters"]
kernel_size_1 = params["layer.0.kernel_size"]
model.add(Conv2D(filters_1, (kernel_size_1, kernel_size_1), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
return model
def Conv2D_layer(params, model):
filters_1 = params["layer.0.filters"]
kernel_size_1 = params["layer.0.kernel_size"]
model.add(Conv2D(filters_1, (kernel_size_1, kernel_size_1), activation='relu'))
return model
def Dropout_layer(params, model):
model.add(Dropout(params["layer.1.rate"]))
return model
def no_layer(params, model):
return model
In the search space you can see that the layer-functions are put inside lists. During the optimization run Hyperactive will select those layer-functions similar to any other variable inside the search space.
# you can put the layers into lists like any other variable
search_space_dl = {
"filters.0": list(range(7, 15)),
"kernel_size.0": list(range(3, 6)),
"layer.0": [Conv2D_MaxPooling2D_layer, Conv2D_layer, no_layer],
"layer.0.filters": list(range(5, 12)),
"layer.0.kernel_size": list(range(3, 6)),
"layer.1": [Dropout_layer, no_layer],
"layer.1.rate": list(np.arange(0.2, 0.8, 0.1)),
"dense.0": list(range(10, 200, 20)),
}
Bayesian optimization is a global optimization technique that uses a machine learning model (surrogate model) to approximate the objective function. It relies on a gaussian process regressor fitting to known positions and scores in the search space and predicting where to search next. It follows the following steps:
- fit the gaussian process regressor to the training data (positions in search space) and the target (score of each position).
- the regressor makes a prediction of every position in the search space
- from the predictions an acquisition function is calculated that determines which position to evaluate next
- after the evaluation the algorithm adds the position and score to the training data
Since the regressor is trained in every iteration the optimization step takes a long time compared to other algorithms. This is why it is often used for objective functions with a long evaluation time. The long optimization time does not matter if the evaluation time is even longer. In those cases it is much more important that each new position is carfully selected to avoid wasted time of an evaluation that has a low score.
The following plots show an example of the path a bayesian optimization algorithm would take in different objective functions:
optimizer = BayesianOptimizer()
hyper_dl = Hyperactive(verbosity=False)
hyper_dl.add_search(deep_learning_model, search_space_dl, n_iter=30, optimizer=optimizer)
hyper_dl.run()
dl_search_data = hyper_dl.search_data(deep_learning_model, times=True)
2024-01-02 08:47:40.065310: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2024-01-02 08:47:40.065497: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform. Skipping registering GPU devices...
hyper_optimizer search_data filters.0 kernel_size.0 \ 0 11 3 1 13 4 2 10 4 3 7 5 4 7 5 5 14 5 6 7 3 7 7 3 8 8 3 9 14 5 10 13 3 11 11 3 12 11 4 13 12 5 14 10 3 15 11 3 16 10 4 17 11 3 18 12 3 19 11 3 20 12 3 21 11 3 22 11 3 23 11 3 24 12 3 25 10 3 26 10 3 27 10 3 28 11 3 29 10 3 layer.0 layer.0.filters \ 0 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 5 1 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 11 2 <function Conv2D_layer at 0x7f3a6331a670> 8 3 <function no_layer at 0x7f3a6331a8b0> 11 4 <function no_layer at 0x7f3a6331a8b0> 11 5 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 11 6 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 5 7 <function no_layer at 0x7f3a6331a8b0> 8 8 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 6 9 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 5 10 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 10 11 <function Conv2D_layer at 0x7f3a6331a670> 9 12 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 9 13 <function no_layer at 0x7f3a6331a8b0> 8 14 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 9 15 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 9 16 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 8 17 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 11 18 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 10 19 <function Conv2D_layer at 0x7f3a6331a670> 10 20 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 10 21 <function Conv2D_layer at 0x7f3a6331a670> 10 22 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 10 23 <function no_layer at 0x7f3a6331a8b0> 9 24 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 11 25 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 11 26 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 10 27 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 11 28 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 10 29 <function Conv2D_MaxPooling2D_layer at 0x7f3a6... 10 layer.0.kernel_size layer.1 \ 0 4 <function no_layer at 0x7f3a6331a8b0> 1 5 <function Dropout_layer at 0x7f3a6331a820> 2 4 <function Dropout_layer at 0x7f3a6331a820> 3 5 <function no_layer at 0x7f3a6331a8b0> 4 5 <function no_layer at 0x7f3a6331a8b0> 5 3 <function Dropout_layer at 0x7f3a6331a820> 6 3 <function no_layer at 0x7f3a6331a8b0> 7 3 <function no_layer at 0x7f3a6331a8b0> 8 5 <function no_layer at 0x7f3a6331a8b0> 9 5 <function Dropout_layer at 0x7f3a6331a820> 10 5 <function Dropout_layer at 0x7f3a6331a820> 11 5 <function Dropout_layer at 0x7f3a6331a820> 12 3 <function Dropout_layer at 0x7f3a6331a820> 13 3 <function Dropout_layer at 0x7f3a6331a820> 14 4 <function Dropout_layer at 0x7f3a6331a820> 15 5 <function Dropout_layer at 0x7f3a6331a820> 16 4 <function Dropout_layer at 0x7f3a6331a820> 17 4 <function Dropout_layer at 0x7f3a6331a820> 18 4 <function Dropout_layer at 0x7f3a6331a820> 19 3 <function Dropout_layer at 0x7f3a6331a820> 20 3 <function no_layer at 0x7f3a6331a8b0> 21 4 <function Dropout_layer at 0x7f3a6331a820> 22 4 <function Dropout_layer at 0x7f3a6331a820> 23 3 <function Dropout_layer at 0x7f3a6331a820> 24 4 <function Dropout_layer at 0x7f3a6331a820> 25 5 <function Dropout_layer at 0x7f3a6331a820> 26 4 <function Dropout_layer at 0x7f3a6331a820> 27 3 <function Dropout_layer at 0x7f3a6331a820> 28 5 <function Dropout_layer at 0x7f3a6331a820> 29 5 <function no_layer at 0x7f3a6331a8b0> layer.1.rate dense.0 score 0 0.4 150 0.9879 1 0.2 110 0.9898 2 0.5 90 0.9905 3 0.2 190 0.9871 4 0.2 10 0.9746 5 0.8 10 0.9750 6 0.8 190 0.9836 7 0.2 130 0.9864 8 0.5 70 0.9853 9 0.3 70 0.9824 10 0.2 190 0.9887 11 0.4 150 0.9895 12 0.3 150 0.9900 13 0.2 190 0.9860 14 0.4 130 0.9905 15 0.2 130 0.9893 16 0.4 130 0.9860 17 0.3 150 0.9900 18 0.3 130 0.9908 19 0.4 130 0.9909 20 0.3 150 0.9891 21 0.3 130 0.9902 22 0.4 130 0.9885 23 0.4 130 0.9870 24 0.2 130 0.9882 25 0.3 190 0.9904 26 0.3 170 0.9905 27 0.3 190 0.9842 28 0.3 150 0.9892 29 0.3 170 0.9856
# we need to replace the functions with their names for the plot
def func2str(row):
return row.__name__
dl_search_data["layer.0"] = dl_search_data["layer.0"].apply(func2str)
dl_search_data["layer.1"] = dl_search_data["layer.1"].apply(func2str)
dl_search_data = dl_search_data.drop(["eval_times", "iter_times"], axis=1)
score_max = np.amax(search_data_0["score"])
score_std = search_data_0["score"].std()
dl_search_data_f = dl_search_data[abs(search_data_0["score"]-score_max) < score_std*2]
dl_search_data
filters.0 | kernel_size.0 | layer.0 | layer.0.filters | layer.0.kernel_size | layer.1 | layer.1.rate | dense.0 | score | |
---|---|---|---|---|---|---|---|---|---|
0 | 11 | 3 | Conv2D_MaxPooling2D_layer | 5 | 4 | no_layer | 0.4 | 150 | 0.9879 |
1 | 13 | 4 | Conv2D_MaxPooling2D_layer | 11 | 5 | Dropout_layer | 0.2 | 110 | 0.9898 |
2 | 10 | 4 | Conv2D_layer | 8 | 4 | Dropout_layer | 0.5 | 90 | 0.9905 |
3 | 7 | 5 | no_layer | 11 | 5 | no_layer | 0.2 | 190 | 0.9871 |
4 | 7 | 5 | no_layer | 11 | 5 | no_layer | 0.2 | 10 | 0.9746 |
5 | 14 | 5 | Conv2D_MaxPooling2D_layer | 11 | 3 | Dropout_layer | 0.8 | 10 | 0.9750 |
6 | 7 | 3 | Conv2D_MaxPooling2D_layer | 5 | 3 | no_layer | 0.8 | 190 | 0.9836 |
7 | 7 | 3 | no_layer | 8 | 3 | no_layer | 0.2 | 130 | 0.9864 |
8 | 8 | 3 | Conv2D_MaxPooling2D_layer | 6 | 5 | no_layer | 0.5 | 70 | 0.9853 |
9 | 14 | 5 | Conv2D_MaxPooling2D_layer | 5 | 5 | Dropout_layer | 0.3 | 70 | 0.9824 |
10 | 13 | 3 | Conv2D_MaxPooling2D_layer | 10 | 5 | Dropout_layer | 0.2 | 190 | 0.9887 |
11 | 11 | 3 | Conv2D_layer | 9 | 5 | Dropout_layer | 0.4 | 150 | 0.9895 |
12 | 11 | 4 | Conv2D_MaxPooling2D_layer | 9 | 3 | Dropout_layer | 0.3 | 150 | 0.9900 |
13 | 12 | 5 | no_layer | 8 | 3 | Dropout_layer | 0.2 | 190 | 0.9860 |
14 | 10 | 3 | Conv2D_MaxPooling2D_layer | 9 | 4 | Dropout_layer | 0.4 | 130 | 0.9905 |
15 | 11 | 3 | Conv2D_MaxPooling2D_layer | 9 | 5 | Dropout_layer | 0.2 | 130 | 0.9893 |
16 | 10 | 4 | Conv2D_MaxPooling2D_layer | 8 | 4 | Dropout_layer | 0.4 | 130 | 0.9860 |
17 | 11 | 3 | Conv2D_MaxPooling2D_layer | 11 | 4 | Dropout_layer | 0.3 | 150 | 0.9900 |
18 | 12 | 3 | Conv2D_MaxPooling2D_layer | 10 | 4 | Dropout_layer | 0.3 | 130 | 0.9908 |
19 | 11 | 3 | Conv2D_layer | 10 | 3 | Dropout_layer | 0.4 | 130 | 0.9909 |
20 | 12 | 3 | Conv2D_MaxPooling2D_layer | 10 | 3 | no_layer | 0.3 | 150 | 0.9891 |
21 | 11 | 3 | Conv2D_layer | 10 | 4 | Dropout_layer | 0.3 | 130 | 0.9902 |
22 | 11 | 3 | Conv2D_MaxPooling2D_layer | 10 | 4 | Dropout_layer | 0.4 | 130 | 0.9885 |
23 | 11 | 3 | no_layer | 9 | 3 | Dropout_layer | 0.4 | 130 | 0.9870 |
24 | 12 | 3 | Conv2D_MaxPooling2D_layer | 11 | 4 | Dropout_layer | 0.2 | 130 | 0.9882 |
25 | 10 | 3 | Conv2D_MaxPooling2D_layer | 11 | 5 | Dropout_layer | 0.3 | 190 | 0.9904 |
26 | 10 | 3 | Conv2D_MaxPooling2D_layer | 10 | 4 | Dropout_layer | 0.3 | 170 | 0.9905 |
27 | 10 | 3 | Conv2D_MaxPooling2D_layer | 11 | 3 | Dropout_layer | 0.3 | 190 | 0.9842 |
28 | 11 | 3 | Conv2D_MaxPooling2D_layer | 10 | 5 | Dropout_layer | 0.3 | 150 | 0.9892 |
29 | 10 | 3 | Conv2D_MaxPooling2D_layer | 10 | 5 | no_layer | 0.3 | 170 | 0.9856 |
parameter_names = list(dl_search_data_f.keys())
fig = px.parallel_categories(dl_search_data,
color="score",
color_continuous_scale=color_scale,
dimensions=parameter_names,
)
fig.update_layout(width=950, height=700)
fig.show()