Using the OPTaaS Python Client, you can optimize any scikit-learn pipeline. For each step or estimator in the pipeline, OPTaaS just needs to know what parameters to optimize and what constraints will apply to them.
We have provided pre-defined parameters and constraints for some of the most widely used estimators, such as Random Forest and XGBoost (there is a tutorial on how to use them), but you can easily optimize any estimator, whether or not it's part of the sklearn library. Here's an example:
First we create a class that extends our OptimizableBaseEstimator
mixin. You'll notice there is an abstract method that we will need to implement:
from mindfoundry.optaas.client.sklearn_pipelines.mixin import OptimizableBaseEstimator
from mindfoundry.optaas.client.sklearn_pipelines.parameter_maker import SklearnParameterMaker
from mindfoundry.optaas.client.sklearn_pipelines.utils import ParametersAndConstraints
class MyEstimator(OptimizableBaseEstimator):
def make_parameters_and_constraints(self, sk: SklearnParameterMaker, **kwargs) -> ParametersAndConstraints:
pass
For each of our estimator's hyperparameters that we wish to optimize, we will create a corresponding OPTaaS parameter.
The first argument to our method is a SklearnParameterMaker
. We will use this to create our parameters, i.e. we call sk.CategoricalParameter
instead of just CategoricalParameter
.
This will ensure that each parameter is automatically assigned a unique id and a default value, which is based on the values set in the estimator's constructor. The parameter name should be exactly the same as the name of the argument in our constructor:
class MyEstimator(OptimizableBaseEstimator):
def __init__(self, cat_param='abc'):
self.cat_param = cat_param
def make_parameters_and_constraints(self, sk: SklearnParameterMaker, **kwargs) -> ParametersAndConstraints:
return [
sk.CategoricalParameter("cat_param", values=['abc', 'def', 'ghi'])
], []
display(f"{MyEstimator()} (default = 'abc')")
display(f"{MyEstimator(cat_param='def')} (default = 'def')")
"MyEstimator(cat_param='abc') (default = 'abc')"
"MyEstimator(cat_param='def') (default = 'def')"
Convenience methods and constants are provided to help us model all the different scenarios we might come across:
from mindfoundry.optaas.client.sklearn_pipelines.utils import SMALLEST_NUMBER_ABOVE_ZERO, LARGEST_NUMBER_BELOW_ONE
class MyEstimator(OptimizableBaseEstimator):
def make_parameters_and_constraints(self, sk: SklearnParameterMaker, **kwargs) -> ParametersAndConstraints:
return [
# A float value in the range (0, 1) exclusive
sk.FloatParameter("float_param", minimum=SMALLEST_NUMBER_ABOVE_ZERO, maximum=LARGEST_NUMBER_BELOW_ONE),
# Either an integer or the string 'auto' (there is also FloatOrAuto for floats)
sk.IntOrAuto("int_or_auto", minimum=0, maximum=10),
# Either an integer or None
sk.IntParameter("int_or_none", minimum=0, maximum=10, optional=True),
# An integer or a float or a string
sk.ChoiceParameter("multi_type_param", choices=[
sk.IntParameter("an_int", minimum=5, maximum=10),
sk.FloatParameter("a_float", minimum=0, maximum=1),
sk.CategoricalParameter("a_string", values=['abc', 'xyz'])
]),
# A list of values, e.g. [1, 0.2, 'c']
sk.GroupParameter("list_of_stuff", items=[
sk.IntParameter("an_int", minimum=0, maximum=5),
sk.FloatParameter("a_float", minimum=0, maximum=0.5),
sk.CategoricalParameter("a_string", values=['a', 'b', 'c']),
]),
# A dict value, e.g. {'alpha': 0.5, 'beta': 13}
sk.DictParameter("dict_param", items=[
sk.FloatParameter('alpha', minimum=0.5, maximum=1),
sk.IntParameter('beta', minimum=10, maximum=20)
])
], []
Some estimators require additional information in order to optimize their hyperparameters, e.g. for PCA and ICA we need to know how many features are in our dataset, so that we can set a maximum value for the n_components
parameter.
These arguments are provided when a Task is created, and they are made available here as kwargs
. We can use the get_required_kwarg
method to raise an error if an argument has not been provided:
from sklearn.decomposition import PCA as BasePCA
class PCA(BasePCA, OptimizableBaseEstimator):
def make_parameters_and_constraints(self, sk: SklearnParameterMaker, **kwargs) -> ParametersAndConstraints:
feature_count = self.get_required_kwarg(kwargs, 'feature_count')
max_n_components = feature_count - 1 if self.svd_solver == 'arpack' else feature_count
return [
sk.IntParameter('n_components', minimum=1, maximum=max_n_components),
sk.BoolParameter('whiten')
], []
Where necessary, we can also implement some constraints to prevent OPTaaS from generating a configuration which our constructor would not accept:
from mindfoundry.optaas.client.constraint import Constraint
class MyEstimator(OptimizableBaseEstimator):
def __init__(self, cat_param='abc', bool_param=True):
if cat_param == 'abc' and not bool_param:
raise ValueError('Invalid combination of arguments')
self.cat_param = cat_param
self.bool_param = bool_param
def make_parameters_and_constraints(self, sk: SklearnParameterMaker, **kwargs) -> ParametersAndConstraints:
cat_param = sk.CategoricalParameter("cat_param", values=['abc', 'def', 'ghi'])
bool_param = sk.BoolParameter("bool_param")
constraint = Constraint(when=cat_param == 'abc', then=bool_param == True)
return [cat_param, bool_param], [constraint]
def fit():
pass
We now create a task using our new estimator. As you can see, all the parameters and constraints have been generated as expected, and the defaults have been set.
from mindfoundry.optaas.client.client import OPTaaSClient
from mindfoundry.optaas.client.sklearn_pipelines.mixin import OptimizablePipeline
client = OPTaaSClient('https://optaas.mindfoundry.ai', '<Your OPTaaS API key>')
task = client.create_sklearn_task(
title='My Custom Estimator Task',
pipeline=OptimizablePipeline([
('pca', PCA(svd_solver='arpack')),
('my estimator', MyEstimator(cat_param='def'))
]),
feature_count=20
)
display(task.parameters, task.constraints)
[{'id': 'pipeline', 'name': 'pipeline', 'type': 'group', 'items': [{'id': 'pipeline__pca', 'name': 'pca', 'type': 'group', 'items': [{'id': 'pipeline__pca__n_components', 'name': 'n_components', 'type': 'integer', 'minimum': 1, 'maximum': 19}, {'id': 'pipeline__pca__whiten', 'name': 'whiten', 'type': 'boolean', 'default': False}]}, {'id': 'pipeline__my-estimator', 'name': 'my estimator', 'type': 'group', 'items': [{'id': 'pipeline__my-estimator__cat_param', 'name': 'cat_param', 'type': 'categorical', 'default': 'def', 'enum': ['abc', 'def', 'ghi']}, {'id': 'pipeline__my-estimator__bool_param', 'name': 'bool_param', 'type': 'boolean', 'default': True}]}]}]
["if #pipeline__my-estimator__cat_param == 'abc' then #pipeline__my-estimator__bool_param == true"]
We can now generate some configurations for our task and use them to create pipelines:
for configuration in task.generate_configurations(5):
display(task.make_pipeline(configuration))
Pipeline(memory=None, steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None, svd_solver='arpack', tol=0.0, whiten=False)), ('my estimator', MyEstimator(bool_param=True, cat_param='def'))])
Pipeline(memory=None, steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=13, random_state=None, svd_solver='arpack', tol=0.0, whiten=False)), ('my estimator', MyEstimator(bool_param=True, cat_param='def'))])
Pipeline(memory=None, steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=8, random_state=None, svd_solver='arpack', tol=0.0, whiten=True)), ('my estimator', MyEstimator(bool_param=True, cat_param='abc'))])
Pipeline(memory=None, steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=14, random_state=None, svd_solver='arpack', tol=0.0, whiten=True)), ('my estimator', MyEstimator(bool_param=False, cat_param='def'))])
Pipeline(memory=None, steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=13, random_state=None, svd_solver='arpack', tol=0.0, whiten=False)), ('my estimator', MyEstimator(bool_param=True, cat_param='ghi'))])
Any estimator can be an optional step in a pipeline by simply calling optional_step(estimator)
as demonstrated [here](Scikit-learn Pipelines.ipynb).
However, if you want your estimator to always be optional, you can simply use the OptionalStepMixin
:
from mindfoundry.optaas.client.sklearn_pipelines.mixin import OptionalStepMixin
class MyOptionalEstimator(MyEstimator, OptionalStepMixin):
pass