This page introduces the hyperparams, and distributions in Neuraxle. You can find Hyperparams Distribution API here, and Hyperparameter Samples API here.
Hyperparameter is a parameter drawn from a prior distribution. In Neuraxle, we have a few built-in distributions, and we are also compatible with scipy distributions.
Create a Uniform Distribution:
from neuraxle.hyperparams.distributions import Uniform
hd = Uniform(
min_included=-10,
max_included=10,
null_default_value=0
)
Sample the random variable using rvs:
sample = hd.rvs()
print(sample)
-5.873595345744127
Nullify the random variable using nullify:
nullified_sample = hd.nullify()
assert nullified_sample == hd.null_default_value
Get the probability distribution function value at x
using pdf:
pdf = hd.pdf(1)
print('pdf: {}'.format(pdf))
pdf: 0.05
Get the cumulative probability distribution function value at x
using cdf
cdf = hd.cdf(1)
print('cdf: {}'.format(cdf))
cdf: 0.55
In Neuraxle, each step has hyperparams of type HyperparameterSamples, and spaces of type HyperparameterSpace.
Consider a simple pipeline that contains 2 MultiplyByN steps, and one PCA component inside a nested pipeline:
from sklearn.decomposition import PCA
from neuraxle.hyperparams.distributions import RandInt
from neuraxle.hyperparams.space import HyperparameterSpace, HyperparameterSamples
from neuraxle.pipeline import Pipeline
from neuraxle.steps.numpy import MultiplyByN
p = Pipeline([
('step1', MultiplyByN(2)),
('step2', MultiplyByN(2)),
Pipeline([
PCA(n_components=4)
])
])
We can set or update the hyperparams, and spaces by doing the following:
p.set_hyperparams(HyperparameterSamples({
'step1__multiply_by': 42,
'step2__multiply_by': -10,
'Pipeline__PCA__n_components': 2
}))
p.update_hyperparams(HyperparameterSamples({
'Pipeline__PCA__n_components': 3
}))
p.set_hyperparams_space(HyperparameterSpace({
'step1__multiply_by': RandInt(42, 50),
'step2__multiply_by': RandInt(-10, 0),
'Pipeline__PCA__n_components': RandInt(2, 3)
}))
Pipeline ( Pipeline( name=Pipeline, hyperparameters=HyperparameterSamples([('step1', RecursiveDict([('multiply_by', 42)])), ('step2', RecursiveDict([('multiply_by', -10)])), ('Pipeline', RecursiveDict([('PCA', RecursiveDict([('n_components', 3)]))]))]) )( [('step1', MultiplyByN( name=step1, hyperparameters=HyperparameterSamples([('multiply_by', 42)]) )), ('step2', MultiplyByN( name=step2, hyperparameters=HyperparameterSamples([('multiply_by', -10)]) )), ('Pipeline', Pipeline ( Pipeline( name=Pipeline, hyperparameters=HyperparameterSamples([('PCA', RecursiveDict([('n_components', 3)]))]) )( [('PCA', <neuraxle.steps.sklearn.SKLearnWrapper(PCA(...)) object 0x108906810>)] ) ))] ) )
We can sample the space of random variables:
samples = p.get_hyperparams_space().rvs()
assert 42 <= samples['step1__multiply_by'] <= 50
assert -10 <= samples['step2__multiply_by'] <= 0
assert samples['Pipeline__PCA__n_components'] in [2, 3]
We can get all hyperparams:
samples = p.get_hyperparams()
assert 42 <= samples['step1__multiply_by'] <= 50
assert -10 <= samples['step2__multiply_by'] <= 0
assert samples['Pipeline__PCA__n_components'] in [2, 3]
assert p['Pipeline']['PCA'].get_wrapped_sklearn_predictor().n_components in [2, 3]
To define a scipy distribution that is compatible with Neuraxle, you need to wrap the scipy distribution with ScipyDistributionWrapper:
from neuraxle.hyperparams.scipy_distributions import ScipyDistributionWrapper, BaseContinuousDistribution, BaseDiscreteDistribution
from scipy.integrate import quad
from scipy.special import factorial
from scipy.stats import rv_continuous, norm, rv_discrete, rv_histogram, truncnorm, randint
import numpy as np
import math
hd = ScipyDistributionWrapper(
scipy_distribution=randint(low=0, high=10),
is_continuous=False,
null_default_value=0
)
For discrete distribution that inherit from rv_discrete, you only need to implement _pmf. The rest is taken care of magically by scipy.
For example, here is a discrete poisson distribution:
class Poisson(BaseDiscreteDistribution):
def __init__(self, min_included: float, max_included: float, null_default_value: float = None, mu=0.6):
super().__init__(
min_included=min_included,
max_included=max_included,
name='poisson',
null_default_value=null_default_value
)
self.mu = mu
def _pmf(self, x):
return math.exp(-self.mu) * self.mu ** x / factorial(x)
For continous distribution that inherit from rv_continuous, you only need to implement _pdf function. The rest is taken care of magically by scipy.
For example, here is a continous gaussian distribution:
class Gaussian(BaseContinuousDistribution):
def __init__(self, min_included: int, max_included: int, null_default_value: float = None):
self.max_included = max_included
self.min_included = min_included
BaseContinuousDistribution.__init__(
self,
name='gaussian',
min_included=min_included,
max_included=max_included,
null_default_value=null_default_value
)
def _pdf(self, x):
return math.exp(-x ** 2 / 2.) / np.sqrt(2.0 * np.pi)
If you want to add more properties to calculate your distributions, just add them in self. They will be available in all of the scipy private methods you can override like _pmf, and _pdf.
class LogNormal(BaseContinuousDistribution):
def __init__(
self,
log2_space_mean: float,
log2_space_std: float,
hard_clip_min: float,
hard_clip_max: float,
null_default_value: float = None
):
if null_default_value is None:
null_default_value = hard_clip_min
if hard_clip_min is None:
hard_clip_min = np.nan
if hard_clip_max is None:
hard_clip_max = np.nan
self.log2_space_mean = log2_space_mean
self.log2_space_std = log2_space_std
super().__init__(
name='log_normal',
min_included=hard_clip_min,
max_included=hard_clip_max,
null_default_value=null_default_value
)
def _pdf(self, x):
if x <= 0:
return 0.
cdf_min = 0.
cdf_max = 1.
pdf_x = 1 / (x * math.log(2) * self.log2_space_std * math.sqrt(2 * math.pi)) * math.exp(
-(math.log2(x) - self.log2_space_mean) ** 2 / (2 * self.log2_space_std ** 2))
return pdf_x / (cdf_max - cdf_min)
All of the scipy distribution methods are available:
def get_many_samples_for(hd, num_trial):
return [hd.rvs() for _ in range(num_trial)]
samples = get_many_samples_for(hd, 1000)
for s in samples:
assert type(s) == int
hd = Gaussian(min_included=0, max_included=10, null_default_value=0)
assert 0.0 <= hd.rvs() <= 10.0
assert hd.pdf(10) < 0.001
assert hd.pdf(0) < 0.42
assert 0.55 > hd.cdf(5.0) > 0.45
assert hd.cdf(0) == 0.0
assert hd.logpdf(5) == -13.418938533204672
assert hd.logcdf(5) == -0.6931477538632531
assert hd.sf(5) == 0.5000002866515718
assert hd.logsf(5) == -0.693146607256966
assert np.all(hd.ppf([0.0, 0.01, 0.05, 0.1, 1 - 0.10, 1 - 0.05, 1 - 0.01, 1.0], 10))
assert np.isclose(hd.moment(2), 50.50000000091249)
assert hd.stats()[0]
assert hd.stats()[1]
assert np.array_equal(hd.entropy(), np.array(0.7094692666023363))
assert hd.median()
assert hd.mean() == 5.398942280397029
assert np.isclose(hd.std(), 4.620759921685374)
assert np.isclose(hd.var(), 21.35142225385382)
assert np.isclose(hd.expect(), 0.39894228040143276)
interval = hd.interval(alpha=[0.25, 0.50])
assert np.all(interval[0])
assert np.all(interval[1])
assert hd.support() == (0, 10)
SKLearnWrapper wraps sklearn predictors so that they can be compatible with Neuraxle. When you set the hyperparams of an SKLearnWrapper, it automatically sets the params of the sklearn predictor for you:
from neuraxle.hyperparams.distributions import Choice
from neuraxle.hyperparams.distributions import RandInt
from neuraxle.hyperparams.space import HyperparameterSpace
from neuraxle.steps.sklearn import SKLearnWrapper
from sklearn.tree import DecisionTreeClassifier
decision_tree_classifier = SKLearnWrapper(
DecisionTreeClassifier(),
HyperparameterSpace({
'criterion': Choice(['gini', 'entropy']),
'splitter': Choice(['best', 'random']),
'min_samples_leaf': RandInt(2, 5),
'min_samples_split': RandInt(1, 3)
})
).set_hyperparams(HyperparameterSamples({
'criterion': 'gini',
'splitter': 'best',
'min_samples_leaf': 3,
'min_samples_split': 3
}))