getml.hyperopt

Automatically find the best parameters for

Enterprise edition

This feature is exclusive to the Enterprise edition and is not available in the Community edition. Discover the benefits of the Enterprise edition and compare their features.

For licensing information and technical support, please contact us.

Example

The easiest way to conduct a hyperparameter optimization is to use the built-in tuning routines. Note that these tuning routines usually take a day to complete unless we use very small data sets as we do in this example.

from getml import data
from getml import datasets
from getml import engine
from getml import feature_learning
from getml.feature_learning import aggregations
from getml.feature_learning import loss_functions
from getml import hyperopt
from getml import pipeline
from getml import predictors

# ----------------

engine.set_project("examples")

# ----------------

population_table, peripheral_table = datasets.make_numerical()

# ----------------
# Construct placeholders

population_placeholder = data.Placeholder("POPULATION")
peripheral_placeholder = data.Placeholder("PERIPHERAL")
population_placeholder.join(peripheral_placeholder, "join_key", "time_stamp")

# ----------------

feature_learner1 = feature_learning.Multirel(
    aggregation=[
        aggregations.COUNT,
        aggregations.SUM
    ],
    loss_function=loss_functions.SquareLoss,
    num_features=10,
    share_aggregations=1.0,
    max_length=1,
    num_threads=0
)

# ----------------

feature_learner2 = feature_learning.Relboost(
    loss_function=loss_functions.SquareLoss,
    num_features=10
)

# ----------------

predictor = predictors.LinearRegression()

# ----------------

pipe = pipeline.Pipeline(
    population=population_placeholder,
    peripheral=[peripheral_placeholder],
    feature_learners=[feature_learner1, feature_learner2],
    predictors=[predictor]
)

# ----------------

tuned_pipeline = getml.hyperopt.tune_feature_learners(
    pipeline=base_pipeline,
    population_table_training=population_table,
    population_table_validation=population_table,
    peripheral_tables=[peripheral_table]
)

# ----------------

tuned_pipeline = getml.hyperopt.tune_predictors(
    pipeline=tuned_pipeline,
    population_table_training=population_table,
    population_table_validation=population_table,
    peripheral_tables=[peripheral_table]
)

If you want to define the hyperparameter space and the tuning routing yourself, this is how you can do that:

from getml import data
from getml import datasets
from getml import engine
from getml import feature_learning
from getml.feature_learning import aggregations
from getml.feature_learning import loss_functions
from getml import hyperopt
from getml import pipeline
from getml import predictors

# ----------------

engine.set_project("examples")

# ----------------

population_table, peripheral_table = datasets.make_numerical()

# ----------------
# Construct placeholders

population_placeholder = data.Placeholder("POPULATION")
peripheral_placeholder = data.Placeholder("PERIPHERAL")
population_placeholder.join(peripheral_placeholder, "join_key", "time_stamp")

# ----------------
# Base model - any parameters not included
# in param_space will be taken from this.

feature_learner1 = feature_learning.Multirel(
    aggregation=[
        aggregations.COUNT,
        aggregations.SUM
    ],
    loss_function=loss_functions.SquareLoss,
    num_features=10,
    share_aggregations=1.0,
    max_length=1,
    num_threads=0
)

# ----------------
# Base model - any parameters not included
# in param_space will be taken from this.

feature_learner2 = feature_learning.Relboost(
    loss_function=loss_functions.SquareLoss,
    num_features=10
)

# ----------------
# Base model - any parameters not included
# in param_space will be taken from this.

predictor = predictors.LinearRegression()

# ----------------

pipe = pipeline.Pipeline(
    population=population_placeholder,
    peripheral=[peripheral_placeholder],
    feature_learners=[feature_learner1, feature_learner2],
    predictors=[predictor]
)

# ----------------
# Build a hyperparameter space.
# We have two feature learners and one
# predictor, so this is how we must
# construct our hyperparameter space.
# If we only wanted to optimize the predictor,
# we could just leave out the feature_learners.

param_space = {
    "feature_learners": [
        {
            "num_features": [10, 50],
        },
        {
            "max_depth": [1, 10],
            "min_num_samples": [100, 500],
            "num_features": [10, 50],
            "reg_lambda": [0.0, 0.1],
            "shrinkage": [0.01, 0.4]
        }],
    "predictors": [
        {
            "reg_lambda": [0.0, 10.0]
        }
    ]
}

# ----------------
# Wrap a GaussianHyperparameterSearch around the reference model

gaussian_search = hyperopt.GaussianHyperparameterSearch(
    pipeline=pipe,
    param_space=param_space,
    n_iter=30,
    score=pipeline.scores.rsquared
)

gaussian_search.fit(
    population_table_training=population_table,
    population_table_validation=population_table,
    peripheral_tables=[peripheral_table]
)

# ----------------

# We want 5 additional iterations.
gaussian_search.n_iter = 5

# We do not want another burn-in-phase,
# so we set ratio_iter to 0.
gaussian_search.ratio_iter = 0.0

# This widens the hyperparameter space.
gaussian_search.param_space["feature_learners"][1]["num_features"] = [10, 100]

# This narrows the hyperparameter space.
gaussian_search.param_space["predictors"][0]["reg_lambda"] = [0.0, 0.0]

# This continues the hyperparameter search using the previous iterations as
# prior knowledge.
gaussian_search.fit(
    population_table_training=population_table,
    population_table_validation=population_table,
    peripheral_tables=[peripheral_table]
)

# ----------------

all_hyp = hyperopt.list_hyperopts()

best_pipeline = gaussian_search.best_pipeline

list_hyperopts

list_hyperopts() -> List[str]

Lists all hyperparameter optimization objects present in the Engine.

Note that this function only lists hyperopts which are part of the current project. See set_project for changing projects.

To subsequently load one of them, use load_hyperopt.

RETURNS	DESCRIPTION
`List[str]`	list containing the names of all hyperopts.

Source code in getml/hyperopt/helpers.py

def list_hyperopts() -> List[str]:
    """Lists all hyperparameter optimization objects present in the Engine.

    Note that this function only lists hyperopts which are part of the
    current project. See [`set_project`][getml.engine.set_project] for
    changing projects.

    To subsequently load one of them, use
    [`load_hyperopt`][getml.hyperopt.load_hyperopt.load_hyperopt].

    Returns:
        list containing the names of all hyperopts.

    """

    cmd: Dict[str, Any] = {}
    cmd["type_"] = "list_hyperopts"
    cmd["name_"] = ""

    with comm.send_and_get_socket(cmd) as sock:
        msg = comm.recv_string(sock)
        if msg != "Success!":
            comm.handle_engine_exception(msg)
        json_str = comm.recv_string(sock)

    return json.loads(json_str)["names"]

tune_feature_learners

tune_feature_learners(
    pipeline: Pipeline,
    container: Container,
    train: str = "train",
    validation: str = "validation",
    n_iter: int = 0,
    score: Optional[str] = None,
    num_threads: int = 0,
) -> Pipeline

A high-level interface for optimizing the feature learners of a Pipeline.

Efficiently optimizes the hyperparameters for the set of feature learners (from feature_learning) of a given pipeline by breaking each feature learner's hyperparameter space down into carefully curated subspaces: hyperopt_tuning_subspaces and optimizing the hyperparameters for each subspace in a sequential multi-step process. For further details about the actual recipes behind the tuning routines refer to tuning routines: hyperopt_tuning.

PARAMETER	DESCRIPTION
`pipeline`	Base pipeline used to derive all models fitted and scored during the hyperparameter optimization. It defines the data schema and any hyperparameters that are not optimized. TYPE: `Pipeline`
`container`	The data container used for the hyperparameter tuning. TYPE: `Container`
`train`	The name of the subset in 'container' used for training. TYPE: `str` DEFAULT: `'train'`
`validation`	The name of the subset in 'container' used for validation. TYPE: `str` DEFAULT: `'validation'`
`n_iter`	The number of iterations. TYPE: `int` DEFAULT: `0`
`score`	The score to optimize. Must be from `metrics`. TYPE: `Optional[str]` DEFAULT: `None`
`num_threads`	The number of parallel threads to use. If set to 0, the number of threads will be inferred. TYPE: `int` DEFAULT: `0`

RETURNS	DESCRIPTION
`Pipeline`	Pipeline containing tuned versions of the feature learners.

Example

We assume that you have already set up your Pipeline and Container.

tuned_pipeline = getml.hyperopt.tune_predictors(
    pipeline=base_pipeline,
    container=container)

Source code in getml/hyperopt/tuning.py

def tune_feature_learners(
    pipeline: Pipeline,
    container: Container,
    train: str = "train",
    validation: str = "validation",
    n_iter: int = 0,
    score: Optional[str] = None,
    num_threads: int = 0,
) -> Pipeline:
    """
    A high-level interface for optimizing the feature learners of a
    [`Pipeline`][getml.pipeline.Pipeline].

    Efficiently optimizes the hyperparameters for the set of feature learners
    (from [`feature_learning`][getml.feature_learning]) of a given pipeline by breaking each
    feature learner's hyperparameter space down into carefully curated
    subspaces: `hyperopt_tuning_subspaces` and optimizing the hyperparameters for
    each subspace in a sequential multi-step process.  For further details about
    the actual recipes behind the tuning routines refer
    to tuning routines: `hyperopt_tuning`.

    Args:
        pipeline:
            Base pipeline used to derive all models fitted and scored
            during the hyperparameter optimization. It defines the data
            schema and any hyperparameters that are not optimized.

        container:
            The data container used for the hyperparameter tuning.

        train:
            The name of the subset in 'container' used for training.

        validation:
            The name of the subset in 'container' used for validation.

        n_iter:
            The number of iterations.

        score:
            The score to optimize. Must be from
            [`metrics`][getml.pipeline.metrics].

        num_threads:
            The number of parallel threads to use. If set to 0,
            the number of threads will be inferred.

    Returns:
        Pipeline containing tuned versions of the feature learners.

    ??? example
        We assume that you have already set up your
        [`Pipeline`][getml.Pipeline] and
        [`Container`][getml.data.Container].

        ```python
        tuned_pipeline = getml.hyperopt.tune_predictors(
            pipeline=base_pipeline,
            container=container)
        ```
    """

    if not isinstance(pipeline, getml.pipeline.Pipeline):
        raise TypeError("'pipeline' must be a pipeline!")

    pipeline._validate()

    if not score:
        score = _infer_score(pipeline)

    tuned_feature_learners = []

    for feature_learner in pipeline.feature_learners:
        tuned_pipeline = _tune_feature_learner(
            feature_learner=feature_learner,
            pipeline=pipeline,
            container=container,
            train=train,
            validation=validation,
            n_iter=n_iter,
            score=score,
            num_threads=num_threads,
        )

        assert len(tuned_pipeline.feature_learners) == 1, (
            "Expected exactly one feature learner, got "
            + str(len(tuned_pipeline.feature_learners))
        )

        tuned_feature_learners.append(tuned_pipeline.feature_learners[0])

    return _make_final_pipeline(
        pipeline,
        tuned_feature_learners,
        copy.deepcopy(pipeline.predictors),
        container,
        train,
        validation,
    )

tune_predictors

tune_predictors(
    pipeline: Pipeline,
    container: Container,
    train: str = "train",
    validation: str = "validation",
    n_iter: int = 0,
    score: Optional[str] = None,
    num_threads: int = 0,
) -> Pipeline

A high-level interface for optimizing the predictors of a Pipeline.

Efficiently optimizes the hyperparameters for the set of predictors (from getml.predictors) of a given pipeline by breaking each predictor's hyperparameter space down into carefully curated subspaces: hyperopt_tuning_subspaces and optimizing the hyperparameters for each subspace in a sequential multi-step process. For further details about the actual recipes behind the tuning routines refer to tuning routines: hyperopt_tuning.

PARAMETER	DESCRIPTION
`pipeline`	Base pipeline used to derive all models fitted and scored during the hyperparameter optimization. It defines the data schema and any hyperparameters that are not optimized. TYPE: `Pipeline`
`container`	The data container used for the hyperparameter tuning. TYPE: `Container`
`train`	The name of the subset in 'container' used for training. TYPE: `str` DEFAULT: `'train'`
`validation`	The name of the subset in 'container' used for validation. TYPE: `str` DEFAULT: `'validation'`
`n_iter`	The number of iterations. TYPE: `int` DEFAULT: `0`
`score`	The score to optimize. Must be from `metrics`. TYPE: `Optional[str]` DEFAULT: `None`
`num_threads`	The number of parallel threads to use. If set to 0, the number of threads will be inferred. TYPE: `int` DEFAULT: `0`

Example

We assume that you have already set up your Pipeline and Container.

tuned_pipeline = getml.hyperopt.tune_predictors(
    pipeline=base_pipeline,
    container=container)

RETURNS	DESCRIPTION
`Pipeline`	Pipeline containing tuned predictors.

Source code in getml/hyperopt/tuning.py

def tune_predictors(
    pipeline: Pipeline,
    container: Container,
    train: str = "train",
    validation: str = "validation",
    n_iter: int = 0,
    score: Optional[str] = None,
    num_threads: int = 0,
) -> Pipeline:
    """
    A high-level interface for optimizing the predictors of a
    [`Pipeline`][getml.Pipeline].

    Efficiently optimizes the hyperparameters for the set of predictors (from
    [`getml.predictors`][getml.predictors]) of a given pipeline by breaking each
    predictor's
    hyperparameter space down into carefully curated
    subspaces: `hyperopt_tuning_subspaces` and optimizing the hyperparameters for
    each subspace in a sequential multi-step process.  For further details about
    the actual recipes behind the tuning routines refer to
    tuning routines: `hyperopt_tuning`.

    Args:
        pipeline:
            Base pipeline used to derive all models fitted and scored
            during the hyperparameter optimization. It defines the data
            schema and any hyperparameters that are not optimized.

        container:
            The data container used for the hyperparameter tuning.

        train:
            The name of the subset in 'container' used for training.

        validation:
            The name of the subset in 'container' used for validation.

        n_iter:
            The number of iterations.

        score:
            The score to optimize. Must be from
            [`metrics`][getml.pipeline.metrics].

        num_threads:
            The number of parallel threads to use. If set to 0,
            the number of threads will be inferred.

    ??? example
        We assume that you have already set up your
        [`Pipeline`][getml.Pipeline] and
        [`Container`][getml.data.Container].

        ```python
        tuned_pipeline = getml.hyperopt.tune_predictors(
            pipeline=base_pipeline,
            container=container)
        ```

    Returns:
        Pipeline containing tuned predictors.

    """

    if not isinstance(pipeline, getml.pipeline.Pipeline):
        raise TypeError("'pipeline' must be a pipeline!")

    pipeline._validate()

    if not score:
        score = _infer_score(pipeline)

    tuned_predictors = []

    for predictor in pipeline.predictors:
        tuned_pipeline = _tune_predictor(
            predictor=predictor,
            pipeline=pipeline,
            container=container,
            train=train,
            validation=validation,
            n_iter=n_iter,
            score=score,
            num_threads=num_threads,
        )

        assert len(tuned_pipeline.predictors) == 1, (
            "Expected exactly one predictor, got " + str(len(tuned_pipeline.predictors))
        )

        tuned_predictors.append(tuned_pipeline.predictors[0])

    return _make_final_pipeline(
        pipeline,
        copy.deepcopy(pipeline.feature_learners),
        tuned_predictors,
        container,
        train,
        validation,
    )

exists

exists(name: str) -> bool

Determines whether a hyperopt exists.

PARAMETER	DESCRIPTION
`name`	The name of the hyperopt. TYPE: `str`

RETURNS	DESCRIPTION
`bool`	A boolean indicating whether a hyperopt named 'name' exists.

Source code in getml/hyperopt/helpers.py

def exists(name: str) -> bool:
    """Determines whether a hyperopt exists.

    Args:
        name: The name of the hyperopt.

    Returns:
        A boolean indicating whether a hyperopt named 'name' exists.
    """
    if not isinstance(name, str):
        raise TypeError("'name' must be of type str")

    return name in list_hyperopts()

delete

delete(name: str) -> None

If a hyperopt named 'name' exists, it is deleted.

PARAMETER	DESCRIPTION
`name`	The name of the hyperopt. TYPE: `str`

Source code in getml/hyperopt/helpers.py

def delete(name: str) -> None:
    """
    If a hyperopt named 'name' exists, it is deleted.

    Args:
        name: The name of the hyperopt.
    """

    if not exists(name):
        return

    cmd: Dict[str, Any] = {}

    cmd["type_"] = "Hyperopt.delete"
    cmd["name_"] = name

    with comm.send_and_get_socket(cmd) as sock:
        msg = comm.recv_string(sock)
        if msg != "Success!":
            comm.handle_engine_exception(msg)

load_hyperopt

load_hyperopt(
    name: str,
) -> Union[
    GaussianHyperparameterSearch,
    LatinHypercubeSearch,
    RandomSearch,
]

Loads a hyperparameter optimization object from the getML Engine into Python.

PARAMETER	DESCRIPTION
`name`	The name of the hyperopt to be loaded. TYPE: `str`

RETURNS	DESCRIPTION
`Union[GaussianHyperparameterSearch, LatinHypercubeSearch, RandomSearch]`	The hyperopt object.

Source code in getml/hyperopt/load_hyperopt.py

def load_hyperopt(
    name: str,
) -> Union[GaussianHyperparameterSearch, LatinHypercubeSearch, RandomSearch]:
    """Loads a hyperparameter optimization object from the getML Engine into Python.

    Args:
        name:
            The name of the hyperopt to be loaded.

    Returns:
        The hyperopt object.

    """
    # This will be overwritten by .refresh(...) anyway
    dummy_pipeline = _make_dummy("123456")

    dummy_param_space = {"predictors": [{"reg_lambda": [0.0, 1.0]}]}

    json_obj = _get_json_obj(name)

    if json_obj["type_"] == "GaussianHyperparameterSearch":
        return GaussianHyperparameterSearch(
            param_space=dummy_param_space, pipeline=dummy_pipeline
        )._parse_json_obj(json_obj)

    if json_obj["type_"] == "LatinHypercubeSearch":
        return LatinHypercubeSearch(
            param_space=dummy_param_space, pipeline=dummy_pipeline
        )._parse_json_obj(json_obj)

    if json_obj["type_"] == "RandomSearch":
        return RandomSearch(
            param_space=dummy_param_space, pipeline=dummy_pipeline
        )._parse_json_obj(json_obj)

    raise ValueError("Unknown type: '" + json_obj["type_"] + "'!")

kernels

Collection of kernel functions to be used by the hyperparameter optimizations.

exp `module-attribute`

exp = 'exp'

An exponential kernel yielding non-differentiable sample paths.

gauss `module-attribute`

gauss = 'gauss'

A Gaussian kernel yielding analytic (infinitely--differentiable) sample paths.

matern32 `module-attribute`

matern32 = 'matern32'

A Matérn 3/2 kernel yielding once-differentiable sample paths.

matern52 `module-attribute`

matern52 = 'matern52'

A Matérn 5/2 kernel yielding twice-differentiable sample paths.

optimization

Collection of optimization algorithms to be used by the hyperparameter optimizations.

bfgs `module-attribute`

bfgs = 'bfgs'

Broyden-Fletcher-Goldbarb-Shanno optimization algorithm.

The BFGS algorithm is a quasi-Newton method that requires the function to be differentiable.

nelder_mead `module-attribute`

nelder_mead = 'nelderMead'

Nelder-Mead optimization algorithm.

Nelder-Mead is a direct search method that does not require functions to be differentiable.

burn_in

Collection of burn-in algorithms to be used by the hyperparameter optimizations.

latin_hypercube `module-attribute`

latin_hypercube = 'latinHypercube'

Samples from the hyperparameter space almost randomly, but ensures that the different draws are sufficiently different from each other.

random `module-attribute`

random = 'random'

Samples from the hyperparameter space at random.

getml.hyperopt

list_hyperopts

tune_feature_learners

tune_predictors

exists

delete

load_hyperopt

kernels

exp module-attribute

gauss module-attribute

matern32 module-attribute

matern52 module-attribute

optimization

bfgs module-attribute

nelder_mead module-attribute

burn_in

latin_hypercube module-attribute

random module-attribute

exp `module-attribute`

gauss `module-attribute`

matern32 `module-attribute`

matern52 `module-attribute`

bfgs `module-attribute`

nelder_mead `module-attribute`

latin_hypercube `module-attribute`

random `module-attribute`