"""
The :mod:`physlearn.supervised.utils._estimator_checks` module provides
basic utilities for automated estimator checking.
"""
# Author: Alex Wozniakowski
# License: MIT
import Levenshtein
import os
import re
import warnings
import numpy as np
from physlearn.loss import LOSS_FUNCTIONS
from physlearn.supervised.utils._definition import (_BAYESOPTCV_INIT_PARAMS,
_ESTIMATOR_DICT,
_OPTIMIZE_METHOD,
_PIPELINE_PARAMS,
_SEARCH_METHOD)
[docs]def _basic_autocorrect(init_choice: str, candidate_choices: list) -> str:
"""Chooses the candidate string that minimizes the edit distance.
Parameters
----------
init_choice : str
Specify the initial choice as a string, e.g., the Scikit-Learn class
Ridge as 'ridge', 'Ridge', 'RIDGE', etc.
candidate_choices : list
A list of candidate choices, where each candidate is a string.
Returns
-------
out_choice : str
Notes
-----
The edit distance between the initial choice and each possible choice corresponds
to the Levenshtein distance, which uses the operations of insertion, removal, or
substitution to count the distance.
"""
assert isinstance(init_choice, str)
assert isinstance(candidate_choices, list)
min_dist = np.inf
out_choice = init_choice
for candidate_choice in candidate_choices:
dist = Levenshtein.distance(init_choice, candidate_choice)
if dist < min_dist:
min_dist = dist
out_choice = candidate_choice
if min_dist > 0:
warnings.warn(f'{init_choice} was misspelled, so we replaced it with {out_choice}.',
UserWarning)
return out_choice
[docs]def _check_estimator_choice(estimator_choice: str, estimator_type: str,
estimator_choices=None) -> str:
"""Chooses the candidate estimator that minimizes the edit distance.
Parameters
----------
estimator_choice : str
Specify the estimator choice as a string, e.g., the Scikit-Learn
class Ridge as 'ridge', 'Ridge', 'RIDGE', etc.
estimator_type : str
Specify the supervised learning task, e.g., regression.
estimator_choices : list or None, optional (default=None)
A list of estimator choices, where each estimator is a string.
Returns
-------
estimator_choice : str
"""
assert all(isinstance(arg, str) for arg in [estimator_choice, estimator_type])
if estimator_choices is not None:
assert isinstance(estimator_choices, list)
assert all(isinstance(choices, str) for choices in estimator_choices)
else:
estimator_choices = [choice for choice in _ESTIMATOR_DICT[estimator_type].keys()]
estimator_choice = _basic_autocorrect(init_choice=estimator_choice.strip().lower(),
candidate_choices=estimator_choices)
return estimator_choice
[docs]def _check_stacking_layer(stacking_layer: dict, estimator_type: str) -> dict:
"""Chooses the the first and second stacking layer estimators.
Parameters
----------
stacking_layer : dict
Specify the estimator(s) in the first stacking layer, and the
final estimator in the second stacking layer.
estimator_type : str
Specify the supervised learning task, e.g., regression.
Returns
-------
stacking_layer : dict
"""
assert isinstance(stacking_layer, dict)
assert isinstance(estimator_type, str)
estimator_choices = [choice for choice in _ESTIMATOR_DICT[estimator_type].keys()]
# Loop through the first and second stacking layers,
# and compute the edit distance for each estimator.
# If the edit distance is positive, then replace
# the estimator with the minimal distance estimator.
for key, layer in stacking_layer.items():
if key in ['estimators', 'regressors']:
stacking_layer[key] = [_check_estimator_choice(estimator_choice=est.strip().lower(),
estimator_type=estimator_type,
estimator_choices=estimator_choices)
for est in layer]
elif key in ['final_estimator', 'final_regressor']:
stacking_layer[key] = _check_estimator_choice(estimator_choice=layer.strip().lower(),
estimator_type=estimator_type,
estimator_choices=estimator_choices)
else:
raise KeyError('The key: %s is not a valid choice for stacking_layer.'
% (key))
return stacking_layer
[docs]def _check_line_search_options(line_search_options: dict) -> None:
"""Checks the line search computation options for base boosting.
Parameters
----------
init_guess : int, float, or ndarray
The initial guess for the expansion coefficient.
opt_method : str
Choice of optimization method. If ``'minimize'``, then
:class:`scipy.optimize.minimize`, else if ``'basinhopping'``,
then :class:`scipy.optimize.basinhopping`.
method : str or None
The type of solver utilized in the optimization method.
tol : float or None
The epsilon tolerance for terminating the optimization method.
options : dict or None
A dictionary of solver options.
niter : int or None
The number of iterations in basin-hopping.
T : float or None
The temperature paramter utilized in basin-hopping,
which determines the accept or reject criterion.
loss : str
The loss function utilized in the line search computation, where 'ls'
denotes the squared error loss function, 'lad' denotes the absolute error
loss function, 'huber' denotes the Huber loss function, and 'quantile'
denotes the quantile loss function.
regularization : int or float
The regularization strength in the line search computation.
"""
for search_key, search_option in line_search_options.items():
if search_key == 'init_guess':
assert isinstance(search_option, (int, float, np.array))
elif search_key == 'opt_method':
assert search_option in ['minimize', 'basinhopping']
elif search_key == 'method':
assert search_option in _OPTIMIZE_METHOD
elif search_key == 'tol':
assert isinstance(search_option, float)
elif search_key == 'options':
assert isinstance(search_option, dict)
elif search_key == 'niter':
if search_option is not None:
assert isinstance(search_option, int)
elif search_key == 'T':
if search_option is not None:
assert isinstance(search_option, float)
elif search_key == 'loss':
assert search_option in LOSS_FUNCTIONS
elif search_key == 'regularization':
assert isinstance(search_option, (int, float))
else:
raise KeyError('The key: %s is not a valid line search option.'
% (search_key))
[docs]def _check_bayesoptcv_param_type(pbounds: dict) -> dict:
"""Checks if the Bayesian optimization utility changed the (hyper)parameter type.
Parameters
----------
pbounds: dict
A dictionary, wherein the keys are the (hyper)parameter names
and the values are the (hyper)parameter values.
Returns
-------
pbounds : dict
Notes
-----
During the sequential Bayesian optimization, the utility occasionally sets
the value of a (hyper)parameter with type int to a value with type float.
"""
assert isinstance(pbounds, dict)
for key, param in pbounds.items():
if key in _BAYESOPTCV_INIT_PARAMS:
pbounds[key] = int(value)
return pbounds
[docs]def _preprocess_hyperparams(raw_params: dict, multi_target: bool,
chain: bool) -> dict:
"""Preprocesses the (hyper)parameters.
The preprocessing is determined by the regression task, and the assumption
on the single-targets, if the task is multi-target regression.
Parameters
----------
raw_params : dict
The user provided (hyper)parameters.
multi_target : bool
Distinguishes between single-target and multi-target regression.
If True, then the expected task is multi-target regression.
chain : bool
Distinguishes between independent single-target regression
subtasks and chaining. If true, then the expected multi-target
combination is chaining.
Returns
-------
out_params : dict
"""
assert isinstance(raw_params, (dict))
out_params = {}
if multi_target and chain:
for key, value in raw_params.items():
if re.match('tr__', key):
out_params[key] = value
elif re.match('reg__', key):
out_params['reg__base_estimator__' + key[5:]] = value
elif key in _PIPELINE_PARAMS:
out_params[key] = value
else:
raise KeyError('The key: %s is not a valid (hyper)parameter name.'
% (key))
elif multi_target and not chain:
for key, value in raw_params.items():
if re.match('tr__', key):
out_params[key] = value
elif re.match('reg__', key):
out_params['reg__estimator__' + key[5:]] = value
elif key in _PIPELINE_PARAMS:
out_params[key] = value
else:
raise KeyError('The key: %s is not a valid (hyper)parameter name.'
% (key))
else:
for key, value in raw_params.items():
if re.match('tr__', key):
out_params[key] = value
elif re.match('reg__', key):
out_params[key] = value
elif key in _PIPELINE_PARAMS:
out_params[key] = value
else:
raise KeyError('The key: %s is not a valid (hyper)parameter name.'
% (key))
return out_params
[docs]def _check_search_method(search_method: str) -> str:
"""Chooses the (hyper)parameter search method that minimizes the edit distance.
Parameters
----------
search_method : str
Specifies the Scikit-learn or Bayesian optimization
(hyper)parameter search method.
Returns
-------
search_method : str
"""
return _basic_autocorrect(init_choice=search_method.strip().lower(),
candidate_choices=_SEARCH_METHOD)