Source code for physlearn.supervised.utils._estimator_checks

"""
The :mod:`physlearn.supervised.utils._estimator_checks` module provides
basic utilities for automated estimator checking. 
"""

# Author: Alex Wozniakowski
# License: MIT

import Levenshtein
import os
import re
import warnings

import numpy as np

from physlearn.loss import LOSS_FUNCTIONS
from physlearn.supervised.utils._definition import (_BAYESOPTCV_INIT_PARAMS,
                                                    _ESTIMATOR_DICT,
                                                    _OPTIMIZE_METHOD,
                                                    _PIPELINE_PARAMS,
                                                    _SEARCH_METHOD)


[docs]def _basic_autocorrect(init_choice: str, candidate_choices: list) -> str: """Chooses the candidate string that minimizes the edit distance. Parameters ---------- init_choice : str Specify the initial choice as a string, e.g., the Scikit-Learn class Ridge as 'ridge', 'Ridge', 'RIDGE', etc. candidate_choices : list A list of candidate choices, where each candidate is a string. Returns ------- out_choice : str Notes ----- The edit distance between the initial choice and each possible choice corresponds to the Levenshtein distance, which uses the operations of insertion, removal, or substitution to count the distance. """ assert isinstance(init_choice, str) assert isinstance(candidate_choices, list) min_dist = np.inf out_choice = init_choice for candidate_choice in candidate_choices: dist = Levenshtein.distance(init_choice, candidate_choice) if dist < min_dist: min_dist = dist out_choice = candidate_choice if min_dist > 0: warnings.warn(f'{init_choice} was misspelled, so we replaced it with {out_choice}.', UserWarning) return out_choice
[docs]def _check_estimator_choice(estimator_choice: str, estimator_type: str, estimator_choices=None) -> str: """Chooses the candidate estimator that minimizes the edit distance. Parameters ---------- estimator_choice : str Specify the estimator choice as a string, e.g., the Scikit-Learn class Ridge as 'ridge', 'Ridge', 'RIDGE', etc. estimator_type : str Specify the supervised learning task, e.g., regression. estimator_choices : list or None, optional (default=None) A list of estimator choices, where each estimator is a string. Returns ------- estimator_choice : str """ assert all(isinstance(arg, str) for arg in [estimator_choice, estimator_type]) if estimator_choices is not None: assert isinstance(estimator_choices, list) assert all(isinstance(choices, str) for choices in estimator_choices) else: estimator_choices = [choice for choice in _ESTIMATOR_DICT[estimator_type].keys()] estimator_choice = _basic_autocorrect(init_choice=estimator_choice.strip().lower(), candidate_choices=estimator_choices) return estimator_choice
[docs]def _check_stacking_layer(stacking_layer: dict, estimator_type: str) -> dict: """Chooses the the first and second stacking layer estimators. Parameters ---------- stacking_layer : dict Specify the estimator(s) in the first stacking layer, and the final estimator in the second stacking layer. estimator_type : str Specify the supervised learning task, e.g., regression. Returns ------- stacking_layer : dict """ assert isinstance(stacking_layer, dict) assert isinstance(estimator_type, str) estimator_choices = [choice for choice in _ESTIMATOR_DICT[estimator_type].keys()] # Loop through the first and second stacking layers, # and compute the edit distance for each estimator. # If the edit distance is positive, then replace # the estimator with the minimal distance estimator. for key, layer in stacking_layer.items(): if key in ['estimators', 'regressors']: stacking_layer[key] = [_check_estimator_choice(estimator_choice=est.strip().lower(), estimator_type=estimator_type, estimator_choices=estimator_choices) for est in layer] elif key in ['final_estimator', 'final_regressor']: stacking_layer[key] = _check_estimator_choice(estimator_choice=layer.strip().lower(), estimator_type=estimator_type, estimator_choices=estimator_choices) else: raise KeyError('The key: %s is not a valid choice for stacking_layer.' % (key)) return stacking_layer
[docs]def _check_line_search_options(line_search_options: dict) -> None: """Checks the line search computation options for base boosting. Parameters ---------- init_guess : int, float, or ndarray The initial guess for the expansion coefficient. opt_method : str Choice of optimization method. If ``'minimize'``, then :class:`scipy.optimize.minimize`, else if ``'basinhopping'``, then :class:`scipy.optimize.basinhopping`. method : str or None The type of solver utilized in the optimization method. tol : float or None The epsilon tolerance for terminating the optimization method. options : dict or None A dictionary of solver options. niter : int or None The number of iterations in basin-hopping. T : float or None The temperature paramter utilized in basin-hopping, which determines the accept or reject criterion. loss : str The loss function utilized in the line search computation, where 'ls' denotes the squared error loss function, 'lad' denotes the absolute error loss function, 'huber' denotes the Huber loss function, and 'quantile' denotes the quantile loss function. regularization : int or float The regularization strength in the line search computation. """ for search_key, search_option in line_search_options.items(): if search_key == 'init_guess': assert isinstance(search_option, (int, float, np.array)) elif search_key == 'opt_method': assert search_option in ['minimize', 'basinhopping'] elif search_key == 'method': assert search_option in _OPTIMIZE_METHOD elif search_key == 'tol': assert isinstance(search_option, float) elif search_key == 'options': assert isinstance(search_option, dict) elif search_key == 'niter': if search_option is not None: assert isinstance(search_option, int) elif search_key == 'T': if search_option is not None: assert isinstance(search_option, float) elif search_key == 'loss': assert search_option in LOSS_FUNCTIONS elif search_key == 'regularization': assert isinstance(search_option, (int, float)) else: raise KeyError('The key: %s is not a valid line search option.' % (search_key))
[docs]def _check_bayesoptcv_param_type(pbounds: dict) -> dict: """Checks if the Bayesian optimization utility changed the (hyper)parameter type. Parameters ---------- pbounds: dict A dictionary, wherein the keys are the (hyper)parameter names and the values are the (hyper)parameter values. Returns ------- pbounds : dict Notes ----- During the sequential Bayesian optimization, the utility occasionally sets the value of a (hyper)parameter with type int to a value with type float. """ assert isinstance(pbounds, dict) for key, param in pbounds.items(): if key in _BAYESOPTCV_INIT_PARAMS: pbounds[key] = int(value) return pbounds
[docs]def _preprocess_hyperparams(raw_params: dict, multi_target: bool, chain: bool) -> dict: """Preprocesses the (hyper)parameters. The preprocessing is determined by the regression task, and the assumption on the single-targets, if the task is multi-target regression. Parameters ---------- raw_params : dict The user provided (hyper)parameters. multi_target : bool Distinguishes between single-target and multi-target regression. If True, then the expected task is multi-target regression. chain : bool Distinguishes between independent single-target regression subtasks and chaining. If true, then the expected multi-target combination is chaining. Returns ------- out_params : dict """ assert isinstance(raw_params, (dict)) out_params = {} if multi_target and chain: for key, value in raw_params.items(): if re.match('tr__', key): out_params[key] = value elif re.match('reg__', key): out_params['reg__base_estimator__' + key[5:]] = value elif key in _PIPELINE_PARAMS: out_params[key] = value else: raise KeyError('The key: %s is not a valid (hyper)parameter name.' % (key)) elif multi_target and not chain: for key, value in raw_params.items(): if re.match('tr__', key): out_params[key] = value elif re.match('reg__', key): out_params['reg__estimator__' + key[5:]] = value elif key in _PIPELINE_PARAMS: out_params[key] = value else: raise KeyError('The key: %s is not a valid (hyper)parameter name.' % (key)) else: for key, value in raw_params.items(): if re.match('tr__', key): out_params[key] = value elif re.match('reg__', key): out_params[key] = value elif key in _PIPELINE_PARAMS: out_params[key] = value else: raise KeyError('The key: %s is not a valid (hyper)parameter name.' % (key)) return out_params
[docs]def _check_search_method(search_method: str) -> str: """Chooses the (hyper)parameter search method that minimizes the edit distance. Parameters ---------- search_method : str Specifies the Scikit-learn or Bayesian optimization (hyper)parameter search method. Returns ------- search_method : str """ return _basic_autocorrect(init_choice=search_method.strip().lower(), candidate_choices=_SEARCH_METHOD)