Source code for physlearn.supervised.regression

"""
The :mod:`physlearn.supervised.regression` module provides machine learning
utilities, which solve single-target and multi-target regression tasks. It
includes the :class:`physlearn.BaseRegressor` and :class:`physlearn.Regressor`
classes.
"""

# Author: Alex Wozniakowski
# License: MIT

from __future__ import annotations

import joblib
import re
import typing

import numpy as np
import pandas as pd

import sklearn.base
import sklearn.metrics
import sklearn.metrics._scorer
import sklearn.model_selection
import sklearn.model_selection._split
import sklearn.model_selection._validation
import sklearn.utils
import sklearn.utils.estimator_checks
import sklearn.utils.metaestimators
import sklearn.utils.multiclass
import sklearn.utils.validation

from collections import defaultdict
from dataclasses import dataclass, field

from physlearn.base import AdditionalRegressorMixin
from physlearn.loss import LOSS_FUNCTIONS
from physlearn.pipeline import make_pipeline
from physlearn.supervised.interface import RegressorDictionaryInterface
from physlearn.supervised.utils._data_checks import (_n_features, _n_targets,
                                                     _n_samples, _validate_data)
from physlearn.supervised.utils._definition import (_MULTI_TARGET, _REGRESSOR_DICT,
                                                    _SEARCH_METHOD, _SCORE_CHOICE,
                                                    _SCORE_MULTIOUTPUT)
from physlearn.supervised.utils._estimator_checks import (_check_bayesoptcv_param_type,
                                                          _check_estimator_choice,
                                                          _check_search_method,
                                                          _check_stacking_layer,
                                                          _preprocess_hyperparams)
from physlearn.supervised.utils._search import _search_method

DataFrame_or_Series = typing.Union[pd.DataFrame, pd.Series]
pandas_or_numpy = typing.Union[pd.DataFrame, pd.Series, np.ndarray]
str_list_or_tuple = typing.Union[str, list, tuple]


[docs]@dataclass
class BaseRegressor(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin,
                    AdditionalRegressorMixin):
    """Base class for regressor amalgamation.

    The object is designed to amalgamate regressors from 
    `Scikit-learn <https://scikit-learn.org/>`_,
    `LightGBM <https://lightgbm.readthedocs.io/en/latest/index.html>`_,
    `XGBoost <https://xgboost.readthedocs.io/en/latest/>`_,
    `CatBoost <https://catboost.ai/>`_,
    and `Mlxtend <http://rasbt.github.io/mlxtend/>`_ into a unified framework,
    which follows the Scikit-learn API. Important methods include ``fit``,
    ``predict``, ``score``, ``dump``, ``load``, ``cross_validate``, and
    ``cross_val_score``.

    Parameters
    ----------
    regressor_choice : str, optional (default='ridge')
        Specifies the case-insensitive regressor choice.

    cv : int, cross-validation generator, an iterable, or None, optional (default=5)
        Determines the cross-validation strategy if the regressor choice is stacking,
        if the task is multi-target regression and the single-targets are chained,
        and as the default in the k-fold cross-validation methods.

    random_state : int, RandomState instance, or None, optional (default=0)
        Determines the random number generation in the regressor choice
        :class:`mlxtend.regressor.StackingCVRegressor` and in the modified
        pipeline construction.

    verbose : int, optional (default=0)
        Determines verbosity in either regressor choice:
        :class:`mlxtend.regressor.StackingRegressor` and
        :class:`mlxtend.regressor.StackingCVRegressor`, in the modified
        pipeline construction, and in the k-fold cross-validation methods.

    n_jobs : int or None, optional (default=-1)
        The number of jobs to run in parallel if the regressor choice is stacking
        or voting, in the modified pipeline construction, and in the k-fold
        cross-validation methods.

    score_multioutput : str, optional (default='raw_values')
        Defines aggregating of multiple output values in the score method,
        wherein the string must be either ``'raw_values'``, ``'uniform_average'``, or
        ``'variance_weighted'``.

    scoring : str, callable, list/tuple, or dict, optional (default='neg_mean_absolute_error')
        Determines scoring in the k-fold cross-validation methods.

    return_train_score : bool, optional (default=True)
        Determines whether to return the training scores from the k-fold
        cross-validation methods.

    auto_target : bool, optional (default=True)
        Determines whether to automatically handle the pipeline steps or let
        the user specify the steps.

    pipeline_transform : str, list, tuple, or None, optional (default=None)
        Choice of transform(s) used in the modified pipeline construction.
        If the specified choice is a string, then it must be a default option,
        where ``'standardscaler'``, ``'boxcox'``, ``'yeojohnson'``, ``'quantileuniform'``,
        and ``'quantilenormal'`` denote :class:`sklearn.preprocessing.StandardScaler`,
        :class:`sklearn.preprocessing.PowerTransformer` with ``method='box-cox'``
        or ``method='yeo-johnson'``, and :class:`sklearn.preprocessing.QuantileTransformer`
        with ``output_distribution='uniform'`` or ``output_distribution='normal'``,
        respectively.

    pipeline_memory : str or object with the joblib.Memory interface, optional (default=None)
        Enables fitted transform caching in the modified pipeline construction.

    params : dict, list, or None, optional (default=None)
        The choice of (hyper)parameters for the regressor choice.
        If None, then the default (hyper)parameters are utilized.

    target_index : int, or None, optional (default=None)
        Specifies the single-target regression subtask in the multi-target
        regression task.

    chain_order : list or None
        Determines the target order in :class:`sklearn.multioutput.RegressorChain`
        during the modified pipeline construction.

    stacking_options : dict or None, optional (default=None)
        A dictionary of stacking options, whereby ``layers``
        must be specified:

        layers :obj:`dict`
            A dictionary of stacking layer(s).

        shuffle :obj:`bool` or None, (default=True)
            Determines whether to shuffle the training data in
            :class:`mlxtend.regressor.StackingCVRegressor`.

        refit :obj:`bool` or None, (default=True)
            Determines whether to clone and refit the regressors in
            :class:`mlxtend.regressor.StackingCVRegressor`.

        passthrough :obj:`bool` or None, (default=True)
            Determines whether to concatenate the original features with
            the first stacking layer predictions in
            :class:`sklearn.ensemble.StackingRegressor`,
            :class:`mlxtend.regressor.StackingRegressor`, or
            :class:`mlxtend.regressor.StackingCVRegressor`.

        meta_features : :obj:`bool` or None, (default=True)
            Determines whether to make the concatenated features
            accessible through the attribute ``train_meta_features_``
            in :class:`mlxtend.regressor.StackingRegressor` and
            :class:`mlxtend.regressor.StackingCVRegressor`.

        voting_weights : :obj:`ndarray` of shape (n_regressors,) or None, (default=None)
            Sequence of weights for :class:`sklearn.ensemble.VotingRegressor`.

    base_boosting_options : dict or None, optional (default=None)
        A dictionary of base boosting options used in the modified pipeline construction,
        wherein the following options must be specified:

        n_estimators :obj:`int`
            The number of basis functions in the noise term of the additive expansion.
            Note that this option may also be specified as ``n_regressors``.

        boosting_loss :obj:`str` 
            The loss function utilized in the pseudo-residual computation, where 'ls'
            denotes the squared error loss function, 'lad' denotes the absolute error
            loss function, 'huber' denotes the Huber loss function, and 'quantile'
            denotes the quantile loss function.

        line_search_options :obj:`dict` 
            init_guess :obj:`int`, :obj:`float`, or :obj:`ndarray`
                The initial guess for the expansion coefficient.

            opt_method :obj:`str`
                Choice of optimization method. If ``'minimize'``, then
                :class:`scipy.optimize.minimize`, else if ``'basinhopping'``,
                then :class:`scipy.optimize.basinhopping`.

            method :obj:`str` or None
                The type of solver utilized in the optimization method.

            tol :obj:`float` or None
                The epsilon tolerance for terminating the optimization method.

            options :obj:`dict` or None
                A dictionary of solver options.

            niter :obj:`int` or None
                The number of iterations in basin-hopping.

            T :obj:`float` or None
                The temperature paramter utilized in basin-hopping,
                which determines the accept or reject criterion.

            loss :obj:`str`
                The loss function utilized in the line search computation, where 'ls'
                denotes the squared error loss function, 'lad' denotes the absolute error
                loss function, 'huber' denotes the Huber loss function, and 'quantile'
                denotes the quantile loss function.

            regularization :obj:`int` or :obj:`float`
                The regularization strength in the line search computation.

    Notes
    -----
    The ``score`` method differs from the Scikit-learn usage, as the method is designed
    to abstract the regressor metrics, e.g., :class:`sklearn.metrics.mean_absolute_error`.

    See Also
    --------
    :class:`physlearn.pipeline.ModifiedPipeline` : Class for creating a pipeline.
    :class:`physlearn.supervised.regression.Regressor` : Main class for regressor amalgamation.

    Examples
    --------
    >>> import pandas as pd
    >>> from sklearn.datasets import load_boston
    >>> from sklearn.model_selection import train_test_split
    >>> from physlearn import BaseRegressor
    >>> X, y = load_boston(return_X_y=True)
    >>> X, y = pd.DataFrame(X), pd.Series(y)
    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                            random_state=42)
    >>> reg = BaseRegressor(regressor_choice='lgbmregressor',
                            pipeline_transform='standardscaler')
    >>> y_pred = reg.fit(X_train, y_train).predict(X_test)
    >>> reg.score(y_test, y_pred)
    array([11.63706835])
    """

    regressor_choice: str = field(default='ridge')
    cv: int = field(default=5)
    random_state: int = field(default=0)
    verbose: int = field(default=0)
    n_jobs: int = field(default=-1)
    score_multioutput: str = field(default='raw_values')
    scoring: str = field(default='neg_mean_absolute_error')
    return_train_score: bool = field(default=True)
    auto_target: bool = field(default=True)
    pipeline_transform: typing.Union[str, list, tuple] = field(default=None)
    pipeline_memory: str = field(default=None)
    params: typing.Union[dict, list] = field(default=None)
    target_index: int = field(default=None)
    chain_order: list = field(default=None)
    stacking_options: dict = field(default=None)
    base_boosting_options: dict = field(default=None)

    def __post_init__(self):
        self._validate_regressor_options()
        self._get_regressor()

    def _validate_regressor_options(self):
        self.regressor_choice = _check_estimator_choice(estimator_choice=self.regressor_choice,
                                                        estimator_type='regression')

        assert isinstance(self.cv, int) and self.cv > 1
        assert isinstance(self.random_state, int) and self.random_state >= 0
        assert isinstance(self.verbose, int) and self.verbose >= 0
        assert isinstance(self.n_jobs, int)
        assert isinstance(self.score_multioutput, str)
        assert isinstance(self.scoring, str)
        assert isinstance(self.return_train_score, bool)

        if self.pipeline_transform is not None:
            assert any(isinstance(self.pipeline_transform, built_in)
                   for built_in in (str, list, tuple))

        if self.pipeline_memory is not None:
            assert isinstance(self.pipeline_memory, bool)

        if self.params is not None:
            assert isinstance(self.params, (dict, list))

        if self.target_index is not None:
            assert isinstance(self.target_index, int)

        if self.chain_order is not None:
            assert isinstance(self.chain_order, list)

        if self.stacking_options is not None:
            for key, option in self.stacking_options.items():
                if key == 'layers':
                    self.stacking_options[key] = _check_stacking_layer(stacking_layer=option,
                                                                       estimator_type='regression')
                elif key not in ['shuffle', 'refit', 'passthrough', 'meta_features']:
                    raise KeyError('The key: %s is not a stacking option.'
                                   % (key))

        if self.base_boosting_options is not None:
            # The options are checked in the
            # ModifiedPipeline constructor.
            assert isinstance(self.base_boosting_options, dict)

[docs]    def _get_regressor(self):
        """Helper method which instantiates the regressor choice."""

        reg = RegressorDictionaryInterface(regressor_choice=self.regressor_choice,
                                           params=self.params,
                                           stacking_options=self.stacking_options)

        kwargs = dict(cv=self.cv,
                      verbose=self.verbose,
                      random_state=self.random_state,
                      n_jobs=self.n_jobs,
                      stacking_options=self.stacking_options)

        # The (hyper)parameters must be set
        # before retrieval.
        self._regressor = reg.set_params(**kwargs)
        self.params = reg.get_params(regressor=self._regressor)

    @property
    def check_regressor(self):
        """Checks if regressor adheres to scikit-learn conventions.

        Namely, it runs :class:`sklearn.utils.estimator_checks.check_estimator`.
        """

        return sklearn.utils.estimator_checks.check_estimator(self._regressor)

[docs]    def get_params(self, deep=True) -> dict:
        """Retrieves the (hyper)parameters.

        Parameters
        ----------
        deep : bool, optional (default=True)
            Although we do not use this parameter, it is required as
            various Scikit-learn utilities require it.

        Returns
        -------
        self.params : dict
            (Hyper)parameter names mapped to their values.
        """

        return self.params

[docs]    def set_params(self, **params) -> BaseRegressor:
        """Sets the regressor's (hyper)parameters.

        Parameters
        ----------
        **params : dict
            The regressor's (hyper)parameters.

        Returns
        -------
        self : BaseRegressor
            The base regressor object.
        """

        if not params:
            # Simple optimization to gain speed (inspect is slow)
            return self
        valid_params = self.get_params(deep=True)

        nested_params = defaultdict(dict)  # grouped by prefix
        for key, value in params.items():
            key, delim, sub_key = key.partition('__')
            if key not in valid_params:
                raise ValueError('Invalid parameter %s for regressor %s. '
                                 'Check the list of available parameters '
                                 'with `regressor.get_params().keys()`.'
                                 % (key, self))

            if delim:
                nested_params[key][sub_key] = value
            else:
                setattr(self._regressor, key, value)
                valid_params[key] = value

        for key, sub_params in nested_params.items():
            valid_params[key].set_params(**sub_params)

        return self

[docs]    def _validate_data(self, X=None, y=None):
        """Checks the validity of the data representation(s).

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        Returns
        -------
        out : validated data
        """

        if X is not None and y is not None:
            if not hasattr(self, '_validated_data'):
                out = _validate_data(X=X, y=y)
                setattr(self, '_validated_data', True)
            else:
                out = X, y
        elif X is not None:
            if not hasattr(self, '_validated_data'):
                out = _validate_data(X=X)
            else:
                out = X
        elif y is not None:
            if not hasattr(self, '_validated_data'):
                out = _validate_data(y=y)
            else:
                out = y
        else:
            raise ValueError('Both the data matrix X and the target matrix y are None. '
                             'Thus, there is no data to validate.')

        return out


[docs]    def dump(self, value, filename) -> list:
        """Serializes the value with joblib.

        Parameters
        ----------
        value: any Python object
            The object to store to disk.

        filename : str, joblib.pathlib.Path, or file object
            The file object or path of the file.

        Returns
        -------
        filenames: list of str
            The list of file names in which the data is stored.
        """

        assert isinstance(filename, str)
        joblib.dump(value=value, filename=filename)

[docs]    def load(self, filename):
        """Deserializes the file object.

        Parameters
        ----------
        filename : str, joblib.pathlib.Path, or file object
            The file object or path of the file.

        Returns
        -------
        joblib.load : any Python object
            The object stored in the file.
        """

        assert isinstance(filename, str)        
        return joblib.load(filename=filename)
    
[docs]    def get_pipeline(self, y: DataFrame_or_Series, n_quantiles=None):
        """Creates pipe attribute for downstream tasks.

        This method constructs a ModifiedPipeline from the given base regressor.

        Parameters
        ----------
        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s). The targets are used to
            determine the type of the target, and the number of samples if the
            ``pipeline_transform`` involves quantile transformers.

        n_quantiles : int or None, optional (default=None)
            Number of quantiles in :class:`sklearn.preprocessing.QuantileTransformer`, if
            ``pipeline_transform`` is either ```quantileuniform``` or ```quantilenormal```.

        Attributes
        ----------
        pipe : :class:`physlearn.pipeline.ModifiedPipeline`
            A ModifiedPipeline object.
        """

        y = self._validate_data(y=y)

        if n_quantiles is None and isinstance(self.pipeline_transform, str):
            if re.search('quantile', self.pipeline_transform):
                n_quantiles = _n_samples(y)

        kwargs = dict(random_state=self.random_state,
                      verbose=self.verbose,
                      n_jobs=self.n_jobs,
                      cv=self.cv,
                      memory=self.pipeline_memory,
                      auto_target = self.auto_target,
                      target_index=self.target_index,
                      target_type = sklearn.utils.multiclass.type_of_target(y),
                      n_quantiles=n_quantiles,
                      chain_order=self.chain_order,
                      base_boosting_options=self.base_boosting_options)

        self.pipe =  make_pipeline(estimator=self._regressor,
                                   transform=self.pipeline_transform,
                                   **kwargs)

[docs]    def regattr(self, attr: str) -> str:
        """Gets a regressor's attribute from the ModifiedPipeline object.

        The pipe attribute must exist in order to use this method. 

        Parameters
        ----------
        attr : str
            The name of the regressor's attribute.

        Returns
        -------
        attr : type of attribute
        """

        assert hasattr(self, 'pipe') and isinstance(attr, str)

        try:
            attr = {f'target {index}': getattr(self.pipe, attr)
                   for index, self.pipe
                   in enumerate(self.pipe.named_steps['reg'].estimators_)}
            return attr
        except:
            raise AttributeError('%s needs to have an estimators_ attribute '
                                 'in order to access the attribute: %s.'
                                 % (self.pipe.named_steps['reg'], attr))

[docs]    def _check_target_index(self, y: DataFrame_or_Series) -> DataFrame_or_Series:
        """Automates subtask slicing in multi-target regression.

        Parameters
        ----------
        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s). The targets are used to
            determine the type of the target, and the number of samples if the
            ``pipeline_transform`` involves quantile transformers.

        Returns
        -------
        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
        """

        y = self._validate_data(y=y)

        if self.target_index is not None and \
        sklearn.utils.multiclass.type_of_target(y) in _MULTI_TARGET:
            # Selects a particular single-target
            return y.iloc[:, self.target_index]
        else:
            return y

[docs]    @staticmethod
    def _fit(regressor, X: DataFrame_or_Series, y: DataFrame_or_Series,
             sample_weight=None, **fit_params):
        """Helper fit method.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        sample_weight : float, ndarray, or None, optional (default=None)
            Individual weights for each example. If the weight is a float, then
            every example will have the same weight.

        **fit_params : dict of string -> object
            If base boosting, then these parameters are passed to the stagewise
            ``_fit_stages`` method.
        """

        if sample_weight is not None:
            try:
                regressor.fit(X=X, y=y, sample_weight=sample_weight)
            except TypeError as exc:
                if 'unexpected keyword argument sample_weight' in str(exc):
                    raise TypeError('%s does not support sample weights.'
                                    % (regressor.__class__.__name__)) from exc
        elif sample_weight is None and fit_params:
            try:
                regressor.fit(X=X, y=y, **fit_params)
            except ValueError:
                raise ('%s is not a valid fit parameter for this regressor.'
                       % (fit_params.values()))
        else:
            regressor.fit(X=X, y=y)

[docs]    def fit(self, X: DataFrame_or_Series, y: DataFrame_or_Series,
            sample_weight=None) -> ModifiedPipeline:
        """Fits the ModifiedPipeline object.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        sample_weight : float, ndarray, or None, optional (default=None)
            Individual weights for each example. If the weight is a float, then
            every example will have the same weight.

        Returns
        -------
        self.pipe : ModifiedPipeline
            The induced pipeline object.
        """

        X, y = self._validate_data(X=X, y=y)

        # Automates single-target slicing.
        y = self._check_target_index(y=y)

        if not hasattr(self, 'pipe'):
            self.get_pipeline(y=y)

        self._fit(regressor=self.pipe, X=X, y=y,
                  sample_weight=sample_weight)

        return self.pipe

[docs]    def predict(self, X: DataFrame_or_Series) -> DataFrame_or_Series:
        """Generates predictions with the ModifiedPipeline object.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        Returns
        -------
        y_pred : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The predictions generated by the induced ModifiedPipeline object.
        """

        assert hasattr(self, 'pipe')
        X = self._validate_data(X=X)

        return self.pipe.predict(X=X)

[docs]    def score(self, y_true: pandas_or_numpy, y_pred: pandas_or_numpy, scoring='mse',
              multioutput='raw_values') -> pandas_or_numpy:
        """Computes the supervised score.

        Parameters
        ----------
        y_true : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The observed target matrix, where each row corresponds to an example and the
            column(s) correspond to the observed single-target(s).

        y_pred : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The predicted target matrix, where each row corresponds to an example and the
            column(s) correspond to the predicted single-target(s).

        scoring : str, optional (default='mse')
            The scoring name, which may be `mae`, `mse`, `rmse`, `r2`, `ev`, or
            `msle`.

        multioutput : str, optional (default='raw_values')
            Defines aggregating of multiple output values, wherein the string
            must be either ``'raw_values'``, ``'uniform_average'``, or
            ``'variance_weighted'``.

        Returns
        -------
        score : float or ndarray of floats
            The computed score.
        """

        assert any(scoring for method in _SCORE_CHOICE) and isinstance(scoring, str)

        if scoring in ['r2', 'ev']:
            possible_multioutputs = _SCORE_MULTIOUTPUT + ['variance_weighted']
            assert any(multioutput for output in possible_multioutputs)
        else:
            possible_multioutputs = _SCORE_MULTIOUTPUT
            assert any(multioutput for output in possible_multioutputs)

        # Automates single-target slicing
        y_true = self._check_target_index(y=y_true)

        if scoring == 'mae':
            score = sklearn.metrics.mean_absolute_error(y_true=y_true,
                                                        y_pred=y_pred,
                                                        multioutput=multioutput)
        elif scoring == 'mse':
            score = sklearn.metrics.mean_squared_error(y_true=y_true,
                                                       y_pred=y_pred,
                                                       multioutput=multioutput)
        elif scoring == 'rmse':
            score = np.sqrt(sklearn.metrics.mean_squared_error(y_true=y_true,
                                                               y_pred=y_pred,
                                                               multioutput=multioutput))
        elif scoring == 'r2':
            score = sklearn.metrics.r2_score(y_true=y_true,
                                             y_pred=y_pred,
                                             multioutput=multioutput)
        elif scoring == 'ev':
            score = sklearn.metrics.explained_variance_score(y_true=y_true,
                                                             y_pred=y_pred,
                                                             multioutput=multioutput)
        elif scoring == 'msle':
            try:
                score = sklearn.metrics.mean_squared_log_error(y_true=y_true,
                                                               y_pred=y_pred,
                                                               multioutput=multioutput)
            except:
                # Sklearn will raise a ValueError if either
                # statement is true, so we circumvent
                # this error and score with a NaN.
                score = np.nan

        return score

[docs]    def _estimate_fold_size(self, y: DataFrame_or_Series, cv) -> int:
        """Helper method to estimate cross-validation fold size.

        Parameters
        ----------
        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        cv : int, cross-validation generator, or an iterable
            Used in order to determine the fold size.

        Returns
        -------
        estimate : int
        """

        n_samples = _n_samples(y)
        if isinstance(cv, int):
            fold_size =  np.full(shape=n_samples,
                                 fill_value=n_samples // cv,
                                 dtype=np.int)
        else:
            fold_size =  np.full(shape=n_samples,
                                 fill_value=n_samples // cv.n_splits,
                                 dtype=np.int)
        return n_samples - (np.max(fold_size) + 1)

[docs]    def _modified_cross_validate(self, X: DataFrame_or_Series, y: DataFrame_or_Series,
                                 return_regressor=False, error_score=np.nan,
                                 return_incumbent_score=False, cv=None,
                                 fit_params=None) -> dict:
        """Performs (augmented) cross-validation.

        If ``return_incumbent_score`` is True, then the incumbent is scored
        on the withheld folds. Otherwise, the behavior is the same as in
        Scikit-learn.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        return_regressor : bool, optional (default=False)
            Determines whether to return the induced regressor.

        error_score : 'raise' or numeric, optional (default=np.nan)
            The assigned value if an error occurs while inducing a regressor.
            If set to 'raise', then the specific error is raised. Else if set
            to a numeric value, then FitFailedWarning is raised.

        return_incumbent_score : bool, optional (default=True)
            Determines whether to score the incumbent on the withheld folds,
            whereby the incumbent is assumed to be an example in the design
            matrix.

        cv : int, cross-validation generator, an iterable, or None, optional (default=None)
            Determines the cross-validation strategy. If None, then the default
            is 5-fold cross-validation.

        fit_params : dict, optional (default=None)
            (Hyper)parameters to pass to the regressor's fit method.

        Returns
        -------
        scores : dict of float arrays of shape (n_splits,)
            Array of scores for each run of the cross-validation procedure.

        References
        ----------
        - Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
          "A new formulation of gradient boosting",
          Machine Learning: Science and Technology, 2 045022 (2021).
        """

        X, y = self._validate_data(X=X, y=y)

        # Automates single-target slicing.
        y = self._check_target_index(y=y)

        X, y, groups = sklearn.utils.validation.indexable(X, y, None)

        if cv is None:
            cv = self.cv

        if not hasattr(self, 'pipe'):
            self.get_pipeline(y=y,
                              n_quantiles=self._estimate_fold_size(y=y,
                                                                   cv=cv))

        cv = sklearn.model_selection._split.check_cv(cv=cv, y=y,
                                                     classifier=False)

        if isinstance(self.scoring, str):
            scorers = sklearn.metrics._scorer.check_scoring(estimator=self.pipe,
                                                            scoring=self.scoring)
        else:
            scorers, _ = sklearn.metrics._scorer._check_multimetric_scoring(estimator=self.pipe,
                                                                            scoring=self.scoring)

        parallel = joblib.Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                                   pre_dispatch='2*n_jobs')

        results = parallel(
            joblib.delayed(sklearn.model_selection._validation._fit_and_score)(
                estimator=sklearn.base.clone(self.pipe), X=X, y=y,
                scorer=scorers, train=train, test=test, verbose=self.verbose,
                parameters=None, fit_params=fit_params,
                return_train_score=self.return_train_score,
                return_parameters=False, return_n_test_samples=False,
                return_times=True, return_estimator=return_regressor,
                error_score=np.nan)
            for train, test in cv.split(X, y, groups))

        if return_incumbent_score:
            if self.target_index is not None:
                y_pred = X.iloc[:, self.target_index]
            else:
                y_pred = X

            incumbent_test_score = parallel(
                joblib.delayed(self.score)(
                    y_true=y.loc[test], y_pred=y_pred.loc[test])
                for _, test in cv.split(X, y, groups))

            if self.scoring == 'neg_mean_absolute_error':
                incumbent_test_score = [score['mae'].values[0] for score in incumbent_test_score]
            elif self.scoring == 'neg_mean_squared_error':
                incumbent_test_score = [score['mse'].values[0] for score in incumbent_test_score]


        results = sklearn.model_selection._validation._aggregate_score_dicts(results)

        ret = {}
        ret['fit_time'] = results["fit_time"]
        ret['score_time'] = results["score_time"]

        if return_regressor:
            ret['regressor'] = results["estimator"]

        test_scores_dict = sklearn.model_selection._validation._normalize_score_results(
            results["test_scores"])

        if self.return_train_score:
            train_scores_dict = sklearn.model_selection._validation._normalize_score_results(
                results["train_scores"])

        for name in test_scores_dict:
            ret['test_%s' % name] = test_scores_dict[name]
            if self.return_train_score:
                key = 'train_%s' % name
                ret[key] = train_scores_dict[name]

        if return_incumbent_score:
            ret['incumbent_test_score'] = incumbent_test_score

        return ret

[docs]    def cross_validate(self, X: DataFrame_or_Series, y: DataFrame_or_Series,
                       return_regressor=False, error_score=np.nan,
                       return_incumbent_score=False, cv=None,
                       fit_params=None) -> pd.DataFrame:
        """Performs (augmented) cross-validation, and wraps the result in a DataFrame.

        If ``return_incumbent_score`` is True, then the incumbent is scored
        on the withheld folds. Otherwise, the behavior is the same as in
        Scikit-learn.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        return_regressor : bool, optional (default=False)
            Determines whether to return the induced regressor.

        error_score : 'raise' or numeric, optional (default=np.nan)
            The assigned value if an error occurs while inducing a regressor.
            If set to 'raise', then the specific error is raised. Else if set
            to a numeric value, then FitFailedWarning is raised.

        return_incumbent_score : bool, optional (default=True)
            Determines whether to score the incumbent on the withheld folds,
            whereby the incumbent is assumed to be an example in the design
            matrix.

        cv : int, cross-validation generator, an iterable, or None, optional (default=None)
            Determines the cross-validation strategy. If None, then the default
            is 5-fold cross-validation.

        fit_params : dict, optional (default=None)
            (Hyper)parameters to pass to the regressor's fit method.

        Returns
        -------
        scores : pd.DataFrame
            DataFrame of scores for each run of the cross-validation procedure.

        Notes
        -----
        Scikit-learn returns negative scores for some metrics, such as
        mean absolute error (MAE) or mean squared error (MSE). However,
        we only return nonnegativie scores.

        References
        ----------
        - Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
          "A new formulation of gradient boosting",
          Machine Learning: Science and Technology, 2 045022 (2021).
        """

        scores = self._modified_cross_validate(X=X, y=y,
                                               return_regressor=return_regressor,
                                               error_score=error_score,
                                               return_incumbent_score=return_incumbent_score,
                                               cv=cv, fit_params=fit_params)

        if re.match('neg', self.scoring):
            scores['train_score'] = np.abs(scores['train_score'])
            scores['test_score'] = np.abs(scores['test_score'])

        return pd.DataFrame(scores)

[docs]    def cross_val_score(self, X: DataFrame_or_Series, y: DataFrame_or_Series,
                        error_score=np.nan, return_incumbent_score=False,
                        cv=None, fit_params=None) -> DataFrame_or_Series:
        """Performs (augmented) cross-validation, then returns the withheld fold score.

        If ``return_incumbent_score`` is True, then the incumbent is scored
        on the withheld folds. Otherwise, the behavior is the same as in
        Scikit-learn.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        error_score : 'raise' or numeric, optional (default=np.nan)
            The assigned value if an error occurs while inducing a regressor.
            If set to 'raise', then the specific error is raised. Else if set
            to a numeric value, then FitFailedWarning is raised.

        return_incumbent_score : bool, optional (default=True)
            Determines whether to score the incumbent on the withheld folds,
            whereby the incumbent is assumed to be an example in the design
            matrix.

        cv : int, cross-validation generator, an iterable, or None, optional (default=None)
            Determines the cross-validation strategy. If None, then the default
            is 5-fold cross-validation.

        fit_params : dict, optional (default=None)
            (Hyper)parameters to pass to the regressor's fit method.

        Returns
        -------
        scores : pd.Series or pd.DataFrame
            The withheld fold scores for each run of the cross-validation procedure.

        Notes
        -----
        Scikit-learn returns negative scores for some metrics, such as
        mean absolute error (MAE) or mean squared error (MSE). However,
        we only return nonnegativie scores.

        References
        ----------
        - Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
          "A new formulation of gradient boosting",
          Machine Learning: Science and Technology, 2 045022 (2021).
        """

        scores = self.cross_validate(X=X, y=y,
                                     error_score=error_score,
                                     return_incumbent_score=return_incumbent_score,
                                     cv=cv, fit_params=fit_params)

        if return_incumbent_score:
            return scores[['test_score', 'incumbent_test_score']]
        else:
            return scores['test_score']


[docs]@dataclass
class Regressor(BaseRegressor):
    """Main class for regressor amalgamation.

    The object is designed to amalgamate regressors from 
    `Scikit-learn <https://scikit-learn.org/>`_,
    `LightGBM <https://lightgbm.readthedocs.io/en/latest/index.html>`_,
    `XGBoost <https://xgboost.readthedocs.io/en/latest/>`_,
    `CatBoost <https://catboost.ai/>`_,
    and `Mlxtend <http://rasbt.github.io/mlxtend/>`_ into a unified framework,
    which follows the Scikit-learn API. Important methods include ``fit``,
    ``predict``, ``score``, ``baseboostcv``, ``search``, ``dump``, ``load``,
    ``cross_val_score``, and ``nested_cross_validate``.

    Parameters
    ----------
    regressor_choice : str, optional (default='ridge')
        Specifies the case-insensitive regressor choice.

    cv : int, cross-validation generator, an iterable, or None, optional (default=5)
        Determines the cross-validation strategy if the regressor choice is stacking,
        if the task is multi-target regression and the single-targets are chained,
        and as the default in the k-fold cross-validation methods.

    random_state : int, RandomState instance, or None, optional (default=0)
        Determines the random number generation in the regressor choice
        :class:`mlxtend.regressor.StackingCVRegressor` and in the modified
        pipeline construction.

    verbose : int, optional (default=1)
        Determines verbosity in either regressor choice:
        :class:`mlxtend.regressor.StackingRegressor` and
        :class:`mlxtend.regressor.StackingCVRegressor`, in the modified
        pipeline construction, and in the k-fold cross-validation methods.

    n_jobs : int or None, optional (default=-1)
        The number of jobs to run in parallel if the regressor choice is stacking
        or voting, in the modified pipeline construction, and in the k-fold
        cross-validation methods.

    score_multioutput : str, optional (default='raw_values')
        Defines aggregating of multiple output values in the score method,
        wherein the string must be either ``'raw_values'``, ``'uniform_average'``, or
        ``'variance_weighted'``.

    scoring : str, callable, list/tuple, or dict, optional (default='neg_mean_absolute_error')
        Determines scoring in the k-fold cross-validation methods.

    refit : bool, optional (default=True)
        Determines whether to return the refit regressor in the search method.

    randomizedcv_n_iter : int, optional (default=20)
        Determines the number of (hyper)parameter settings that are
        sampled in the search method, when the chosen search is
        ``'randomizedsearchcv'``, e.g., RandomizedSearchCV from
        Scikit-learn.

    bayesoptcv_init_points : int, optional (default=2)
        Determines the number of random exploration steps in the search method,
        when the chose search method is ``'bayesoptcv'``, e.g., `Bayesian
        Optimization <https://github.com/fmfn/BayesianOptimization>`_.
        Increasing the number corresponds to diversifying the exploration
        space.

    bayesoptcv_n_iter : int, optional (default=20)
        Determines the number of Bayesian optimization steps in the search method,
        when the chose search method is ``'bayesoptcv'``, e.g., `Bayesian
        Optimization <https://github.com/fmfn/BayesianOptimization>`_.

    return_train_score : bool, optional (default=True)
        Determines whether to return the training scores from the k-fold
        cross-validation methods.

    pipeline_transform : str, list, tuple, or None, optional (default='quantilenormal')
        Choice of transform(s) used in the modified pipeline construction.
        If the specified choice is a string, then it must be a default option,
        where ``'standardscaler'``, ``'boxcox'``, ``'yeojohnson'``, ``'quantileuniform'``,
        and ``'quantilenormal'`` denote :class:`sklearn.preprocessing.StandardScaler`,
        :class:`sklearn.preprocessing.PowerTransformer` with ``method='box-cox'``
        or ``method='yeo-johnson'``, and :class:`sklearn.preprocessing.QuantileTransformer`
        with ``output_distribution='uniform'`` or ``output_distribution='normal'``,
        respectively.

    pipeline_memory : str or object with the joblib.Memory interface, optional (default=None)
        Enables fitted transform caching in the modified pipeline construction.

    params : dict, list, or None, optional (default=None)
        The choice of (hyper)parameters for the regressor choice.
        If None, then the default (hyper)parameters are utilized.

    target_index : int, or None, optional (default=None)
        Specifies the single-target regression subtask in the multi-target
        regression task.

    chain_order : list or None
        Determines the target order in :class:`sklearn.multioutput.RegressorChain`
        during the modified pipeline construction.

    stacking_options : dict or None, optional (default=None)
        A dictionary of stacking options, whereby ``layers``
        must be specified:

        layers :obj:`dict`
            A dictionary of stacking layer(s).

        shuffle :obj:`bool` or None, (default=True)
            Determines whether to shuffle the training data in
            :class:`mlxtend.regressor.StackingCVRegressor`.

        refit :obj:`bool` or None, (default=True)
            Determines whether to clone and refit the regressors in
            :class:`mlxtend.regressor.StackingCVRegressor`.

        passthrough :obj:`bool` or None, (default=True)
            Determines whether to concatenate the original features with
            the first stacking layer predictions in
            :class:`sklearn.ensemble.StackingRegressor`,
            :class:`mlxtend.regressor.StackingRegressor`, or
            :class:`mlxtend.regressor.StackingCVRegressor`.

        meta_features : :obj:`bool` or None, (default=True)
            Determines whether to make the concatenated features
            accessible through the attribute ``train_meta_features_``
            in :class:`mlxtend.regressor.StackingRegressor` and
            :class:`mlxtend.regressor.StackingCVRegressor`.

        voting_weights : :obj:`ndarray` of shape (n_regressors,) or None, (default=None)
            Sequence of weights for :class:`sklearn.ensemble.VotingRegressor`.

    base_boosting_options : dict or None, optional (default=None)
        A dictionary of base boosting options used in the modified pipeline construction,
        wherein the following options must be specified:

        n_estimators :obj:`int`
            The number of basis functions in the noise term of the additive expansion.
            Note that this option may also be specified as ``n_regressors``.

        boosting_loss :obj:`str` 
            The loss function utilized in the pseudo-residual computation, where 'ls'
            denotes the squared error loss function, 'lad' denotes the absolute error
            loss function, 'huber' denotes the Huber loss function, and 'quantile'
            denotes the quantile loss function.

        line_search_options :obj:`dict` 
            init_guess :obj:`int`, :obj:`float`, or :obj:`ndarray`
                The initial guess for the expansion coefficient.

            opt_method :obj:`str`
                Choice of optimization method. If ``'minimize'``, then
                :class:`scipy.optimize.minimize`, else if ``'basinhopping'``,
                then :class:`scipy.optimize.basinhopping`.

            method :obj:`str` or None
                The type of solver utilized in the optimization method.

            tol :obj:`float` or None
                The epsilon tolerance for terminating the optimization method.

            options :obj:`dict` or None
                A dictionary of solver options.

            niter :obj:`int` or None
                The number of iterations in basin-hopping.

            T :obj:`float` or None
                The temperature paramter utilized in basin-hopping,
                which determines the accept or reject criterion.

            loss :obj:`str`
                The loss function utilized in the line search computation, where 'ls'
                denotes the squared error loss function, 'lad' denotes the absolute error
                loss function, 'huber' denotes the Huber loss function, and 'quantile'
                denotes the quantile loss function.

            regularization :obj:`int` or :obj:`float`
                The regularization strength in the line search computation.

    Notes
    -----
    The ``score`` method differs from the Scikit-learn usage, as the method is designed
    to abstract the regressor metrics, e.g., :class:`sklearn.metrics.mean_absolute_error`.
    Moreover, it computes multiple metrics, and returns the scores in a pandas object.

    See Also
    --------
    :class:`physlearn.pipeline.ModifiedPipeline` : Class for creating a pipeline.
    :class:`physlearn.supervised.regression.BaseRegressor` : Base class for regressor amalgamation.

    Examples
    --------
    >>> import pandas as pd
    >>> from sklearn.datasets import load_boston
    >>> from sklearn.decomposition import PCA, TruncatedSVD
    >>> from sklearn.model_selection import train_test_split
    >>> from sklearn.pipeline import FeatureUnion
    >>> from physlearn import Regressor
    >>> X, y = load_boston(return_X_y=True)
    >>> X, y = pd.DataFrame(X), pd.Series(y)
    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                            random_state=42)
    >>> transformer_list = [('pca', PCA(n_components=1)),
                            ('svd', TruncatedSVD(n_components=2))]
    >>> union = FeatureUnion(transformer_list=transformer_list, n_jobs=-1)
    >>> stack = dict(regressors=['kneighborsregressor', 'bayesianridge'],
                     final_regressor='lasso')
    >>> reg = Regressor(regressor_choice='stackingregressor',
                        pipeline_transform=('tr', union),
                        stacking_options=dict(layers=stack))
    >>> y_pred = reg.fit(X_train, y_train).predict(X_test)
    >>> reg.score(y_test, y_pred)
                 mae        mse      rmse        r2       ev      msle
    target
    0       4.775145  42.874253  6.547843  0.387748  0.40836  0.079818
    """

    verbose: int = field(default=1)
    pipeline_transform: str_list_or_tuple = field(default='quantilenormal')
    refit: bool = field(default=True)
    randomizedcv_n_iter: int = field(default=20)
    bayesoptcv_init_points: int = field(default=2)
    bayesoptcv_n_iter: int = field(default=20)

    def __post_init__(self):
        self._validate_regressor_options()
        self._validate_search_options()
        self._get_regressor()

    def _validate_search_options(self):
        assert isinstance(self.refit, bool)
        assert isinstance(self.randomizedcv_n_iter, int)
        assert isinstance(self.bayesoptcv_init_points, int)
        assert isinstance(self.bayesoptcv_n_iter, int)

    @property
    def check_regressor(self):
        """Checks if regressor adheres to scikit-learn conventions.

        Namely, it runs :class:`sklearn.utils.estimator_checks.check_estimator`.
        Scikit-learn and Mlxtend stacking regressors, as well as LightGBM,
        XGBoost, and CatBoost regressor do not adhere to the convention.
        """
        try:
            super().check_regressor
        except:
            raise TypeError('%s does not adhere to the Scikit-learn estimator convention.'
                            % (_REGRESSOR_DICT[self.regressor_choice]))

[docs]    def get_params(self, deep=True) -> dict:
        """Retrieves the (hyper)parameters.

        Parameters
        ----------
        deep : bool, optional (default=True)
            Although we do not use this parameter, it is required as
            various Scikit-learn utilities require it.

        Returns
        -------
        self.params : dict
            (Hyper)parameter names mapped to their values.
        """

        return super().get_params(deep=deep)

[docs]    def set_params(self, **params) -> BaseRegressor:
        """Sets the regressor's (hyper)parameters.

        Parameters
        ----------
        **params : dict
            The regressor's (hyper)parameters.

        Returns
        -------
        self : BaseRegressor
            The base regressor object.
        """

        return super().set_params(**params)

[docs]    def dump(self, value, filename) -> list:
        """Serializes the value with joblib.

        Parameters
        ----------
        value: any Python object
            The object to store to disk.

        filename : str, joblib.pathlib.Path, or file object
            The file object or path of the file.

        Returns
        -------
        filenames: list of str
            The list of file names in which the data is stored.
        """

        super().dump(value=value, filename=filename)

[docs]    def load(self, filename):
        """Deserializes the file object.

        Parameters
        ----------
        filename : str, joblib.pathlib.Path, or file object
            The file object or path of the file.

        Returns
        -------
        joblib.load : any Python object
            The object stored in the file.
        """

        return super().load(filename=filename)

[docs]    def regattr(self, attr: str) -> str:
        """Gets a regressor's attribute from the ModifiedPipeline object.

        The pipe attribute must exist in order to use this method. 

        Parameters
        ----------
        attr : str
            The name of the regressor's attribute.

        Returns
        -------
        attr : type of attribute
        """

        return super().regattr(attr=attr)

[docs]    def fit(self, X: DataFrame_or_Series, y: DataFrame_or_Series,
            sample_weight=None) -> ModifiedPipeline:
        """Fits the ModifiedPipeline object.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        sample_weight : float, ndarray, or None, optional (default=None)
            Individual weights for each example. If the weight is a float, then
            every example will have the same weight.

        Returns
        -------
        self.pipe : ModifiedPipeline
            The induced pipeline object.
        """

        return super().fit(X=X, y=y, sample_weight=sample_weight)

[docs]    def _inbuilt_model_selection_step(self, X: DataFrame_or_Series,
                                      y: DataFrame_or_Series) -> None:
        """Performs augmented cross-validation.

        This method is designed to be utilized within
        :meth:`physlearn.supervised.regression.Regressor.baseboostcv`,
        as the inbuilt model selection step.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        Attributes
        ----------
        _return_incumbent : bool
            This flag implies that the incumbent won the inbuilt model
            selection step.

        Returns
        -------
        None

        References
        ----------
        - Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
          "A new formulation of gradient boosting",
          Machine Learning: Science and Technology, 2 045022 (2021).
        """

        cross_val_score = super().cross_val_score(X=X, y=y,
                                                  return_incumbent_score=True)
        mean_cross_val_score = cross_val_score.mean(axis=0)

        if mean_cross_val_score[0] >= mean_cross_val_score[1]:
            # Base boosting did not improve performance.
            setattr(self, '_return_incumbent', True)

[docs]    def baseboostcv(self, X: DataFrame_or_Series, y: DataFrame_or_Series,
                    **fit_params) -> typing.Union[Regressor, ModifiedPipeline]:
        """Base boosting with inbuilt cross-validation.

        This method starts with inbuilt cross-validation, which scores both
        the incumbent and the candidate base boosting algorithm. If the
        incumbent wins, then the explict model of the domain is the single-target
        regressor. Otherwise, base boosting greedily boosts the explict model of
        the domain in a stagewise fashion.

        In essence, this method acts as a fit method.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        **fit_params : dict of string -> object
            If base boosting, then these parameters are passed to the stagewise
            ``_fit_stages`` method.

        Attributes
        ----------
        return_incumbent_ : bool
            This flag implies that the incumbent won the inbuilt model
            selection step, and it notifies the predict method.

        Returns
        -------
        single-target regressor : Regressor or ModifiedPipeline

        References
        ----------
        - Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
          "A new formulation of gradient boosting",
          Machine Learning: Science and Technology, 2 045022 (2021).
        """

        X, y = super()._validate_data(X=X, y=y)

        # Automates single-target slicing
        y = super()._check_target_index(y=y)

        # Performs augmented k-fold cross-validation, then it
        # selects either the incumbent or the candidate.
        self._inbuilt_model_selection_step(X=X, y=y)

        if not hasattr(self, 'pipe'):
            super().get_pipeline(y=y)

        if not hasattr(self, '_return_incumbent'):
            # This checks if the candidate was chosen
            # in model selection.
            super()._fit(regressor=self.pipe, X=X, y=y, **fit_params)
            return self.pipe
        else:
            setattr(self, 'return_incumbent_', True) 
            return self

[docs]    def predict(self, X: DataFrame_or_Series) -> pd.DataFrame:
        """Generates predictions with the ModifiedPipeline object.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        Returns
        -------
        y_pred : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The predictions generated by the induced ModifiedPipeline object.

        References
        ----------
        - Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
          "A new formulation of gradient boosting",
          Machine Learning: Science and Technology, 2 045022 (2021).
        """

        X = self._validate_data(X=X)

        if hasattr(self, 'return_incumbent_'):
            # This checks if the incumbent was chosen in the
            # inbuilt model selection of base boosting with
            # augmented cross-validation.
            if self.target_index is not None:
                y_pred = X.iloc[:, self.target_index]
            else:
                y_pred = X
        else:
            assert hasattr(self, 'pipe')
            y_pred = self.pipe.predict(X=X)

        return y_pred

[docs]    def score(self, y_true: DataFrame_or_Series, y_pred: DataFrame_or_Series,
              path=None) -> pd.DataFrame:
        """Computes the DataFrame of supervised scores.

        The scoring metrics include mean squared error, mean absolute error,
        root mean squared error, R^2, explained variance, and mean squared
        logarithmic error. If the observed or predicted single-targets contain
        negative values, then the mean squared logarithmic error is not included,
        as the score is considered a NaN.

        Parameters
        ----------
        y_true : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The observed target matrix, where each row corresponds to an example
            and the column(s) correspond to the observed single-target(s).

        y_pred : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The predicted target matrix, where each row corresponds to an example
            and the column(s) correspond to the predicted single-target(s).

        path : str or file handle, optional (default=None)
            The file path or object, if the scoring DataFrame is to be saved
            to a comma-seperated values (csv) file.

        Returns
        -------
        scores : pd.DataFrame or pd.Series
            The pandas object of computed scores.
        """

        assert any(self.score_multioutput for output in _SCORE_MULTIOUTPUT)

        scores = {}
        for scoring in _SCORE_CHOICE:
            scores[scoring] = super().score(y_true=y_true,
                                            y_pred=y_pred,
                                            scoring=scoring,
                                            multioutput=self.score_multioutput)

        if self.score_multioutput == 'raw_values':
            scores = pd.DataFrame(scores).dropna(how='any', axis=1)
            scores.index.name = 'target'
            
            # Shifts the index origin by one.
            if self.target_index is not None:
                scores.index = pd.RangeIndex(start=self.target_index + 1,
                                             stop=self.target_index + 2,
                                             step=1)
        else:
            scores = pd.Series(scores).dropna(how='any', axis=0)

        if path is not None:
            assert isinstance(path, str)
            scores.to_csv(path_or_buf=path)

        return scores

[docs]    def cross_validate(self, X: DataFrame_or_Series, y: DataFrame_or_Series,
                       return_regressor=False, error_score=np.nan,
                       return_incumbent_score=False, cv=None,
                       fit_params=None) -> pd.DataFrame:
        """Performs (augmented) cross-validation, and wraps the result in a DataFrame.

        If ``return_incumbent_score`` is True, then the incumbent is scored
        on the withheld folds. Otherwise, the behavior is the same as in
        Scikit-learn.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        return_regressor : bool, optional (default=False)
            Determines whether to return the induced regressor.

        error_score : 'raise' or numeric, optional (default=np.nan)
            The assigned value if an error occurs while inducing a regressor.
            If set to 'raise', then the specific error is raised. Else if set
            to a numeric value, then FitFailedWarning is raised.

        return_incumbent_score : bool, optional (default=True)
            Determines whether to score the incumbent on the withheld folds,
            whereby the incumbent is assumed to be an example in the design
            matrix.

        cv : int, cross-validation generator, an iterable, or None, optional (default=None)
            Determines the cross-validation strategy. If None, then the default
            is 5-fold cross-validation.

        fit_params : dict, optional (default=None)
            (Hyper)parameters to pass to the regressor's fit method.

        Returns
        -------
        scores : pd.DataFrame
            DataFrame of scores for each run of the cross-validation procedure.

        Notes
        -----
        Scikit-learn returns negative scores for some metrics, such as
        mean absolute error (MAE) or mean squared error (MSE). However,
        we only return nonnegativie scores.

        References
        ----------
        - Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
          "A new formulation of gradient boosting",
          Machine Learning: Science and Technology, 2 045022 (2021).
        """

        return super().cross_validate(X=X, y=y,
                                       return_regressor=return_regressor,
                                       error_score=error_score,
                                       return_incumbent_score=return_incumbent_score,
                                       cv=cv,
                                       fit_params=fit_params)

[docs]    def cross_val_score(self, X: DataFrame_or_Series, y: DataFrame_or_Series,
                        error_score=np.nan, return_incumbent_score=False,
                        cv=None, fit_params=None) -> DataFrame_or_Series:
        """Performs (augmented) cross-validation, then returns the withheld fold score.

        If ``return_incumbent_score`` is True, then the incumbent is scored
        on the withheld folds. Otherwise, the behavior is the same as in
        Scikit-learn.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        error_score : 'raise' or numeric, optional (default=np.nan)
            The assigned value if an error occurs while inducing a regressor.
            If set to 'raise', then the specific error is raised. Else if set
            to a numeric value, then FitFailedWarning is raised.

        return_incumbent_score : bool, optional (default=True)
            Determines whether to score the incumbent on the withheld folds,
            whereby the incumbent is assumed to be an example in the design
            matrix.

        cv : int, cross-validation generator, an iterable, or None, optional (default=None)
            Determines the cross-validation strategy. If None, then the default
            is 5-fold cross-validation.

        fit_params : dict, optional (default=None)
            (Hyper)parameters to pass to the regressor's fit method.

        Returns
        -------
        scores : pd.Series or pd.DataFrame
            The withheld fold scores for each run of the cross-validation procedure.

        Notes
        -----
        Scikit-learn returns negative scores for some metrics, such as
        mean absolute error (MAE) or mean squared error (MSE). However,
        we only return nonnegativie scores.

        References
        ----------
        - Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
          "A new formulation of gradient boosting",
          Machine Learning: Science and Technology, 2 045022 (2021).
        """

        return super().cross_val_score(X=X, y=y,
                                       return_regressor=return_regressor,
                                       error_score=error_score,
                                       return_incumbent_score=return_incumbent_score,
                                       cv=cv,
                                       fit_params=fit_params)

[docs]    def _preprocess_search_params(self, y: DataFrame_or_Series, search_params: dict) -> dict:
        """Helper method for preprocessing (hyper)parameters.

        This method automatically preprocesses (hyper)parameter names for the
        exhaustive search method by determining whether the task is single-target
        or multi-target regression. In the latter case, it further determines the
        user's assumption on the single-targets's independence. Namely, it asks if
        the user wishes to chain the single-targets.

        Parameters
        ----------
        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        search_params : dict
            Dictionary with (hyper)parameter names as keys, and either lists of
            (hyper)parameter settings to try as values or tuples of (hyper)parameter
            lower and upper bounds to try as values.

        Returns
        -------
        search_params : dict
            The preprocessed (hyper)parameters.
        """

        if self.auto_target:
            if sklearn.utils.multiclass.type_of_target(y) in _MULTI_TARGET:
                if self.chain_order is not None:
                    search_params = _preprocess_hyperparams(raw_params=search_params,
                                                            multi_target=True,
                                                            chain=True)
                else:
                    search_params = _preprocess_hyperparams(raw_params=search_params,
                                                            multi_target=True,
                                                            chain=False)
            else:
                search_params = _preprocess_hyperparams(raw_params=search_params,
                                                        multi_target=False,
                                                        chain=False)
        else:
            search_params = _preprocess_hyperparams(raw_params=search_params,
                                                        multi_target=False,
                                                        chain=False)

        return search_params

[docs]    def _search(self, X: DataFrame_or_Series, y: DataFrame_or_Series, search_params: dict,
                search_method='gridsearchcv', cv=None) -> None:
        """Helper (hyper)parameter search method.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        search_params : dict
            Dictionary with (hyper)parameter names as keys, and either lists of
            (hyper)parameter settings to try as values or tuples of (hyper)parameter
            lower and upper bounds to try as values.

        search_method : str, optional (default='gridsearchcv')
            Specifies the search method. If ``'gridsearchcv'``, ``'randomizedsearchcv'``,
            or ``'bayesoptcv'`` then the search method is GridSearchCV, RandomizedSearchCV,
            or Bayesian Optimization.

        cv : int, cross-validation generator, an iterable, or None, optional (default=None)
            Determines the cross-validation strategy. If None, then the default
            is 5-fold cross-validation.

        Attributes
        ----------
        _method : GridSearchCV, RandomizedSearchCV, BayesianOptimization
            An instance of the (hyper)parameter search object.
        """

        search_method = _check_search_method(search_method=search_method)
        search_params = self._preprocess_search_params(y=y, search_params=search_params)

        if cv is None:
            cv = self.cv

        if not hasattr(self, 'pipe'):
            self.get_pipeline(y=y,
                              n_quantiles=super()._estimate_fold_size(y=y,
                                                                      cv=cv))

        self._method = _search_method(search_method=search_method,
                                      pipeline=self.pipe,
                                      search_params=search_params,
                                      scoring=self.scoring,
                                      refit=self.refit,
                                      n_jobs=self.n_jobs,
                                      cv=cv,
                                      verbose=self.verbose,
                                      pre_dispatch='2*n_jobs',
                                      error_score=np.nan,
                                      return_train_score=self.return_train_score,
                                      randomizedcv_n_iter=self.randomizedcv_n_iter,
                                      X=X, y=y,
                                      init_points=self.bayesoptcv_init_points,
                                      bayesoptcv_n_iter=self.bayesoptcv_n_iter)

[docs]    def search(self, X: DataFrame_or_Series, y: DataFrame_or_Series, search_params: dict,
                search_method='gridsearchcv', cv=None, path=None) -> None:
        """(Hyper)parameter search method.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        search_params : dict
            Dictionary with (hyper)parameter names as keys, and either lists of
            (hyper)parameter settings to try as values or tuples of (hyper)parameter
            lower and upper bounds to try as values.

        search_method : str, optional (default='gridsearchcv')
            Specifies the search method. If ``'gridsearchcv'``, ``'randomizedsearchcv'``,
            or ``'bayesoptcv'`` then the search method is GridSearchCV, RandomizedSearchCV,
            or Bayesian Optimization.

        cv : int, cross-validation generator, an iterable, or None, optional (default=None)
            Determines the cross-validation strategy. If None, then the default
            is 5-fold cross-validation.

        path : str or file handle, optional (default=None)
            The file path or object, if the scoring DataFrame is to be saved
            to a comma-seperated values (csv) file.

        Attributes
        ----------
        best_params_ : pd.Series
            The optimal (hyper)parameters.

        best_score_ : pd.Series
            The scores for the optimal (hyper)parameters.

        search_summary_ : pd.DataFrame
            Bundles the ``best_params_``, ``best_score_``, and ``refit_time``
            into one attribute.

        Notes
        -----
        Scikit-learn returns negative scores for some metrics, such as
        mean absolute error (MAE) or mean squared error (MSE). However,
        we only return nonnegativie scores.
        """

        X, y = super()._validate_data(X=X, y=y)

        # Automates single-target slicing.
        y = self._check_target_index(y=y)

        self._search(X=X, y=y, search_params=search_params,
                     search_method=search_method, cv=cv)

        if search_method == 'bayesoptcv' and self.refit:
            self.pipe = sklearn.base.clone(sklearn.base.clone(self.pipe).set_params(
                **_check_bayesoptcv_param_type(pbounds=self._method.max['params'])))
            self.pipe.fit(X=X, y=y)
        else:
            try:
                self._method.fit(X=X, y=y)
            except:
                raise AttributeError('Performing the search requires the '
                                     'attribute: %s. However, the attribute '
                                     'is not set.'
                                     % (_method))

        if search_method in ['gridsearchcv', 'randomizedsearchcv']:
            self.best_params_ = pd.Series(self._method.best_params_)
            self.best_score_ = pd.Series({'best_score': self._method.best_score_})
        elif search_method == 'bayesoptcv':
            try:
                self.best_params_ = pd.Series(self._method.max['params'])
                self.best_score_ = pd.Series({'best_score': self._method.max['target']})
            except:
                raise AttributeError('In order to set the attributes: %s and %s, '
                                     'there must be the attribute: %s.'
                                     % (best_params_, best_score_, optimization))

        if re.match('neg', self.scoring):
            self.best_score_.loc['best_score'] *= -1.0

        self.search_summary_ = pd.concat([self.best_score_, self.best_params_], axis=0)

        _sklearn_list = ['best_estimator_', 'cv_results_', 'refit_time_']
        if all(hasattr(self._method, attr) for attr in _sklearn_list):
            self.pipe = self._method.best_estimator_
            self.best_regressor_ = self._method.best_estimator_
            self.pipe = self._method.best_estimator_
            self.cv_results_ = pd.DataFrame(self._method.cv_results_)
            self.refit_time_ = pd.Series({'refit_time':self._method.refit_time_})
            self.search_summary_ = pd.concat([self.search_summary_, self.refit_time_], axis=0)

        if path is not None:
            assert isinstance(path, str)
            self.search_summary_.to_csv(path_or_buf=path, header=True)

[docs]    def _search_and_score(self, pipeline: ModifiedPipeline, X: DataFrame_or_Series,
                          y: DataFrame_or_Series, scorer: dict,
                          train: list, test: list, verbose: int,
                          search_params: dict, search_method='gridsearchcv',
                          cv=None) -> tuple:
        """Helper method for nested cross-validation.

        Exhaustively searches over the specified (hyper)parameters in the inner
        loop then scores the best performing regressor in the outer loop.

        Parameters
        ----------
        pipeline : ModifiedPipeline
            A ModifiedPipeline object.

        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        scorer : dict
            A dict mapping each scorer name to its validated scorer.

        train : list
            A list of indices for the training folds.

        test : list
            A list of indices for the withheld folds.

        verbose : int
            Determines verbosity.

        search_params : dict
            Dictionary with (hyper)parameter names as keys, and either lists of
            (hyper)parameter settings to try as values or tuples of (hyper)parameter
            lower and upper bounds to try as values.

        search_method : str, optional (default='gridsearchcv')
            Specifies the search method. If ``'gridsearchcv'``, ``'randomizedsearchcv'``,
            or ``'bayesoptcv'`` then the search method is GridSearchCV, RandomizedSearchCV,
            or Bayesian Optimization.

        cv : int, cross-validation generator, an iterable, or None, optional (default=None)
            Determines the cross-validation strategy. If None, then the default
            is 5-fold cross-validation.

        Returns
        -------
        score : tuple

        Notes
        -----
        Scikit-learn returns negative scores for some metrics, such as
        mean absolute error (MAE) or mean squared error (MSE). However,
        we only return nonnegativie scores.
        """

        X_train, y_train = sklearn.utils.metaestimators._safe_split(estimator=pipeline,
                                                                    X=X, y=y,
                                                                    indices=train)
        X_test, y_test = sklearn.utils.metaestimators._safe_split(estimator=pipeline,
                                                                  X=X, y=y,
                                                                  indices=test,
                                                                  train_indices=train)

        self.search(X=X_train, y=y_train, search_params=search_params,
                    search_method=search_method, cv=cv)

        if not self.refit:
            self.pipe = sklearn.base.clone(sklearn.base.clone(self.pipe).set_params(
                **self.best_params_))
            self.pipe._fit(X=X_train, y=y_train)

        test_score = sklearn.model_selection._validation._score(estimator=self.pipe,
                                                                X_test=X_test,
                                                                y_test=y_test,
                                                                scorer=scorer)

        return (self.best_score_.values, test_score)

[docs]    def nested_cross_validate(self, X: DataFrame_or_Series, y: DataFrame_or_Series,
                              search_params: dict, search_method='gridsearchcv',
                              outer_cv=None, inner_cv=None,
                              return_inner_loop_score=False) ->typing.Union[pd.Series, tuple]:
        """Performs a nested cross-validation procedure.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        search_params : dict
            Dictionary with (hyper)parameter names as keys, and either lists of
            (hyper)parameter settings to try as values or tuples of (hyper)parameter
            lower and upper bounds to try as values.

        search_method : str, optional (default='gridsearchcv')
            Specifies the search method. If ``'gridsearchcv'``, ``'randomizedsearchcv'``,
            or ``'bayesoptcv'`` then the search method is GridSearchCV, RandomizedSearchCV,
            or Bayesian Optimization.

        outer_cv : int, cross-validation generator, an iterable, or None, optional (default=None)
            Determines the outer loop cross-validation strategy. If None, then the default
            is 5-fold cross-validation.

        inner_cv : int, cross-validation generator, an iterable, or None, optional (default=None)
            Determines the inner loop cross-validation strategy. If None, then the default
            is 5-fold cross-validation.

        return_inner_loop_score : bool, optional (default=False)
            If True, then we return the inner loop score in addition to the
            outer loop score.

        Returns
        -------
        score : pd.Series or tuple

        Notes
        -----
        The procedure does not compute the single best set of (hyper)parameters,
        as each inner loop may return a different set of optimal (hyper)parameters.

        Scikit-learn returns negative scores for some metrics, such as
        mean absolute error (MAE) or mean squared error (MSE). However,
        we only return nonnegativie scores.

        References
        ----------
        Jacques Wainer and Gavin Cawley. "Nested cross-validation when selecting
        classifiers is overzealous for most practical applications," arXiv preprint
        arXiv:1809.09446 (2018).
        """

        X, y = super()._validate_data(X=X, y=y)

        # Automates single-target slicing
        y = self._check_target_index(y=y)

        X, y, groups = sklearn.utils.validation.indexable(X, y, None)

        if outer_cv is None:
            outer_cv = self.cv

        if inner_cv is None:
            inner_cv = self.cv

        if not hasattr(self, 'pipe'):
            self.get_pipeline(y=y,
                              n_quantiles=super()._estimate_fold_size(y=y,
                                                                      cv=outer_cv))

        outer_cv = sklearn.model_selection._split.check_cv(cv=outer_cv, y=y,
                                                           classifier=False)

        if isinstance(self.scoring, str):
            scorers = sklearn.metrics._scorer.check_scoring(estimator=self.pipe,
                                                            scoring=self.scoring)
        else:
            scorers, _ = sklearn.metrics._scorer._check_multimetric_scoring(estimator=self.pipe,
                                                                            scoring=self.scoring)

        parallel = joblib.Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                                   pre_dispatch='2*n_jobs')

        # Parallelized nested cross-validation: the helper method utilizes
        # the search method to select a regressor from the inner loop, then
        # the performance of this regressor is evaluated in the outer loop.
        scores = parallel(
            joblib.delayed(self._search_and_score)(
                pipeline=sklearn.base.clone(self.pipe), X=X, y=y, scorer=scorers,
                train=train, test=test, verbose=self.verbose, search_params=search_params,
                search_method='gridsearchcv', cv=inner_cv)
            for train, test in outer_cv.split(X, y, groups))

        outer_loop_scores = pd.Series([np.abs(pair[1]) for pair in scores])

        if return_inner_loop_score:
            inner_loop_scores = pd.Series(np.concatenate([np.abs(pair[0]) for pair in scores]))
            return outer_loop_scores, inner_loop_scores
        else:
            return outer_loop_scores
        
[docs]    def subsample(self, X: DataFrame_or_Series, y: DataFrame_or_Series,
                  subsample_proportion=None) -> tuple: 
        """Subsamples from the design and target matrices.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        subsample_proportion : float or None, optional (default=None)
            Determines the proportion of observations to use in the
            subsampling procedure.

        Returns
        -------
        out : tuple
            A tuple with the X and y data.
        """

        if subsample_proportion is not None:
            assert subsample_proportion > 0 and subsample_proportion < 1
            out = sklearn.utils.resample(X, y, replace=False,
                                         n_samples=int(len(X) * subsample_proportion),
                                         random_state=self.random_state)
        else:
            out = sklearn.utils.resample(X, y, replace=False,
                                         n_samples=len(X),
                                         random_state=self.random_state)

        return out
Table Of Contents

Source code for physlearn.supervised.regression