Source code for physlearn.pipeline

"""
The :mod:`physlearn.pipeline` module enhances the original Scikit-learn
pipeline with an implementation of base boosting. It includes a
:class:`physlearn.pipeline.ModifiedPipeline` class, as well as a
:func:`physlearn.pipeline.make_pipeline` convenience
function.
"""

# Author: Alex Wozniakowski
# License: MIT

from __future__ import annotations

import copy
import joblib
import typing

import numpy as np
import pandas as pd

import scipy.optimize

import sklearn.base
import sklearn.multioutput
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.utils
import sklearn.utils.metaestimators
import sklearn.utils.multiclass

from dataclasses import dataclass, field

from physlearn.loss import LOSS_FUNCTIONS
from physlearn.supervised.utils._data_checks import _n_targets
from physlearn.supervised.utils._definition import (_CATBOOST_FLAG, _CHAIN_FLAG,
                                                    _MULTI_TARGET,
                                                    _PIPELINE_TRANSFORM_CHOICE,
                                                    _SCORE_CHOICE)
from physlearn.supervised.utils._estimator_checks import _check_line_search_options

DataFrame_or_Series = typing.Union[pd.DataFrame, pd.Series]
pandas_or_numpy = typing.Union[pd.DataFrame, pd.Series, np.ndarray]


[docs]@dataclass
class ModifiedPipeline(sklearn.pipeline.Pipeline):
    """Custom pipeline object that supports base boosting.

    The object inherits from the original Scikit-learn Pipeline, thus it is
    designed to sequentially compose a list of named transforms and a final
    estimator into a new estimator. The modification extends this
    functionality such that the composed estimator supports base boosting.
    In other words, the ``base_boosting_options`` parameter enables a user
    to boost an explicit model of the domain by fitting an additive
    expansion, wherein the intercept term is generated by the explicit
    model. As such, the final estimator may be any estimator contained
    in the dictionary of estimators, i.e., the final estimator is not
    restricted to the decision tree hypothesis class.

    Parameters
    ----------
    steps : list
        List of tuples, wherein the preceding tuple(s) (name, transform)
        are transform(s) and the last tuple (name, estimator) is an
        estimator.

    memory : str or object with the joblib.Memory interface, optional (default=None)
        Enables fitted transform caching.

    verbose : int, optional (default=0)
        Determines verbosity.

    n_jobs : int or None, optional (default=-1)
        The number of jobs to run in parallel.

    target_index : int or None, optional (default=None)
        Specifies the single-target subtask in the multi-target task.

    base_boosting_options : dict or None, optional (default=None)
        A dictionary of base boosting options, wherein the following options
        must be specified:

        n_estimators :obj:`int`
            The number of basis functions in the noise term of the additive
            expansion. Note that this option may also be specified as
            ``n_regressors``; see the example below.

        boosting_loss :obj:`str` 
            The loss function utilized in the pseudo-residual computation,
            where 'ls' denotes the squared error loss function, 'lad'
            denotes the absolute error loss function, 'huber' denotes
            the Huber loss function, and 'quantile' denotes the quantile
            loss function.

        line_search_options :obj:`dict` 
            init_guess :obj:`int`, :obj:`float`, or :obj:`ndarray`
                The initial guess for the expansion coefficient.

            opt_method :obj:`str`
                Choice of optimization method. If ``'minimize'``, then
                :class:`scipy.optimize.minimize`, else if ``'basinhopping'``,
                then :class:`scipy.optimize.basinhopping`.

            method :obj:`str` or None
                The type of solver utilized in the optimization method.

            tol :obj:`float` or None
                The epsilon tolerance for terminating the optimization method.

            options :obj:`dict` or None
                A dictionary of solver options.

            niter :obj:`int` or None
                The number of iterations in basin-hopping.

            T :obj:`float` or None
                The temperature paramter utilized in basin-hopping,
                which determines the accept or reject criterion.

            loss :obj:`str`
                The loss function utilized in the line search computation,
                where 'ls' denotes the squared error loss function, 'lad'
                denotes the absolute error loss function, 'huber' denotes
                the Huber loss function, and 'quantile' denotes the quantile
                loss function.

            regularization :obj:`int` or :obj:`float`
                The regularization strength in the line search computation.

    See Also
    --------
    :func:`physlearn.pipeline.make_pipeline` : Convenience function for constructing a modified pipeline.
    :mod:`physlearn.supervised.utils._definition` : Dictionary of final estimator options.

    Examples
    --------
    >>> from sklearn.linear_model import Ridge
    >>> from sklearn.preprocessing import StandardScaler
    >>> from physlearn import ModifiedPipeline
    >>> from physlearn.datasets import load_benchmark
    >>> X_train, X_test, y_train, y_test = load_benchmark(return_split=True)
    >>> line_search_options = dict(init_guess=1, opt_method='minimize',
                                   method='Nelder-Mead', tol=1e-7,
                                   options={"maxiter": 10000},
                                   niter=None, T=None, loss='lad',
                                   regularization=0.1)
    >>> base_boosting_options = dict(n_regressors=3, boosting_loss='lad',
                                     line_search_options=line_search_options)
    >>> pipe = ModifiedPipeline(steps=[('scaler', StandardScaler()), ('reg', Ridge())],
                                base_boosting_options=base_boosting_options)
    >>> pipe.fit(X_train, y_train)
    >>> pipe.score(X_test, y_test).round(decimals=2)
        mae    mse  rmse    r2    ev
    0  2.17  10.01  3.16  0.97  0.98
    1  1.17   3.09  1.76  0.99  0.99
    2  0.78   1.20  1.09  1.00  1.00
    3  0.83   1.12  1.06  1.00  1.00
    4  0.99   2.00  1.42  1.00  1.00

    References
    ----------
    - Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
      "A new formulation of gradient boosting",
      Machine Learning: Science and Technology, 2 045022 (2021).

    - John Tukey. "Exploratory Data Analysis", Addison-Wesley (1977).
    
    - Jerome Friedman. "Greedy function approximation: A gradient boosting machine,"
      Annals of Statistics, 29(5):1189–1232 (2001).
    
    - Trevor Hastie, Robert Tibshirani, and Jerome Friedman.
      "The Elements of Statistical Learning", Springer (2009).

    - Lars Buitinck et al.
      "API design for machine learning software: experiences from the scikit-learn project"
      arXiv preprint arXiv:1309.0238 (2013).
    """

    _required_parameters = ['steps']

    steps: list
    memory: str = field(default=None)
    verbose: int = field(default=0)
    n_jobs: str = field(default=None)
    target_index: int = field(default=None)
    base_boosting_options: dict = field(default=None)

    def __post_init__(self):
        self._validate_steps()

    def _validate_base_boosting_options(self):
        if self.base_boosting_options is not None:
            for key, option in self.base_boosting_options.items():
                if key in ['n_estimators', 'n_regressors']:
                    if option is not None:
                        assert isinstance(option, int) and option > 0
                        self.n_estimators = option
                    else:
                        raise ValueError('The value of key: %s cannot be None. '
                                         'Specify a positive integer for the number '
                                         'of basis functions to fit in the additive '
                                         'expansion.'
                                         % (key))
                elif key == 'boosting_loss':
                    if option is not None:
                        assert option in LOSS_FUNCTIONS
                        self.boosting_loss = option
                    else:
                        raise ValueError('The value of key: %s cannot be None. '
                                         'The choice of loss function is required '
                                         'in the pseudo-residual computation.'
                                         % (key))
                elif key == 'line_search_options':
                    if option is not None:
                        assert isinstance(option, dict)
                        _check_line_search_options(line_search_options=option)
                        self.line_search_options = option
                    else:
                        raise ValueError('The value of key: %s cannot be None. '
                                         'These options determine the line search '
                                         'optimization procedure.'
                                         % (key))
                else:
                    raise KeyError('The key: %s is not a base boosting option.'
                                   % (key))
        else:
            # This attribute instructs the fit method to avoid
            # base boosting. Consequently, the predict method
            # will not generate predictions from the decomposition
            # into a smooth term and a noise term.
            setattr(self, '_default_fit', True)

[docs]    @staticmethod
    def line_search(function: typing.Callable[[np.ndarray], float],
                    init_guess: typing.Union[int, float],
                    opt_method: str, method=None, tol=None,
                    options=None, niter=None, T=None) -> np.ndarray:
        """Computes the expansion coefficient in :meth:`physlearn.pipeline.ModifiedPipeline._fit_stages`.

        Parameters
        ----------
        function : callable
            The objective function for the line search.

        init_guess : int, float, or ndarray
            The initial guess for the expansion coefficient.

        opt_method : str
            Choice of optimization method. If ``'minimize'``, then
            :class:`scipy.optimize.minimize`, else if ``'basinhopping'``,
            then :class:`scipy.optimize.basinhopping`.

        method : str or None, optional (default=None)
            The type of solver utilized in the optimization method.

        tol : float or None, optional (default=None)
            The epsilon tolerance for terminating the optimization method.

        options : dict or None, optional (default=None)
            A dictionary of solver options.

        niter : int or None, optional (default=None)
            The number of iterations in basin-hopping.

        T : float or None, optional (default=None)
            The temperature paramter utilized in basin-hopping,
            which determines the accept or reject criterion.

        Returns
        -------
        res.x[0] : float
            The expansion coefficient, i.e., the first element in the solution array.

        Notes
        -----
        The supported optimization methods include: :class:`scipy.optimize.minimize`
        and :class:`scipy.optimize.basinhopping`; see the Scipy optimization
        `documentation <https://docs.scipy.org/doc/scipy/reference/optimize.html>`_ for
        further details.
        """

        if opt_method == 'minimize':
            assert method is not None and tol is not None
            minimize_object = scipy.optimize.minimize(fun=function,
                                                      x0=init_guess,
                                                      method=method,
                                                      tol=tol,
                                                      options=options)            
        elif opt_method == 'basinhopping':
            assert niter is not None and T is not None
            minimize_object = scipy.optimize.basinhopping(func=function,
                                                          x0=init_guess,
                                                          niter=niter,
                                                          T=T)
        else:
            raise ValueError('The optimization method: %s has not been implemented.'
                             % (opt_method))

        return minimize_object.x[0]

[docs]    def _fit_stage(self, X: DataFrame_or_Series, pseudo_residual: pandas_or_numpy,
                   **fit_params_last_step) -> None:
        """Induces a basis function, which is a map from the domain to the pseudo-residual space.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        pseudo_residual : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The negative gradient of the loss function.

        **fit_params_last_step : dict of string -> object
            Parameters passed to the estimator's ``fit`` method during the stage.
        """

        for k in range(self.loss.K):
            if self._final_estimator.__class__ in _CHAIN_FLAG:
                self._final_estimator.fit(X=X, Y=pseudo_residual, **fit_params_last_step)
            else:
                self._final_estimator.fit(X=X, y=pseudo_residual, **fit_params_last_step)

[docs]    def _fit_stages(self, X: DataFrame_or_Series, y: pandas_or_numpy,
                    init_expansion: pandas_or_numpy,
                    **fit_params_last_step) -> None:
        """Fits the additive expansion in a greedy stagewise fashion.

        This method transfers prior domain knowledge to gradient boosting through the
        ``init_expansion`` parameter, and it is designed to be utilized within
        :meth:`physlearn.pipeline.ModifiedPipeline.fit`. The induced basis functions
        and the learned expansion coefficients can be retrieved with the ``estimators_``
        and the ``coefs_`` attributes, respectively.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        init_expansion : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The smooth term in the additive expansion, i.e., the initial guess
            in gradient boosting.

        **fit_params_last_step : dict of string -> object
            Parameters passed to the estimator's ``fit`` method during each stage.

        Attributes
        ----------
        estimators_ : list
            A list of induced basis functions.
        coefs_ : list
            A list of learned expansion coefficients.

        Notes
        -----
        This greedy stagewise algorithm fits an additive expansion, which differs
        from the standard additve expansion. Namely, the constant term is a random
        variable, which depends upon the input example, e.g., an element in
        ``init_expansion``.
        """

        self._estimators = []
        self._coefs = []

        if getattr(self, "_estimator_type", None) == 'regressor':
            # The boosting loss attribute determines, which loss function
            # is employed in the negative gradient computation.
            if self.boosting_loss == 'huber':
                self.loss = LOSS_FUNCTIONS[self.boosting_loss](alpha=0.9)
            else:
                self.loss = LOSS_FUNCTIONS[self.boosting_loss]()
        
        pseudo_residual = self.loss.negative_gradient(y=y,
                                                      raw_predictions=init_expansion)

        # Greedily builds the additive expansion in a stagewise fashion,
        # wherein gradient boosting initializes with init_expansion. Thereby,
        # enabling the transfer of prior domain knowledge to gradient boosting.
        current_expansion = init_expansion
        for k in range(self.n_estimators):
            self._fit_stage(X=X, pseudo_residual=pseudo_residual,
                            **fit_params_last_step)

            # Copy the basis function for the predict method.
            # We use deepcopy instead of clone, since a clone
            # instance will not have invoked the fit method.
            self._estimators.append(copy.deepcopy(self))

            # Generate predictions for the line search computation.
            y_pred = self._final_estimator.predict(X=X)

            # This loss key determines, which loss function
            # is employed in the line search computation.
            if self.line_search_options['loss'] == 'huber':
                line_search_loss = LOSS_FUNCTIONS[self.line_search_options['loss']](alpha=0.9)
            else:
                line_search_loss = LOSS_FUNCTIONS[self.line_search_options['loss']]()
            
            def regularized_loss(alpha):
                current_expansion_ref = current_expansion
                loss = line_search_loss(y=y, 
                                        raw_predictions=current_expansion_ref.add(alpha*y_pred))
                return loss + self.line_search_options['regularization']*np.abs(alpha)

            coef = self.line_search(function=regularized_loss,
                                    init_guess=self.line_search_options['init_guess'],
                                    opt_method=self.line_search_options['opt_method'],
                                    method=self.line_search_options['method'],
                                    tol=self.line_search_options['tol'],
                                    options=self.line_search_options['options'],
                                    niter=self.line_search_options['niter'],
                                    T=self.line_search_options['T'])
            
            # Store the learned expansion coefficient for the predict method.
            self._coefs.append(coef)

            # These computations are not required in the last stage.
            if self.n_estimators - 1 > k:
                current_expansion = current_expansion.add(coef*y_pred)
                pseudo_residual = self.loss.negative_gradient(y=pseudo_residual,
                                                              raw_predictions=current_expansion)

        self.estimators_ = self._estimators
        self.coefs_ = self._coefs

[docs]    def fit(self, X: DataFrame_or_Series, y: pandas_or_numpy,
            **fit_params) -> ModifiedPipeline:
        """Sequentially fits the transform(s) then the final estimator.

        This method supports base boosting.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of each step of the stagewise
            ``_fit_stage`` method.

        Returns
        -------
        self : ModifiedPipeline
            The induced pipeline object.
        """

        fit_params_steps = self._check_fit_params(**fit_params)

        if X.ndim == 1:
            Xt = self._fit(X=X.values.reshape(-1, 1), y=y, **fit_params_steps)
        else:
            Xt = self._fit(X=X, y=y, **fit_params_steps)
        
        with sklearn.utils._print_elapsed_time('Pipeline',
                                               self._log_message(len(self.steps) - 1)):
            # This check distinguishes between a
            # default fit and base boosting.
            self._validate_base_boosting_options()
            if self._final_estimator != 'passthrough':
                fit_params_last_step = fit_params_steps[self.steps[-1][0]]
                if not hasattr(self, '_default_fit'):
                    if self.target_index is not None and \
                    sklearn.utils.multiclass.type_of_target(y) == 'continuous':
                        smooth_term = X.iloc[:, self.target_index]
                    else:
                        smooth_term = X
                    self._fit_stages(X=Xt, y=y, init_expansion=smooth_term,
                                     **fit_params_last_step)
                else:
                    if self._final_estimator.__class__ in _CHAIN_FLAG:
                        self._final_estimator.fit(X=Xt, Y=y, **fit_params_last_step)
                    else:
                        self._final_estimator.fit(X=Xt, y=y, **fit_params_last_step)

        return self

[docs]    def _predict(self, estimator, Xt: pandas_or_numpy, coef: float,
                 **predict_params) -> np.ndarray:
        """Helper method for parallelizing the noise term predictions.

        Parameters
        ----------
        estimator : estimator
            An estimator that follows the Scikit-learn API.

        Xt : array-like of shape = [n_samples, n_features]
            The transformed design matrix.

        coef : float
            The learned expansion coefficient.

        **predict_params : dict of string -> object
            Parameters to the ``predict`` called at the end of all
            transformations in the pipeline.

        Returns
        -------
        y_pred : ndarray
            A Numpy array of predictions.
        """

        return coef * estimator.steps[-1][-1].predict(X=Xt, **predict_params)

[docs]    @sklearn.utils.metaestimators.available_if(sklearn.pipeline._final_estimator_has('predict'))
    def predict(self, X: DataFrame_or_Series, **predict_params) -> DataFrame_or_Series:
        """Applies transform(s) to the data, then predicts with the final estimator.

        The method supports base boosting.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        **predict_params : dict of string -> object
            Parameters to the ``predict`` method, which are called after completing
            all of the pipeline transformations.

        Returns
        -------
        y_pred : DataFrame or Series
            A pandas DataFrame or Series of predictions.

        Notes
        -----
        In base boosting, we decompose the predictions in accord with Tukey's
        notion of reroughing. Namely, data = smooth + rough.
        """

        Xt = X
        for _, name, transform in self._iter(with_final=False):
            if Xt.ndim == 1:
                Xt = transform.transform(X=Xt.values.reshape(-1, 1))
            else:
                Xt = transform.transform(X=Xt)
                
        if hasattr(self, 'coefs_') and hasattr(self, 'estimators_'):
            # Tukey's reroughing: smooth term plus noise term(s).
            if self.target_index is not None:
                smooth_term = X.iloc[:, self.target_index]
            else:
                smooth_term = X
            
            parallel = joblib.Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                                       pre_dispatch='2*n_jobs')

            noise_term = parallel(
                joblib.delayed(self._predict)(
                    estimator=estimator, Xt=Xt, coef=coef)
                for coef, estimator in zip(self.coefs_, self.estimators_))
            y_pred = smooth_term.add(noise_term[0])
        else:
            # Generate predictions without reroughing.
            if self.target_index is not None:
                if self._final_estimator.__class__ == _CATBOOST_FLAG:
                    y_pred = pd.DataFrame(self.steps[-1][-1].predict(data=Xt, **predict_params),
                                          index=X.iloc[:, self.target_index].index)
                else:
                    y_pred = pd.DataFrame(self.steps[-1][-1].predict(X=Xt, **predict_params),
                                          index=X.iloc[:, self.target_index].index)
            else:
                if self._final_estimator.__class__ == _CATBOOST_FLAG:
                    y_pred = pd.DataFrame(self.steps[-1][-1].predict(data=Xt, **predict_params),
                                          index=X.index)
                else:
                    y_pred = pd.DataFrame(self.steps[-1][-1].predict(X=Xt, **predict_params),
                                          index=X.index)

        return y_pred

[docs]    @sklearn.utils.metaestimators.available_if(sklearn.pipeline._final_estimator_has('score'))
    def score(self, X: DataFrame_or_Series, y: DataFrame_or_Series,
              multioutput='raw_values', **predict_params) -> pd.DataFrame:
        """Computes the supervised score.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The design matrix, where each row corresponds to an example and the
            column(s) correspond to the feature(s).

        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        multioutput : str, optional (default='raw_values')
            Defines aggregating of multiple output values, wherein the string
            must be either ``'raw_values'`` or ``'uniform_average'``.

        **predict_params : dict of string -> object
            Parameters to the ``predict`` method, which are called after completing
            all of the pipeline transformations.

        Returns
        -------
        scores : pd.DataFrame or pd.Series
            The pandas object of computed scores.
        """

        assert any(multioutput for output in ['raw_values', 'uniform_average'])

        y_pred = self.predict(X=X, **predict_params)
        y_true = y

        scores = {}
        for scoring in _SCORE_CHOICE:
            if scoring == 'mae':
                scores[scoring] = sklearn.metrics.mean_absolute_error(y_true=y_true,
                                                                      y_pred=y_pred,
                                                                      multioutput=multioutput)
            elif scoring == 'mse':
                scores[scoring] = sklearn.metrics.mean_squared_error(y_true=y_true,
                                                                     y_pred=y_pred,
                                                                     multioutput=multioutput)
            elif scoring == 'rmse':
                scores[scoring] = np.sqrt(sklearn.metrics.mean_squared_error(y_true=y_true,
                                                                             y_pred=y_pred,
                                                                             multioutput=multioutput))
            elif scoring == 'r2':
                scores[scoring] = sklearn.metrics.r2_score(y_true=y_true,
                                                           y_pred=y_pred,
                                                           multioutput=multioutput)
            elif scoring == 'ev':
                scores[scoring] = sklearn.metrics.explained_variance_score(y_true=y_true,
                                                                           y_pred=y_pred,
                                                                           multioutput=multioutput)
            elif scoring == 'msle':
                try:
                    scores[scoring] = sklearn.metrics.mean_squared_log_error(y_true=y_true,
                                                                             y_pred=y_pred,
                                                                             multioutput=multioutput)
                except:
                    # Sklearn will raise a ValueError if either
                    # statement is true, so we circumvent
                    # this error and score with a NaN.
                    scores[scoring] = np.nan

        if multioutput == 'raw_values':
            return pd.DataFrame(scores).dropna(how='any', axis=1)
        else:
            return pd.Series(scores).dropna(how='any', axis=0)


[docs]def make_pipeline(estimator, transform=None, **kwargs) -> ModifiedPipeline:
    """Constructs a ModifiedPipeline from the given base estimator.

    Parameters
    ----------
    estimator : estimator
        A base estimator that follows the Scikit-learn API.

    transform : str, list, tuple, or None, optional (default=None)
        Choice of transform(s). If the specified choice is a string,
        then it must be a default option, where ``'standardscaler'``,
        ``'boxcox'``, ``'yeojohnson'``, ``'quantileuniform'``, and
        ``'quantilenormal'`` denote :class:`sklearn.preprocessing.StandardScaler`,
        :class:`sklearn.preprocessing.PowerTransformer` with ``method='box-cox'``
        or ``method='yeo-johnson'``, and :class:`sklearn.preprocessing.QuantileTransformer`
        with ``output_distribution='uniform'`` or ``output_distribution='normal'``,
        respectively.

    memory : str or object with the joblib.Memory interface
        Enables fitted transform caching.

    verbose : int
        Determines verbosity.

    n_jobs : int or None
        The number of jobs to run in parallel.

    auto_target : bool, optional (default=True)
        Determines whether to automatically handle the pipeline steps or let
        the user specify the steps.

    target_index : int or None
        Specifies the single-target subtask in the multi-target task.

    target_type : str
        Specifies the type of target according to :class:`sklearn.utils.multiclass.type_of_target`.

    base_boosting_options : dict or None
        A dictionary of base boosting options, wherein the following options
        must be specified:

        n_estimators :obj:`int`
            The number of basis functions in the noise term of the additive expansion.

        boosting_loss :obj:`str` 
            The loss function utilized in the pseudo-residual computation, where 'ls'
            denotes the squared error loss function, 'lad' denotes the absolute error
            loss function, 'huber' denotes the Huber loss function, and 'quantile'
            denotes the quantile loss function.

        line_search_options :obj:`dict` 
            init_guess :obj:`int`, :obj:`float`, or :obj:`ndarray`
                The initial guess for the expansion coefficient.

            opt_method :obj:`str`
                Choice of optimization method. If ``'minimize'``, then
                :class:`scipy.optimize.minimize`, else if ``'basinhopping'``,
                then :class:`scipy.optimize.basinhopping`.

            method :obj:`str` or None
                The type of solver utilized in the optimization method.

            tol :obj:`float` or None
                The epsilon tolerance for terminating the optimization method.

            options :obj:`dict` or None
                A dictionary of solver options.

            niter :obj:`int` or None
                The number of iterations in basin-hopping.

            T :obj:`float` or None
                The temperature paramter utilized in basin-hopping,
                which determines the accept or reject criterion.

            loss :obj:`str`
                The loss function utilized in the line search computation, where 'ls'
                denotes the squared error loss function, 'lad' denotes the absolute error
                loss function, 'huber' denotes the Huber loss function, and 'quantile'
                denotes the quantile loss function.

            regularization :obj:`int` or :obj:`float`
                The regularization strength in the line search computation.

    random_state : int, RandomState instance, or None
        Determines the random number generation in
        :class:`sklearn.preprocessing.QuantileTransformer`, if ``pipeline_transform``
        is either ```quantileuniform``` or ```quantilenormal```, and also in
        :class:`sklearn.multioutput.RegressorChain`.

    n_quantiles : int or None
        Number of quantiles in :class:`sklearn.preprocessing.QuantileTransformer`, if
        ``pipeline_transform`` is either ```quantileuniform``` or ```quantilenormal```.

    cv : int, cross-validation generator, an iterable, or None
        Determines which targets are utilized in :class:`sklearn.multioutput.RegressorChain`.

    chain_order : list or None
        Determines the target order in :class:`sklearn.multioutput.RegressorChain`.

    Returns
    -------
    pipe : ModifiedPipeline

    See Also
    --------
    :class:`physlearn.pipeline.ModifiedPipeline` : Class for creating a modified pipeline of
        transforms with a final estimator, which supports base boosting.

    Examples
    --------
    >>> import pandas as pd
    >>> from sklearn.datasets import make_regression
    >>> from sklearn.linear_model import Ridge
    >>> from sklearn.model_selection import train_test_split
    >>> from sklearn.utils.multiclass import type_of_target
    >>> from physlearn import make_pipeline, Regressor
    >>> X, y = make_regression(n_targets=3, random_state=42)
    >>> X, y = pd.DataFrame(X), pd.DataFrame(y)
    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                            random_state=42)
    >>> pipe = make_pipeline(Ridge(), 'yeojohnson',
                             target_type=type_of_target(y))
    >>> pipe.fit(X_train, y_train)
    >>> pipe.score(X_test, y_test).round(decimals=2)
          mae       mse    rmse    r2    ev
    0   58.68   5884.12   76.71  0.67  0.67
    1  101.19  14627.70  120.95  0.36  0.36
    2   96.31  14450.54  120.21  0.40  0.40

    References
    ----------
    - Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
      "A new formulation of gradient boosting",
      Machine Learning: Science and Technology, 2 045022 (2021).

    - Jerome Friedman. "Greedy function approximation: A gradient boosting machine,"
      Annals of Statistics, 29(5):1189–1232 (2001).
    
    - Trevor Hastie, Robert Tibshirani, and Jerome Friedman.
      "The Elements of Statistical Learning", Springer (2009).
    """

    memory = kwargs.pop('memory', None)
    verbose = kwargs.pop('verbose', None)
    n_jobs = kwargs.pop('n_jobs', None)
    auto_target = kwargs.pop('auto_target', None)
    target_index = kwargs.pop('target_index', None)
    target_type = kwargs.pop('target_type', None)
    base_boosting_options = kwargs.pop('base_boosting_options', None)
    random_state = kwargs.pop('random_state', None)
    n_quantiles = kwargs.pop('n_quantiles', None)
    cv = kwargs.pop('cv', None)
    chain_order = kwargs.pop('chain_order', None)
    if kwargs:
        raise TypeError('Unknown keyword arguments: %s'
                        % (list(kwargs.keys())[0]))

    if transform is not None:
        if isinstance(transform, tuple):
            steps = [transform]
        elif isinstance(transform, list):
            steps = transform
        elif transform in _PIPELINE_TRANSFORM_CHOICE:
            # Utulize the in-built transforma options.
            if transform == 'standardscaler':
                transform = sklearn.preprocessing.StandardScaler()
            elif transform == 'boxcox':
                transform = sklearn.preprocessing.PowerTransformer(method='box-cox')
            elif transform == 'yeojohnson':
                transform = sklearn.preprocessing.PowerTransformer(method='yeo-johnson')
            elif transform == 'quantileuniform':
                transform = sklearn.preprocessing.QuantileTransformer(n_quantiles=n_quantiles,
                                                                      output_distribution='uniform',
                                                                      random_state=random_state)
            elif transform == 'quantilenormal':  
                transform = sklearn.preprocessing.QuantileTransformer(n_quantiles=n_quantiles,
                                                                      output_distribution='normal',
                                                                      random_state=random_state)
            steps = [('tr', transform)]
        else:
            raise TypeError('The transform: %s was not a default str option, '
                            'tuple with (name, transform), or a list of such '
                            'tuple(s).'
                            % (transform))
    else:
        steps = []

    # Distinguishes between single-target and multi-target regression.
    if auto_target:
        if target_type in _MULTI_TARGET and target_index is None:
            if chain_order is not None:
                estimator = sklearn.multioutput.RegressorChain(base_estimator=estimator,
                                                               order=chain_order,
                                                               cv=cv,
                                                               random_state=random_state)
                steps.append(('reg', estimator))
            else:
                estimator = sklearn.multioutput.MultiOutputRegressor(estimator=estimator,
                                                                     n_jobs=n_jobs)
                steps.append(('reg', estimator))
        else:
            steps.append(('reg', estimator))
    else:
        steps.append(('reg', estimator))

    return ModifiedPipeline(steps=steps,
                            memory=memory,
                            verbose=verbose,
                            n_jobs=n_jobs,
                            target_index=target_index,
                            base_boosting_options=base_boosting_options)
Table Of Contents

Source code for physlearn.pipeline