Source code for physlearn.loss

"""
The :mod:`physlearn.loss` module enables computation of the average loss
or the negative gradient in either the single-target or the multi-target
regression setting, whereby data can be represented heterogeneously with
Numpy or Pandas. It includes the :class:`physlearn.LeastSquaresError`,
:class:`physlearn.LeastAbsoluteError`, :class:`physlearn.HuberLossFunction`,
:class:`physlearn.QuantileLossFunction` classes, and the helper
:func:`physlearn.loss._difference` function.
"""

# Author: Alex Wozniakowski
# License: MIT

import typing

import numpy as np
import pandas as pd

import sklearn.ensemble
import sklearn.metrics

pandas_or_numpy = typing.Union[pd.DataFrame, pd.Series, np.ndarray]


[docs]def _difference(y: pandas_or_numpy, raw_predictions: pandas_or_numpy) -> pandas_or_numpy:
    """Subtract the raw predictions from the single-target(s).

    The function supports heterogeneous usage of Numpy and pandas
    data representations.

    Parameters
    ----------

    y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
        The target matrix, where each row corresponds to an example and the
        column(s) correspond to the single-target(s).

    raw_predictions : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
        The estimate matrix, where each row corresponds to an example and the
        column(s) correspond to the prediction(s) for the single-target(s).

    Returns
    -------
    diff : DataFrame, Series, or ndarray
        The difference between the single-target(s) and the raw predictions.

    Examples
    --------
    >>> import pandas as pd
    >>> from sklearn.datasets import load_linnerud
    >>> from physlearn.loss import _difference
    >>> X, y = load_linnerud(return_X_y=True)
    >>> _difference(y=pd.DataFrame(y), raw_predictions=X).iloc[:2]
           0      1     2
    0  186.0 -126.0 -10.0
    1  187.0  -73.0  -8.0
    """

    if isinstance(y, (pd.DataFrame, pd.Series)):
        if hasattr(raw_predictions, 'values'):
            diff = y.subtract(raw_predictions.values)
        else:
            diff = y.subtract(raw_predictions)
    else:
        diff = y - raw_predictions
    return diff


[docs]class LeastSquaresError(sklearn.ensemble._gb_losses.LeastSquaresError):
    """Least squares loss function.

    The object modifies the original Scikit-learn LeastSquaresError such that
    the average loss and pseudo-residual computations support heterogeneous
    usage of Numpy and pandas data representations. Moreover, the modification
    supports both single-target and multi-target data.

    References
    ----------
    - Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
      "A new formulation of gradient boosting",
      Machine Learning: Science and Technology, 2 045022 (2021).
    
    - Jerome Friedman. "Greedy function approximation: A gradient boosting machine,"
      Annals of Statistics, 29(5):1189–1232 (2001).
      """

[docs]    def __call__(self, y: pandas_or_numpy, raw_predictions: pandas_or_numpy,
                 sample_weight=None) -> pandas_or_numpy:
        """Computes the average loss.

        Parameters
        ----------
        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        raw_predictions : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The estimate matrix, where each row corresponds to an example and the
            column(s) correspond to the prediction(s) for the single-target(s).

        sample_weight : float, ndarray, or None, optional (default=None)
            Individual weights for each target. If the weight is a float, then
            every target will have the same weight.

        Returns
        -------
        mse : DataFrame, Series, or ndarray

        Examples
        --------
        >>> from sklearn.datasets import load_linnerud
        >>> from physlearn import LeastSquaresError
        >>> X, y = load_linnerud(return_X_y=True)
        >>> ls = LeastSquaresError()
        >>> ls(y=y, raw_predictions=X)
        16048.6
        """

        if sample_weight is None:
            return sklearn.metrics.mean_squared_error(y_true=y, y_pred=raw_predictions)
        else:
            return sklearn.metrics.mean_squared_error(y_true=y, y_pred=raw_predictions,
                                                      sample_weight=sample_weight)

[docs]    def negative_gradient(self, y: pandas_or_numpy,
                          raw_predictions: pandas_or_numpy) -> pandas_or_numpy:
        """Computes the pseudo-residuals.

        Parameters
        ----------
        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        raw_predictions : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The estimate matrix, where each row corresponds to an example and the
            column(s) correspond to the prediction(s) for the single-target(s).

        Returns
        -------
        residual : DataFrame, Series, or ndarray

        Examples
        --------
        >>> import pandas as pd
        >>> from sklearn.datasets import load_linnerud
        >>> from physlearn import LeastSquaresError
        >>> X, y = load_linnerud(return_X_y=True)
        >>> ls = LeastSquaresError()
        >>> ls.negative_gradient(y=pd.DataFrame(y), raw_predictions=X).iloc[:2]
               0      1     2
        0  186.0 -126.0 -10.0
        1  187.0  -73.0  -8.0
        """

        return _difference(y=y, raw_predictions=raw_predictions)


[docs]class LeastAbsoluteError(sklearn.ensemble._gb_losses.LeastAbsoluteError):
    """Absolute error loss function.

    The object modifies the original Scikit-learn LeastAbsoluteError such that
    the average loss and pseudo-residual computations support heterogeneous
    usage of Numpy and pandas data representations. Moreover, the modification
    supports both single-target and multi-target data.

    References
    ----------
    - Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
      "A new formulation of gradient boosting",
      Machine Learning: Science and Technology, 2 045022 (2021).
    
    - Jerome Friedman. "Greedy function approximation: A gradient boosting machine,"
      Annals of Statistics, 29(5):1189–1232 (2001).
      """

[docs]    def __call__(self, y: pandas_or_numpy, raw_predictions: pandas_or_numpy,
                 sample_weight=None) -> pandas_or_numpy:
        """Computes the average loss.

        Parameters
        ----------
        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        raw_predictions : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The estimate matrix, where each row corresponds to an example and the
            column(s) correspond to the prediction(s) for the single-target(s).

        sample_weight : float, ndarray, or None, optional (default=None)
            Individual weights for each target. If the weight is a float, then
            every target will have the same weight.

        Returns
        -------
        mae : DataFrame, Series, or ndarray

        Examples
        --------
        >>> from sklearn.datasets import load_linnerud
        >>> from physlearn import LeastAbsoluteError
        >>> X, y = load_linnerud(return_X_y=True)
        >>> lad = LeastAbsoluteError()
        >>> lad(y=y, raw_predictions=X)
        104.23333333333333
        """

        if sample_weight is None:
            return sklearn.metrics.mean_absolute_error(y_true=y, y_pred=raw_predictions)
        else:
            return sklearn.metrics.mean_absolute_error(y_true=y, y_pred=raw_predictions,
                                                       sample_weight=sample_weight)

[docs]    def negative_gradient(self, y: pandas_or_numpy,
                          raw_predictions: pandas_or_numpy) -> pandas_or_numpy:
        """Computes the pseudo-residuals.

        Parameters
        ----------
        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        raw_predictions : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The estimate matrix, where each row corresponds to an example and the
            column(s) correspond to the prediction(s) for the single-target(s).

        Returns
        -------
        residual : DataFrame, Series, or ndarray

        Examples
        --------
        >>> import pandas as pd
        >>> from sklearn.datasets import load_linnerud
        >>> from physlearn import LeastAbsoluteError
        >>> X, y = load_linnerud(return_X_y=True)
        >>> lad = LeastAbsoluteError()
        >>> lad.negative_gradient(y=pd.DataFrame(y), raw_predictions=X).iloc[:2]
             0    1    2
        0  1.0 -1.0 -1.0
        1  1.0 -1.0 -1.0
        """

        if isinstance(y, (pd.DataFrame, pd.Series)):
            return _difference(y=y, raw_predictions=raw_predictions).apply(np.sign)
        else:
            return np.sign(_difference(y=y, raw_predictions=raw_predictions))


[docs]class HuberLossFunction(sklearn.ensemble._gb_losses.HuberLossFunction):
    """Huber loss function.

    The object modifies the original Scikit-learn HuberLossFunction such that
    the average loss and pseudo-residual computations support heterogeneous
    usage of Numpy and pandas data representations. Moreover, the modification
    supports both single-target and multi-target data.

    References
    ----------
    - Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
      "A new formulation of gradient boosting",
      Machine Learning: Science and Technology, 2 045022 (2021).
    
    - Jerome Friedman. "Greedy function approximation: A gradient boosting machine,"
      Annals of Statistics, 29(5):1189–1232 (2001).
      """

[docs]    def _delta(self, difference: pandas_or_numpy,
               sample_weight=None) -> np.float64:
        """Computes the delta threshold.

        This threshold determines whether to use the squared error or
        the absolute error loss function.

        Parameters
        ----------
        difference : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The difference between the single-target(s) and the raw prediction(s).

        sample_weight : float, ndarray, or None, optional (default=None)
            Individual weights for each target. If the weight is a float, then
            every target will have the same weight.

        Returns
        -------
        delta : np.float64
        """

        if hasattr(difference, 'abs'):
            abs_diff = difference.abs()
        else:
            abs_diff = np.abs(difference)
        if sample_weight is None:
            delta = np.percentile(abs_diff, self.alpha * 100)
        else:
            delta = sklearn.utils.stats._weighted_percentile(abs_diff, sample_weight,
                                                             self.alpha * 100)
        return delta


[docs]    def __call__(self, y: pandas_or_numpy, raw_predictions: pandas_or_numpy,
                 sample_weight=None) -> pandas_or_numpy: 
        """Computes the average loss.

        Parameters
        ----------
        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        raw_predictions : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The estimate matrix, where each row corresponds to an example and the
            column(s) correspond to the prediction(s) for the single-target(s).

        sample_weight : float, ndarray, or None, optional (default=None)
            Individual weights for each target. If the weight is a float, then
            every target will have the same weight.

        Returns
        -------
        huber : DataFrame, Series, or ndarray

        Examples
        --------
        >>> from sklearn.datasets import load_linnerud
        >>> from physlearn import HuberLossFunction
        >>> X, y = load_linnerud(return_X_y=True)
        >>> huber = HuberLossFunction()
        >>> huber(y=y, raw_predictions=X)
        7989.893
        """

        diff = _difference(y=y, raw_predictions=raw_predictions)
        delta = self._delta(difference=diff, sample_weight=sample_weight)

        if hasattr(diff, 'abs'):
            mask = diff.abs() > delta
            if sample_weight is None:
                diff[mask] = delta * (diff[mask].abs() - delta/2)
                diff[~mask] = 0.5 * diff[~mask].pow(other=2)
                return diff.mean()
            else:
                diff[mask] = delta * (diff[mask].abs().multiply(sample_weight[mask]) - delta/2)
                diff[~mask] = 0.5 * diff[~mask].pow(other=2).multiply(sample_weight[~mask])
        else:
            mask = np.abs(diff) > delta
            if sample_weight is None:
                diff[mask] = delta * (np.abs(diff[mask]) - delta/2)
                diff[~mask] = 0.5 * diff[~mask]**2
                return diff.mean()
            else:
                diff[mask] = delta * (sample_weight[mask]@np.abs(diff[mask]) - delta/2)
                diff[~mask] = 0.5 * sample_weight[~mask]@diff[~mask]**2
        
        return diff.sum() / sample_weight.sum()

[docs]    def negative_gradient(self, y: pandas_or_numpy, raw_predictions: pandas_or_numpy,
                          sample_weight=None) -> pandas_or_numpy:
        """Computes the pseudo-residuals.

        Parameters
        ----------
        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        raw_predictions : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The estimate matrix, where each row corresponds to an example and the
            column(s) correspond to the prediction(s) for the single-target(s).

        Returns
        -------
        residual : DataFrame, Series, or ndarray

        Examples
        --------
        >>> import pandas as pd
        >>> from sklearn.datasets import load_linnerud
        >>> from physlearn import HuberLossFunction
        >>> X, y = load_linnerud(return_X_y=True)
        >>> huber = HuberLossFunction()
        >>> huber.negative_gradient(y=pd.DataFrame(y), raw_prediction=X).iloc[:2]
               0      1     2
        0  186.0 -126.0 -10.0
        1  187.0  -73.0  -8.0
        """

        diff = _difference(y=y, raw_predictions=raw_predictions)
        delta = self._delta(difference=diff, sample_weight=sample_weight)

        if hasattr(diff, 'abs'):
            mask = diff.abs() > delta
            diff[mask] = delta * diff[mask].apply(np.sign)
        else:
            mask = np.abs(diff) > delta
            diff[mask] = delta * np.sign(diff[mask])
        
        return diff


[docs]class QuantileLossFunction(sklearn.ensemble._gb_losses.QuantileLossFunction):
    """Quantile loss function.

    The object modifies the original Scikit-learn QuantileLossFunction such that
    the average loss and pseudo-residual computations support heterogeneous
    usage of Numpy and pandas data representations. Moreover, the modification
    supports both single-target and multi-target data.

    References
    ----------
    - Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
      "A new formulation of gradient boosting",
      Machine Learning: Science and Technology, 2 045022 (2021).
    
    - Jerome Friedman. "Greedy function approximation: A gradient boosting machine,"
      Annals of Statistics, 29(5):1189–1232 (2001).
      """

[docs]    def __call__(self, y: pandas_or_numpy, raw_predictions: pandas_or_numpy,
                 sample_weight=None) -> pandas_or_numpy:
        """Computes the average loss.

        Parameters
        ----------
        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        raw_predictions : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The estimate matrix, where each row corresponds to an example and the
            column(s) correspond to the prediction(s) for the single-target(s).

        sample_weight : float, ndarray, or None, optional (default=None)
            Individual weights for each target. If the weight is a float, then
            every target will have the same weight.

        Returns
        -------
        quantile : DataFrame, Series, or ndarray

        Examples
        --------
        >>> from sklearn.datasets import load_linnerud
        >>> from physlearn import QuantileLossFunction
        >>> X, y = load_linnerud(return_X_y=True)
        >>> quantile = QuantileLossFunction()
        >>> quantile(y=y, raw_predictions=X)
        174.27
        """

        diff = _difference(y=y, raw_predictions=raw_predictions)
        mask = y > raw_predictions
        if sample_weight is None:
            return (self.alpha * diff[mask].sum() - (1-self.alpha) * diff[~mask].sum()) / y.shape[0]
        else:
            if hasattr(diff, 'multiply'):
                with_mask = diff[mask].multiply(sample_weight[mask]).sum()
                without_mask = diff[~mask].multiply(sample_weight[~mask]).sum()
            else:
                with_mask = (sample_weight[mask]@diff[mask]).sum()
                without_mask = (sample_weight[~mask]@diff[~mask]).sum()
            return (self.alpha*with_mask - (1-self.alpha)*without_mask) / sample_weight.sum()


[docs]    def negative_gradient(self, y: pandas_or_numpy,
                          raw_predictions: pandas_or_numpy) -> pandas_or_numpy:
        """Computes the pseudo-residuals.

        Parameters
        ----------
        y : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

        raw_predictions : array-like of shape = [n_samples] or shape = [n_samples, n_targets]
            The estimate matrix, where each row corresponds to an example and the
            column(s) correspond to the prediction(s) for the single-target(s).

        Returns
        -------
        residual : DataFrame, Series, or ndarray

        Examples
        --------
        >>> import pandas as pd
        >>> from sklearn.datasets import load_linnerud
        >>> from physlearn import QuantileLossFunction
        >>> X, y = load_linnerud(return_X_y=True)
        >>> quantile = QuantileLossFunction()
        >>> quantile.negative_gradient(y=pd.DataFrame(y), raw_predictions=X).iloc[:2]
             0    1    2
        0  0.9 -0.1 -0.1
        1  0.9 -0.1 -0.1
        """

        if hasattr(raw_predictions, 'values'):
            mask = y.gt(raw_predictions.values)
        else:
            mask = y.gt(raw_predictions)
        return self.alpha*mask - (1-self.alpha)*~mask


LOSS_FUNCTIONS = dict(ls=LeastSquaresError,
                      lad=LeastAbsoluteError,
                      huber=HuberLossFunction,
                      quantile=QuantileLossFunction)
Table Of Contents

Source code for physlearn.loss