Source code for physlearn.supervised.utils._data_checks

"""
The :mod:`physlearn.supervised.utils._data_checks` module provides basic
utilities for automated data checking.
"""

# Author: Alex Wozniakowski
# License: MIT

import os
import re
import typing

import numpy as np
import pandas as pd

import sklearn.utils.multiclass

DataFrame_or_Series = typing.Union[pd.DataFrame, pd.Series]


[docs]def _check_X(X: DataFrame_or_Series) -> DataFrame_or_Series: """Checks if the design matrix uses a pandas data representation. Parameters ---------- X : array-like of shape = [n_samples, n_features] The design matrix, where each row corresponds to an example and the column(s) correspond to the feature(s). Returns ------- X : pd.Series or pd.DataFrame """ assert isinstance(X, (pd.Series, pd.DataFrame)) return X
[docs]def _check_y(y: DataFrame_or_Series) -> DataFrame_or_Series: """Checks if the target matrix uses a pandas data representation. Parameters ---------- y : array-like of shape = [n_samples] or shape = [n_samples, n_targets] The target matrix, where each row corresponds to an example and the column(s) correspond to the single-target(s). Returns ------- y : pd.Series or pd.DataFrame """ assert isinstance(y, (pd.Series, pd.DataFrame)) return y
[docs]def _check_X_y(X: DataFrame_or_Series, y: DataFrame_or_Series) -> DataFrame_or_Series: """Checks if the design and target matrices use a pandas data representations. Parameters ---------- X : array-like of shape = [n_samples, n_features] The design matrix, where each row corresponds to an example and the column(s) correspond to the feature(s). y : array-like of shape = [n_samples] or shape = [n_samples, n_targets] The target matrix, where each row corresponds to an example and the column(s) correspond to the single-target(s). Returns ------- data : tuple """ X = _check_X(X=X) y = _check_y(y=y) assert X.index.equals(y.index) target_type = sklearn.utils.multiclass.type_of_target(y) assert any(target_type != continuous_target_type for continuous_target_type in ['continous', 'continuous-multioutput']) if X.ndim > 1: n_features = X.shape[-1] else: n_features = 1 data = pd.concat([X, y], axis=1).dropna(how='any', axis=0) if target_type == 'continuous-multioutput': return data.iloc[:, :n_features], data.iloc[:, n_features:] else: if n_features == 1: return data.iloc[:, :n_features].squeeze(), data.iloc[:, n_features:].squeeze() else: return data.iloc[:, :n_features], data.iloc[:, n_features:].squeeze()
[docs]def _validate_data(X=None, y=None) -> DataFrame_or_Series: """Bundles the pandas data checks together. Parameters ---------- X : array-like of shape = [n_samples, n_features] or None, optional (default=None) The design matrix, where each row corresponds to an example and the column(s) correspond to the feature(s). y : array-like of shape = [n_samples] or shape = [n_samples, n_targets] or None, optional (default=None) The target matrix, where each row corresponds to an example and the column(s) correspond to the single-target(s). Returns ------- data : tuple, pd.Series, or pd.DataFrame """ if X is not None and y is not None: data = _check_X_y(X=X, y=y) elif X is not None: data = _check_X(X=X) elif y is not None: data = _check_y(y=y) else: raise ValueError('Both the design matrix X and the target matrix y are None. ' 'Thus, there is no data to validate.') return data
[docs]def _n_features(X: DataFrame_or_Series) -> int: """Counts the number of features in the design matrix. Parameters ---------- X : array-like of shape = [n_samples, n_features] or None, optional (default=None) The design matrix, where each row corresponds to an example and the column(s) correspond to the feature(s). Returns ------- dim : int """ if X.ndim > 1: dim = X.shape[-1] else: dim = 1 return dim
[docs]def _n_targets(y: DataFrame_or_Series) -> int: """Counts the number of targets in the target matrix. Parameters ---------- y : array-like of shape = [n_samples] or shape = [n_samples, n_targets] or None, optional (default=None) The target matrix, where each row corresponds to an example and the column(s) correspond to the single-target(s). Returns ------- dim : int """ if y.ndim > 1: dim = y.shape[-1] else: dim = 1 return dim
[docs]def _n_samples(y: DataFrame_or_Series) -> int: """Counts the number of observations in the target matrix. Parameters ---------- y : array-like of shape = [n_samples] or shape = [n_samples, n_targets] or None, optional (default=None) The target matrix, where each row corresponds to an example and the column(s) correspond to the single-target(s). Returns ------- samples : int """ return y.shape[0]