Source code for physlearn.datasets.google.utils._helper_functions

"""
The :mod:`physlearn.datasets.google.utils._dataset_helper_functions` module
provides basic utilities for wrangling, serializing, and deserializing
superconducting quantum computing calibration data.
"""

# Author: Alex Wozniakowski
# License: MIT

import os
import re
import json
import typing

import pandas as pd

import sklearn.model_selection
import sklearn.utils

DataFrame_or_Series = typing.Union[pd.DataFrame, pd.Series]


[docs]def _json_dump(train_test_data: dict, folder: str, n_qubits=None) -> None:
    """Serializes the training and test data dictionary as a JSON formatted stream.

    Parameters
    ----------
    train_test_data : dict
        A dictionary with keys: 'X_train', 'X_test', 'y_train', and 'y_test'.

    folder : str
        Directory in which the training and test data is dumped.

    n_qubits : int or None, optional (default=None)
        Number of qubits. If specified, then this value
        is utilied in the file name.
    """

    assert isinstance(train_test_data, dict)
    assert isinstance(train_test_data['X_train'], pd.DataFrame)
    assert isinstance(train_test_data['X_test'], pd.DataFrame)
    assert isinstance(train_test_data['y_train'], (pd.Series, pd.DataFrame))
    assert isinstance(train_test_data['y_test'], (pd.Series, pd.DataFrame))
    assert isinstance(folder, str)

    train_test_data_json = {'X_train': train_test_data['X_train'].to_json(),
                            'X_test': train_test_data['X_test'].to_json(),
                            'y_train': train_test_data['y_train'].to_json(),
                            'y_test': train_test_data['y_test'].to_json()}

    if n_qubits is not None:
        assert isinstance(n_qubits, int)
        file = folder + '_{}'.format(n_qubits) + 'q_{}'.format(pd.Timestamp.now().isoformat())
    else:
        file = folder + '_{}'.format(pd.Timestamp.now().isoformat())

    with open(file + '.json', 'w') as outfile:
        json.dump(train_test_data_json, outfile)


[docs]def _json_load(filename: str) -> dict:
    """Deserializes the training and test data dictionary.

    The training and test data dictionary were serialized as a
    JSON formatted stream.

    Parameters
    ----------
    filename : str
        Name of the file in which the training and test data dictionary has been dumped.

    Returns
    -------
    train_test_data : dict
    """

    with open(filename, 'r') as json_file:
        get_train_test_data = json.load(json_file)

    train_test_data = {}
    train_test_data['X_train'] = pd.read_json(get_train_test_data['X_train'])
    train_test_data['X_test'] = pd.read_json(get_train_test_data['X_test'])
    train_test_data['y_train'] = pd.read_json(get_train_test_data['y_train'])
    train_test_data['y_test'] = pd.read_json(get_train_test_data['y_test'])

    return train_test_data


[docs]def _train_test_split(X: DataFrame_or_Series, y: DataFrame_or_Series, test_size: float,
                      random_state: int) -> dict:
    """Splits the X and y data intro training and test data.

    The split is determined by the fraction of the test size.

    Parameters
    ----------
    X : DataFrame or Series
        The design matrix, where each row corresponds to an example and the
        column(s) correspond to the feature(s).

    y : DataFrame or Series
            The target matrix, where each row corresponds to an example and the
            column(s) correspond to the single-target(s).

    test_size : float
        The decimal amount of test data.

    random_state : int, RandomState instance or None.
        Determines random number generation in sklearn.model_selection.train_test_split.

    Returns
    -------
    train_test_data : dict

    Notes
    -----
    As shuffling is handled by sklearn.utils.shuffle, there is no shuffling parameter.
    """

    data = sklearn.model_selection.train_test_split(X, y,
                                                    test_size=test_size,
                                                    random_state=random_state,
                                                    shuffle=False)

    return dict(X_train=data[0], X_test=data[1], 
                y_train=data[-2], y_test=data[-1])


[docs]def _shuffle(data: DataFrame_or_Series, drop=True) -> DataFrame_or_Series:
    """Shuffles the pandas data object.

    Parameters
    ----------
    data : DataFrame or Series
        The pandas data that is to be shuffled.

    drop : bool
        Resets the index of the pandas data object.

    Returns
    -------
    pandas : DataFrame or Series
    """

    return sklearn.utils.shuffle(data).reset_index(drop=drop)
    

[docs]def _iqr_outlier_mask(data: DataFrame_or_Series) -> DataFrame_or_Series:
    """Computes the interquartile range, then it masks the outliers.

    Parameters
    ----------
    data : DataFrame or Series
        The pandas data that is to be masked.

    Returns
    -------
    pandas : DataFrame or Series
    """

    first = data.quantile(0.25)
    third = data.quantile(0.75)
    iqr = third - first
    return ((data < (first - 1.5*iqr)) | (data > (third + 1.5*iqr))).any(axis=1)


[docs]def _path_to_google_data() -> str:
    """Finds the path to the Google quantum computer calibration data.

    Returns
    -------
    path : str
    """

    root = os.path.dirname(__file__).replace('utils', '')
    return os.path.join(root, 'data', 'google_5q_random.csv')


[docs]def _path_to_google_json_folder() -> str:
    """Finds the path to the folder with the serialized Google data.

    Returns
    -------
    path : str
    """

    root = os.path.dirname(__file__).replace('utils', '')
    return os.path.join(root, 'google_json')
Table Of Contents

Source code for physlearn.datasets.google.utils._helper_functions