"""
The :mod:`physlearn.datasets.google._google` module provides utilities
for wrangling, serializing, and deserializing superconducting quantum
computing calibration data.
Notes
-----
The calibration data was collected by Benjamin Chiaro during his
time as a graduate student at UC Santa Barbara. The Google quantum computer
contains 9 qubits, wherein the 5 rightmost qubits and 4 interleaving
couplers were utilized during experimentation. The 4 leftmost qubits
and couplers were left idle during experimentation.
"""
# Author: Alex Wozniakowski
# License: MIT
import os
import typing
import pandas as pd
import sklearn.model_selection
from dataclasses import dataclass, field
from physlearn.datasets.google.base import BaseDataFrame
from physlearn.datasets.google.utils._helper_functions import (_shuffle,
_iqr_outlier_mask,
_train_test_split,
_json_dump,
_json_load,
_path_to_google_data,
_path_to_google_json_folder)
sklearn_train_test_split_or_dict = typing.Union[sklearn.model_selection.train_test_split, dict]
[docs]@dataclass
class GoogleDataFrame(BaseDataFrame):
"""Represents the Google quantum computer calibration data with a DataFrame.
Parameters
----------
path : str
Path to the csv file with calibration data.
n_qubits : int
Number of qubits in the experiment.
See Also
--------
:class:`physlearn.datasets.GoogleData` : Class for wrangling the calibration data.
Examples
--------
>>> from physlearn.datasets import GoogleDataFrame
>>> from physlearn.datasets.google.utils._helper_functions import _path_to_google_data
>>> df = GoogleDataFrame(path=_path_to_google_data(), n_qubits=5)
>>> df.get_df_with_correct_columns.head().iloc[0, :3]
qvolt5 -0.008238
qvolt6 -0.006896
qvolt7 -0.026120
Name: 1, dtype: float64
"""
n_qubits: int
def __post_init__(self):
self._validate_dataframe_options()
def _validate_dataframe_options(self):
assert isinstance(self.path, str)
assert isinstance(self.n_qubits, int) and self.n_qubits > 0
@property
def get_df_with_correct_columns(self) -> pd.DataFrame:
"""Drops the undesired columns from the raw calibration data.
Returns
-------
df : DataFrame
"""
df = self.get_df
if self.n_qubits == 5:
# Select every fifth row, as well as
# the relevant columns.
df = df.iloc[1::5, :].loc[:, 'qubit_voltages':' .29']
df.columns = ['qvolt5', 'qvolt6', 'qvolt7', 'qvolt8', 'qvolt9',
'cvolt4', 'cvolt5', 'cvolt6', 'cvolt7', 'cvolt8',
'pref105', 'pref106', 'pref107', 'pref108', 'pref109',
'precoup5', 'precoup6', 'precoup7', 'precoup8',
'postf105', 'post106', 'postf107', 'postf108', 'postf109',
'postcoup5', 'postcoup6', 'postcoup7', 'postcoup8',
'peig1', 'peig2', 'peig3', 'peig4', 'peig5',
'eeig1', 'eeig2', 'eeig3', 'eeig4', 'eeig5']
# This coupler was idle during the experiment.
df = df.drop(['cvolt4'], axis=1)
return df
[docs]@dataclass
class GoogleData(GoogleDataFrame):
"""Wrangles the calibration data for multi-target regression.
Parameters
----------
path : str, optional (default=None)
Path to the csv file with calibration data.
n_qubits : int, optional (default=5)
Number of qubits in the experiment. Currently, supports 5 qubits.
test_split : float, optional (default=0.3)
The proportion of labeled examples withheld from training.
random_state : int, RandomState instance, or None, optional (default=0)
Determines the random number generation in the training and test
examples split.
remove_outliers : bool, optional (default=False)
If True, then it removes labeled examples that are not
within the interquartile range of the DataFrame.
shuffle : bool, optional (default=True)
If True, then it shuffles the DataFrame rows prior
to splitting the DataFrame into training and test
examples.
See Also
--------
:class:`physlearn.datasets.GoogleDataFrame` : Class for representing the calibration data.
Examples
--------
>>> from physlearn.datasets import GoogleData
>>> data = GoogleData()
>>> data.load_benchmark['X_train'].iloc[0, :3]
qvolt5 0.003398
qvolt6 -0.018080
qvolt7 -0.009895
Name: 0, dtype: float64
References
----------
- Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
"A new formulation of gradient boosting",
Machine Learning: Science and Technology, 2 045022 (2021).
"""
path : str = field(default=None)
n_qubits: int = field(default=5)
test_split: float = field(default=0.3)
random_state: int = field(default=0)
remove_outliers: bool = field(default=False)
shuffle: bool = field(default=True)
def __post_init__(self):
if self.path is None:
self.path = _path_to_google_data()
self._validate_data_options()
def _validate_data_options(self):
if self.n_qubits != 5:
raise ValueError('This object only supports 5 qubits, '
'but %s qubits were specified.'
% (self.n_qubits))
assert self.test_split > 0.0 and self.test_split < 1.0
assert isinstance(self.random_state, int)
assert isinstance(self.remove_outliers, bool)
assert isinstance(self.shuffle, bool)
[docs] def _train_test_split(self) -> dict:
"""Get the DataFrame, then split it into training and test data.
Returns
-------
X_train, X_test, y_train, and y_test : DataFrame(s)
"""
if self.shuffle:
df = _shuffle(data=self.get_df_with_correct_columns)
else:
df = self.get_df_with_correct_columns
# Compute interquartile range for outlier removal.
if self.remove_outliers:
df = df[~_iqr_outlier_mask(data=df)]
qubit_coupler_voltages = ['qvolt5', 'qvolt6', 'qvolt7', 'qvolt8', 'qvolt9',
'cvolt5', 'cvolt6', 'cvolt7', 'cvolt8']
google_predictions = ['peig1', 'peig2', 'peig3', 'peig4', 'peig5']
measured_eigenvalues = ['eeig1', 'eeig2', 'eeig3', 'eeig4', 'eeig5']
mat_entries = ['postf105', 'post106', 'postf107', 'postf108', 'postf109',
'postcoup5', 'postcoup6', 'postcoup7', 'postcoup8']
return _train_test_split(df[qubit_coupler_voltage + google_predictions],
df[measured_eigenvalues + mat_entry],
test_size=self.test_split,
random_state=self.random_state)
[docs] def save_train_test_split_to_json(self) -> None:
"""Serializes the training and test data as a JSON formatted stream.
It automatically dumps the data into the Google JSON folder.
"""
_json_dump(train_test_data=self._train_test_split(),
folder=_path_to_google_json_folder(),
n_qubits=self.n_qubits)
@property
def load_benchmark(self) -> dict:
"""Deserializes the benchmark dataset.
Returns
-------
data : dict
"""
folder = _path_to_google_json_folder()
return _json_load(filename=os.path.join(folder, '_{}'.format(self.n_qubits) + 'q.json'))
[docs]def load_benchmark(return_split=False) -> sklearn_train_test_split_or_dict:
"""Deserializes the benchmark dataset for the multi-target regression task.
If the return split parameter is true, then the benchmark dataset is
returned in the familiar X_train, X_test, y_train, and y_test format.
Parameters
----------
return_split : bool
If True, then the benchmark dataset is returned in the form of
X_train, X_test, y_train, and y_test.
Returns
-------
X_train, X_test, y_train, and y_test or data : DataFrame(s) or dict
References
----------
- Alex Wozniakowski, Jayne Thompson, Mile Gu, and Felix C. Binder.
"A new formulation of gradient boosting",
Machine Learning: Science and Technology, 2 045022 (2021).
"""
data = GoogleData(n_qubits=5).load_benchmark
if return_split:
X_train, X_test = data['X_train'].iloc[:, -5:], data['X_test'].iloc[:, -5:]
y_train, y_test = data['y_train'].iloc[:, :5], data['y_test'].iloc[:, :5]
return X_train, X_test, y_train, y_test
else:
return data