diff --git a/.github/workflows/run_benchmark_results_file_sanity_checks.yaml b/.github/workflows/run_benchmark_results_file_sanity_checks.yaml index a9ce633..c0e9462 100644 --- a/.github/workflows/run_benchmark_results_file_sanity_checks.yaml +++ b/.github/workflows/run_benchmark_results_file_sanity_checks.yaml @@ -19,3 +19,4 @@ jobs: - name: Check sanity of benchmark files run: | python ./benchmarks/kmeans/consolidate_result_csv.py ./benchmarks/kmeans/results.csv --check-csv + python ./benchmarks/pca/consolidate_result_csv.py ./benchmarks/pca/results.csv --check-csv diff --git a/.github/workflows/sync_benchmark_files_to_gsheet.yaml b/.github/workflows/sync_benchmark_files_to_gsheet.yaml index 9030738..de39077 100644 --- a/.github/workflows/sync_benchmark_files_to_gsheet.yaml +++ b/.github/workflows/sync_benchmark_files_to_gsheet.yaml @@ -26,3 +26,5 @@ jobs: echo "$GSPREAD_SERVICE_ACCOUNT_AUTH_KEY" > service_account.json python ./benchmarks/kmeans/consolidate_result_csv.py ./benchmarks/kmeans/results.csv \ --sync-to-gspread --gspread-url $GSPREAD_URL --gspread-auth-key ./service_account.json + python ./benchmarks/pca/consolidate_result_csv.py ./benchmarks/pca/results.csv \ + --sync-to-gspread --gspread-url $GSPREAD_URL --gspread-auth-key ./service_account.json diff --git a/.github/workflows/test_cpu_benchmarks.yaml b/.github/workflows/test_cpu_benchmarks.yaml index 8d7fd6a..b566564 100644 --- a/.github/workflows/test_cpu_benchmarks.yaml +++ b/.github/workflows/test_cpu_benchmarks.yaml @@ -141,3 +141,5 @@ jobs: run: | cd benchmarks/kmeans PYTHONPATH=$PYTHONPATH:$(realpath ./kmeans_dpcpp/) benchopt run --no-plot -l -d Simulated_correlated_data[n_samples=1000,n_features=14] + cd ../pca + benchopt run --no-plot -l -d Simulated_correlated_data[n_samples=100,n_features=100] diff --git a/README.md b/README.md index 29fdcdd..fc78fcb 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ hardware. Benchmarks are currently available for the following algorithms: - [k-means](https://github.com/soda-inria/sklearn-engine-benchmarks/tree/main/benchmarks/kmeans) +- [PCA](https://github.com/soda-inria/sklearn-engine-benchmarks/tree/main/benchmarks/pca) Here is a (non-exhaustive) list of libraries that are compared in the benchmarks: - [scikit-learn](https://scikit-learn.org/stable/index.html) diff --git a/benchmarks/pca/consolidate_result_csv.py b/benchmarks/pca/consolidate_result_csv.py new file mode 100644 index 0000000..6982393 --- /dev/null +++ b/benchmarks/pca/consolidate_result_csv.py @@ -0,0 +1,574 @@ +import hashlib +from io import BytesIO +from operator import attrgetter + +import numpy as np +import pandas as pd +from pandas.io.parsers.readers import STR_NA_VALUES + +GOOGLE_WORKSHEET_NAME = "PCA" + +DATES_FORMAT = "%Y-%m-%d" + +BENCHMARK_DEFINING_COLUMNS = [ + "objective_objective_param___name", + "objective_dataset_param___name", + "objective_dataset_param_n_samples", + "objective_dataset_param_n_features", + "objective_dataset_param_dtype", + "objective_dataset_param_random_state", + "objective_objective_param_n_components", + "objective_objective_param_tol", + "objective_objective_param_iterated_power", + "objective_objective_param_n_oversamples", + "objective_objective_param_random_state", + "objective_objective_param_verbose", +] + +BENCHMARK_DEFINING_COLUMNS = sorted(BENCHMARK_DEFINING_COLUMNS) +_benchmark_defining_columns_identifier = "".join(sorted(BENCHMARK_DEFINING_COLUMNS)) + +BACKEND_PROVIDER = "Backend provider" +COMMENT = "Comment" +COMPUTE_DEVICE = "Compute device" +COMPUTE_RUNTIME = "Compute runtime" +DATA_RANDOM_STATE = "Data random state" +DTYPE = "Dtype" +NB_COMPONENTS = "Nb components" +NB_DATA_FEATURES = "Nb data features" +NB_DATA_SAMPLES = "Nb data samples" +SVD_SOLVER = "SVD solver" +POWER_ITERATION_NORMALIZER = "Power iteration normalizer" +PLATFORM = "Platform" +PLATFORM_ARCHITECTURE = "Platform architecture" +PLATFORM_RELEASE = "Platform release" +SYSTEM_CPUS = "Nb cpus" +SYSTEM_PROCESSOR = "Cpu name" +SYSTEM_RAM = "RAM (GB)" +SYSTEM_GPU = "Gpu name" +SUM_OF_EXPLAINED_VARIANCE_RATIO = "Sum of explained variance ratio" +VERSION_INFO = "Version info" +RUN_DATE = "Run date" +SOLVER_RANDOM_STATE = "Solver random state" +WALLTIME = "Walltime" + +BENCHMARK_ID_NAME = "Benchmark id" + +TABLE_DISPLAY_ORDER = [ + BENCHMARK_ID_NAME, + DTYPE, + NB_DATA_SAMPLES, + NB_DATA_FEATURES, + NB_COMPONENTS, + WALLTIME, + BACKEND_PROVIDER, + COMPUTE_DEVICE, + COMPUTE_RUNTIME, + SVD_SOLVER, + POWER_ITERATION_NORMALIZER, + SYSTEM_CPUS, + SYSTEM_PROCESSOR, + SYSTEM_GPU, + SYSTEM_RAM, + PLATFORM, + PLATFORM_ARCHITECTURE, + PLATFORM_RELEASE, + RUN_DATE, + VERSION_INFO, + COMMENT, + SUM_OF_EXPLAINED_VARIANCE_RATIO, + DATA_RANDOM_STATE, + SOLVER_RANDOM_STATE, +] + +COLUMNS_DTYPES = { + BENCHMARK_ID_NAME: str, + DTYPE: str, + NB_DATA_SAMPLES: np.int64, + NB_DATA_FEATURES: np.int64, + NB_COMPONENTS: np.int64, + WALLTIME: np.float64, + BACKEND_PROVIDER: str, + COMPUTE_DEVICE: str, + COMPUTE_RUNTIME: str, + SUM_OF_EXPLAINED_VARIANCE_RATIO: np.float64, + SVD_SOLVER: str, + POWER_ITERATION_NORMALIZER: str, + PLATFORM: str, + PLATFORM_ARCHITECTURE: str, + PLATFORM_RELEASE: str, + SYSTEM_CPUS: np.int64, + SYSTEM_PROCESSOR: str, + SYSTEM_GPU: str, + SYSTEM_RAM: np.int64, + DATA_RANDOM_STATE: np.int64, + SOLVER_RANDOM_STATE: np.int64, + VERSION_INFO: str, + RUN_DATE: str, + COMMENT: str, +} + +COLUMNS_WITH_NONE_STRING = [] + +# If all those fields have equal values for two given benchmarks, then the oldest +# benchmark (given by RUN_DATE) will be discarded +UNIQUE_BENCHMARK_KEY = [ + BENCHMARK_ID_NAME, + DTYPE, + NB_DATA_SAMPLES, + NB_DATA_FEATURES, + NB_COMPONENTS, + BACKEND_PROVIDER, + SVD_SOLVER, + POWER_ITERATION_NORMALIZER, + COMPUTE_DEVICE, + COMPUTE_RUNTIME, + PLATFORM, + PLATFORM_ARCHITECTURE, + SYSTEM_PROCESSOR, + SYSTEM_CPUS, + SYSTEM_GPU, + DATA_RANDOM_STATE, + SOLVER_RANDOM_STATE, +] + +# Importance and say if ascending / descending +ROW_SORT_ORDER = [ + (DTYPE, True), + (NB_DATA_SAMPLES, False), + (NB_DATA_FEATURES, False), + (NB_COMPONENTS, False), + (WALLTIME, True), + (BACKEND_PROVIDER, True), + (COMPUTE_DEVICE, True), + (COMPUTE_RUNTIME, True), + (SVD_SOLVER, True), + (POWER_ITERATION_NORMALIZER, True), + (SUM_OF_EXPLAINED_VARIANCE_RATIO, False), + (SYSTEM_GPU, True), + (SYSTEM_CPUS, True), + (PLATFORM, True), + (PLATFORM_ARCHITECTURE, True), + (PLATFORM_RELEASE, True), + (SYSTEM_PROCESSOR, True), + (SYSTEM_RAM, True), + (DATA_RANDOM_STATE, True), + (SOLVER_RANDOM_STATE, True), + (RUN_DATE, False), + (VERSION_INFO, False), + (COMMENT, True), + (BENCHMARK_ID_NAME, True), +] +_row_sort_by, _row_sort_ascending = map(list, zip(*ROW_SORT_ORDER)) + +PARQUET_TABLE_DISPLAY_MAPPING = dict( + time=WALLTIME, + objective_value=SUM_OF_EXPLAINED_VARIANCE_RATIO, + objective_dataset_param_n_samples=NB_DATA_SAMPLES, + objective_dataset_param_n_features=NB_DATA_FEATURES, + objective_dataset_param_dtype=DTYPE, + objective_dataset_param_random_state=DATA_RANDOM_STATE, + objective_objective_param_n_components=NB_COMPONENTS, + objective_objective_param_random_state=SOLVER_RANDOM_STATE, + objective_solver_param___name=BACKEND_PROVIDER, + objective_solver_param_device=COMPUTE_DEVICE, + objective_solver_param_runtime=COMPUTE_RUNTIME, + objective_objective_param_power_iteration_normalizer=POWER_ITERATION_NORMALIZER, + objective_objective_param_power_svd_solver=SVD_SOLVER, + objective_solver_param_comment=COMMENT, + objective_solver_param_version_info=VERSION_INFO, + objective_solver_param_run_date=RUN_DATE, + platform=PLATFORM, +) + +PARQUET_TABLE_DISPLAY_MAPPING.update( + { + "platform-architecture": PLATFORM_ARCHITECTURE, + "platform-release": PLATFORM_RELEASE, + "system-cpus": SYSTEM_CPUS, + "system-processor": SYSTEM_PROCESSOR, + "system-ram (GB)": SYSTEM_RAM, + } +) +_all_table_columns = list(PARQUET_TABLE_DISPLAY_MAPPING) + [BENCHMARK_ID_NAME] + +ALL_EXPECTED_COLUMNS = set(BENCHMARK_DEFINING_COLUMNS + _all_table_columns) + +IDS_LENGTH = 8 + + +def _get_id_from_str(s): + return hashlib.sha256(s.encode("utf8"), usedforsecurity=False).hexdigest()[ + :IDS_LENGTH + ] + + +def _get_sample_id_for_columns(row, defining_colums, constant_identifier): + return _get_id_from_str( + "".join(row[defining_colums].astype(str)) + constant_identifier + ) + + +def _validate_one_parquet_table(source): + df = pd.read_parquet(source) + + # NB: we're lenient on the columns + for col in ALL_EXPECTED_COLUMNS - set(df.columns): + df[col] = None + + df[BENCHMARK_ID_NAME] = df.apply( + lambda row: _get_sample_id_for_columns( + row, BENCHMARK_DEFINING_COLUMNS, _benchmark_defining_columns_identifier + ), + axis=1, + ) + + df = df[_all_table_columns] + df.rename(columns=PARQUET_TABLE_DISPLAY_MAPPING, inplace=True, errors="raise") + + df[RUN_DATE] = df[RUN_DATE].astype("datetime64[ns]") + + return df + + +def _validate_one_csv_table(source, parse_dates=True, order_columns=True): + NA_VALUES = set(STR_NA_VALUES) + NA_VALUES.discard("None") + + df = pd.read_csv( + source, + usecols=TABLE_DISPLAY_ORDER, + dtype=COLUMNS_DTYPES, + index_col=False, + na_values={col: NA_VALUES for col in COLUMNS_WITH_NONE_STRING}, + keep_default_na=False, + ) + + if order_columns: + df = df[TABLE_DISPLAY_ORDER] + + if parse_dates: + df[RUN_DATE] = pd.to_datetime(df[RUN_DATE], format=DATES_FORMAT).astype( + "datetime64[ns]" + ) + + return df + + +def _assemble_output_table( + dfs_from_csv, dfs_from_parquet, parquet_gpu_name, create_gpu_entry, list_known_gpus +): + + if not list_known_gpus and (len(dfs_from_parquet) == 0): + if parquet_gpu_name is not None: + parameter_name = ( + "--parquet-gpu-name" if parquet_gpu_name else "--no-parquet-gpu-name" + ) + raise ValueError( + f"The parameter {parameter_name} should only be used if at least one " + "benchopt parquet table is being consolidated, but only got csv tables." + ) + if create_gpu_entry is not False: + raise ValueError( + "The parameter --create-gpu-entry should only be used if at least one " + "benchopt parquet table is being consolidated, but got only csv tables." + ) + elif not list_known_gpus and parquet_gpu_name is None: + raise ValueError( + "Please use the --parquet-gpu-name parameter to provide a gpu name that " + "will be added to the metadata of the samples in the input parquet tables " + "or use the --no-parquet-gpu-name if you intend to leave the corresponding " + "field empty." + ) + + else: + gpu_names_from_csv = set( + gpu_name + for df in dfs_from_csv + for gpu_name in df[SYSTEM_GPU] + if (len(gpu_name) > 0) + ) + + if list_known_gpus: + print("\n".join(gpu_names_from_csv)) + return False + + if ( + (len(parquet_gpu_name) > 0) + and (parquet_gpu_name not in gpu_names_from_csv) + and not create_gpu_entry + ): + raise IndexError( + f"The gpu name {parquet_gpu_name} is unknown. Please use the " + "--new-gpu-entry parameter to confirm the addition of the new gpu " + "entry in the output csv table, or use --list-known-gpus parameter to " + "print a list of gpus names that have been already registered and use " + "one of those to bypass this error." + ) + + for df in dfs_from_parquet: + df[SYSTEM_GPU] = parquet_gpu_name + + df_list = dfs_from_csv + dfs_from_parquet + + if len(df_list) > 1: + df = pd.concat(df_list, ignore_index=True, copy=False) + else: + df = df_list[0] + + df = df[TABLE_DISPLAY_ORDER] + df.sort_values( + by=_row_sort_by, ascending=_row_sort_ascending, inplace=True, kind="stable" + ) + # HACK: sanitize mix of None values and empty strings that can happen when some + # columns are missing in the parquet input files (because it's optional and no + # solver returns it in the batch) by passing the data to CSV and re-loading + # again from CSV + df = _sanitize_df_with_tocsv(df) + + df.drop_duplicates(subset=UNIQUE_BENCHMARK_KEY, inplace=True, ignore_index=True) + + return df + + +def _sanitize_df_with_tocsv(df): + in_memory_buffer = BytesIO() + _df_to_csv(df, in_memory_buffer) + in_memory_buffer.seek(0) + return _validate_one_csv_table(in_memory_buffer, order_columns=False) + + +def _df_to_csv(df, target): + df.to_csv(target, index=False, mode="a", date_format=DATES_FORMAT) + + +def _gspread_sync(source, gspread_url, gspread_auth_key): + import gspread + + df = _validate_one_csv_table(source, parse_dates=False) + + n_rows, n_cols = df.shape + walltime_worksheet_col = df.columns.get_loc(WALLTIME) + 1 + + gs = gspread.service_account(gspread_auth_key) + sheet = gs.open_by_url(gspread_url) + + try: + worksheet = sheet.worksheet(GOOGLE_WORKSHEET_NAME) + worksheet.clear() + worksheet.clear_basic_filter() + worksheet.freeze(0, 0) + worksheet.resize(rows=n_rows + 1, cols=n_cols) + except gspread.WorksheetNotFound: + worksheet = sheet.add_worksheet( + GOOGLE_WORKSHEET_NAME, rows=n_rows + 1, cols=n_cols + ) + # ensure worksheets are sorted alphabetically + sheet.reorder_worksheets(sorted(sheet.worksheets(), key=attrgetter("title"))) + + # upload all values + worksheet.update( + values=[df.columns.values.tolist()] + df.values.tolist(), range_name="A1" + ) + + # set filter + worksheet.set_basic_filter(1, 1, n_rows + 1, n_cols) + + # freeze filter rows and benchmark-defining cols + worksheet.freeze(rows=1, cols=walltime_worksheet_col) + + # Text is centerd and wrapped in all cells + global_format = dict( + horizontalAlignment="CENTER", + verticalAlignment="MIDDLE", + wrapStrategy="WRAP", + ) + global_range = ( + f"{gspread.utils.rowcol_to_a1(1, 1)}:" + f"{gspread.utils.rowcol_to_a1(n_rows + 1, n_cols)}" + ) + worksheet.format(global_range, global_format) + + # benchmark_id and walltime columns are bold + bold_format = dict(textFormat=dict(bold=True)) + benchmark_id_col_range = ( + f"{gspread.utils.rowcol_to_a1(2, 1)}:" + f"{gspread.utils.rowcol_to_a1(n_rows + 1, 1)}" + ) + walltime_col_range = ( + f"{gspread.utils.rowcol_to_a1(2, walltime_worksheet_col)}:" + f"{gspread.utils.rowcol_to_a1(n_rows + 1, walltime_worksheet_col)}" + ) + worksheet.batch_format( + [ + dict(range=benchmark_id_col_range, format=bold_format), + dict(range=walltime_col_range, format=bold_format), + ] + ) + + # auto-resize rows and cols + worksheet.columns_auto_resize(0, n_cols - 1) + worksheet.rows_auto_resize(0, n_rows) + + +if __name__ == "__main__": + import os + import sys + from argparse import ArgumentParser + + argparser = ArgumentParser( + description=( + "Print an aggregated CSV-formated database of pca benchmark results " + "for the sklearn-engine-benchmarks project hosted at " + "https://github.com/soda-inria/sklearn-engine-benchmarks.\n\n" + "The inputs are assumed to be a collection of benchopt parquet files and " + "CSV files, well formated according to the project current specs. This " + "command assumes rhat the inputs are valid and is lenient at checking " + "types, null values, or missing columns, hence the user is advised to " + "cautiously check outputs before using.\n\n" + "If several results are found for identical benchmarks, only the most " + "recent `Run date` value is retained, all anterior entries are discarded " + "from the output CSV." + ) + ) + + argparser.add_argument( + "benchmark_files", + nargs="+", + help="benchopt parquet files or sklearn-engine-benchmarks csv files", + ) + + argparser.add_argument( + "--check-csv", + action="store_true", + help="Perform a few sanity checks on a CSV database of pca benchmark " + "results. If this option is passed, then the command only expects a single " + "input path to a csv file.", + ) + + argparser.add_argument( + "--sync-to-gspread", + action="store_true", + help="Synchronize a CSV database of pca benchmark results to a google " + "spreadsheet and format it nicely. If this option is passed, then the command " + "only expects a single input path to a csv file, and also requires " + "--gspread-url and --gspread-auth-key.", + ) + + argparser.add_argument( + "--gspread-url", + help="URL to a google spreadsheet. Expected if and only if --sync-to-gspread " + "is passed.", + ) + + argparser.add_argument( + "--gspread-auth-key", + help="Path to a json authentication key for a gspread service account. " + "Expected if and only if --sync-to-gspread is passed.", + ) + + argparser.add_argument( + "--parquet-gpu-name", + help="Name of the GPU on the host that runs the benchmarks that are recorded " + "in the input parquet files.", + ) + + argparser.add_argument( + "--no-parquet-gpu-name", + action="store_true", + help="Do not insert a GPU name in the metadata of the benchmark samples that " + "were recorded in the input parquet files (and leave it blank).", + ) + + argparser.add_argument( + "--new-gpu-entry", + action="store_true", + help="Use this parameter along with --parquet-gpu-name to confirm that if the " + "GPU name is not yet known in the existing databases, it will be added to the " + "list of known GPU names. Else the command will throw an error.", + ) + + argparser.add_argument( + "--list-known-gpus", + action="store_true", + help="Will print a list of the GPU names that are used in CSV benchmark files.", + ) + + args = argparser.parse_args() + + if (parquet_gpu_name := args.parquet_gpu_name) is None and args.no_parquet_gpu_name: + parquet_gpu_name = "" + + create_gpu_entry = args.new_gpu_entry + list_known_gpus = args.list_known_gpus + + paths = args.benchmark_files + if (check_csv := args.check_csv) or args.sync_to_gspread: + if (n_paths := len(paths)) > 1: + command = "--check-csv" if check_csv else "--sync-to-gspread" + raise ValueError( + f"A single input path to a csv file is expected when the {command} " + f"parameter is passed, but you passed {n_paths - 1} additional " + "arguments." + ) + path = paths[0] + _, file_extension = os.path.splitext(path) + if file_extension != ".csv": + raise ValueError( + "Expecting a '.csv' file extensions, but got " + f"{file_extension} instead !" + ) + + if check_csv: + df_loaded = _validate_one_csv_table(path) + df_clean = _assemble_output_table( + dfs_from_csv=[df_loaded], + dfs_from_parquet=[], + parquet_gpu_name=None, + create_gpu_entry=False, + list_known_gpus=list_known_gpus, + ) + + pd.testing.assert_frame_equal(df_loaded, df_clean) + + if gspread_sync := args.sync_to_gspread: + if (gspread_url := args.gspread_url) is None: + raise ValueError( + "Please provide a URL to a google spreadsheet using the " + "--gspread-url parameter." + ) + + if (gspread_auth_key := args.gspread_auth_key) is None: + raise ValueError( + "Please use the --gspread-auth-key parameter to pass a json " + "authentication key for a service account from the google developer " + "console." + ) + _gspread_sync(path, gspread_url, gspread_auth_key) + + if not check_csv and not gspread_sync: + dfs_from_parquet, dfs_from_csv = [], [] + for path in paths: + _, file_extension = os.path.splitext(path) + if file_extension == ".parquet": + if list_known_gpus: + continue + dfs_from_parquet.append(_validate_one_parquet_table(path)) + elif file_extension == ".csv": + dfs_from_csv.append(_validate_one_csv_table(path, order_columns=False)) + else: + raise ValueError( + "Expecting '.csv' or '.parquet' file extensions, but got " + f"{file_extension} instead !" + ) + + df = _assemble_output_table( + dfs_from_csv=dfs_from_csv, + dfs_from_parquet=dfs_from_parquet, + parquet_gpu_name=parquet_gpu_name, + create_gpu_entry=create_gpu_entry, + list_known_gpus=list_known_gpus, + ) + + if df is not False: + _df_to_csv(df, sys.stdout) diff --git a/benchmarks/pca/datasets/simulated_blobs.py b/benchmarks/pca/datasets/simulated_blobs.py new file mode 100644 index 0000000..a0f3cba --- /dev/null +++ b/benchmarks/pca/datasets/simulated_blobs.py @@ -0,0 +1,30 @@ +from benchopt import BaseDataset, safe_import_context +from benchopt.datasets import make_correlated_data + +with safe_import_context() as import_ctx: + import numpy as np + + +class Dataset(BaseDataset): + name = "Simulated_correlated_data" + + parameters = { + "n_samples, n_features": [(1_000_000, 100), (10_000, 10_000)], + "dtype": ["float32"], + "random_state": [123], + } + + def __init__(self, n_samples, n_features, dtype, random_state): + self.n_samples = n_samples + self.n_features = n_features + self.random_state = random_state + self.dtype = dtype + + def get_data(self): + rng = np.random.RandomState(self.random_state) + + X, *_ = make_correlated_data(self.n_samples, self.n_features, random_state=rng) + + return dict( + X=X.astype(getattr(np, self.dtype)), __name=self.name, **self._parameters + ) diff --git a/benchmarks/pca/objective.py b/benchmarks/pca/objective.py new file mode 100644 index 0000000..6c24d6e --- /dev/null +++ b/benchmarks/pca/objective.py @@ -0,0 +1,64 @@ +from datetime import datetime + +from benchopt import BaseObjective + + +class Objective(BaseObjective): + name = "PCA walltime" + url = "https://github.com/soda-inria/sklearn-engine-benchmarks" + + requirements = ["numpy"] + + # Since our goal is to measure walltime for solvers that perform exact same + # computations, the solver parameters are part of the objective and must be set + # for all solvers, rather than being an independent benchmark space for each + # solver. + parameters = dict( + n_components=[10], + tol=[0.0], + iterated_power=[15], + n_oversamples=[10], + random_state=[123], + verbose=[False], + ) + + def set_data(self, X, **dataset_parameters): + self.X = X + self.dataset_parameters = dataset_parameters + + def evaluate_result(self, explained_variance_ratio_sum, **solver_parameters): + all_parameters = dict(solver_param_run_date=datetime.today()) + all_parameters.update( + { + ("dataset_param_" + key): value + for key, value in self.dataset_parameters.items() + } + ) + all_parameters.update( + { + ("objective_param_" + key): value + for key, value in self._parameters.items() + } + ) + all_parameters.update( + {("solver_param_" + key): value for key, value in solver_parameters.items()} + ) + return dict( + value=explained_variance_ratio_sum, + objective_param___name=self.name, + **all_parameters, + ) + + def get_one_result(self): + return dict(explained_variance_ratio_sum=1) + + def get_objective(self): + return dict( + X=self.X, + n_components=self.n_components, + tol=self.tol, + iterated_power=self.iterated_power, + n_oversamples=self.n_oversamples, + random_state=self.random_state, + verbose=self.verbose, + ) diff --git a/benchmarks/pca/results.csv b/benchmarks/pca/results.csv new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/pca/solvers/cuml.py b/benchmarks/pca/solvers/cuml.py new file mode 100644 index 0000000..1a63bbc --- /dev/null +++ b/benchmarks/pca/solvers/cuml.py @@ -0,0 +1,87 @@ +from importlib.metadata import version + +from benchopt import BaseSolver, safe_import_context +from benchopt.stopping_criterion import SingleRunCriterion + +with safe_import_context() as import_ctx: + import cuml + import cupy + import numpy as np + + +class Solver(BaseSolver): + name = "cuml" + requirements = ["cuml"] + + parameters = dict( + device=["gpu"], + svd_solver=["full", "jacobi"], + ) + + stopping_criterion = SingleRunCriterion(1) + + def skip(self, **objective_dict): + + X = objective_dict["X"] + if X.dtype == np.float64: + # We haven't came accross cuda devices that doesn't support float64 yet, + # can it happen ? If it happens, the following instruction will fail, + # please enclose it with the appropriate Try/Except to return the + # appropriate skip decision. + cupy.zeros(1, dtype=cupy.float64) + # return True, ( + # f"This {self.device} device has no support for float64 compute" + # ) + + return False, None + + def set_objective( + self, + X, + n_components, + tol, + iterated_power, + n_oversamples, + random_state, + verbose, + ): + if self.device == "cpu": + # Copy the data before running the benchmark to ensure that no unfortunate + # side effects can happen + self.X = X.copy() + + else: + self.X = cupy.asarray(X) + + self.n_components = n_components + self.tol = tol + + # if tol == 0: + # tol = 1e-16 + # self.tol = tol + + self.iterated_power = iterated_power + self.n_oversamples = n_oversamples + self.random_state = random_state + self.verbose = verbose + + def run(self, _): + estimator = cuml.PCA( + copy=True, + iterated_power=self.iterated_power, + n_components=self.n_components, + random_state=self.random_state, + svd_solver=self.svd_solver, + tol=self.tol, + whiten=self.whiten, + ).fit(self.X, y=None) + + self.explained_variance_ratio_ = estimator.explained_variance_ratio_ + + def get_result(self): + return dict( + explained_variance_ratio_sum=self.explained_variance_ratio_.sum().item(), + version_info=f"cuml {version('cuml')}", + __name=self.name, + **self._parameters, + ) diff --git a/benchmarks/pca/solvers/scikit_learn.py b/benchmarks/pca/solvers/scikit_learn.py new file mode 100644 index 0000000..e6b0f9b --- /dev/null +++ b/benchmarks/pca/solvers/scikit_learn.py @@ -0,0 +1,67 @@ +from importlib.metadata import version + +from benchopt import BaseSolver, safe_import_context +from benchopt.stopping_criterion import SingleRunCriterion + +with safe_import_context() as import_ctx: + from sklearn.decomposition import PCA + + +class Solver(BaseSolver): + name = "scikit-learn" + requirements = ["scikit-learn"] + + parameters = { + "svd_solver, power_iteration_normalizer": [ + (svd_solver, power_iteration_normalizer) + for svd_solver in ["full", "randomized"] + for power_iteration_normalizer in ["LU"] + ] + + [("arpack", "none")] + } + + stopping_criterion = SingleRunCriterion(1) + + def set_objective( + self, + X, + n_components, + tol, + iterated_power, + n_oversamples, + random_state, + verbose, + ): + # Copy the data before running the benchmark to ensure that no unfortunate side + # effects can happen + self.X = X.copy() + + self.n_components = n_components + self.tol = tol + self.iterated_power = iterated_power + self.n_oversamples = n_oversamples + self.random_state = random_state + self.verbose = verbose + + def run(self, _): + estimator = PCA( + n_components=self.n_components, + copy=True, + whiten=False, + svd_solver=self.svd_solver, + tol=self.tol, + iterated_power=self.iterated_power, + n_oversamples=self.n_oversamples, + power_iteration_normalizer=self.power_iteration_normalizer, + random_state=self.random_state, + ).fit(self.X, y=None) + + self.explained_variance_ratio_ = estimator.explained_variance_ratio_ + + def get_result(self): + return dict( + explained_variance_ratio_sum=self.explained_variance_ratio_.sum(), + version_info=f"scikit-learn {version('scikit-learn')}", + __name=self.name, + **self._parameters, + ) diff --git a/benchmarks/pca/solvers/scikit_learn_intelex.py b/benchmarks/pca/solvers/scikit_learn_intelex.py new file mode 100644 index 0000000..a485771 --- /dev/null +++ b/benchmarks/pca/solvers/scikit_learn_intelex.py @@ -0,0 +1,105 @@ +from importlib.metadata import version + +from benchopt import BaseSolver, safe_import_context +from benchopt.stopping_criterion import SingleRunCriterion + +with safe_import_context() as import_ctx: + # isort: off + import dpctl + import dpctl.tensor as dpt + import numpy as np + from sklearnex.decomposition import PCA + + # isort: on + + +class Solver(BaseSolver): + name = "scikit-learn-intelex" + + requirements = [ + "scikit-learn-intelex", + "dpcpp-cpp-rt", + ] + + parameters = { + "device, runtime": [ + ("cpu", "numpy"), + ("gpu", "level_zero"), + ], + "svd_solver, power_iteration_normalizer": [ + (svd_solver, power_iteration_normalizer) + for svd_solver in ["full", "randomized"] + for power_iteration_normalizer in ["LU"] + ] + + [("arpack", "none")], + } + + stopping_criterion = SingleRunCriterion(1) + + def skip(self, **objective_dict): + if self.runtime != "numpy": + try: + device = dpctl.SyclDevice(f"{self.runtime}:{self.device}") + except Exception: + return ( + True, + f"{self.runtime} runtime not found for device {self.device}", + ) + + X = objective_dict["X"] + if (X.dtype == np.float64) and not device.has_aspect_fp64: + return True, ( + f"This {self.device} device has no support for float64 compute" + ) + + return False, None + + def set_objective( + self, + X, + n_components, + tol, + iterated_power, + n_oversamples, + random_state, + verbose, + ): + + # Copy the data before running the benchmark to ensure that no unfortunate + # side effects can happen + if self.runtime != "numpy": + device = device = dpctl.SyclDevice(f"{self.runtime}:{self.device}") + self.X = dpt.asarray(X, copy=True, device=device) + + else: + self.X = X.copy() + + self.n_components = n_components + self.tol = tol + self.iterated_power = iterated_power + self.n_oversamples = n_oversamples + self.random_state = random_state + self.verbose = verbose + + def run(self, _): + estimator = PCA( + n_components=self.n_components, + copy=True, + whiten=False, + svd_solver=self.svd_solver, + tol=self.tol, + iterated_power=self.iterated_power, + n_oversamples=self.n_oversamples, + power_iteration_normalizer=self.power_iteration_normalizer, + random_state=self.random_state, + ).fit(self.X, y=None) + + self.explained_variance_ratio_ = estimator.explained_variance_ratio_ + + def get_result(self): + return dict( + explained_variance_ratio_sum=self.explained_variance_ratio_.sum(), + version_info=f"scikit-learn-intelex {version('scikit-learn-intelex')}", + __name=self.name, + **self._parameters, + )