From 77b470cd6cd6840797faaa9192285d41d7b02d68 Mon Sep 17 00:00:00 2001 From: Eunwoo Shin Date: Sat, 23 Mar 2024 00:36:29 +0900 Subject: [PATCH] Change code using fixed subset name in HPO (#3101) * change code using fixed subset name in hpo * apply comment * remove non_pure_train_ratio from HPO * raise error once HPO trial exits abnormally * update unit test * align with pre-commit --- src/otx/engine/hpo/hpo_api.py | 4 +--- src/otx/hpo/hpo_base.py | 9 --------- src/otx/hpo/hpo_runner.py | 10 ++++------ tests/unit/hpo/test_hyperband.py | 1 - 4 files changed, 5 insertions(+), 19 deletions(-) diff --git a/src/otx/engine/hpo/hpo_api.py b/src/otx/engine/hpo/hpo_api.py index bcafb6039ae..e45b88e06af 100644 --- a/src/otx/engine/hpo/hpo_api.py +++ b/src/otx/engine/hpo/hpo_api.py @@ -133,14 +133,12 @@ def hpo_config(self) -> dict[str, Any]: @hpo_config.setter def hpo_config(self, hpo_config: HpoConfig | None) -> None: - train_dataset_size = len(self._engine.datamodule.subsets["train"]) - val_dataset_size = len(self._engine.datamodule.subsets["val"]) + train_dataset_size = len(self._engine.datamodule.train_dataloader()) self._hpo_config: dict[str, Any] = { # default setting "save_path": str(self._hpo_workdir), "num_full_iterations": self._max_epoch, "full_dataset_size": train_dataset_size, - "non_pure_train_ratio": val_dataset_size / (train_dataset_size + val_dataset_size), } if hpo_config is not None: diff --git a/src/otx/hpo/hpo_base.py b/src/otx/hpo/hpo_base.py index f0e03452a57..dfb0e412b62 100644 --- a/src/otx/hpo/hpo_base.py +++ b/src/otx/hpo/hpo_base.py @@ -36,7 +36,6 @@ class HpoBase(ABC): num_trials (int | None, optional): How many training to conduct for HPO. num_workers (int, optional): How many trains are executed in parallel. num_full_iterations (int, optional): epoch for traninig after HPO. - non_pure_train_ratio (float, optional): ratio of validation time to (train time + validation time) full_dataset_size (int, optional): train dataset size expected_time_ratio (int | float | None, optional): Time to use for HPO. If HPO is configured automatically, @@ -64,7 +63,6 @@ def __init__( num_trials: int | None = None, num_workers: int = 1, num_full_iterations: int | float = 1, - non_pure_train_ratio: float = 0.2, full_dataset_size: int = 0, expected_time_ratio: int | float | None = None, maximum_resource: int | float | None = None, @@ -78,12 +76,6 @@ def __init__( check_mode_input(mode) check_positive(full_dataset_size, "full_dataset_size") check_positive(num_full_iterations, "num_full_iterations") - if not 0 < non_pure_train_ratio <= 1: - error_msg = ( - "non_pure_train_ratio should be greater than 0 and lesser than or equal to 1." - f"Your value is {subset_ratio}" - ) - raise ValueError(error_msg) if maximum_resource is not None: check_positive(maximum_resource, "maximum_resource") if num_trials is not None: @@ -103,7 +95,6 @@ def __init__( self.num_trials = num_trials self.num_workers = num_workers self.num_full_iterations = num_full_iterations - self.non_pure_train_ratio = non_pure_train_ratio self.full_dataset_size = full_dataset_size self.expected_time_ratio = expected_time_ratio self.maximum_resource: int | float | None = maximum_resource diff --git a/src/otx/hpo/hpo_runner.py b/src/otx/hpo/hpo_runner.py index 2a936ff1a49..f43064aeeed 100644 --- a/src/otx/hpo/hpo_runner.py +++ b/src/otx/hpo/hpo_runner.py @@ -67,7 +67,6 @@ def __init__( self._mp = multiprocessing.get_context("spawn") self._report_queue = self._mp.Queue() self._uid_index = 0 - self._trial_fault_count = 0 self._resource_manager = get_resource_manager( resource_type, num_parallel_trial, @@ -83,7 +82,7 @@ def run(self) -> None: """Run a HPO loop.""" logger.info("HPO loop starts.") try: - while not self._hpo_algo.is_done() and self._trial_fault_count < 3: + while not self._hpo_algo.is_done(): if self._resource_manager.have_available_resource(): trial = self._hpo_algo.get_next_sample() if trial is not None: @@ -98,9 +97,6 @@ def run(self) -> None: raise e # noqa: TRY201 logger.info("HPO loop is done.") - if self._trial_fault_count >= 3: - logger.warning("HPO trials exited abnormally more than three times. HPO is suspended.") - self._get_reports() self._join_all_processes() @@ -143,7 +139,9 @@ def _remove_finished_process(self) -> None: for uid, trial in self._running_trials.items(): if not trial.process.is_alive(): if trial.process.exitcode != 0: - self._trial_fault_count += 1 + self._terminate_all_running_processes() + msg = "One of HPO trials exit abnormally." + raise RuntimeError(msg) trial.queue.close() trial.process.join() trial_to_remove.append(uid) diff --git a/tests/unit/hpo/test_hyperband.py b/tests/unit/hpo/test_hyperband.py index 00dad36d58c..971f319005b 100644 --- a/tests/unit/hpo/test_hyperband.py +++ b/tests/unit/hpo/test_hyperband.py @@ -74,7 +74,6 @@ def good_hyperband_args(): "mode": "max", "num_workers": 1, "num_full_iterations": 64, - "non_pure_train_ratio": 0.2, "full_dataset_size": 100, "maximum_resource": 64, "minimum_resource": 1,