From 77b470cd6cd6840797faaa9192285d41d7b02d68 Mon Sep 17 00:00:00 2001
From: Eunwoo Shin <eunwoo.shin@intel.com>
Date: Sat, 23 Mar 2024 00:36:29 +0900
Subject: [PATCH] Change code using fixed subset name in HPO (#3101)

* change code using fixed subset name in hpo

* apply comment

* remove non_pure_train_ratio from HPO

* raise error once HPO trial exits abnormally

* update unit test

* align with pre-commit
---
 src/otx/engine/hpo/hpo_api.py    |  4 +---
 src/otx/hpo/hpo_base.py          |  9 ---------
 src/otx/hpo/hpo_runner.py        | 10 ++++------
 tests/unit/hpo/test_hyperband.py |  1 -
 4 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/src/otx/engine/hpo/hpo_api.py b/src/otx/engine/hpo/hpo_api.py
index bcafb6039ae..e45b88e06af 100644
--- a/src/otx/engine/hpo/hpo_api.py
+++ b/src/otx/engine/hpo/hpo_api.py
@@ -133,14 +133,12 @@ def hpo_config(self) -> dict[str, Any]:
 
     @hpo_config.setter
     def hpo_config(self, hpo_config: HpoConfig | None) -> None:
-        train_dataset_size = len(self._engine.datamodule.subsets["train"])
-        val_dataset_size = len(self._engine.datamodule.subsets["val"])
+        train_dataset_size = len(self._engine.datamodule.train_dataloader())
 
         self._hpo_config: dict[str, Any] = {  # default setting
             "save_path": str(self._hpo_workdir),
             "num_full_iterations": self._max_epoch,
             "full_dataset_size": train_dataset_size,
-            "non_pure_train_ratio": val_dataset_size / (train_dataset_size + val_dataset_size),
         }
 
         if hpo_config is not None:
diff --git a/src/otx/hpo/hpo_base.py b/src/otx/hpo/hpo_base.py
index f0e03452a57..dfb0e412b62 100644
--- a/src/otx/hpo/hpo_base.py
+++ b/src/otx/hpo/hpo_base.py
@@ -36,7 +36,6 @@ class HpoBase(ABC):
         num_trials (int | None, optional): How many training to conduct for HPO.
         num_workers (int, optional): How many trains are executed in parallel.
         num_full_iterations (int, optional): epoch for traninig after HPO.
-        non_pure_train_ratio (float, optional): ratio of validation time to (train time + validation time)
         full_dataset_size (int, optional): train dataset size
         expected_time_ratio (int | float | None, optional): Time to use for HPO.
                                                             If HPO is configured automatically,
@@ -64,7 +63,6 @@ def __init__(
         num_trials: int | None = None,
         num_workers: int = 1,
         num_full_iterations: int | float = 1,
-        non_pure_train_ratio: float = 0.2,
         full_dataset_size: int = 0,
         expected_time_ratio: int | float | None = None,
         maximum_resource: int | float | None = None,
@@ -78,12 +76,6 @@ def __init__(
         check_mode_input(mode)
         check_positive(full_dataset_size, "full_dataset_size")
         check_positive(num_full_iterations, "num_full_iterations")
-        if not 0 < non_pure_train_ratio <= 1:
-            error_msg = (
-                "non_pure_train_ratio should be greater than 0 and lesser than or equal to 1."
-                f"Your value is {subset_ratio}"
-            )
-            raise ValueError(error_msg)
         if maximum_resource is not None:
             check_positive(maximum_resource, "maximum_resource")
         if num_trials is not None:
@@ -103,7 +95,6 @@ def __init__(
         self.num_trials = num_trials
         self.num_workers = num_workers
         self.num_full_iterations = num_full_iterations
-        self.non_pure_train_ratio = non_pure_train_ratio
         self.full_dataset_size = full_dataset_size
         self.expected_time_ratio = expected_time_ratio
         self.maximum_resource: int | float | None = maximum_resource
diff --git a/src/otx/hpo/hpo_runner.py b/src/otx/hpo/hpo_runner.py
index 2a936ff1a49..f43064aeeed 100644
--- a/src/otx/hpo/hpo_runner.py
+++ b/src/otx/hpo/hpo_runner.py
@@ -67,7 +67,6 @@ def __init__(
         self._mp = multiprocessing.get_context("spawn")
         self._report_queue = self._mp.Queue()
         self._uid_index = 0
-        self._trial_fault_count = 0
         self._resource_manager = get_resource_manager(
             resource_type,
             num_parallel_trial,
@@ -83,7 +82,7 @@ def run(self) -> None:
         """Run a HPO loop."""
         logger.info("HPO loop starts.")
         try:
-            while not self._hpo_algo.is_done() and self._trial_fault_count < 3:
+            while not self._hpo_algo.is_done():
                 if self._resource_manager.have_available_resource():
                     trial = self._hpo_algo.get_next_sample()
                     if trial is not None:
@@ -98,9 +97,6 @@ def run(self) -> None:
             raise e  # noqa: TRY201
         logger.info("HPO loop is done.")
 
-        if self._trial_fault_count >= 3:
-            logger.warning("HPO trials exited abnormally more than three times. HPO is suspended.")
-
         self._get_reports()
         self._join_all_processes()
 
@@ -143,7 +139,9 @@ def _remove_finished_process(self) -> None:
         for uid, trial in self._running_trials.items():
             if not trial.process.is_alive():
                 if trial.process.exitcode != 0:
-                    self._trial_fault_count += 1
+                    self._terminate_all_running_processes()
+                    msg = "One of HPO trials exit abnormally."
+                    raise RuntimeError(msg)
                 trial.queue.close()
                 trial.process.join()
                 trial_to_remove.append(uid)
diff --git a/tests/unit/hpo/test_hyperband.py b/tests/unit/hpo/test_hyperband.py
index 00dad36d58c..971f319005b 100644
--- a/tests/unit/hpo/test_hyperband.py
+++ b/tests/unit/hpo/test_hyperband.py
@@ -74,7 +74,6 @@ def good_hyperband_args():
             "mode": "max",
             "num_workers": 1,
             "num_full_iterations": 64,
-            "non_pure_train_ratio": 0.2,
             "full_dataset_size": 100,
             "maximum_resource": 64,
             "minimum_resource": 1,