add UCB bonus to planning progress explorer (#1602)

Learning-and-Intelligent-Systems · Dec 13, 2023 · 33b1d21 · 33b1d21
1 parent b6227fb
commit 33b1d21
Showing 1 changed file with 9 additions and 12 deletions.
diff --git a/predicators/explorers/active_sampler_explorer.py b/predicators/explorers/active_sampler_explorer.py
@@ -322,10 +322,6 @@ def _update_ground_op_hist(self, state: State) -> None:
         if nsrt is None:
             return
         assert exploration_indicator is not None
-        # NOTE: checking just the add effects doesn't work in general, but
-        # is probably fine for now. The right thing to do here is check
-        # the necessary atoms, which we will compute with a utility function
-        # and then use in a forthcoming PR.
         success = all(a.holds(state) for a in nsrt.add_effects)
         logging.info(f"[Explorer] Last NSRT: {nsrt.name}{nsrt.objects}")
         logging.info(f"[Explorer]   outcome: {success}")
@@ -371,19 +367,20 @@ def _get_option_policy_for_task(self,
 
     def _score_ground_op(
             self, ground_op: _GroundSTRIPSOperator) -> Tuple[float, ...]:
+        history = self._ground_op_hist[ground_op]
+        num_tries = len(history)
+        success_rate = sum(history) / num_tries
+        total_trials = sum(len(h) for h in self._ground_op_hist.values())
+        # UCB-like bonus.
+        c = CFG.active_sampler_explore_bonus
+        bonus = c * np.sqrt(np.log(total_trials) / num_tries)
         if CFG.active_sampler_explore_task_strategy == "planning_progress":
             score = self._score_ground_op_planning_progress(ground_op)
+            if CFG.active_sampler_explore_use_ucb_bonus:
+                score += bonus
         elif CFG.active_sampler_explore_task_strategy == "success_rate":
-            history = self._ground_op_hist[ground_op]
-            num_tries = len(history)
-            success_rate = sum(history) / num_tries
-            total_trials = sum(len(h) for h in self._ground_op_hist.values())
             score = (1.0 - success_rate)
             if CFG.active_sampler_explore_use_ucb_bonus:
-                # Try less successful operators more often.
-                # UCB-like bonus.
-                c = CFG.active_sampler_explore_bonus
-                bonus = c * np.sqrt(np.log(total_trials) / num_tries)
                 score += bonus
         elif CFG.active_sampler_explore_task_strategy == "random":
             # Random scores baseline.