RobotecAI · jmatejcz · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/examples/manipulation-demo.launch.py b/examples/manipulation-demo.launch.py
@@ -51,7 +51,6 @@ def generate_launch_description():
     launch_robotic_manipulation = Node(
         package="robotic_manipulation",
         executable="robotic_manipulation",
-        # name="robotic_manipulation_node",
         output="screen",
         parameters=[
             {"use_sim_time": True},

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -62,7 +62,6 @@ pytest-timeout = "^2.3.1"
 tomli-w = "^1.1.0"
 faster-whisper = "^1.1.1"
 pydub = "^0.25.1"
-opencv-python = "^4.11.0.86"
 [tool.poetry.group.dev.dependencies]
 ipykernel = "^6.29.4"
 

diff --git a/src/rai_bench/README.md b/src/rai_bench/README.md
@@ -9,13 +9,7 @@ The RAI Bench is a package including benchmarks and providing frame for creating
 Frame components can be found in `src/rai_bench/rai_bench/benchmark_model.py`
 
 - `Task` - abstract class for creating specific task. It introduces helper funtions that make it easier to calculate metrics/scores. Your custom tasks must implement a prompt got agent to do, a way to calculate a result and a validation if given scene config suits the task.
--
-- `Scenario` - class defined by a Scene and Task. Can be created manually like:
-
-  ```python
-
-  ```
-
+- `Scenario` - class defined by a Scene and Task.
 - `Benchmark` - class responsible for running and logging scenarios.
 
 ### O3DE TEST BENCHMARK

diff --git a/src/rai_bench/pyproject.toml b/src/rai_bench/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "rai-bench"
 version = "0.1.0"
-description = ""
+description = "Package for running and creating benchmarks."
 authors = ["jmatejcz <[email protected]>"]
 readme = "README.md"
 

diff --git a/src/rai_bench/rai_bench/benchmark_model.py b/src/rai_bench/rai_bench/benchmark_model.py
@@ -19,9 +19,9 @@
 from typing import Any, Dict, Generic, List, Union
 
 from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
-from rai.messages import HumanMultimodalMessage
 from rclpy.impl.rcutils_logger import RcutilsLogger
 
+from rai.messages import HumanMultimodalMessage
 from rai_sim.simulation_bridge import (
     PoseModel,
     SimulationBridge,
@@ -34,16 +34,17 @@
 
 
 class EntitiesMismatchException(Exception):
-    def __init__(self, message: str) -> None:
-        super().__init__(message)
+    pass
 
 
 class Task(ABC):
     """
-    Task to perform.
-    Specyfic implementation should implement a way to calculate results.
-    Abstract provides utility functions for common calculations, that can be usefull when
-    creating metrics
+    Abstract of a Task. Provides utility functions for common calculations
+    that can be helfull when creating metrics.
+    Specific child classes should implement:
+    - get_prompt method
+    - validate_config
+    - calculate_result
     """
 
     def __init__(
@@ -57,6 +58,7 @@ def __init__(
 
     @abstractmethod
     def get_prompt(self) -> str:
+        """Returns the task instruction - the prompt that will be passed to agent"""
         pass
 
     @abstractmethod
@@ -75,7 +77,8 @@ def calculate_result(
         self, simulation_bridge: SimulationBridge[SimulationConfigT]
     ) -> float:
         """
-        Calculate result of the task
+        Calculates result of the task, based on info retrieved from simulation.
+        Should return score between 0.0 and 1.
         """
         pass
 
@@ -135,7 +138,10 @@ def count_adjacent(
 
 
 class Scenario(Generic[SimulationConfigT]):
-    """Single instances are run separatly by benchmark"""
+    """
+    A Scenarios are defined by a pair of Task and Simlation Config.
+    Each Scenario is executed separatly by a Benchmark.
+    """
 
     def __init__(
         self,
@@ -154,24 +160,30 @@ def __init__(
 
 class Benchmark:
     """
-    Defined by a set of scenarios to be done
+    Benchmark represents a set of Scenarios to be executed and evaluated.
+    It manages the execution, logs results, and provides functionality
+    for tracking and exporting performance metrics.
     """
 
     def __init__(
         self,
         simulation_bridge: SimulationBridge[SimulationConfigT],
         scenarios: List[Scenario[SimulationConfigT]],
         logger: loggers_type | None = None,
+        results_filename: str = "benchmark_results.csv",
     ) -> None:
         self.simulation_bridge = simulation_bridge
         self.num_of_scenarios = len(scenarios)
         self.scenarios = enumerate(iter(scenarios))
         self.results: List[Dict[str, Any]] = []
+        self.results_filename = results_filename
         if logger:
             self._logger = logger
         else:
             self._logger = logging.getLogger(__name__)
 
+        self._initialize_results_file()
+
     @classmethod
     def create_scenarios(
         cls,
@@ -198,6 +210,23 @@ def create_scenarios(
                     )
         return scenarios
 
+    def _initialize_results_file(self):
+        """Initialize the CSV file with headers."""
+        fieldnames = [
+            "task",
+            "simulation_config",
+            "initial_score",
+            "final_score",
+            "total_time",
+            "number_of_tool_calls",
+        ]
+
+        with open(
+            self.results_filename, mode="w", newline="", encoding="utf-8"
+        ) as file:
+            writer = csv.DictWriter(file, fieldnames=fieldnames)
+            writer.writeheader()
+
     def run_next(self, agent) -> None:
         """
         Runs the next scenario
@@ -250,41 +279,34 @@ def run_next(self, agent) -> None:
             self._logger.info(  # type: ignore
                 f"TASK SCORE: {result}, TOTAL TIME: {total_time:.3f}, NUM_OF_TOOL_CALLS: {tool_calls_num}"
             )
-
-            self.results.append(
-                {
-                    "task": scenario.task.get_prompt(),
-                    "simulation_config": scenario.simulation_config_path,
-                    "initial_score": initial_result,
-                    "final_score": result,
-                    "total_time": f"{total_time:.3f}",
-                    "number_of_tool_calls": tool_calls_num,
-                }
-            )
+            # TODO (jm) not sure if keeping all results here,
+            # besides saving them to file, is an overkill?
+            scenario_result: Dict[str, Any] = {
+                "task": scenario.task.get_prompt(),
+                "simulation_config": scenario.simulation_config_path,
+                "initial_score": initial_result,
+                "final_score": result,
+                "total_time": f"{total_time:.3f}",
+                "number_of_tool_calls": tool_calls_num,
+            }
+            self._save_scenario_result_to_csv(scenario_result)
 
         except StopIteration:
             print("No more scenarios left to run.")
 
-    def get_results(self) -> List[Dict[str, Any]]:
-        return self.results
-
-    def dump_results_to_csv(self, filename: str) -> None:
-        if not self.results:
-            self._logger.warning("No results to save.")  # type: ignore
-            return
-
+    def _save_scenario_result_to_csv(self, result: Dict[str, Any]) -> None:
+        """Save a single scenario result to the CSV file."""
         fieldnames = [
             "task",
-            "initial_score",
             "simulation_config",
+            "initial_score",
             "final_score",
             "total_time",
             "number_of_tool_calls",
         ]
 
-        with open(filename, mode="w", newline="", encoding="utf-8") as file:
+        with open(
+            self.results_filename, mode="a", newline="", encoding="utf-8"
+        ) as file:
             writer = csv.DictWriter(file, fieldnames=fieldnames)
-            writer.writeheader()
-            writer.writerows(self.results)
-
-        self._logger.info(f"Results saved to {filename}")  # type: ignore
+            writer.writerow(result)
diff --git a/src/rai_bench/rai_bench/main.py → ...rai_bench/examples/o3de_test_benchmark.py b/src/rai_bench/rai_bench/main.py → ...rai_bench/examples/o3de_test_benchmark.py
@@ -20,13 +20,13 @@
 
 import rclpy
 from langchain.tools import BaseTool
+from rai_open_set_vision.tools import GetGrabbingPointTool
+
 from rai.agents.conversational_agent import create_conversational_agent
 from rai.communication.ros2.connectors import ROS2ARIConnector
 from rai.tools.ros.manipulation import GetObjectPositionsTool, MoveToPointTool
 from rai.tools.ros2.topics import GetROS2ImageTool, GetROS2TopicsNamesAndTypesTool
 from rai.utils.model_initialization import get_llm_model
-from rai_open_set_vision.tools import GetGrabbingPointTool
-
 from rai_bench.benchmark_model import Benchmark, Task
 from rai_bench.o3de_test_bench.tasks import GrabCarrotTask, PlaceCubesTask
 from rai_sim.o3de.o3de_bridge import (
@@ -163,6 +163,7 @@
         simulation_bridge=o3de,
         scenarios=scenarios,
         logger=bench_logger,
+        results_filename="src/rai_bench/rai_bench/results.csv",
     )
     for i, s in enumerate(scenarios):
         agent = create_conversational_agent(
@@ -180,7 +181,6 @@
     bench_logger.info("===============================================================")
     bench_logger.info("ALL SCENARIOS DONE. BENCHMARK COMPLETED!")
     bench_logger.info("===============================================================")
-    benchmark.dump_results_to_csv(filename="src/rai_bench/rai_bench/results.csv")
 
     connector.shutdown()
     o3de.shutdown()

diff --git a/src/rai_bench/rai_bench/o3de_test_bench/tasks/place_cubes_task.py b/src/rai_bench/rai_bench/o3de_test_bench/tasks/place_cubes_task.py
@@ -61,7 +61,7 @@ def calculate_result(
             )
 
         else:
-            ini_poses = [cube.pose for cube in initial_cubes]
+            initial_poses = [cube.pose for cube in initial_cubes]
             final_poses = [cube.pose for cube in final_cubes]
             # NOTE the specific coords that refer to for example
             # middle of the table can differ across simulations,
@@ -73,7 +73,7 @@ def calculate_result(
                     if ini_cube.name == final_cube.name:
                         was_adjacent_initially = self.is_adjacent_to_any(
                             ini_cube.pose,
-                            [p for p in ini_poses if p != ini_cube.pose],
+                            [p for p in initial_poses if p != ini_cube.pose],
                             0.15,
                         )
                         is_adjacent_finally = self.is_adjacent_to_any(

diff --git a/src/rai_core/rai/agents/tool_runner.py b/src/rai_core/rai/agents/tool_runner.py
@@ -69,13 +69,8 @@ def run_one(call: ToolCall):
                 ts = time.perf_counter()
                 output = self.tools_by_name[call["name"]].invoke(call, config)  # type: ignore
                 te = time.perf_counter() - ts
-                tool_output_log = (
-                    str(output.content)[:1000] + "..."
-                    if len(str(output.content)) > 1000
-                    else ""
-                )
                 self.logger.info(
-                    f"Tool {call['name']} completed in {te:.2f} seconds. Tool output: {tool_output_log}"
+                    f"Tool {call['name']} completed in {te:.2f} seconds. Tool output: {output.content}"
                 )
                 self.logger.debug(
                     f"Tool {call['name']} output: \n\n{str(output.content)}"