Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: requested changes to rai_bench #438

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion examples/manipulation-demo.launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def generate_launch_description():
launch_robotic_manipulation = Node(
package="robotic_manipulation",
executable="robotic_manipulation",
# name="robotic_manipulation_node",
output="screen",
parameters=[
{"use_sim_time": True},
Expand Down
484 changes: 408 additions & 76 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ pytest-timeout = "^2.3.1"
tomli-w = "^1.1.0"
faster-whisper = "^1.1.1"
pydub = "^0.25.1"
opencv-python = "^4.11.0.86"
[tool.poetry.group.dev.dependencies]
ipykernel = "^6.29.4"

Expand Down
8 changes: 1 addition & 7 deletions src/rai_bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,7 @@ The RAI Bench is a package including benchmarks and providing frame for creating
Frame components can be found in `src/rai_bench/rai_bench/benchmark_model.py`

- `Task` - abstract class for creating specific task. It introduces helper funtions that make it easier to calculate metrics/scores. Your custom tasks must implement a prompt got agent to do, a way to calculate a result and a validation if given scene config suits the task.
-
- `Scenario` - class defined by a Scene and Task. Can be created manually like:

```python

```

- `Scenario` - class defined by a Scene and Task.
- `Benchmark` - class responsible for running and logging scenarios.

### O3DE TEST BENCHMARK
Expand Down
2 changes: 1 addition & 1 deletion src/rai_bench/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.poetry]
name = "rai-bench"
version = "0.1.0"
description = ""
description = "Package for running and creating benchmarks."
authors = ["jmatejcz <[email protected]>"]
readme = "README.md"

Expand Down
92 changes: 57 additions & 35 deletions src/rai_bench/rai_bench/benchmark_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
from typing import Any, Dict, Generic, List, Union

from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
from rai.messages import HumanMultimodalMessage
from rclpy.impl.rcutils_logger import RcutilsLogger

from rai.messages import HumanMultimodalMessage
from rai_sim.simulation_bridge import (
PoseModel,
SimulationBridge,
Expand All @@ -34,16 +34,17 @@


class EntitiesMismatchException(Exception):
def __init__(self, message: str) -> None:
super().__init__(message)
pass


class Task(ABC):
"""
Task to perform.
Specyfic implementation should implement a way to calculate results.
Abstract provides utility functions for common calculations, that can be usefull when
creating metrics
Abstract of a Task. Provides utility functions for common calculations
that can be helfull when creating metrics.
Specific child classes should implement:
- get_prompt method
- validate_config
- calculate_result
"""

def __init__(
Expand All @@ -57,6 +58,7 @@ def __init__(

@abstractmethod
def get_prompt(self) -> str:
"""Returns the task instruction - the prompt that will be passed to agent"""
pass

@abstractmethod
Expand All @@ -75,7 +77,8 @@ def calculate_result(
self, simulation_bridge: SimulationBridge[SimulationConfigT]
) -> float:
"""
Calculate result of the task
Calculates result of the task, based on info retrieved from simulation.
Should return score between 0.0 and 1.
"""
pass

Expand Down Expand Up @@ -135,7 +138,10 @@ def count_adjacent(


class Scenario(Generic[SimulationConfigT]):
"""Single instances are run separatly by benchmark"""
"""
A Scenarios are defined by a pair of Task and Simlation Config.
Each Scenario is executed separatly by a Benchmark.
"""

def __init__(
self,
Expand All @@ -154,24 +160,30 @@ def __init__(

class Benchmark:
"""
Defined by a set of scenarios to be done
Benchmark represents a set of Scenarios to be executed and evaluated.
It manages the execution, logs results, and provides functionality
for tracking and exporting performance metrics.
"""

def __init__(
self,
simulation_bridge: SimulationBridge[SimulationConfigT],
scenarios: List[Scenario[SimulationConfigT]],
logger: loggers_type | None = None,
results_filename: str = "benchmark_results.csv",
) -> None:
self.simulation_bridge = simulation_bridge
self.num_of_scenarios = len(scenarios)
self.scenarios = enumerate(iter(scenarios))
self.results: List[Dict[str, Any]] = []
self.results_filename = results_filename
if logger:
self._logger = logger
else:
self._logger = logging.getLogger(__name__)

self._initialize_results_file()

@classmethod
def create_scenarios(
cls,
Expand All @@ -198,6 +210,23 @@ def create_scenarios(
)
return scenarios

def _initialize_results_file(self):
"""Initialize the CSV file with headers."""
fieldnames = [
"task",
"simulation_config",
"initial_score",
"final_score",
"total_time",
"number_of_tool_calls",
]

with open(
self.results_filename, mode="w", newline="", encoding="utf-8"
) as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()

def run_next(self, agent) -> None:
"""
Runs the next scenario
Expand Down Expand Up @@ -250,41 +279,34 @@ def run_next(self, agent) -> None:
self._logger.info( # type: ignore
f"TASK SCORE: {result}, TOTAL TIME: {total_time:.3f}, NUM_OF_TOOL_CALLS: {tool_calls_num}"
)

self.results.append(
{
"task": scenario.task.get_prompt(),
"simulation_config": scenario.simulation_config_path,
"initial_score": initial_result,
"final_score": result,
"total_time": f"{total_time:.3f}",
"number_of_tool_calls": tool_calls_num,
}
)
# TODO (jm) not sure if keeping all results here,
# besides saving them to file, is an overkill?
scenario_result: Dict[str, Any] = {
"task": scenario.task.get_prompt(),
"simulation_config": scenario.simulation_config_path,
"initial_score": initial_result,
"final_score": result,
"total_time": f"{total_time:.3f}",
"number_of_tool_calls": tool_calls_num,
}
self._save_scenario_result_to_csv(scenario_result)

except StopIteration:
print("No more scenarios left to run.")

def get_results(self) -> List[Dict[str, Any]]:
return self.results

def dump_results_to_csv(self, filename: str) -> None:
if not self.results:
self._logger.warning("No results to save.") # type: ignore
return

def _save_scenario_result_to_csv(self, result: Dict[str, Any]) -> None:
"""Save a single scenario result to the CSV file."""
fieldnames = [
"task",
"initial_score",
"simulation_config",
"initial_score",
"final_score",
"total_time",
"number_of_tool_calls",
]

with open(filename, mode="w", newline="", encoding="utf-8") as file:
with open(
self.results_filename, mode="a", newline="", encoding="utf-8"
) as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(self.results)

self._logger.info(f"Results saved to {filename}") # type: ignore
writer.writerow(result)
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@

import rclpy
from langchain.tools import BaseTool
from rai_open_set_vision.tools import GetGrabbingPointTool

from rai.agents.conversational_agent import create_conversational_agent
from rai.communication.ros2.connectors import ROS2ARIConnector
from rai.tools.ros.manipulation import GetObjectPositionsTool, MoveToPointTool
from rai.tools.ros2.topics import GetROS2ImageTool, GetROS2TopicsNamesAndTypesTool
from rai.utils.model_initialization import get_llm_model
from rai_open_set_vision.tools import GetGrabbingPointTool

from rai_bench.benchmark_model import Benchmark, Task
from rai_bench.o3de_test_bench.tasks import GrabCarrotTask, PlaceCubesTask
from rai_sim.o3de.o3de_bridge import (
Expand Down Expand Up @@ -163,6 +163,7 @@
simulation_bridge=o3de,
scenarios=scenarios,
logger=bench_logger,
results_filename="src/rai_bench/rai_bench/results.csv",
)
for i, s in enumerate(scenarios):
agent = create_conversational_agent(
Expand All @@ -180,7 +181,6 @@
bench_logger.info("===============================================================")
bench_logger.info("ALL SCENARIOS DONE. BENCHMARK COMPLETED!")
bench_logger.info("===============================================================")
benchmark.dump_results_to_csv(filename="src/rai_bench/rai_bench/results.csv")

connector.shutdown()
o3de.shutdown()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def calculate_result(
)

else:
ini_poses = [cube.pose for cube in initial_cubes]
initial_poses = [cube.pose for cube in initial_cubes]
final_poses = [cube.pose for cube in final_cubes]
# NOTE the specific coords that refer to for example
# middle of the table can differ across simulations,
Expand All @@ -73,7 +73,7 @@ def calculate_result(
if ini_cube.name == final_cube.name:
was_adjacent_initially = self.is_adjacent_to_any(
ini_cube.pose,
[p for p in ini_poses if p != ini_cube.pose],
[p for p in initial_poses if p != ini_cube.pose],
0.15,
)
is_adjacent_finally = self.is_adjacent_to_any(
Expand Down
7 changes: 1 addition & 6 deletions src/rai_core/rai/agents/tool_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,8 @@ def run_one(call: ToolCall):
ts = time.perf_counter()
output = self.tools_by_name[call["name"]].invoke(call, config) # type: ignore
te = time.perf_counter() - ts
tool_output_log = (
str(output.content)[:1000] + "..."
if len(str(output.content)) > 1000
else ""
)
self.logger.info(
f"Tool {call['name']} completed in {te:.2f} seconds. Tool output: {tool_output_log}"
f"Tool {call['name']} completed in {te:.2f} seconds. Tool output: {output.content}"
)
self.logger.debug(
f"Tool {call['name']} output: \n\n{str(output.content)}"
Expand Down