replayio · Domiii · Jan 24, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025
diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -1,3 +1,4 @@
+import json
 import os
 from collections import deque
 
@@ -9,7 +10,7 @@
 from openhands.core.config import AgentConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
-from openhands.core.schema.replay import ReplayDebuggingPhase
+from openhands.core.schema.replay import ReplayPhase
 from openhands.events.action import (
     Action,
     AgentDelegateAction,
@@ -36,12 +37,14 @@
 from openhands.events.observation.error import ErrorObservation
 from openhands.events.observation.observation import Observation
 from openhands.events.observation.replay import (
-    ReplayPhaseUpdateObservation,
-    ReplayToolCmdOutputObservation,
+    ReplayObservation,
 )
-from openhands.events.replay import replay_enhance_action
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
+from openhands.replay.replay_initial_analysis import replay_enhance_action
+from openhands.replay.replay_phases import (
+    get_replay_observation_prompt,
+)
 from openhands.runtime.plugins import (
     AgentSkillsRequirement,
     JupyterRequirement,
@@ -102,7 +105,7 @@ def __init__(
 
         # We're in normal mode by default (even if replay is not enabled).
         # This will initialize the set of tools the agent has access to.
-        self.replay_phase_changed(ReplayDebuggingPhase.Normal)
+        self.update_tools(ReplayPhase.Normal)
 
         self.prompt_manager = PromptManager(
             microagent_dir=os.path.join(os.path.dirname(__file__), 'micro')
@@ -253,38 +256,8 @@ def get_observation_message(
                 )
             text += f'\n[Command finished with exit code {obs.exit_code}]'
             message = Message(role='user', content=[TextContent(text=text)])
-        elif isinstance(obs, ReplayToolCmdOutputObservation):
-            # if it doesn't have tool call metadata, it was triggered by a user action
-            if obs.tool_call_metadata is None:
-                text = truncate_content(
-                    f'\nObserved result of replay command executed by user:\n{obs.content}',
-                    max_message_chars,
-                )
-            else:
-                text = obs.content
-            message = Message(role='user', content=[TextContent(text=text)])
-        elif isinstance(obs, ReplayPhaseUpdateObservation):
-            # NOTE: The phase change itself is handled in AgentController.
-            new_phase = obs.new_phase
-            if new_phase == ReplayDebuggingPhase.Edit:
-                # Tell the agent to stop analyzing and start editing:
-                text = """
-You have concluded the analysis.
-
-IMPORTANT: NOW review, then implement the hypothesized changes using tools. The code is available in the workspace. Start by answering these questions:
-  1. What is the goal of the investigation according to the initial prompt and initial analysis? IMPORTANT. PAY ATTENTION TO THIS. THIS IS THE ENTRY POINT OF EVERYTHING.
-  2. Given (1), is the hypothesis's `problem` description correct? Does it match the goal of the investigation?
-  3. Do the `editSuggestions` actually address the issue?
-  4. Rephrase the hypothesis so that it is consistent and correct.
-
-IMPORTANT: Don't stop. Keep working.
-IMPORTANT: Don't stop. Keep working.
-"""
-                message = Message(role='user', content=[TextContent(text=text)])
-            else:
-                raise NotImplementedError(
-                    f'Unhandled ReplayPhaseUpdateAction: {new_phase}'
-                )
+        elif isinstance(obs, ReplayObservation):
+            message = get_replay_observation_prompt(obs, max_message_chars)
         elif isinstance(obs, IPythonRunCellObservation):
             text = obs.content
             # replace base64 images with a placeholder
@@ -344,22 +317,16 @@ def reset(self) -> None:
         """Resets the CodeAct Agent."""
         super().reset()
 
-    def replay_phase_changed(self, phase: ReplayDebuggingPhase) -> None:
-        """Called whenenever the phase of the replay debugging process changes.
-
-        We currently use this to give the agent access to different tools for the
-        different phases.
-        """
+    def update_tools(self, phase: ReplayPhase) -> None:
         self.tools = codeact_function_calling.get_tools(
             codeact_enable_browsing=self.config.codeact_enable_browsing,
             codeact_enable_jupyter=self.config.codeact_enable_jupyter,
             codeact_enable_llm_editor=self.config.codeact_enable_llm_editor,
             codeact_enable_replay=self.config.codeact_enable_replay,
-            codeact_replay_phase=phase,
+            replay_phase=phase,
         )
-        logger.debug(
-            f'[REPLAY] CodeActAgent.replay_phase_changed({phase}).'
-            # f'New tools: {json.dumps(self.tools, indent=2)}'
+        logger.info(
+            f'[REPLAY] update_tools for phase {phase}: {json.dumps([t["function"]['name'] for t in self.tools], indent=2)}'
         )
 
     def step(self, state: State) -> Action:
@@ -388,7 +355,7 @@ def step(self, state: State) -> Action:
             return AgentFinishAction()
 
         if self.config.codeact_enable_replay:
-            # Replay enhancement.
+            # Check for whether we should enhance the prompt.
             enhance_action = replay_enhance_action(state, self.config.is_workspace_repo)
             if enhance_action:
                 logger.info('[REPLAY] Enhancing prompt for Replay recording...')
@@ -402,7 +369,10 @@ def step(self, state: State) -> Action:
         params['tools'] = self.tools
         if self.mock_function_calling:
             params['mock_function_calling'] = True
-        # logger.debug(f'#######\nCodeActAgent.step: messages:\n{json.dumps(params)}\n\n#######\n')
+
+        # # Debug log the raw input to the LLM:
+        # logger.debug(f'#######\nCodeActAgent.step: RAW LLM INPUT:\n{repr(params)}\n\n#######\n')
+
         response = self.llm.completion(**params)
         actions = codeact_function_calling.response_to_actions(response, state)
         for action in actions:

diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py
@@ -15,7 +15,7 @@
 from openhands.controller.state.state import State
 from openhands.core.exceptions import FunctionCallNotExistsError
 from openhands.core.logger import openhands_logger as logger
-from openhands.core.schema import ReplayDebuggingPhase
+from openhands.core.schema import ReplayPhase
 from openhands.events.action import (
     Action,
     AgentDelegateAction,
@@ -26,171 +26,13 @@
     IPythonRunCellAction,
     MessageAction,
 )
-from openhands.events.action.replay import (
-    ReplayPhaseUpdateAction,
-    ReplayToolCmdRunAction,
-)
 from openhands.events.tool import ToolCallMetadata
-
-# ---------------------------------------------------------
-# Tool: inspect-data
-# ---------------------------------------------------------
-_REPLAY_INSPECT_DATA_DESCRIPTION = """
-Explains value, data flow and origin information for `expression` at `point`.
-IMPORTANT: Prefer using inspect-data over inspect-point.
-"""
-
-ReplayInspectDataTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='inspect-data',
-        description=_REPLAY_INSPECT_DATA_DESCRIPTION.strip(),
-        parameters={
-            'type': 'object',
-            'properties': {
-                'expression': {
-                    'type': 'string',
-                    'description': 'A valid JS expression. IMPORTANT: First pick the best expression. If the expression is an object: Prefer "array[0]" over "array" and "o.x" over "o" to get closer to the origin and creation site of important data points. Prefer nested object over primitive expressions.',
-                },
-                'point': {
-                    'type': 'string',
-                    'description': 'The point at which to inspect the runtime. The first point comes from the `thisPoint` in the Initial analysis.',
-                },
-                'explanation': {
-                    'type': 'string',
-                    'description': 'Give a concise explanation as to why you take this investigative step.',
-                },
-                'explanation_source': {
-                    'type': 'string',
-                    'description': 'Explain which data you saw in the previous analysis results that informs this step.',
-                },
-            },
-            'required': ['expression', 'point', 'explanation', 'explanation_source'],
-        },
-    ),
-)
-
-# ---------------------------------------------------------
-# Tool: inspect-point
-# ---------------------------------------------------------
-_REPLAY_INSPECT_POINT_DESCRIPTION = """
-Explains dynamic control flow and data flow dependencies of the code at `point`.
-Use this tool instead of `inspect-data` only when you don't have a specific data point to investigate.
-"""
-
-ReplayInspectPointTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='inspect-point',
-        description=_REPLAY_INSPECT_POINT_DESCRIPTION.strip(),
-        parameters={
-            'type': 'object',
-            'properties': {
-                'point': {'type': 'string'},
-            },
-            'required': ['point'],
-        },
-    ),
+from openhands.replay.replay_tools import (
+    get_replay_tools,
+    handle_replay_tool_call,
+    is_replay_tool,
 )
 
-# ---------------------------------------------------------
-# Tool: SubmitHypothesis
-# TODO: Divide this into multiple steps -
-#   1. The first submission must be as simple as possible to take little computational effort from the analysis steps.
-#   2. The second submission, after analysis has already concluded, must be as complete as possible.
-# ---------------------------------------------------------
-# _REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
-# Your investigation has yielded a complete thin slice from symptom to root cause,
-# enough proof to let the `CodeEdit` agent take over to fix the bug.
-# DO NOT GUESS. You must provide exact code in the exact right location to fix this bug,
-# based on evidence you have gathered.
-# """
-
-# ReplaySubmitHypothesisTool = ChatCompletionToolParam(
-#     type='function',
-#     function=ChatCompletionToolParamFunctionChunk(
-#         name='submit-hypothesis',
-#         description=_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION.strip(),
-#         parameters={
-#             'type': 'object',
-#             'properties': {
-#                 'rootCauseHypothesis': {'type': 'string'},
-#                 'thinSlice': {
-#                     'type': 'array',
-#                     'items': {
-#                         'type': 'object',
-#                         'properties': {
-#                             'point': {'type': 'string'},
-#                             'code': {'type': 'string'},
-#                             'role': {'type': 'string'},
-#                         },
-#                         'required': ['point', 'code', 'role'],
-#                     },
-#                 },
-#                 'modifications': {
-#                     'type': 'array',
-#                     'items': {
-#                         'type': 'object',
-#                         'properties': {
-#                             'kind': {
-#                                 'type': 'string',
-#                                 'enum': ['add', 'remove', 'modify'],
-#                             },
-#                             'newCode': {'type': 'string'},
-#                             'oldCode': {'type': 'string'},
-#                             'location': {'type': 'string'},
-#                             'point': {'type': 'string'},
-#                             # NOTE: Even though, we really want the `line` here, it will lead to much worse performance because the agent has a hard time computing correct line numbers from its point-based investigation.
-#                             # Instead of requiring a line number, the final fix will be more involved, as explained in the issue.
-#                             # see: https://linear.app/replay/issue/PRO-939/use-tools-data-flow-analysis-for-10608#comment-3b7ae176
-#                             # 'line': {'type': 'number'},
-#                             'briefExplanation': {'type': 'string'},
-#                             'verificationProof': {'type': 'string'},
-#                         },
-#                         'required': [
-#                             'kind',
-#                             'location',
-#                             'briefExplanation',
-#                             # 'line',
-#                             'verificationProof',
-#                         ],
-#                     },
-#                 },
-#             },
-#             'required': ['rootCauseHypothesis', 'thinSlice', 'modifications'],
-#         },
-#     ),
-# )
-_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
-# Use this tool to conclude your analysis and move on to code editing.
-# """
-
-ReplaySubmitHypothesisTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='submit-hypothesis',
-        description=_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION.strip(),
-        parameters={
-            'type': 'object',
-            'properties': {
-                'problem': {
-                    'type': 'string',
-                    'description': 'One-sentence explanation of the core problem that this will solve.',
-                },
-                'rootCauseHypothesis': {'type': 'string'},
-                'editSuggestions': {
-                    'type': 'string',
-                    'description': 'Provide suggestions to fix the bug, if you know enough about the code that requires modification.',
-                },
-            },
-            'required': ['rootCauseHypothesis'],
-        },
-    ),
-)
-
-REPLAY_TOOLS = ['inspect-data', 'inspect-point', 'submit-hypothesis']
-
-
 # ---------------------------------------------------------
 # OH default tools.
 # ---------------------------------------------------------
@@ -631,36 +473,8 @@ def response_to_actions(response: ModelResponse, state: State) -> list[Action]:
                 ) from e
             if tool_call.function.name == 'execute_bash':
                 action = CmdRunAction(**arguments)
-            elif tool_call.function.name in REPLAY_TOOLS:
-                logger.info(
-                    f'[REPLAY] TOOL_CALL {tool_call.function.name} - arguments: {json.dumps(arguments, indent=2)}'
-                )
-                if tool_call.function.name == 'inspect-data':
-                    # Remove explanation props.
-                    arguments = {
-                        k: v for k, v in arguments.items() if 'explanation' not in k
-                    }
-                    action = ReplayToolCmdRunAction(
-                        command_name='inspect-data',
-                        command_args=arguments
-                        | {'recordingId': state.replay_recording_id},
-                    )
-                elif tool_call.function.name == 'inspect-point':
-                    # if arguments['expression'] == 'wiredRules':   # hackfix for 10608 experiment
-                    #     raise FunctionCallValidationError(f'wiredRules is irrelevant to the problem. Try something else.')
-                    action = ReplayToolCmdRunAction(
-                        command_name='inspect-point',
-                        command_args=arguments
-                        | {'recordingId': state.replay_recording_id},
-                    )
-                elif tool_call.function.name == 'submit-hypothesis':
-                    action = ReplayPhaseUpdateAction(
-                        new_phase=ReplayDebuggingPhase.Edit, info=json.dumps(arguments)
-                    )
-                else:
-                    raise ValueError(
-                        f'Unknown Replay tool. Make sure to add them all to REPLAY_TOOLS: {tool_call.function.name}'
-                    )
+            elif is_replay_tool(tool_call.function.name):
+                action = handle_replay_tool_call(tool_call, arguments, state)
             elif tool_call.function.name == 'execute_ipython_cell':
                 action = IPythonRunCellAction(**arguments)
             elif tool_call.function.name == 'delegate_to_browsing_agent':
@@ -686,6 +500,7 @@ def response_to_actions(response: ModelResponse, state: State) -> list[Action]:
                 raise FunctionCallNotExistsError(
                     f'Tool {tool_call.function.name} is not registered. (arguments: {arguments}). Please check the tool name and retry with an existing tool.'
                 )
+            assert action
 
             # We only add thought to the first action
             if i == 0:
@@ -727,31 +542,16 @@ def get_tools(
     codeact_enable_llm_editor: bool = False,
     codeact_enable_jupyter: bool = False,
     codeact_enable_replay: bool = False,
-    codeact_replay_phase: ReplayDebuggingPhase = ReplayDebuggingPhase.Normal,
+    replay_phase: ReplayPhase = ReplayPhase.Normal,
 ) -> list[ChatCompletionToolParam]:
     default_tools = get_default_tools(
         codeact_enable_browsing,
         codeact_enable_llm_editor,
         codeact_enable_jupyter,
     )
-    if not codeact_enable_replay or codeact_replay_phase == ReplayDebuggingPhase.Normal:
-        # Use the default tools when not in a Replay-specific phase.
-        return default_tools
-
     if codeact_enable_replay:
-        analysis_tools = [
-            ReplayInspectDataTool,
-            ReplayInspectPointTool,
-        ]
-        if codeact_replay_phase == ReplayDebuggingPhase.Analysis:
-            # Analysis tools only. This phase is concluded upon submit-hypothesis.
-            tools = analysis_tools + [ReplaySubmitHypothesisTool]
-        elif codeact_replay_phase == ReplayDebuggingPhase.Edit:
-            # Combine default and analysis tools.
-            tools = default_tools + analysis_tools
-        else:
-            raise ValueError(
-                f'Unhandled ReplayDebuggingPhase in get_tools: {codeact_replay_phase}'
-            )
+        # Handle Replay tool updates.
+        return get_replay_tools(replay_phase, default_tools)
 
-    return tools
+    # Just the default tools.
+    return default_tools