From f7e3d8c3ced71250e9cec85aa34b54bfd100794e Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Tue, 21 Jan 2025 20:37:18 +0800
Subject: [PATCH 01/17] WIP: refactoring replay code into its own set of files

---
 .../agenthub/codeact_agent/codeact_agent.py   | 44 ++-------
 openhands/controller/agent_controller.py      | 28 +-----
 openhands/events/observation/replay.py        | 10 +-
 .../replay.py => replay/replay_commands.py}   | 91 +++----------------
 openhands/replay/replay_prompts.py            | 77 ++++++++++++++++
 openhands/replay/replay_state_machine.py      | 63 +++++++++++++
 openhands/replay/replay_types.py              | 19 ++++
 7 files changed, 191 insertions(+), 141 deletions(-)
 rename openhands/{events/replay.py => replay/replay_commands.py} (51%)
 create mode 100644 openhands/replay/replay_prompts.py
 create mode 100644 openhands/replay/replay_state_machine.py
 create mode 100644 openhands/replay/replay_types.py

diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index e61e57da5a79..4e04f87443f1 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -36,12 +36,14 @@
 from openhands.events.observation.error import ErrorObservation
 from openhands.events.observation.observation import Observation
 from openhands.events.observation.replay import (
-    ReplayPhaseUpdateObservation,
-    ReplayToolCmdOutputObservation,
+    ReplayObservation,
 )
-from openhands.events.replay import replay_enhance_action
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
+from openhands.replay.replay_commands import replay_enhance_action
+from openhands.replay.replay_state_machine import (
+    get_replay_observation_message,
+)
 from openhands.runtime.plugins import (
     AgentSkillsRequirement,
     JupyterRequirement,
@@ -253,38 +255,8 @@ def get_observation_message(
                 )
             text += f'\n[Command finished with exit code {obs.exit_code}]'
             message = Message(role='user', content=[TextContent(text=text)])
-        elif isinstance(obs, ReplayToolCmdOutputObservation):
-            # if it doesn't have tool call metadata, it was triggered by a user action
-            if obs.tool_call_metadata is None:
-                text = truncate_content(
-                    f'\nObserved result of replay command executed by user:\n{obs.content}',
-                    max_message_chars,
-                )
-            else:
-                text = obs.content
-            message = Message(role='user', content=[TextContent(text=text)])
-        elif isinstance(obs, ReplayPhaseUpdateObservation):
-            # NOTE: The phase change itself is handled in AgentController.
-            new_phase = obs.new_phase
-            if new_phase == ReplayDebuggingPhase.Edit:
-                # Tell the agent to stop analyzing and start editing:
-                text = """
-You have concluded the analysis.
-
-IMPORTANT: NOW review, then implement the hypothesized changes using tools. The code is available in the workspace. Start by answering these questions:
-  1. What is the goal of the investigation according to the initial prompt and initial analysis? IMPORTANT. PAY ATTENTION TO THIS. THIS IS THE ENTRY POINT OF EVERYTHING.
-  2. Given (1), is the hypothesis's `problem` description correct? Does it match the goal of the investigation?
-  3. Do the `editSuggestions` actually address the issue?
-  4. Rephrase the hypothesis so that it is consistent and correct.
-
-IMPORTANT: Don't stop. Keep working.
-IMPORTANT: Don't stop. Keep working.
-"""
-                message = Message(role='user', content=[TextContent(text=text)])
-            else:
-                raise NotImplementedError(
-                    f'Unhandled ReplayPhaseUpdateAction: {new_phase}'
-                )
+        elif isinstance(obs, ReplayObservation):
+            message = get_replay_observation_message(obs, max_message_chars)
         elif isinstance(obs, IPythonRunCellObservation):
             text = obs.content
             # replace base64 images with a placeholder
@@ -388,7 +360,7 @@ def step(self, state: State) -> Action:
             return AgentFinishAction()
 
         if self.config.codeact_enable_replay:
-            # Replay enhancement.
+            # Check for whether we should enhance the prompt.
             enhance_action = replay_enhance_action(state, self.config.is_workspace_repo)
             if enhance_action:
                 logger.info('[REPLAY] Enhancing prompt for Replay recording...')
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 5462da7804da..08d1521a8787 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -49,11 +49,11 @@
 )
 from openhands.events.observation.replay import (
     ReplayInternalCmdOutputObservation,
-    ReplayPhaseUpdateObservation,
+    ReplayObservation,
 )
-from openhands.events.replay import handle_replay_internal_observation
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
+from openhands.replay.replay_state_machine import on_replay_observation
 from openhands.utils.shutdown_listener import should_continue
 
 # note: RESUME is only available on web GUI
@@ -297,28 +297,8 @@ async def _handle_observation(self, observation: Observation) -> None:
 
         if self._pending_action and self._pending_action.id == observation.cause:
             self._pending_action = None
-            if isinstance(observation, ReplayInternalCmdOutputObservation):
-                # NOTE: Currently, the only internal command is the initial-analysis command.
-                analysis_tool_metadata = handle_replay_internal_observation(
-                    self.state, observation
-                )
-                if analysis_tool_metadata:
-                    # Start analysis phase
-                    self.state.replay_recording_id = analysis_tool_metadata[
-                        'recordingId'
-                    ]
-                    self.state.replay_phase = ReplayDebuggingPhase.Analysis
-                    self.agent.replay_phase_changed(ReplayDebuggingPhase.Analysis)
-            elif isinstance(observation, ReplayPhaseUpdateObservation):
-                new_phase = observation.new_phase
-                if self.state.replay_phase == new_phase:
-                    self.log(
-                        'warning',
-                        f'Unexpected ReplayPhaseUpdateAction. Already in phase. Observation:\n {repr(observation)}',
-                    )
-                else:
-                    self.state.replay_phase = new_phase
-                    self.agent.replay_phase_changed(new_phase)
+            if isinstance(observation, ReplayObservation):
+                on_replay_observation(observation, self.state, self.agent)
 
             if self.state.agent_state == AgentState.USER_CONFIRMED:
                 await self.set_agent_state_to(AgentState.RUNNING)
diff --git a/openhands/events/observation/replay.py b/openhands/events/observation/replay.py
index 05d7299d83f9..9d0425db40e6 100644
--- a/openhands/events/observation/replay.py
+++ b/openhands/events/observation/replay.py
@@ -1,3 +1,4 @@
+from abc import ABC
 from dataclasses import dataclass
 
 from openhands.core.schema import ObservationType
@@ -6,7 +7,12 @@
 
 
 @dataclass
-class ReplayCmdOutputObservationBase(Observation):
+class ReplayObservation(Observation, ABC):
+    pass
+
+
+@dataclass
+class ReplayCmdOutputObservationBase(ReplayObservation, ABC):
     """This data class represents the output of a replay command."""
 
     command_id: int
@@ -38,7 +44,7 @@ class ReplayToolCmdOutputObservation(ReplayCmdOutputObservationBase):
 
 
 @dataclass
-class ReplayPhaseUpdateObservation(Observation):
+class ReplayPhaseUpdateObservation(ReplayObservation):
     new_phase: ReplayDebuggingPhase
     observation: str = ObservationType.REPLAY_UPDATE_PHASE
 
diff --git a/openhands/events/replay.py b/openhands/replay/replay_commands.py
similarity index 51%
rename from openhands/events/replay.py
rename to openhands/replay/replay_commands.py
index 4b9f4fb75248..a5c037a69ff1 100644
--- a/openhands/events/replay.py
+++ b/openhands/replay/replay_commands.py
@@ -1,6 +1,6 @@
 import json
 import re
-from typing import Any, TypedDict, cast
+from typing import Any, cast
 
 from openhands.controller.state.state import State
 from openhands.core.logger import openhands_logger as logger
@@ -8,6 +8,8 @@
 from openhands.events.action.message import MessageAction
 from openhands.events.action.replay import ReplayInternalCmdRunAction
 from openhands.events.observation.replay import ReplayInternalCmdOutputObservation
+from openhands.replay.replay_prompts import replay_prompt_phase_analysis
+from openhands.replay.replay_types import AnalysisToolMetadata, AnnotateResult
 
 
 def scan_recording_id(issue: str) -> str | None:
@@ -24,7 +26,7 @@ def scan_recording_id(issue: str) -> str | None:
 
 
 # Produce the command string for the `annotate-execution-points` command.
-def command_annotate_execution_points(
+def start_initial_analysis(
     thought: str, is_workspace_repo: bool
 ) -> ReplayInternalCmdRunAction:
     command_input: dict[str, Any] = dict()
@@ -57,30 +59,12 @@ def replay_enhance_action(state: State, is_workspace_repo: bool) -> Action | Non
                 )
                 state.extra_data['replay_enhance_prompt_id'] = latest_user_message.id
                 logger.info('[REPLAY] stored latest_user_message id in state')
-                return command_annotate_execution_points(
+                return start_initial_analysis(
                     latest_user_message.content, is_workspace_repo
                 )
     return None
 
 
-class AnnotatedLocation(TypedDict, total=False):
-    filePath: str
-    line: int
-
-
-class AnalysisToolMetadata(TypedDict, total=False):
-    recordingId: str
-
-
-class AnnotateResult(TypedDict, total=False):
-    point: str
-    commentText: str | None
-    annotatedRepo: str | None
-    annotatedLocations: list[AnnotatedLocation] | None
-    pointLocation: str | None
-    metadata: AnalysisToolMetadata | None
-
-
 def safe_parse_json(text: str) -> dict[str, Any] | None:
     try:
         return json.loads(text)
@@ -97,15 +81,7 @@ def split_metadata(result):
     return metadata, data
 
 
-def enhance_prompt(user_message: MessageAction, prefix: str, suffix: str | None = None):
-    if prefix != '':
-        user_message.content = f'{prefix}\n\n{user_message.content}'
-    if suffix is not None:
-        user_message.content = f'{user_message.content}\n\n{suffix}'
-    logger.info(f'[REPLAY] Enhanced user prompt:\n{user_message.content}')
-
-
-def handle_replay_internal_observation(
+def handle_replay_internal_command_observation(
     state: State, observation: ReplayInternalCmdOutputObservation
 ) -> AnalysisToolMetadata | None:
     """
@@ -126,63 +102,20 @@ def handle_replay_internal_observation(
         assert user_message
         state.extra_data['replay_enhance_observed'] = True
 
+        # Deserialize stringified result.
         result: AnnotateResult = cast(
             AnnotateResult, safe_parse_json(observation.content)
         )
 
-        # Determine what initial-analysis did:
+        # Get metadata and enhance prompt.
         if result and 'metadata' in result:
-            # New workflow: initial-analysis provided the metadata to allow tool use.
-            metadata, data = split_metadata(result)
-            prefix = ''
-            suffix = """
-# Instructions
-0. Take a look at below `Initial Analysis`, based on a recorded trace of the bug. Pay special attention to `IMPORTANT_NOTES`.
-1. State the main problem statement. It MUST address `IMPORTANT_NOTES`. It must make sure that the application won't crash. It must fix the issue.
-2. Propose a plan to fix or investigate with multiple options in order of priority.
-3. Then use the `inspect-*` tools to investigate.
-4. Once found, `submit-hypothesis`.
-
-
-# Initial Analysis
-""" + json.dumps(data, indent=2)
-            enhance_prompt(
-                user_message,
-                prefix,
-                suffix,
-            )
+            # initial-analysis provides metadata needed for tool use.
+            metadata, command_result = split_metadata(result)
+            replay_prompt_phase_analysis(command_result, user_message)
             return metadata
-        elif result and result.get('annotatedRepo'):
-            # Old workflow: initial-analysis left hints in form of source code annotations.
-            annotated_repo_path = result.get('annotatedRepo', '')
-            comment_text = result.get('commentText', '')
-            react_component_name = result.get('reactComponentName', '')
-            console_error = result.get('consoleError', '')
-            # start_location = result.get('startLocation', '')
-            start_name = result.get('startName', '')
-
-            # TODO: Move this to a prompt template file.
-            if comment_text:
-                if react_component_name:
-                    prefix = f'There is a change needed to the {react_component_name} component.\n'
-                else:
-                    prefix = f'There is a change needed in {annotated_repo_path}:\n'
-                prefix += f'{comment_text}\n\n'
-            elif console_error:
-                prefix = f'There is a change needed in {annotated_repo_path} to fix a console error that has appeared unexpectedly:\n'
-                prefix += f'{console_error}\n\n'
-
-            prefix += '<IMPORTANT>\n'
-            prefix += 'Information about a reproduction of the problem is available in source comments.\n'
-            prefix += 'You must search for these comments and use them to get a better understanding of the problem.\n'
-            prefix += f'The first reproduction comment to search for is named {start_name}. Start your investigation there.\n'
-            prefix += '</IMPORTANT>\n'
-
-            enhance_prompt(user_message, prefix)
-            return None
         else:
             logger.warning(
-                f'[REPLAY] Replay observation cannot be interpreted. Observed content: {str(observation.content)}'
+                f'[REPLAY] Replay command result cannot be interpreted. Observed content: {str(observation.content)}'
             )
 
     return None
diff --git a/openhands/replay/replay_prompts.py b/openhands/replay/replay_prompts.py
new file mode 100644
index 000000000000..c022249971b4
--- /dev/null
+++ b/openhands/replay/replay_prompts.py
@@ -0,0 +1,77 @@
+import json
+
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action.message import MessageAction
+from openhands.replay.replay_types import AnnotateResult
+
+
+def enhance_prompt(user_message: MessageAction, prefix: str, suffix: str):
+    if prefix != '':
+        user_message.content = f'{prefix}\n\n{user_message.content}'
+    if suffix != '':
+        user_message.content = f'{user_message.content}\n\n{suffix}'
+    logger.info(f'[REPLAY] Enhanced user prompt:\n{user_message.content}')
+
+
+def replay_prompt_phase_analysis(command_result: dict, user_message: MessageAction):
+    prefix = ''
+    suffix = """
+# Instructions
+0. Take a look at below `Initial Analysis`, based on a recorded trace of the bug. Pay special attention to `IMPORTANT_NOTES`.
+1. State the main problem statement. It MUST address `IMPORTANT_NOTES`. It must make sure that the application won't crash. It must fix the issue.
+2. Propose a plan to fix or investigate with multiple options in order of priority.
+3. Then use the `inspect-*` tools to investigate.
+4. Once found, `submit-hypothesis`.
+
+
+# Initial Analysis
+""" + json.dumps(command_result, indent=2)
+    return enhance_prompt(user_message, prefix, suffix)
+
+
+def replay_prompt_phase_analysis_legacy(
+    command_result: AnnotateResult, user_message: MessageAction
+):
+    # Old workflow: initial-analysis left hints in form of source code annotations.
+    annotated_repo_path = command_result.get('annotatedRepo', '')
+    comment_text = command_result.get('commentText', '')
+    react_component_name = command_result.get('reactComponentName', '')
+    console_error = command_result.get('consoleError', '')
+    # start_location = result.get('startLocation', '')
+    start_name = command_result.get('startName', '')
+
+    # TODO: Move this to a prompt template file.
+    if comment_text:
+        if react_component_name:
+            prefix = (
+                f'There is a change needed to the {react_component_name} component.\n'
+            )
+        else:
+            prefix = f'There is a change needed in {annotated_repo_path}:\n'
+        prefix += f'{comment_text}\n\n'
+    elif console_error:
+        prefix = f'There is a change needed in {annotated_repo_path} to fix a console error that has appeared unexpectedly:\n'
+        prefix += f'{console_error}\n\n'
+
+    prefix += '<IMPORTANT>\n'
+    prefix += 'Information about a reproduction of the problem is available in source comments.\n'
+    prefix += 'You must search for these comments and use them to get a better understanding of the problem.\n'
+    prefix += f'The first reproduction comment to search for is named {start_name}. Start your investigation there.\n'
+    prefix += '</IMPORTANT>\n'
+
+    suffix = ''
+
+    return enhance_prompt(user_message, prefix, suffix)
+
+
+def replay_prompt_phase_edit():
+    # Tell the agent to stop analyzing and start editing:
+    return """
+You have concluded the analysis.
+
+IMPORTANT: NOW review, then implement the hypothesized changes using tools. The code is available in the workspace. Start by answering these questions:
+  1. What is the goal of the investigation according to the initial prompt and initial analysis? IMPORTANT. PAY ATTENTION TO THIS. THIS IS THE ENTRY POINT OF EVERYTHING.
+  2. Given (1), is the hypothesis's `problem` description correct? Does it match the goal of the investigation?
+  3. Do the `editSuggestions` actually address the issue?
+  4. Rephrase the hypothesis so that it is consistent and correct.
+"""
diff --git a/openhands/replay/replay_state_machine.py b/openhands/replay/replay_state_machine.py
new file mode 100644
index 000000000000..68f45b93be25
--- /dev/null
+++ b/openhands/replay/replay_state_machine.py
@@ -0,0 +1,63 @@
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.message import Message, TextContent
+from openhands.core.schema.replay import ReplayDebuggingPhase
+from openhands.events.observation.replay import (
+    ReplayInternalCmdOutputObservation,
+    ReplayObservation,
+    ReplayPhaseUpdateObservation,
+    ReplayToolCmdOutputObservation,
+)
+from openhands.events.serialization.event import truncate_content
+from openhands.replay.replay_commands import handle_replay_internal_command_observation
+from openhands.replay.replay_prompts import replay_prompt_phase_edit
+
+
+def on_replay_observation(obs: ReplayObservation, state: State, agent: Agent) -> None:
+    """Handle the observation."""
+    if isinstance(obs, ReplayInternalCmdOutputObservation):
+        # NOTE: Currently, the only internal command is the initial-analysis command.
+        analysis_tool_metadata = handle_replay_internal_command_observation(state, obs)
+        if analysis_tool_metadata:
+            # Start analysis phase
+            state.replay_recording_id = analysis_tool_metadata['recordingId']
+            state.replay_phase = ReplayDebuggingPhase.Analysis
+            agent.replay_phase_changed(ReplayDebuggingPhase.Analysis)
+    elif isinstance(obs, ReplayPhaseUpdateObservation):
+        new_phase = obs.new_phase
+        if state.replay_phase == new_phase:
+            logger.warning(
+                f'Unexpected ReplayPhaseUpdateAction. Already in phase. Observation:\n {repr(obs)}',
+            )
+        else:
+            state.replay_phase = new_phase
+            agent.replay_phase_changed(new_phase)
+
+
+def get_replay_observation_message(
+    obs: ReplayObservation, max_message_chars: int
+) -> Message:
+    """Create a message to explain the observation."""
+    if isinstance(obs, ReplayToolCmdOutputObservation):
+        # if it doesn't have tool call metadata, it was triggered by a user action
+        if obs.tool_call_metadata is None:
+            text = truncate_content(
+                f'\nObserved result of replay command executed by user:\n{obs.content}',
+                max_message_chars,
+            )
+        else:
+            text = obs.content
+        message = Message(role='user', content=[TextContent(text=text)])
+    elif isinstance(obs, ReplayPhaseUpdateObservation):
+        new_phase = obs.new_phase
+        if new_phase == ReplayDebuggingPhase.Edit:
+            text = replay_prompt_phase_edit()
+            message = Message(role='user', content=[TextContent(text=text)])
+        else:
+            raise NotImplementedError(f'Unhandled ReplayPhaseUpdateAction: {new_phase}')
+    else:
+        raise NotImplementedError(
+            f"Unhandled observation type: {obs.__class__.__name__} ({getattr(obs, 'observation', None)})"
+        )
+    return message
diff --git a/openhands/replay/replay_types.py b/openhands/replay/replay_types.py
new file mode 100644
index 000000000000..2628da9e414f
--- /dev/null
+++ b/openhands/replay/replay_types.py
@@ -0,0 +1,19 @@
+from typing import TypedDict
+
+
+class AnnotatedLocation(TypedDict, total=False):
+    filePath: str
+    line: int
+
+
+class AnalysisToolMetadata(TypedDict, total=False):
+    recordingId: str
+
+
+class AnnotateResult(TypedDict, total=False):
+    point: str
+    commentText: str | None
+    annotatedRepo: str | None
+    annotatedLocations: list[AnnotatedLocation] | None
+    pointLocation: str | None
+    metadata: AnalysisToolMetadata | None

From 8d6649c7e71e6e1ba049395724f6ce2e8aab2aaa Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Tue, 21 Jan 2025 21:48:30 +0800
Subject: [PATCH 02/17] WIP: tool refactoring

---
 .../agenthub/codeact_agent/codeact_agent.py   |   4 +-
 .../codeact_agent/function_calling.py         | 217 +---------------
 openhands/events/action/replay.py             |   9 +-
 ...commands.py => replay_initial_analysis.py} |  25 +-
 openhands/replay/replay_prompts.py            |  24 +-
 openhands/replay/replay_state_machine.py      |  10 +-
 openhands/replay/replay_tools.py              | 236 ++++++++++++++++++
 openhands/replay/replay_types.py              |   9 -
 8 files changed, 285 insertions(+), 249 deletions(-)
 rename openhands/replay/{replay_commands.py => replay_initial_analysis.py} (82%)
 create mode 100644 openhands/replay/replay_tools.py

diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index 4e04f87443f1..7453c9df2d41 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -40,7 +40,7 @@
 )
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
-from openhands.replay.replay_commands import replay_enhance_action
+from openhands.replay.replay_initial_analysis import replay_enhance_action
 from openhands.replay.replay_state_machine import (
     get_replay_observation_message,
 )
@@ -327,7 +327,7 @@ def replay_phase_changed(self, phase: ReplayDebuggingPhase) -> None:
             codeact_enable_jupyter=self.config.codeact_enable_jupyter,
             codeact_enable_llm_editor=self.config.codeact_enable_llm_editor,
             codeact_enable_replay=self.config.codeact_enable_replay,
-            codeact_replay_phase=phase,
+            replay_phase=phase,
         )
         logger.debug(
             f'[REPLAY] CodeActAgent.replay_phase_changed({phase}).'
diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py
index 2fbd34f71282..16a7d1af848d 100644
--- a/openhands/agenthub/codeact_agent/function_calling.py
+++ b/openhands/agenthub/codeact_agent/function_calling.py
@@ -26,171 +26,13 @@
     IPythonRunCellAction,
     MessageAction,
 )
-from openhands.events.action.replay import (
-    ReplayPhaseUpdateAction,
-    ReplayToolCmdRunAction,
-)
 from openhands.events.tool import ToolCallMetadata
-
-# ---------------------------------------------------------
-# Tool: inspect-data
-# ---------------------------------------------------------
-_REPLAY_INSPECT_DATA_DESCRIPTION = """
-Explains value, data flow and origin information for `expression` at `point`.
-IMPORTANT: Prefer using inspect-data over inspect-point.
-"""
-
-ReplayInspectDataTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='inspect-data',
-        description=_REPLAY_INSPECT_DATA_DESCRIPTION.strip(),
-        parameters={
-            'type': 'object',
-            'properties': {
-                'expression': {
-                    'type': 'string',
-                    'description': 'A valid JS expression. IMPORTANT: First pick the best expression. If the expression is an object: Prefer "array[0]" over "array" and "o.x" over "o" to get closer to the origin and creation site of important data points. Prefer nested object over primitive expressions.',
-                },
-                'point': {
-                    'type': 'string',
-                    'description': 'The point at which to inspect the runtime. The first point comes from the `thisPoint` in the Initial analysis.',
-                },
-                'explanation': {
-                    'type': 'string',
-                    'description': 'Give a concise explanation as to why you take this investigative step.',
-                },
-                'explanation_source': {
-                    'type': 'string',
-                    'description': 'Explain which data you saw in the previous analysis results that informs this step.',
-                },
-            },
-            'required': ['expression', 'point', 'explanation', 'explanation_source'],
-        },
-    ),
-)
-
-# ---------------------------------------------------------
-# Tool: inspect-point
-# ---------------------------------------------------------
-_REPLAY_INSPECT_POINT_DESCRIPTION = """
-Explains dynamic control flow and data flow dependencies of the code at `point`.
-Use this tool instead of `inspect-data` only when you don't have a specific data point to investigate.
-"""
-
-ReplayInspectPointTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='inspect-point',
-        description=_REPLAY_INSPECT_POINT_DESCRIPTION.strip(),
-        parameters={
-            'type': 'object',
-            'properties': {
-                'point': {'type': 'string'},
-            },
-            'required': ['point'],
-        },
-    ),
-)
-
-# ---------------------------------------------------------
-# Tool: SubmitHypothesis
-# TODO: Divide this into multiple steps -
-#   1. The first submission must be as simple as possible to take little computational effort from the analysis steps.
-#   2. The second submission, after analysis has already concluded, must be as complete as possible.
-# ---------------------------------------------------------
-# _REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
-# Your investigation has yielded a complete thin slice from symptom to root cause,
-# enough proof to let the `CodeEdit` agent take over to fix the bug.
-# DO NOT GUESS. You must provide exact code in the exact right location to fix this bug,
-# based on evidence you have gathered.
-# """
-
-# ReplaySubmitHypothesisTool = ChatCompletionToolParam(
-#     type='function',
-#     function=ChatCompletionToolParamFunctionChunk(
-#         name='submit-hypothesis',
-#         description=_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION.strip(),
-#         parameters={
-#             'type': 'object',
-#             'properties': {
-#                 'rootCauseHypothesis': {'type': 'string'},
-#                 'thinSlice': {
-#                     'type': 'array',
-#                     'items': {
-#                         'type': 'object',
-#                         'properties': {
-#                             'point': {'type': 'string'},
-#                             'code': {'type': 'string'},
-#                             'role': {'type': 'string'},
-#                         },
-#                         'required': ['point', 'code', 'role'],
-#                     },
-#                 },
-#                 'modifications': {
-#                     'type': 'array',
-#                     'items': {
-#                         'type': 'object',
-#                         'properties': {
-#                             'kind': {
-#                                 'type': 'string',
-#                                 'enum': ['add', 'remove', 'modify'],
-#                             },
-#                             'newCode': {'type': 'string'},
-#                             'oldCode': {'type': 'string'},
-#                             'location': {'type': 'string'},
-#                             'point': {'type': 'string'},
-#                             # NOTE: Even though, we really want the `line` here, it will lead to much worse performance because the agent has a hard time computing correct line numbers from its point-based investigation.
-#                             # Instead of requiring a line number, the final fix will be more involved, as explained in the issue.
-#                             # see: https://linear.app/replay/issue/PRO-939/use-tools-data-flow-analysis-for-10608#comment-3b7ae176
-#                             # 'line': {'type': 'number'},
-#                             'briefExplanation': {'type': 'string'},
-#                             'verificationProof': {'type': 'string'},
-#                         },
-#                         'required': [
-#                             'kind',
-#                             'location',
-#                             'briefExplanation',
-#                             # 'line',
-#                             'verificationProof',
-#                         ],
-#                     },
-#                 },
-#             },
-#             'required': ['rootCauseHypothesis', 'thinSlice', 'modifications'],
-#         },
-#     ),
-# )
-_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
-# Use this tool to conclude your analysis and move on to code editing.
-# """
-
-ReplaySubmitHypothesisTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='submit-hypothesis',
-        description=_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION.strip(),
-        parameters={
-            'type': 'object',
-            'properties': {
-                'problem': {
-                    'type': 'string',
-                    'description': 'One-sentence explanation of the core problem that this will solve.',
-                },
-                'rootCauseHypothesis': {'type': 'string'},
-                'editSuggestions': {
-                    'type': 'string',
-                    'description': 'Provide suggestions to fix the bug, if you know enough about the code that requires modification.',
-                },
-            },
-            'required': ['rootCauseHypothesis'],
-        },
-    ),
+from openhands.replay.replay_tools import (
+    get_replay_tools,
+    handle_replay_tool_call,
+    is_replay_tool,
 )
 
-REPLAY_TOOLS = ['inspect-data', 'inspect-point', 'submit-hypothesis']
-
-
 # ---------------------------------------------------------
 # OH default tools.
 # ---------------------------------------------------------
@@ -631,36 +473,8 @@ def response_to_actions(response: ModelResponse, state: State) -> list[Action]:
                 ) from e
             if tool_call.function.name == 'execute_bash':
                 action = CmdRunAction(**arguments)
-            elif tool_call.function.name in REPLAY_TOOLS:
-                logger.info(
-                    f'[REPLAY] TOOL_CALL {tool_call.function.name} - arguments: {json.dumps(arguments, indent=2)}'
-                )
-                if tool_call.function.name == 'inspect-data':
-                    # Remove explanation props.
-                    arguments = {
-                        k: v for k, v in arguments.items() if 'explanation' not in k
-                    }
-                    action = ReplayToolCmdRunAction(
-                        command_name='inspect-data',
-                        command_args=arguments
-                        | {'recordingId': state.replay_recording_id},
-                    )
-                elif tool_call.function.name == 'inspect-point':
-                    # if arguments['expression'] == 'wiredRules':   # hackfix for 10608 experiment
-                    #     raise FunctionCallValidationError(f'wiredRules is irrelevant to the problem. Try something else.')
-                    action = ReplayToolCmdRunAction(
-                        command_name='inspect-point',
-                        command_args=arguments
-                        | {'recordingId': state.replay_recording_id},
-                    )
-                elif tool_call.function.name == 'submit-hypothesis':
-                    action = ReplayPhaseUpdateAction(
-                        new_phase=ReplayDebuggingPhase.Edit, info=json.dumps(arguments)
-                    )
-                else:
-                    raise ValueError(
-                        f'Unknown Replay tool. Make sure to add them all to REPLAY_TOOLS: {tool_call.function.name}'
-                    )
+            elif is_replay_tool(tool_call.function.name):
+                handle_replay_tool_call(tool_call, arguments, state)
             elif tool_call.function.name == 'execute_ipython_cell':
                 action = IPythonRunCellAction(**arguments)
             elif tool_call.function.name == 'delegate_to_browsing_agent':
@@ -727,31 +541,18 @@ def get_tools(
     codeact_enable_llm_editor: bool = False,
     codeact_enable_jupyter: bool = False,
     codeact_enable_replay: bool = False,
-    codeact_replay_phase: ReplayDebuggingPhase = ReplayDebuggingPhase.Normal,
+    replay_phase: ReplayDebuggingPhase = ReplayDebuggingPhase.Normal,
 ) -> list[ChatCompletionToolParam]:
     default_tools = get_default_tools(
         codeact_enable_browsing,
         codeact_enable_llm_editor,
         codeact_enable_jupyter,
     )
-    if not codeact_enable_replay or codeact_replay_phase == ReplayDebuggingPhase.Normal:
+    if not codeact_enable_replay or replay_phase == ReplayDebuggingPhase.Normal:
         # Use the default tools when not in a Replay-specific phase.
         return default_tools
 
     if codeact_enable_replay:
-        analysis_tools = [
-            ReplayInspectDataTool,
-            ReplayInspectPointTool,
-        ]
-        if codeact_replay_phase == ReplayDebuggingPhase.Analysis:
-            # Analysis tools only. This phase is concluded upon submit-hypothesis.
-            tools = analysis_tools + [ReplaySubmitHypothesisTool]
-        elif codeact_replay_phase == ReplayDebuggingPhase.Edit:
-            # Combine default and analysis tools.
-            tools = default_tools + analysis_tools
-        else:
-            raise ValueError(
-                f'Unhandled ReplayDebuggingPhase in get_tools: {codeact_replay_phase}'
-            )
+        tools = get_replay_tools(replay_phase, default_tools)
 
     return tools
diff --git a/openhands/events/action/replay.py b/openhands/events/action/replay.py
index 0d0c2d0983cc..885b8f8eab1f 100644
--- a/openhands/events/action/replay.py
+++ b/openhands/events/action/replay.py
@@ -11,9 +11,14 @@
 )
 
 
+@dataclass
+class ReplayAction(Action):
+    pass
+
+
 # NOTE: We need the same class twice because a lot of the agent logic is based on isinstance checks.
 @dataclass
-class ReplayCmdRunActionBase(Action):
+class ReplayCmdRunActionBase(ReplayAction):
     # Name of the command in @replayapi/cli.
     command_name: str
 
@@ -62,7 +67,7 @@ class ReplayToolCmdRunAction(ReplayCmdRunActionBase):
 
 
 @dataclass
-class ReplayPhaseUpdateAction(Action):
+class ReplayPhaseUpdateAction(ReplayAction):
     new_phase: ReplayDebuggingPhase
 
     thought: str = ''
diff --git a/openhands/replay/replay_commands.py b/openhands/replay/replay_initial_analysis.py
similarity index 82%
rename from openhands/replay/replay_commands.py
rename to openhands/replay/replay_initial_analysis.py
index a5c037a69ff1..fe6fa1c6fa40 100644
--- a/openhands/replay/replay_commands.py
+++ b/openhands/replay/replay_initial_analysis.py
@@ -1,6 +1,6 @@
 import json
 import re
-from typing import Any, cast
+from typing import Any, Tuple, cast
 
 from openhands.controller.state.state import State
 from openhands.core.logger import openhands_logger as logger
@@ -9,7 +9,7 @@
 from openhands.events.action.replay import ReplayInternalCmdRunAction
 from openhands.events.observation.replay import ReplayInternalCmdOutputObservation
 from openhands.replay.replay_prompts import replay_prompt_phase_analysis
-from openhands.replay.replay_types import AnalysisToolMetadata, AnnotateResult
+from openhands.replay.replay_types import AnalysisToolMetadata
 
 
 def scan_recording_id(issue: str) -> str | None:
@@ -72,20 +72,23 @@ def safe_parse_json(text: str) -> dict[str, Any] | None:
         return None
 
 
-def split_metadata(result):
+def split_metadata(result: dict) -> Tuple[AnalysisToolMetadata, dict]:
     if 'metadata' not in result:
         return {}, result
-    metadata = result['metadata']
+    metadata = cast(AnalysisToolMetadata, result['metadata'])
     data = dict(result)
     del data['metadata']
     return metadata, data
 
 
-def handle_replay_internal_command_observation(
+def on_replay_internal_command_observation(
     state: State, observation: ReplayInternalCmdOutputObservation
 ) -> AnalysisToolMetadata | None:
     """
-    Enhance the user prompt with the results of the replay analysis.
+    Handle result for an internally sent command (not agent tool use or user action).
+
+    NOTE: Currently, the only internal command is the initial-analysis command.
+    Enhance the user prompt with the results of the initial analysis.
     Returns the metadata needed for the agent to switch to analysis tools.
     """
     enhance_action_id = state.extra_data.get('replay_enhance_prompt_id')
@@ -103,19 +106,19 @@ def handle_replay_internal_command_observation(
         state.extra_data['replay_enhance_observed'] = True
 
         # Deserialize stringified result.
-        result: AnnotateResult = cast(
-            AnnotateResult, safe_parse_json(observation.content)
-        )
+        result = safe_parse_json(observation.content)
 
         # Get metadata and enhance prompt.
         if result and 'metadata' in result:
             # initial-analysis provides metadata needed for tool use.
             metadata, command_result = split_metadata(result)
-            replay_prompt_phase_analysis(command_result, user_message)
+            user_message.content = replay_prompt_phase_analysis(
+                command_result, user_message.content
+            )
             return metadata
         else:
             logger.warning(
-                f'[REPLAY] Replay command result cannot be interpreted. Observed content: {str(observation.content)}'
+                f'[REPLAY] Replay command result missing metadata. Observed content: {str(observation.content)}'
             )
 
     return None
diff --git a/openhands/replay/replay_prompts.py b/openhands/replay/replay_prompts.py
index c022249971b4..860c3ed3b155 100644
--- a/openhands/replay/replay_prompts.py
+++ b/openhands/replay/replay_prompts.py
@@ -1,19 +1,19 @@
 import json
 
 from openhands.core.logger import openhands_logger as logger
-from openhands.events.action.message import MessageAction
-from openhands.replay.replay_types import AnnotateResult
+from openhands.events.observation.replay import ReplayPhaseUpdateObservation
 
 
-def enhance_prompt(user_message: MessageAction, prefix: str, suffix: str):
+def enhance_prompt(prompt: str, prefix: str, suffix: str):
     if prefix != '':
-        user_message.content = f'{prefix}\n\n{user_message.content}'
+        prompt = f'{prefix}\n\n{prompt}'
     if suffix != '':
-        user_message.content = f'{user_message.content}\n\n{suffix}'
-    logger.info(f'[REPLAY] Enhanced user prompt:\n{user_message.content}')
+        prompt = f'{prompt}\n\n{suffix}'
+    logger.info(f'[REPLAY] Enhanced prompt:\n{prompt}')
+    return prompt
 
 
-def replay_prompt_phase_analysis(command_result: dict, user_message: MessageAction):
+def replay_prompt_phase_analysis(command_result: dict, prompt: str) -> str:
     prefix = ''
     suffix = """
 # Instructions
@@ -26,12 +26,10 @@ def replay_prompt_phase_analysis(command_result: dict, user_message: MessageActi
 
 # Initial Analysis
 """ + json.dumps(command_result, indent=2)
-    return enhance_prompt(user_message, prefix, suffix)
+    return enhance_prompt(prompt, prefix, suffix)
 
 
-def replay_prompt_phase_analysis_legacy(
-    command_result: AnnotateResult, user_message: MessageAction
-):
+def replay_prompt_phase_analysis_legacy(command_result: dict, prompt: str) -> str:
     # Old workflow: initial-analysis left hints in form of source code annotations.
     annotated_repo_path = command_result.get('annotatedRepo', '')
     comment_text = command_result.get('commentText', '')
@@ -61,10 +59,10 @@ def replay_prompt_phase_analysis_legacy(
 
     suffix = ''
 
-    return enhance_prompt(user_message, prefix, suffix)
+    return enhance_prompt(prompt, prefix, suffix)
 
 
-def replay_prompt_phase_edit():
+def replay_prompt_phase_edit(obs: ReplayPhaseUpdateObservation) -> str:
     # Tell the agent to stop analyzing and start editing:
     return """
 You have concluded the analysis.
diff --git a/openhands/replay/replay_state_machine.py b/openhands/replay/replay_state_machine.py
index 68f45b93be25..e8a9fab4b643 100644
--- a/openhands/replay/replay_state_machine.py
+++ b/openhands/replay/replay_state_machine.py
@@ -10,7 +10,9 @@
     ReplayToolCmdOutputObservation,
 )
 from openhands.events.serialization.event import truncate_content
-from openhands.replay.replay_commands import handle_replay_internal_command_observation
+from openhands.replay.replay_initial_analysis import (
+    on_replay_internal_command_observation,
+)
 from openhands.replay.replay_prompts import replay_prompt_phase_edit
 
 
@@ -18,7 +20,7 @@ def on_replay_observation(obs: ReplayObservation, state: State, agent: Agent) ->
     """Handle the observation."""
     if isinstance(obs, ReplayInternalCmdOutputObservation):
         # NOTE: Currently, the only internal command is the initial-analysis command.
-        analysis_tool_metadata = handle_replay_internal_command_observation(state, obs)
+        analysis_tool_metadata = on_replay_internal_command_observation(state, obs)
         if analysis_tool_metadata:
             # Start analysis phase
             state.replay_recording_id = analysis_tool_metadata['recordingId']
@@ -52,10 +54,10 @@ def get_replay_observation_message(
     elif isinstance(obs, ReplayPhaseUpdateObservation):
         new_phase = obs.new_phase
         if new_phase == ReplayDebuggingPhase.Edit:
-            text = replay_prompt_phase_edit()
-            message = Message(role='user', content=[TextContent(text=text)])
+            text = replay_prompt_phase_edit(obs)
         else:
             raise NotImplementedError(f'Unhandled ReplayPhaseUpdateAction: {new_phase}')
+        message = Message(role='user', content=[TextContent(text=text)])
     else:
         raise NotImplementedError(
             f"Unhandled observation type: {obs.__class__.__name__} ({getattr(obs, 'observation', None)})"
diff --git a/openhands/replay/replay_tools.py b/openhands/replay/replay_tools.py
new file mode 100644
index 000000000000..820e98efb0ca
--- /dev/null
+++ b/openhands/replay/replay_tools.py
@@ -0,0 +1,236 @@
+"""This file contains the function calling implementation for different actions.
+
+This is similar to the functionality of `CodeActResponseParser`.
+"""
+
+import json
+
+from litellm import (
+    ChatCompletionMessageToolCall,
+    ChatCompletionToolParam,
+    ChatCompletionToolParamFunctionChunk,
+)
+
+from openhands.controller.state.state import State
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.schema.replay import ReplayDebuggingPhase
+from openhands.events.action.replay import (
+    ReplayAction,
+    ReplayPhaseUpdateAction,
+    ReplayToolCmdRunAction,
+)
+
+
+class ReplayTool(ChatCompletionToolParam):
+    pass
+
+
+def replay_tool(**kwargs):
+    f = ChatCompletionToolParamFunctionChunk(**kwargs)
+    return ReplayTool(type='function', function=f)
+
+
+# ---------------------------------------------------------
+# Tool: inspect-data
+# ---------------------------------------------------------
+_REPLAY_INSPECT_DATA_DESCRIPTION = """
+Explains value, data flow and origin information for `expression` at `point`.
+IMPORTANT: Prefer using inspect-data over inspect-point.
+"""
+
+ReplayInspectDataTool = replay_tool(
+    name='inspect-data',
+    description=_REPLAY_INSPECT_DATA_DESCRIPTION.strip(),
+    parameters={
+        'type': 'object',
+        'properties': {
+            'expression': {
+                'type': 'string',
+                'description': 'A valid JS expression. IMPORTANT: First pick the best expression. If the expression is an object: Prefer "array[0]" over "array" and "o.x" over "o" to get closer to the origin and creation site of important data points. Prefer nested object over primitive expressions.',
+            },
+            'point': {
+                'type': 'string',
+                'description': 'The point at which to inspect the runtime. The first point comes from the `thisPoint` in the Initial analysis.',
+            },
+            'explanation': {
+                'type': 'string',
+                'description': 'Give a concise explanation as to why you take this investigative step.',
+            },
+            'explanation_source': {
+                'type': 'string',
+                'description': 'Explain which data you saw in the previous analysis results that informs this step.',
+            },
+        },
+        'required': ['expression', 'point', 'explanation', 'explanation_source'],
+    },
+)
+
+# ---------------------------------------------------------
+# Tool: inspect-point
+# ---------------------------------------------------------
+_REPLAY_INSPECT_POINT_DESCRIPTION = """
+Explains dynamic control flow and data flow dependencies of the code at `point`.
+Use this tool instead of `inspect-data` only when you don't have a specific data point to investigate.
+"""
+
+ReplayInspectPointTool = replay_tool(
+    name='inspect-point',
+    description=_REPLAY_INSPECT_POINT_DESCRIPTION.strip(),
+    parameters={
+        'type': 'object',
+        'properties': {
+            'point': {'type': 'string'},
+        },
+        'required': ['point'],
+    },
+)
+
+# ---------------------------------------------------------
+# Tool: SubmitHypothesis
+# TODO: Divide this into multiple steps -
+#   1. The first submission must be as simple as possible to take little computational effort from the analysis steps.
+#   2. The second submission, after analysis has already concluded, must be as complete as possible.
+# ---------------------------------------------------------
+# _REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
+# Your investigation has yielded a complete thin slice from symptom to root cause,
+# enough proof to let the `CodeEdit` agent take over to fix the bug.
+# DO NOT GUESS. You must provide exact code in the exact right location to fix this bug,
+# based on evidence you have gathered.
+# """
+
+# ReplaySubmitHypothesisTool = ReplayToolDefinition(
+#     name='submit-hypothesis',
+#     description=_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION.strip(),
+#     parameters={
+#         'type': 'object',
+#         'properties': {
+#             'rootCauseHypothesis': {'type': 'string'},
+#             'thinSlice': {
+#                 'type': 'array',
+#                 'items': {
+#                     'type': 'object',
+#                     'properties': {
+#                         'point': {'type': 'string'},
+#                         'code': {'type': 'string'},
+#                         'role': {'type': 'string'},
+#                     },
+#                     'required': ['point', 'code', 'role'],
+#                 },
+#             },
+#             'modifications': {
+#                 'type': 'array',
+#                 'items': {
+#                     'type': 'object',
+#                     'properties': {
+#                         'kind': {
+#                             'type': 'string',
+#                             'enum': ['add', 'remove', 'modify'],
+#                         },
+#                         'newCode': {'type': 'string'},
+#                         'oldCode': {'type': 'string'},
+#                         'location': {'type': 'string'},
+#                         'point': {'type': 'string'},
+#                         # NOTE: Even though, we really want the `line` here, it will lead to much worse performance because the agent has a hard time computing correct line numbers from its point-based investigation.
+#                         # Instead of requiring a line number, the final fix will be more involved, as explained in the issue.
+#                         # see: https://linear.app/replay/issue/PRO-939/use-tools-data-flow-analysis-for-10608#comment-3b7ae176
+#                         # 'line': {'type': 'number'},
+#                         'briefExplanation': {'type': 'string'},
+#                         'verificationProof': {'type': 'string'},
+#                     },
+#                     'required': [
+#                         'kind',
+#                         'location',
+#                         'briefExplanation',
+#                         # 'line',
+#                         'verificationProof',
+#                     ],
+#                 },
+#             },
+#         },
+#         'required': ['rootCauseHypothesis', 'thinSlice', 'modifications'],
+#     },
+# )
+_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
+# Use this tool to conclude your analysis and move on to code editing.
+# """
+
+ReplaySubmitHypothesisTool = replay_tool(
+    name='submit-hypothesis',
+    description=_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION.strip(),
+    parameters={
+        'type': 'object',
+        'properties': {
+            'problem': {
+                'type': 'string',
+                'description': 'One-sentence explanation of the core problem that this will solve.',
+            },
+            'rootCauseHypothesis': {'type': 'string'},
+            'editSuggestions': {
+                'type': 'string',
+                'description': 'Provide suggestions to fix the bug, if you know enough about the code that requires modification.',
+            },
+        },
+        'required': ['rootCauseHypothesis'],
+    },
+)
+
+replay_tools: list[ReplayTool] = [
+    ReplayInspectDataTool,
+    ReplayInspectPointTool,
+    ReplaySubmitHypothesisTool,
+]
+replay_tool_names: set[str] = set([t.function['name'] for t in replay_tools])
+
+
+def is_replay_tool(tool_name: str) -> bool:
+    return tool_name in replay_tool_names
+
+
+def handle_replay_tool_call(
+    tool_call: ChatCompletionMessageToolCall, arguments: dict, state: State
+) -> ReplayAction:
+    logger.info(
+        f'[REPLAY] TOOL_CALL {tool_call.function.name} - arguments: {json.dumps(arguments, indent=2)}'
+    )
+    action: ReplayAction
+    if tool_call.function.name == 'inspect-data':
+        # Remove explanation props.
+        arguments = {k: v for k, v in arguments.items() if 'explanation' not in k}
+        action = ReplayToolCmdRunAction(
+            command_name='inspect-data',
+            command_args=arguments | {'recordingId': state.replay_recording_id},
+        )
+    elif tool_call.function.name == 'inspect-point':
+        # if arguments['expression'] == 'wiredRules':   # hackfix for 10608 experiment
+        #     raise FunctionCallValidationError(f'wiredRules is irrelevant to the problem. Try something else.')
+        action = ReplayToolCmdRunAction(
+            command_name='inspect-point',
+            command_args=arguments | {'recordingId': state.replay_recording_id},
+        )
+    elif tool_call.function.name == 'submit-hypothesis':
+        action = ReplayPhaseUpdateAction(
+            new_phase=ReplayDebuggingPhase.Edit, info=json.dumps(arguments)
+        )
+    else:
+        raise ValueError(
+            f'Unknown Replay tool. Make sure to add them all to REPLAY_TOOLS: {tool_call.function.name}'
+        )
+    return action
+
+
+def get_replay_tools(
+    replay_phase: ReplayDebuggingPhase, default_tools: list[ChatCompletionToolParam]
+) -> list[ChatCompletionToolParam]:
+    analysis_tools = [
+        ReplayInspectDataTool,
+        ReplayInspectPointTool,
+    ]
+    if replay_phase == ReplayDebuggingPhase.Analysis:
+        # Analysis tools only. This phase is concluded upon submit-hypothesis.
+        tools = analysis_tools + [ReplaySubmitHypothesisTool]
+    elif replay_phase == ReplayDebuggingPhase.Edit:
+        # Combine default and analysis tools.
+        tools = default_tools + analysis_tools
+    else:
+        raise ValueError(f'Unhandled ReplayDebuggingPhase in get_tools: {replay_phase}')
+    return tools
diff --git a/openhands/replay/replay_types.py b/openhands/replay/replay_types.py
index 2628da9e414f..213bc6d8ded0 100644
--- a/openhands/replay/replay_types.py
+++ b/openhands/replay/replay_types.py
@@ -8,12 +8,3 @@ class AnnotatedLocation(TypedDict, total=False):
 
 class AnalysisToolMetadata(TypedDict, total=False):
     recordingId: str
-
-
-class AnnotateResult(TypedDict, total=False):
-    point: str
-    commentText: str | None
-    annotatedRepo: str | None
-    annotatedLocations: list[AnnotatedLocation] | None
-    pointLocation: str | None
-    metadata: AnalysisToolMetadata | None

From ebc4d2152b0199a1eadce16105f7b0f10acbaba2 Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Tue, 21 Jan 2025 22:16:39 +0800
Subject: [PATCH 03/17] rename `ReplayPhase`

---
 openhands/agenthub/codeact_agent/codeact_agent.py    |  6 +++---
 openhands/agenthub/codeact_agent/function_calling.py |  6 +++---
 openhands/controller/agent.py                        |  4 ++--
 openhands/controller/agent_controller.py             |  4 ++--
 openhands/controller/state/state.py                  |  4 ++--
 openhands/core/schema/__init__.py                    |  4 ++--
 openhands/core/schema/replay.py                      |  2 +-
 openhands/events/action/replay.py                    |  4 ++--
 openhands/events/observation/replay.py               |  4 ++--
 openhands/replay/replay_state_machine.py             |  8 ++++----
 openhands/replay/replay_tools.py                     | 12 ++++++------
 openhands/runtime/action_execution_server.py         |  2 +-
 12 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index 7453c9df2d41..a072ce3f3033 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -9,7 +9,7 @@
 from openhands.core.config import AgentConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
-from openhands.core.schema.replay import ReplayDebuggingPhase
+from openhands.core.schema.replay import ReplayPhase
 from openhands.events.action import (
     Action,
     AgentDelegateAction,
@@ -104,7 +104,7 @@ def __init__(
 
         # We're in normal mode by default (even if replay is not enabled).
         # This will initialize the set of tools the agent has access to.
-        self.replay_phase_changed(ReplayDebuggingPhase.Normal)
+        self.replay_phase_changed(ReplayPhase.Normal)
 
         self.prompt_manager = PromptManager(
             microagent_dir=os.path.join(os.path.dirname(__file__), 'micro')
@@ -316,7 +316,7 @@ def reset(self) -> None:
         """Resets the CodeAct Agent."""
         super().reset()
 
-    def replay_phase_changed(self, phase: ReplayDebuggingPhase) -> None:
+    def replay_phase_changed(self, phase: ReplayPhase) -> None:
         """Called whenenever the phase of the replay debugging process changes.
 
         We currently use this to give the agent access to different tools for the
diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py
index 16a7d1af848d..44e3bfc8eeec 100644
--- a/openhands/agenthub/codeact_agent/function_calling.py
+++ b/openhands/agenthub/codeact_agent/function_calling.py
@@ -15,7 +15,7 @@
 from openhands.controller.state.state import State
 from openhands.core.exceptions import FunctionCallNotExistsError
 from openhands.core.logger import openhands_logger as logger
-from openhands.core.schema import ReplayDebuggingPhase
+from openhands.core.schema import ReplayPhase
 from openhands.events.action import (
     Action,
     AgentDelegateAction,
@@ -541,14 +541,14 @@ def get_tools(
     codeact_enable_llm_editor: bool = False,
     codeact_enable_jupyter: bool = False,
     codeact_enable_replay: bool = False,
-    replay_phase: ReplayDebuggingPhase = ReplayDebuggingPhase.Normal,
+    replay_phase: ReplayPhase = ReplayPhase.Normal,
 ) -> list[ChatCompletionToolParam]:
     default_tools = get_default_tools(
         codeact_enable_browsing,
         codeact_enable_llm_editor,
         codeact_enable_jupyter,
     )
-    if not codeact_enable_replay or replay_phase == ReplayDebuggingPhase.Normal:
+    if not codeact_enable_replay or replay_phase == ReplayPhase.Normal:
         # Use the default tools when not in a Replay-specific phase.
         return default_tools
 
diff --git a/openhands/controller/agent.py b/openhands/controller/agent.py
index 0d767751c89d..c74783ce2f56 100644
--- a/openhands/controller/agent.py
+++ b/openhands/controller/agent.py
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Type
 
-from openhands.core.schema.replay import ReplayDebuggingPhase
+from openhands.core.schema.replay import ReplayPhase
 
 if TYPE_CHECKING:
     from openhands.controller.state.state import State
@@ -65,7 +65,7 @@ def reset(self) -> None:
 
     # the "noqa" below is so we can add an empty but not abstract method. otherwise we'd need an empty
     # implmentation in every subclass other than CodeActAgent (which does override it.)
-    def replay_phase_changed(self, phase: ReplayDebuggingPhase) -> None:  # noqa: B027
+    def replay_phase_changed(self, phase: ReplayPhase) -> None:  # noqa: B027
         """Called when the phase of the replay debugging process changes. This method
         can be used to update the agent's behavior based on the phase.
         """
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 08d1521a8787..a9e23c4ec9dd 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -20,7 +20,7 @@
 )
 from openhands.core.logger import LOG_ALL_EVENTS
 from openhands.core.logger import openhands_logger as logger
-from openhands.core.schema import AgentState, ReplayDebuggingPhase
+from openhands.core.schema import AgentState, ReplayPhase
 from openhands.events import EventSource, EventStream, EventStreamSubscriber
 from openhands.events.action import (
     Action,
@@ -145,7 +145,7 @@ def __init__(
         self._stuck_detector = StuckDetector(self.state)
         self.status_callback = status_callback
 
-        self.replay_phase = ReplayDebuggingPhase.Normal
+        self.replay_phase = ReplayPhase.Normal
 
     async def close(self) -> None:
         """Closes the agent controller, canceling any ongoing tasks and unsubscribing from the event stream.
diff --git a/openhands/controller/state/state.py b/openhands/controller/state/state.py
index 9f715323b8a7..f279bad954b1 100644
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -6,7 +6,7 @@
 
 from openhands.controller.state.task import RootTask
 from openhands.core.logger import openhands_logger as logger
-from openhands.core.schema import AgentState, ReplayDebuggingPhase
+from openhands.core.schema import AgentState, ReplayPhase
 from openhands.events.action import (
     MessageAction,
 )
@@ -102,7 +102,7 @@ class State:
     last_error: str = ''
 
     replay_recording_id: str = ''
-    replay_phase: ReplayDebuggingPhase = ReplayDebuggingPhase.Normal
+    replay_phase: ReplayPhase = ReplayPhase.Normal
 
     def save_to_session(self, sid: str, file_store: FileStore):
         pickled = pickle.dumps(self)
diff --git a/openhands/core/schema/__init__.py b/openhands/core/schema/__init__.py
index 4ad32752186b..069b2b323486 100644
--- a/openhands/core/schema/__init__.py
+++ b/openhands/core/schema/__init__.py
@@ -2,12 +2,12 @@
 from openhands.core.schema.agent import AgentState
 from openhands.core.schema.config import ConfigType
 from openhands.core.schema.observation import ObservationType
-from openhands.core.schema.replay import ReplayDebuggingPhase
+from openhands.core.schema.replay import ReplayPhase
 
 __all__ = [
     'ActionType',
     'ObservationType',
     'ConfigType',
     'AgentState',
-    'ReplayDebuggingPhase',
+    'ReplayPhase',
 ]
diff --git a/openhands/core/schema/replay.py b/openhands/core/schema/replay.py
index 91f80ce3edc3..9a3308f4fa56 100644
--- a/openhands/core/schema/replay.py
+++ b/openhands/core/schema/replay.py
@@ -1,7 +1,7 @@
 from enum import Enum
 
 
-class ReplayDebuggingPhase(str, Enum):
+class ReplayPhase(str, Enum):
     Normal = 'normal'
     """The agent is not doing anything related to Replay.
     """
diff --git a/openhands/events/action/replay.py b/openhands/events/action/replay.py
index 885b8f8eab1f..2221049ff3fa 100644
--- a/openhands/events/action/replay.py
+++ b/openhands/events/action/replay.py
@@ -3,7 +3,7 @@
 from typing import Any, ClassVar
 
 from openhands.core.schema import ActionType
-from openhands.core.schema.replay import ReplayDebuggingPhase
+from openhands.core.schema.replay import ReplayPhase
 from openhands.events.action.action import (
     Action,
     ActionConfirmationStatus,
@@ -68,7 +68,7 @@ class ReplayToolCmdRunAction(ReplayCmdRunActionBase):
 
 @dataclass
 class ReplayPhaseUpdateAction(ReplayAction):
-    new_phase: ReplayDebuggingPhase
+    new_phase: ReplayPhase
 
     thought: str = ''
     info: str = ''
diff --git a/openhands/events/observation/replay.py b/openhands/events/observation/replay.py
index 9d0425db40e6..b75c85952a2f 100644
--- a/openhands/events/observation/replay.py
+++ b/openhands/events/observation/replay.py
@@ -2,7 +2,7 @@
 from dataclasses import dataclass
 
 from openhands.core.schema import ObservationType
-from openhands.core.schema.replay import ReplayDebuggingPhase
+from openhands.core.schema.replay import ReplayPhase
 from openhands.events.observation.observation import Observation
 
 
@@ -45,7 +45,7 @@ class ReplayToolCmdOutputObservation(ReplayCmdOutputObservationBase):
 
 @dataclass
 class ReplayPhaseUpdateObservation(ReplayObservation):
-    new_phase: ReplayDebuggingPhase
+    new_phase: ReplayPhase
     observation: str = ObservationType.REPLAY_UPDATE_PHASE
 
     @property
diff --git a/openhands/replay/replay_state_machine.py b/openhands/replay/replay_state_machine.py
index e8a9fab4b643..b05e064f90d9 100644
--- a/openhands/replay/replay_state_machine.py
+++ b/openhands/replay/replay_state_machine.py
@@ -2,7 +2,7 @@
 from openhands.controller.state.state import State
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import Message, TextContent
-from openhands.core.schema.replay import ReplayDebuggingPhase
+from openhands.core.schema.replay import ReplayPhase
 from openhands.events.observation.replay import (
     ReplayInternalCmdOutputObservation,
     ReplayObservation,
@@ -24,8 +24,8 @@ def on_replay_observation(obs: ReplayObservation, state: State, agent: Agent) ->
         if analysis_tool_metadata:
             # Start analysis phase
             state.replay_recording_id = analysis_tool_metadata['recordingId']
-            state.replay_phase = ReplayDebuggingPhase.Analysis
-            agent.replay_phase_changed(ReplayDebuggingPhase.Analysis)
+            state.replay_phase = ReplayPhase.Analysis
+            agent.replay_phase_changed(ReplayPhase.Analysis)
     elif isinstance(obs, ReplayPhaseUpdateObservation):
         new_phase = obs.new_phase
         if state.replay_phase == new_phase:
@@ -53,7 +53,7 @@ def get_replay_observation_message(
         message = Message(role='user', content=[TextContent(text=text)])
     elif isinstance(obs, ReplayPhaseUpdateObservation):
         new_phase = obs.new_phase
-        if new_phase == ReplayDebuggingPhase.Edit:
+        if new_phase == ReplayPhase.Edit:
             text = replay_prompt_phase_edit(obs)
         else:
             raise NotImplementedError(f'Unhandled ReplayPhaseUpdateAction: {new_phase}')
diff --git a/openhands/replay/replay_tools.py b/openhands/replay/replay_tools.py
index 820e98efb0ca..dcc4c19eb32d 100644
--- a/openhands/replay/replay_tools.py
+++ b/openhands/replay/replay_tools.py
@@ -13,7 +13,7 @@
 
 from openhands.controller.state.state import State
 from openhands.core.logger import openhands_logger as logger
-from openhands.core.schema.replay import ReplayDebuggingPhase
+from openhands.core.schema.replay import ReplayPhase
 from openhands.events.action.replay import (
     ReplayAction,
     ReplayPhaseUpdateAction,
@@ -209,7 +209,7 @@ def handle_replay_tool_call(
         )
     elif tool_call.function.name == 'submit-hypothesis':
         action = ReplayPhaseUpdateAction(
-            new_phase=ReplayDebuggingPhase.Edit, info=json.dumps(arguments)
+            new_phase=ReplayPhase.Edit, info=json.dumps(arguments)
         )
     else:
         raise ValueError(
@@ -219,18 +219,18 @@ def handle_replay_tool_call(
 
 
 def get_replay_tools(
-    replay_phase: ReplayDebuggingPhase, default_tools: list[ChatCompletionToolParam]
+    replay_phase: ReplayPhase, default_tools: list[ChatCompletionToolParam]
 ) -> list[ChatCompletionToolParam]:
     analysis_tools = [
         ReplayInspectDataTool,
         ReplayInspectPointTool,
     ]
-    if replay_phase == ReplayDebuggingPhase.Analysis:
+    if replay_phase == ReplayPhase.Analysis:
         # Analysis tools only. This phase is concluded upon submit-hypothesis.
         tools = analysis_tools + [ReplaySubmitHypothesisTool]
-    elif replay_phase == ReplayDebuggingPhase.Edit:
+    elif replay_phase == ReplayPhase.Edit:
         # Combine default and analysis tools.
         tools = default_tools + analysis_tools
     else:
-        raise ValueError(f'Unhandled ReplayDebuggingPhase in get_tools: {replay_phase}')
+        raise ValueError(f'Unhandled ReplayPhase in get_tools: {replay_phase}')
     return tools
diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py
index b0516666a6f2..59baa0d4f352 100644
--- a/openhands/runtime/action_execution_server.py
+++ b/openhands/runtime/action_execution_server.py
@@ -206,7 +206,7 @@ async def _run_replay(
     async def replay_update_phase(self, action: ReplayPhaseUpdateAction) -> Observation:
         return ReplayPhaseUpdateObservation(
             new_phase=action.new_phase,
-            content=f'ReplayDebuggingPhase changed to {action.new_phase}',
+            content=f'ReplayPhase changed to {action.new_phase}',
         )
 
     async def run_ipython(self, action: IPythonRunCellAction) -> Observation:

From 128a6dbad85616c2726e55c46fc906e86eb069b4 Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Tue, 21 Jan 2025 22:53:55 +0800
Subject: [PATCH 04/17] isolate state transition logic

---
 .../agenthub/codeact_agent/codeact_agent.py   | 21 +++---
 .../codeact_agent/function_calling.py         | 10 ++-
 openhands/controller/agent.py                 |  9 +--
 openhands/controller/agent_controller.py      |  8 +--
 openhands/replay/replay_initial_analysis.py   |  4 +-
 ...play_state_machine.py => replay_phases.py} | 62 ++++++++++++-----
 openhands/replay/replay_prompts.py            | 67 +++++++++----------
 openhands/replay/replay_tools.py              |  4 ++
 8 files changed, 103 insertions(+), 82 deletions(-)
 rename openhands/replay/{replay_state_machine.py => replay_phases.py} (52%)

diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index a072ce3f3033..0014ad9c9c44 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -1,3 +1,4 @@
+import json
 import os
 from collections import deque
 
@@ -41,8 +42,8 @@
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
 from openhands.replay.replay_initial_analysis import replay_enhance_action
-from openhands.replay.replay_state_machine import (
-    get_replay_observation_message,
+from openhands.replay.replay_phases import (
+    on_agent_replay_observation,
 )
 from openhands.runtime.plugins import (
     AgentSkillsRequirement,
@@ -104,7 +105,7 @@ def __init__(
 
         # We're in normal mode by default (even if replay is not enabled).
         # This will initialize the set of tools the agent has access to.
-        self.replay_phase_changed(ReplayPhase.Normal)
+        self.update_tools(ReplayPhase.Normal)
 
         self.prompt_manager = PromptManager(
             microagent_dir=os.path.join(os.path.dirname(__file__), 'micro')
@@ -256,7 +257,7 @@ def get_observation_message(
             text += f'\n[Command finished with exit code {obs.exit_code}]'
             message = Message(role='user', content=[TextContent(text=text)])
         elif isinstance(obs, ReplayObservation):
-            message = get_replay_observation_message(obs, max_message_chars)
+            message = on_agent_replay_observation(obs, max_message_chars)
         elif isinstance(obs, IPythonRunCellObservation):
             text = obs.content
             # replace base64 images with a placeholder
@@ -316,12 +317,7 @@ def reset(self) -> None:
         """Resets the CodeAct Agent."""
         super().reset()
 
-    def replay_phase_changed(self, phase: ReplayPhase) -> None:
-        """Called whenenever the phase of the replay debugging process changes.
-
-        We currently use this to give the agent access to different tools for the
-        different phases.
-        """
+    def update_tools(self, phase: ReplayPhase) -> None:
         self.tools = codeact_function_calling.get_tools(
             codeact_enable_browsing=self.config.codeact_enable_browsing,
             codeact_enable_jupyter=self.config.codeact_enable_jupyter,
@@ -329,9 +325,8 @@ def replay_phase_changed(self, phase: ReplayPhase) -> None:
             codeact_enable_replay=self.config.codeact_enable_replay,
             replay_phase=phase,
         )
-        logger.debug(
-            f'[REPLAY] CodeActAgent.replay_phase_changed({phase}).'
-            # f'New tools: {json.dumps(self.tools, indent=2)}'
+        logger.info(
+            f'[REPLAY] update_tools: {json.dumps([t.function['name'] for t in self.tools], indent=2)}'
         )
 
     def step(self, state: State) -> Action:
diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py
index 44e3bfc8eeec..b81b6ee2ecab 100644
--- a/openhands/agenthub/codeact_agent/function_calling.py
+++ b/openhands/agenthub/codeact_agent/function_calling.py
@@ -548,11 +548,9 @@ def get_tools(
         codeact_enable_llm_editor,
         codeact_enable_jupyter,
     )
-    if not codeact_enable_replay or replay_phase == ReplayPhase.Normal:
-        # Use the default tools when not in a Replay-specific phase.
-        return default_tools
-
     if codeact_enable_replay:
-        tools = get_replay_tools(replay_phase, default_tools)
+        # Handle Replay tool updates.
+        return get_replay_tools(replay_phase, default_tools)
 
-    return tools
+    # Just the default tools.
+    return default_tools
diff --git a/openhands/controller/agent.py b/openhands/controller/agent.py
index c74783ce2f56..dd80911879ca 100644
--- a/openhands/controller/agent.py
+++ b/openhands/controller/agent.py
@@ -63,12 +63,9 @@ def reset(self) -> None:
         if self.llm:
             self.llm.reset()
 
-    # the "noqa" below is so we can add an empty but not abstract method. otherwise we'd need an empty
-    # implmentation in every subclass other than CodeActAgent (which does override it.)
-    def replay_phase_changed(self, phase: ReplayPhase) -> None:  # noqa: B027
-        """Called when the phase of the replay debugging process changes. This method
-        can be used to update the agent's behavior based on the phase.
-        """
+    # `noqa: B027` is necessary to have an empty method implementation.
+    def update_tools(self, phase: ReplayPhase) -> None:  # noqa: B027
+        """Agent tools might have changed due to some observed event."""
         pass
 
     @property
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index a9e23c4ec9dd..9a2796f0aab1 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -20,7 +20,7 @@
 )
 from openhands.core.logger import LOG_ALL_EVENTS
 from openhands.core.logger import openhands_logger as logger
-from openhands.core.schema import AgentState, ReplayPhase
+from openhands.core.schema import AgentState
 from openhands.events import EventSource, EventStream, EventStreamSubscriber
 from openhands.events.action import (
     Action,
@@ -53,7 +53,7 @@
 )
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
-from openhands.replay.replay_state_machine import on_replay_observation
+from openhands.replay.replay_phases import on_controller_replay_observation
 from openhands.utils.shutdown_listener import should_continue
 
 # note: RESUME is only available on web GUI
@@ -145,8 +145,6 @@ def __init__(
         self._stuck_detector = StuckDetector(self.state)
         self.status_callback = status_callback
 
-        self.replay_phase = ReplayPhase.Normal
-
     async def close(self) -> None:
         """Closes the agent controller, canceling any ongoing tasks and unsubscribing from the event stream.
 
@@ -298,7 +296,7 @@ async def _handle_observation(self, observation: Observation) -> None:
         if self._pending_action and self._pending_action.id == observation.cause:
             self._pending_action = None
             if isinstance(observation, ReplayObservation):
-                on_replay_observation(observation, self.state, self.agent)
+                on_controller_replay_observation(observation, self.state, self.agent)
 
             if self.state.agent_state == AgentState.USER_CONFIRMED:
                 await self.set_agent_state_to(AgentState.RUNNING)
diff --git a/openhands/replay/replay_initial_analysis.py b/openhands/replay/replay_initial_analysis.py
index fe6fa1c6fa40..164f1c72aa3f 100644
--- a/openhands/replay/replay_initial_analysis.py
+++ b/openhands/replay/replay_initial_analysis.py
@@ -85,9 +85,7 @@ def on_replay_internal_command_observation(
     state: State, observation: ReplayInternalCmdOutputObservation
 ) -> AnalysisToolMetadata | None:
     """
-    Handle result for an internally sent command (not agent tool use or user action).
-
-    NOTE: Currently, the only internal command is the initial-analysis command.
+    Handle result for an internal (automatically or user-triggered sent) command.
     Enhance the user prompt with the results of the initial analysis.
     Returns the metadata needed for the agent to switch to analysis tools.
     """
diff --git a/openhands/replay/replay_state_machine.py b/openhands/replay/replay_phases.py
similarity index 52%
rename from openhands/replay/replay_state_machine.py
rename to openhands/replay/replay_phases.py
index b05e064f90d9..6b1957821da2 100644
--- a/openhands/replay/replay_state_machine.py
+++ b/openhands/replay/replay_phases.py
@@ -15,51 +15,83 @@
 )
 from openhands.replay.replay_prompts import replay_prompt_phase_edit
 
+# ###########################################################################
+# Phase events.
+# ###########################################################################
 
-def on_replay_observation(obs: ReplayObservation, state: State, agent: Agent) -> None:
+
+def on_controller_replay_observation(
+    obs: ReplayObservation, state: State, agent: Agent
+) -> None:
     """Handle the observation."""
+    new_phase: ReplayPhase | None = None
     if isinstance(obs, ReplayInternalCmdOutputObservation):
         # NOTE: Currently, the only internal command is the initial-analysis command.
         analysis_tool_metadata = on_replay_internal_command_observation(state, obs)
         if analysis_tool_metadata:
             # Start analysis phase
             state.replay_recording_id = analysis_tool_metadata['recordingId']
-            state.replay_phase = ReplayPhase.Analysis
-            agent.replay_phase_changed(ReplayPhase.Analysis)
+            new_phase = ReplayPhase.Analysis
     elif isinstance(obs, ReplayPhaseUpdateObservation):
+        # Agent action triggered a phase change.
         new_phase = obs.new_phase
+
+    if new_phase:
         if state.replay_phase == new_phase:
             logger.warning(
                 f'Unexpected ReplayPhaseUpdateAction. Already in phase. Observation:\n {repr(obs)}',
             )
         else:
-            state.replay_phase = new_phase
-            agent.replay_phase_changed(new_phase)
+            update_phase(new_phase, state, agent)
 
 
-def get_replay_observation_message(
+def on_agent_replay_observation(
     obs: ReplayObservation, max_message_chars: int
 ) -> Message:
     """Create a message to explain the observation."""
+    text: str
     if isinstance(obs, ReplayToolCmdOutputObservation):
-        # if it doesn't have tool call metadata, it was triggered by a user action
+        # Internal command result from an automatic or user-triggered replay command.
         if obs.tool_call_metadata is None:
+            # If it doesn't have tool call metadata, it was triggered by a user action.
             text = truncate_content(
                 f'\nObserved result of replay command executed by user:\n{obs.content}',
                 max_message_chars,
             )
         else:
             text = obs.content
-        message = Message(role='user', content=[TextContent(text=text)])
     elif isinstance(obs, ReplayPhaseUpdateObservation):
-        new_phase = obs.new_phase
-        if new_phase == ReplayPhase.Edit:
-            text = replay_prompt_phase_edit(obs)
-        else:
-            raise NotImplementedError(f'Unhandled ReplayPhaseUpdateAction: {new_phase}')
-        message = Message(role='user', content=[TextContent(text=text)])
+        # Agent requested a phase update.
+        text = get_new_phase_prompt(obs)
     else:
         raise NotImplementedError(
             f"Unhandled observation type: {obs.__class__.__name__} ({getattr(obs, 'observation', None)})"
         )
-    return message
+    return Message(role='user', content=[TextContent(text=text)])
+
+
+# ###########################################################################
+# Prompts.
+# ###########################################################################
+
+
+def get_new_phase_prompt(obs: ReplayPhaseUpdateObservation) -> str:
+    """Get the prompt for the new phase."""
+    new_phase = obs.new_phase
+    if new_phase == ReplayPhase.Edit:
+        new_phase_prompt = replay_prompt_phase_edit(obs)
+    else:
+        raise NotImplementedError(f'Unhandled ReplayPhaseUpdateAction: {new_phase}')
+    return new_phase_prompt
+
+
+# ###########################################################################
+# State machine transitions.
+# ###########################################################################
+
+
+def update_phase(new_phase: ReplayPhase, state: State, agent: Agent):
+    """Apply phase update side effects."""
+    state.replay_phase = new_phase
+    agent.update_tools(new_phase)
+    logger.info(f'[REPLAY] update_phase (replay_phase): {new_phase}')
diff --git a/openhands/replay/replay_prompts.py b/openhands/replay/replay_prompts.py
index 860c3ed3b155..31a35f0574b5 100644
--- a/openhands/replay/replay_prompts.py
+++ b/openhands/replay/replay_prompts.py
@@ -23,45 +23,11 @@ def replay_prompt_phase_analysis(command_result: dict, prompt: str) -> str:
 3. Then use the `inspect-*` tools to investigate.
 4. Once found, `submit-hypothesis`.
 
-
 # Initial Analysis
 """ + json.dumps(command_result, indent=2)
     return enhance_prompt(prompt, prefix, suffix)
 
 
-def replay_prompt_phase_analysis_legacy(command_result: dict, prompt: str) -> str:
-    # Old workflow: initial-analysis left hints in form of source code annotations.
-    annotated_repo_path = command_result.get('annotatedRepo', '')
-    comment_text = command_result.get('commentText', '')
-    react_component_name = command_result.get('reactComponentName', '')
-    console_error = command_result.get('consoleError', '')
-    # start_location = result.get('startLocation', '')
-    start_name = command_result.get('startName', '')
-
-    # TODO: Move this to a prompt template file.
-    if comment_text:
-        if react_component_name:
-            prefix = (
-                f'There is a change needed to the {react_component_name} component.\n'
-            )
-        else:
-            prefix = f'There is a change needed in {annotated_repo_path}:\n'
-        prefix += f'{comment_text}\n\n'
-    elif console_error:
-        prefix = f'There is a change needed in {annotated_repo_path} to fix a console error that has appeared unexpectedly:\n'
-        prefix += f'{console_error}\n\n'
-
-    prefix += '<IMPORTANT>\n'
-    prefix += 'Information about a reproduction of the problem is available in source comments.\n'
-    prefix += 'You must search for these comments and use them to get a better understanding of the problem.\n'
-    prefix += f'The first reproduction comment to search for is named {start_name}. Start your investigation there.\n'
-    prefix += '</IMPORTANT>\n'
-
-    suffix = ''
-
-    return enhance_prompt(prompt, prefix, suffix)
-
-
 def replay_prompt_phase_edit(obs: ReplayPhaseUpdateObservation) -> str:
     # Tell the agent to stop analyzing and start editing:
     return """
@@ -73,3 +39,36 @@ def replay_prompt_phase_edit(obs: ReplayPhaseUpdateObservation) -> str:
   3. Do the `editSuggestions` actually address the issue?
   4. Rephrase the hypothesis so that it is consistent and correct.
 """
+
+
+# def replay_prompt_phase_analysis_legacy(command_result: dict, prompt: str) -> str:
+#     # Old workflow: initial-analysis left hints in form of source code annotations.
+#     annotated_repo_path = command_result.get('annotatedRepo', '')
+#     comment_text = command_result.get('commentText', '')
+#     react_component_name = command_result.get('reactComponentName', '')
+#     console_error = command_result.get('consoleError', '')
+#     # start_location = result.get('startLocation', '')
+#     start_name = command_result.get('startName', '')
+
+#     # TODO: Move this to a prompt template file.
+#     if comment_text:
+#         if react_component_name:
+#             prefix = (
+#                 f'There is a change needed to the {react_component_name} component.\n'
+#             )
+#         else:
+#             prefix = f'There is a change needed in {annotated_repo_path}:\n'
+#         prefix += f'{comment_text}\n\n'
+#     elif console_error:
+#         prefix = f'There is a change needed in {annotated_repo_path} to fix a console error that has appeared unexpectedly:\n'
+#         prefix += f'{console_error}\n\n'
+
+#     prefix += '<IMPORTANT>\n'
+#     prefix += 'Information about a reproduction of the problem is available in source comments.\n'
+#     prefix += 'You must search for these comments and use them to get a better understanding of the problem.\n'
+#     prefix += f'The first reproduction comment to search for is named {start_name}. Start your investigation there.\n'
+#     prefix += '</IMPORTANT>\n'
+
+#     suffix = ''
+
+#     return enhance_prompt(prompt, prefix, suffix)
diff --git a/openhands/replay/replay_tools.py b/openhands/replay/replay_tools.py
index dcc4c19eb32d..be384f92889a 100644
--- a/openhands/replay/replay_tools.py
+++ b/openhands/replay/replay_tools.py
@@ -221,6 +221,10 @@ def handle_replay_tool_call(
 def get_replay_tools(
     replay_phase: ReplayPhase, default_tools: list[ChatCompletionToolParam]
 ) -> list[ChatCompletionToolParam]:
+    if replay_phase == ReplayPhase.Normal:
+        # Use the default tools when not in a Replay-specific phase.
+        return default_tools
+
     analysis_tools = [
         ReplayInspectDataTool,
         ReplayInspectPointTool,

From 1b4823c82295f6935dec5f0ba559e007c100562e Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Wed, 22 Jan 2025 00:09:14 +0800
Subject: [PATCH 05/17] Proper state machine rules

---
 .../agenthub/codeact_agent/codeact_agent.py   |   6 +-
 openhands/replay/replay_phases.py             |  96 ++++++--
 openhands/replay/replay_prompts.py            |  26 +-
 openhands/replay/replay_tools.py              | 230 ++++++++----------
 4 files changed, 205 insertions(+), 153 deletions(-)

diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index 0014ad9c9c44..a3d9a7651162 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -43,7 +43,7 @@
 from openhands.llm.llm import LLM
 from openhands.replay.replay_initial_analysis import replay_enhance_action
 from openhands.replay.replay_phases import (
-    on_agent_replay_observation,
+    get_replay_observation_prompt,
 )
 from openhands.runtime.plugins import (
     AgentSkillsRequirement,
@@ -257,7 +257,7 @@ def get_observation_message(
             text += f'\n[Command finished with exit code {obs.exit_code}]'
             message = Message(role='user', content=[TextContent(text=text)])
         elif isinstance(obs, ReplayObservation):
-            message = on_agent_replay_observation(obs, max_message_chars)
+            message = get_replay_observation_prompt(obs, max_message_chars)
         elif isinstance(obs, IPythonRunCellObservation):
             text = obs.content
             # replace base64 images with a placeholder
@@ -326,7 +326,7 @@ def update_tools(self, phase: ReplayPhase) -> None:
             replay_phase=phase,
         )
         logger.info(
-            f'[REPLAY] update_tools: {json.dumps([t.function['name'] for t in self.tools], indent=2)}'
+            f'[REPLAY] update_tools: {json.dumps([t["function"]['name'] for t in self.tools], indent=2)}'
         )
 
     def step(self, state: State) -> Action:
diff --git a/openhands/replay/replay_phases.py b/openhands/replay/replay_phases.py
index 6b1957821da2..cd7ea41c56dc 100644
--- a/openhands/replay/replay_phases.py
+++ b/openhands/replay/replay_phases.py
@@ -1,3 +1,5 @@
+"""Replay state machine logic."""
+
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.logger import openhands_logger as logger
@@ -13,7 +15,9 @@
 from openhands.replay.replay_initial_analysis import (
     on_replay_internal_command_observation,
 )
-from openhands.replay.replay_prompts import replay_prompt_phase_edit
+from openhands.replay.replay_prompts import (
+    get_phase_enter_prompt,
+)
 
 # ###########################################################################
 # Phase events.
@@ -45,15 +49,16 @@ def on_controller_replay_observation(
             update_phase(new_phase, state, agent)
 
 
-def on_agent_replay_observation(
+def get_replay_observation_prompt(
     obs: ReplayObservation, max_message_chars: int
 ) -> Message:
-    """Create a message to explain the observation."""
+    """Create a message to explain the Replay observation to the agent."""
     text: str
     if isinstance(obs, ReplayToolCmdOutputObservation):
         # Internal command result from an automatic or user-triggered replay command.
         if obs.tool_call_metadata is None:
             # If it doesn't have tool call metadata, it was triggered by a user action.
+            # TODO: Improve truncation. It currently can cut things off very aggressively.
             text = truncate_content(
                 f'\nObserved result of replay command executed by user:\n{obs.content}',
                 max_message_chars,
@@ -61,8 +66,8 @@ def on_agent_replay_observation(
         else:
             text = obs.content
     elif isinstance(obs, ReplayPhaseUpdateObservation):
-        # Agent requested a phase update.
-        text = get_new_phase_prompt(obs)
+        # A phase transition was requested.
+        text = get_phase_prompt(obs)
     else:
         raise NotImplementedError(
             f"Unhandled observation type: {obs.__class__.__name__} ({getattr(obs, 'observation', None)})"
@@ -71,23 +76,14 @@ def on_agent_replay_observation(
 
 
 # ###########################################################################
-# Prompts.
+# State machine transition.
 # ###########################################################################
 
 
-def get_new_phase_prompt(obs: ReplayPhaseUpdateObservation) -> str:
-    """Get the prompt for the new phase."""
-    new_phase = obs.new_phase
-    if new_phase == ReplayPhase.Edit:
-        new_phase_prompt = replay_prompt_phase_edit(obs)
-    else:
-        raise NotImplementedError(f'Unhandled ReplayPhaseUpdateAction: {new_phase}')
-    return new_phase_prompt
-
-
-# ###########################################################################
-# State machine transitions.
-# ###########################################################################
+def get_phase_prompt(obs) -> str:
+    """Prompt for agent when entering new phase."""
+    # NOTE: We might add more edge types later, but for now we only have 'enter'.
+    return get_phase_enter_prompt(obs)
 
 
 def update_phase(new_phase: ReplayPhase, state: State, agent: Agent):
@@ -95,3 +91,65 @@ def update_phase(new_phase: ReplayPhase, state: State, agent: Agent):
     state.replay_phase = new_phase
     agent.update_tools(new_phase)
     logger.info(f'[REPLAY] update_phase (replay_phase): {new_phase}')
+
+
+# ###########################################################################
+# ReplayStateMachine.
+# ###########################################################################
+
+replay_state_transitions: dict[ReplayPhase, list[ReplayPhase] | None] = {
+    ReplayPhase.Normal: None,
+    ReplayPhase.Analysis: [ReplayPhase.Edit],
+    ReplayPhase.Edit: None,
+}
+
+
+class ReplayStateMachine:
+    def __init__(self):
+        self.forward_edges = replay_state_transitions
+
+        self.reverse_edges: dict[ReplayPhase, list[ReplayPhase] | None] = {}
+
+        for source, targets in self.forward_edges.items():
+            if targets:
+                for target in targets:
+                    reverse_list = self.reverse_edges.get(target, None)
+                    if not reverse_list:
+                        reverse_list = self.reverse_edges[target] = []
+                    assert reverse_list is not None
+                    if source in reverse_list:
+                        raise ValueError(
+                            f'Cycle detected in ReplayStateMachine: {source} -> {target}'
+                        )
+                    reverse_list.append(source)
+
+    def get_unique_parent_phase(self, phase: ReplayPhase) -> ReplayPhase | None:
+        phases = self.get_parent_phases(phase)
+        if not phases:
+            return None
+        if len(phases) == 1:
+            return phases.pop()
+        assert len(phases) > 1
+        raise ValueError(f'Phase {phase} has multiple parent phases: {phases}')
+
+    def get_parent_phases(self, phase: ReplayPhase) -> list[ReplayPhase] | None:
+        return self.reverse_edges.get(phase, list())
+
+    def get_unique_child_phase(self, phase: ReplayPhase) -> ReplayPhase | None:
+        phases = self.get_child_phases(phase)
+        if not phases:
+            return None
+        if len(phases) == 1:
+            return phases.pop()
+        assert len(phases) > 1
+        raise ValueError(f'Phase {phase} has multiple child phases: {phases}')
+
+    def get_child_phases(self, phase: ReplayPhase) -> list[ReplayPhase] | None:
+        return self.forward_edges.get(phase, list())
+
+
+replay_state_machine = ReplayStateMachine()
+
+
+def get_replay_child_phase(phase: ReplayPhase) -> ReplayPhase | None:
+    return replay_state_machine.get_unique_child_phase(phase)
diff --git a/openhands/replay/replay_prompts.py b/openhands/replay/replay_prompts.py
index 31a35f0574b5..e8c421886cc4 100644
--- a/openhands/replay/replay_prompts.py
+++ b/openhands/replay/replay_prompts.py
@@ -1,6 +1,8 @@
 import json
+from typing import Callable, TypedDict, Union
 
 from openhands.core.logger import openhands_logger as logger
+from openhands.core.schema.replay import ReplayPhase
 from openhands.events.observation.replay import ReplayPhaseUpdateObservation
 
 
@@ -21,16 +23,20 @@ def replay_prompt_phase_analysis(command_result: dict, prompt: str) -> str:
 1. State the main problem statement. It MUST address `IMPORTANT_NOTES`. It must make sure that the application won't crash. It must fix the issue.
 2. Propose a plan to fix or investigate with multiple options in order of priority.
 3. Then use the `inspect-*` tools to investigate.
-4. Once found, `submit-hypothesis`.
+4. Once found, `submit`.
 
 # Initial Analysis
 """ + json.dumps(command_result, indent=2)
     return enhance_prompt(prompt, prefix, suffix)
 
 
-def replay_prompt_phase_edit(obs: ReplayPhaseUpdateObservation) -> str:
-    # Tell the agent to stop analyzing and start editing:
-    return """
+class PromptMap(TypedDict):
+    enter: Union[str, Callable[[ReplayPhaseUpdateObservation], str]]
+
+
+phase_prompts: dict[ReplayPhase, PromptMap] = {
+    ReplayPhase.Edit: {
+        'enter': """
 You have concluded the analysis.
 
 IMPORTANT: NOW review, then implement the hypothesized changes using tools. The code is available in the workspace. Start by answering these questions:
@@ -39,6 +45,18 @@ def replay_prompt_phase_edit(obs: ReplayPhaseUpdateObservation) -> str:
   3. Do the `editSuggestions` actually address the issue?
   4. Rephrase the hypothesis so that it is consistent and correct.
 """
+    }
+}
+
+
+def get_phase_enter_prompt(obs: ReplayPhaseUpdateObservation) -> str:
+    """The prompt to be shown to the agent when entering a new phase."""
+    prompts = phase_prompts[obs.new_phase]
+    if not prompts:
+        raise ValueError(f'[REPLAY] No prompt for entering phase: {obs.new_phase}')
+    p = prompts['enter']
+    prompt = p(obs) if callable(p) else p
+    return prompt.strip()
 
 
 # def replay_prompt_phase_analysis_legacy(command_result: dict, prompt: str) -> str:
diff --git a/openhands/replay/replay_tools.py b/openhands/replay/replay_tools.py
index be384f92889a..efa3372241b0 100644
--- a/openhands/replay/replay_tools.py
+++ b/openhands/replay/replay_tools.py
@@ -1,7 +1,4 @@
-"""This file contains the function calling implementation for different actions.
-
-This is similar to the functionality of `CodeActResponseParser`.
-"""
+"""Define tools to interact with Replay recordings and the Replay agentic workflow."""
 
 import json
 
@@ -19,20 +16,25 @@
     ReplayPhaseUpdateAction,
     ReplayToolCmdRunAction,
 )
+from openhands.replay.replay_phases import get_replay_child_phase
 
 
 class ReplayTool(ChatCompletionToolParam):
     pass
 
 
-def replay_tool(**kwargs):
-    f = ChatCompletionToolParamFunctionChunk(**kwargs)
+def replay_tool(name: str, description: str, parameters: dict) -> ReplayTool:
+    f = ChatCompletionToolParamFunctionChunk(
+        name=name, description=description, parameters=parameters
+    )
     return ReplayTool(type='function', function=f)
 
 
-# ---------------------------------------------------------
-# Tool: inspect-data
-# ---------------------------------------------------------
+# ###########################################################################
+# Analysis.
+# ###########################################################################
+
+
 _REPLAY_INSPECT_DATA_DESCRIPTION = """
 Explains value, data flow and origin information for `expression` at `point`.
 IMPORTANT: Prefer using inspect-data over inspect-point.
@@ -50,7 +52,7 @@ def replay_tool(**kwargs):
             },
             'point': {
                 'type': 'string',
-                'description': 'The point at which to inspect the runtime. The first point comes from the `thisPoint` in the Initial analysis.',
+                'description': 'The point at which to inspect the recorded program.',
             },
             'explanation': {
                 'type': 'string',
@@ -65,9 +67,6 @@ def replay_tool(**kwargs):
     },
 )
 
-# ---------------------------------------------------------
-# Tool: inspect-point
-# ---------------------------------------------------------
 _REPLAY_INSPECT_POINT_DESCRIPTION = """
 Explains dynamic control flow and data flow dependencies of the code at `point`.
 Use this tool instead of `inspect-data` only when you don't have a specific data point to investigate.
@@ -85,107 +84,105 @@ def replay_tool(**kwargs):
     },
 )
 
-# ---------------------------------------------------------
-# Tool: SubmitHypothesis
-# TODO: Divide this into multiple steps -
-#   1. The first submission must be as simple as possible to take little computational effort from the analysis steps.
-#   2. The second submission, after analysis has already concluded, must be as complete as possible.
-# ---------------------------------------------------------
-# _REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
-# Your investigation has yielded a complete thin slice from symptom to root cause,
-# enough proof to let the `CodeEdit` agent take over to fix the bug.
-# DO NOT GUESS. You must provide exact code in the exact right location to fix this bug,
-# based on evidence you have gathered.
-# """
-
-# ReplaySubmitHypothesisTool = ReplayToolDefinition(
-#     name='submit-hypothesis',
-#     description=_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION.strip(),
-#     parameters={
-#         'type': 'object',
-#         'properties': {
-#             'rootCauseHypothesis': {'type': 'string'},
-#             'thinSlice': {
-#                 'type': 'array',
-#                 'items': {
-#                     'type': 'object',
-#                     'properties': {
-#                         'point': {'type': 'string'},
-#                         'code': {'type': 'string'},
-#                         'role': {'type': 'string'},
-#                     },
-#                     'required': ['point', 'code', 'role'],
-#                 },
-#             },
-#             'modifications': {
-#                 'type': 'array',
-#                 'items': {
-#                     'type': 'object',
-#                     'properties': {
-#                         'kind': {
-#                             'type': 'string',
-#                             'enum': ['add', 'remove', 'modify'],
-#                         },
-#                         'newCode': {'type': 'string'},
-#                         'oldCode': {'type': 'string'},
-#                         'location': {'type': 'string'},
-#                         'point': {'type': 'string'},
-#                         # NOTE: Even though, we really want the `line` here, it will lead to much worse performance because the agent has a hard time computing correct line numbers from its point-based investigation.
-#                         # Instead of requiring a line number, the final fix will be more involved, as explained in the issue.
-#                         # see: https://linear.app/replay/issue/PRO-939/use-tools-data-flow-analysis-for-10608#comment-3b7ae176
-#                         # 'line': {'type': 'number'},
-#                         'briefExplanation': {'type': 'string'},
-#                         'verificationProof': {'type': 'string'},
-#                     },
-#                     'required': [
-#                         'kind',
-#                         'location',
-#                         'briefExplanation',
-#                         # 'line',
-#                         'verificationProof',
-#                     ],
-#                 },
-#             },
-#         },
-#         'required': ['rootCauseHypothesis', 'thinSlice', 'modifications'],
-#     },
-# )
-_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
-# Use this tool to conclude your analysis and move on to code editing.
-# """
-
-ReplaySubmitHypothesisTool = replay_tool(
-    name='submit-hypothesis',
-    description=_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION.strip(),
-    parameters={
-        'type': 'object',
-        'properties': {
-            'problem': {
-                'type': 'string',
-                'description': 'One-sentence explanation of the core problem that this will solve.',
-            },
-            'rootCauseHypothesis': {'type': 'string'},
-            'editSuggestions': {
-                'type': 'string',
-                'description': 'Provide suggestions to fix the bug, if you know enough about the code that requires modification.',
+
+# ###########################################################################
+# Phase transitions + submissions.
+# ###########################################################################
+
+
+class ReplaySubmitTool(ReplayTool):
+    new_phase: ReplayPhase
+
+
+def replay_submit_tool(
+    new_phase: ReplayPhase, name: str, description: str, parameters: dict
+):
+    return ReplaySubmitTool(
+        new_phase=new_phase,
+        type='function',
+        function=ChatCompletionToolParamFunctionChunk(
+            name=name, description=description, parameters=parameters
+        ),
+    )
+
+
+replay_phase_transition_tools: list[ReplaySubmitTool] = [
+    replay_submit_tool(
+        ReplayPhase.Edit,
+        'submit',
+        """Conclude your analysis.""",
+        {
+            'type': 'object',
+            'properties': {
+                'problem': {
+                    'type': 'string',
+                    'description': 'One-sentence explanation of the core problem that this will solve.',
+                },
+                'rootCauseHypothesis': {'type': 'string'},
+                'editSuggestions': {
+                    'type': 'string',
+                    'description': 'Provide suggestions to fix the bug, if you know enough about the code that requires modification.',
+                },
             },
+            'required': ['problem', 'rootCauseHypothesis'],
         },
-        'required': ['rootCauseHypothesis'],
-    },
-)
+    )
+]
 
-replay_tools: list[ReplayTool] = [
+# ###########################################################################
+# Bookkeeping + utilities.
+# ###########################################################################
+
+replay_analysis_tools: list[ReplayTool] = [
     ReplayInspectDataTool,
     ReplayInspectPointTool,
-    ReplaySubmitHypothesisTool,
 ]
-replay_tool_names: set[str] = set([t.function['name'] for t in replay_tools])
+
+replay_tools: list[ReplayTool] = [
+    *replay_analysis_tools,
+    *replay_phase_transition_tools,
+]
+replay_tool_names: set[str] = set([t['function']['name'] for t in replay_tools])
 
 
 def is_replay_tool(tool_name: str) -> bool:
     return tool_name in replay_tool_names
 
 
+# ###########################################################################
+# Compute tools based on the current ReplayPhase.
+# ###########################################################################
+
+
+def get_replay_tools(
+    replay_phase: ReplayPhase, default_tools: list[ChatCompletionToolParam]
+) -> list[ChatCompletionToolParam]:
+    if replay_phase == ReplayPhase.Normal:
+        # Use the default tools when not in a Replay-specific phase.
+        tools = default_tools
+    elif replay_phase == ReplayPhase.Analysis:
+        # Only allow analysis in this phase.
+        tools = replay_analysis_tools
+    elif replay_phase == ReplayPhase.Edit:
+        # Combine default and analysis tools.
+        tools = default_tools + replay_analysis_tools
+    else:
+        raise ValueError(f'Unhandled ReplayPhase in get_tools: {replay_phase}')
+
+    # Add phase transition tools.
+    next_phase = get_replay_child_phase(replay_phase)
+    if next_phase:
+        tools += [t for t in replay_phase_transition_tools if t.new_phase == next_phase]
+
+    # Return all tools.
+    return tools
+
+
+# ###########################################################################
+# Tool call handling.
+# ###########################################################################
+
+
 def handle_replay_tool_call(
     tool_call: ChatCompletionMessageToolCall, arguments: dict, state: State
 ) -> ReplayAction:
@@ -194,7 +191,7 @@ def handle_replay_tool_call(
     )
     action: ReplayAction
     if tool_call.function.name == 'inspect-data':
-        # Remove explanation props.
+        # Remove explanation arguments. Those are only used for self-consistency.
         arguments = {k: v for k, v in arguments.items() if 'explanation' not in k}
         action = ReplayToolCmdRunAction(
             command_name='inspect-data',
@@ -207,34 +204,13 @@ def handle_replay_tool_call(
             command_name='inspect-point',
             command_args=arguments | {'recordingId': state.replay_recording_id},
         )
-    elif tool_call.function.name == 'submit-hypothesis':
+    elif isinstance(tool_call, ReplaySubmitTool):
+        # Request a phase change.
         action = ReplayPhaseUpdateAction(
-            new_phase=ReplayPhase.Edit, info=json.dumps(arguments)
+            new_phase=tool_call.new_phase, info=json.dumps(arguments)
         )
     else:
         raise ValueError(
             f'Unknown Replay tool. Make sure to add them all to REPLAY_TOOLS: {tool_call.function.name}'
         )
     return action
-
-
-def get_replay_tools(
-    replay_phase: ReplayPhase, default_tools: list[ChatCompletionToolParam]
-) -> list[ChatCompletionToolParam]:
-    if replay_phase == ReplayPhase.Normal:
-        # Use the default tools when not in a Replay-specific phase.
-        return default_tools
-
-    analysis_tools = [
-        ReplayInspectDataTool,
-        ReplayInspectPointTool,
-    ]
-    if replay_phase == ReplayPhase.Analysis:
-        # Analysis tools only. This phase is concluded upon submit-hypothesis.
-        tools = analysis_tools + [ReplaySubmitHypothesisTool]
-    elif replay_phase == ReplayPhase.Edit:
-        # Combine default and analysis tools.
-        tools = default_tools + analysis_tools
-    else:
-        raise ValueError(f'Unhandled ReplayPhase in get_tools: {replay_phase}')
-    return tools

From 84fedb9f8539b3b7a0d17fc7db56b575f6aa78d2 Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Wed, 22 Jan 2025 00:13:39 +0800
Subject: [PATCH 06/17] tool bug fixes

---
 openhands/replay/replay_tools.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/openhands/replay/replay_tools.py b/openhands/replay/replay_tools.py
index efa3372241b0..ddf00e77ae60 100644
--- a/openhands/replay/replay_tools.py
+++ b/openhands/replay/replay_tools.py
@@ -169,10 +169,16 @@ def get_replay_tools(
     else:
         raise ValueError(f'Unhandled ReplayPhase in get_tools: {replay_phase}')
 
-    # Add phase transition tools.
+    # Add tools to allow transitioning to next phase.
     next_phase = get_replay_child_phase(replay_phase)
     if next_phase:
-        tools += [t for t in replay_phase_transition_tools if t.new_phase == next_phase]
+        transition_tools = [
+            t for t in replay_phase_transition_tools if t['new_phase'] == next_phase
+        ]
+        assert len(
+            transition_tools
+        ), f'replay_phase_transition_tools is missing tools for new_phase: {next_phase}'
+        tools += transition_tools
 
     # Return all tools.
     return tools
@@ -207,7 +213,7 @@ def handle_replay_tool_call(
     elif isinstance(tool_call, ReplaySubmitTool):
         # Request a phase change.
         action = ReplayPhaseUpdateAction(
-            new_phase=tool_call.new_phase, info=json.dumps(arguments)
+            new_phase=tool_call['new_phase'], info=json.dumps(arguments)
         )
     else:
         raise ValueError(

From 5742da3402de976f757232a8fcab670af31d5e09 Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Wed, 22 Jan 2025 00:17:02 +0800
Subject: [PATCH 07/17] whoops

---
 openhands/agenthub/codeact_agent/function_calling.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py
index b81b6ee2ecab..86bfaceeec5d 100644
--- a/openhands/agenthub/codeact_agent/function_calling.py
+++ b/openhands/agenthub/codeact_agent/function_calling.py
@@ -474,7 +474,7 @@ def response_to_actions(response: ModelResponse, state: State) -> list[Action]:
             if tool_call.function.name == 'execute_bash':
                 action = CmdRunAction(**arguments)
             elif is_replay_tool(tool_call.function.name):
-                handle_replay_tool_call(tool_call, arguments, state)
+                action = handle_replay_tool_call(tool_call, arguments, state)
             elif tool_call.function.name == 'execute_ipython_cell':
                 action = IPythonRunCellAction(**arguments)
             elif tool_call.function.name == 'delegate_to_browsing_agent':
@@ -500,6 +500,7 @@ def response_to_actions(response: ModelResponse, state: State) -> list[Action]:
                 raise FunctionCallNotExistsError(
                     f'Tool {tool_call.function.name} is not registered. (arguments: {arguments}). Please check the tool name and retry with an existing tool.'
                 )
+            assert action
 
             # We only add thought to the first action
             if i == 0:

From cd2a71516335f59174c740ed1729e1def96b10ad Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Wed, 22 Jan 2025 00:31:09 +0800
Subject: [PATCH 08/17] fixing python things

---
 openhands/replay/replay_tools.py | 75 ++++++++++++++++++++------------
 1 file changed, 47 insertions(+), 28 deletions(-)

diff --git a/openhands/replay/replay_tools.py b/openhands/replay/replay_tools.py
index ddf00e77ae60..1ee31d929dba 100644
--- a/openhands/replay/replay_tools.py
+++ b/openhands/replay/replay_tools.py
@@ -1,6 +1,7 @@
 """Define tools to interact with Replay recordings and the Replay agentic workflow."""
 
 import json
+from enum import Enum
 
 from litellm import (
     ChatCompletionMessageToolCall,
@@ -19,15 +20,24 @@
 from openhands.replay.replay_phases import get_replay_child_phase
 
 
+class ReplayToolType(Enum):
+    Analysis = ('analysis',)
+    PhaseTransition = 'phase_transition'
+
+
 class ReplayTool(ChatCompletionToolParam):
-    pass
+    replay_tool_type: ReplayToolType
+
+
+class ReplayAnalysisTool(ReplayTool):
+    replay_tool_type = ReplayToolType.Analysis
 
 
 def replay_tool(name: str, description: str, parameters: dict) -> ReplayTool:
     f = ChatCompletionToolParamFunctionChunk(
         name=name, description=description, parameters=parameters
     )
-    return ReplayTool(type='function', function=f)
+    return ReplayAnalysisTool(type='function', function=f)
 
 
 # ###########################################################################
@@ -90,14 +100,15 @@ def replay_tool(name: str, description: str, parameters: dict) -> ReplayTool:
 # ###########################################################################
 
 
-class ReplaySubmitTool(ReplayTool):
+class ReplayPhaseTransitionTool(ReplayTool):
+    replay_tool_type = ReplayToolType.PhaseTransition
     new_phase: ReplayPhase
 
 
-def replay_submit_tool(
+def replay_phase_tool(
     new_phase: ReplayPhase, name: str, description: str, parameters: dict
 ):
-    return ReplaySubmitTool(
+    return ReplayPhaseTransitionTool(
         new_phase=new_phase,
         type='function',
         function=ChatCompletionToolParamFunctionChunk(
@@ -106,8 +117,8 @@ def replay_submit_tool(
     )
 
 
-replay_phase_transition_tools: list[ReplaySubmitTool] = [
-    replay_submit_tool(
+replay_phase_transition_tools: list[ReplayPhaseTransitionTool] = [
+    replay_phase_tool(
         ReplayPhase.Edit,
         'submit',
         """Conclude your analysis.""",
@@ -143,10 +154,19 @@ def replay_submit_tool(
     *replay_phase_transition_tools,
 ]
 replay_tool_names: set[str] = set([t['function']['name'] for t in replay_tools])
+replay_replay_tool_type_by_name = {
+    t['function']['name']: t['function'].get('replay_tool_type', None)
+    for t in replay_tools
+}
 
 
-def is_replay_tool(tool_name: str) -> bool:
-    return tool_name in replay_tool_names
+def is_replay_tool(
+    tool_name: str, replay_tool_type: ReplayToolType | None = None
+) -> bool:
+    own_tool_type = replay_replay_tool_type_by_name.get(tool_name, None)
+    if not own_tool_type:
+        return False
+    return replay_tool_type is None or own_tool_type == replay_tool_type
 
 
 # ###########################################################################
@@ -196,27 +216,26 @@ def handle_replay_tool_call(
         f'[REPLAY] TOOL_CALL {tool_call.function.name} - arguments: {json.dumps(arguments, indent=2)}'
     )
     action: ReplayAction
-    if tool_call.function.name == 'inspect-data':
-        # Remove explanation arguments. Those are only used for self-consistency.
-        arguments = {k: v for k, v in arguments.items() if 'explanation' not in k}
-        action = ReplayToolCmdRunAction(
-            command_name='inspect-data',
-            command_args=arguments | {'recordingId': state.replay_recording_id},
-        )
-    elif tool_call.function.name == 'inspect-point':
-        # if arguments['expression'] == 'wiredRules':   # hackfix for 10608 experiment
-        #     raise FunctionCallValidationError(f'wiredRules is irrelevant to the problem. Try something else.')
-        action = ReplayToolCmdRunAction(
-            command_name='inspect-point',
-            command_args=arguments | {'recordingId': state.replay_recording_id},
-        )
-    elif isinstance(tool_call, ReplaySubmitTool):
+    name = tool_call.function.name
+    if is_replay_tool(name, ReplayToolType.Analysis):
+        if name == 'inspect-data':
+            # Remove explanation arguments. Those are only used for self-consistency.
+            arguments = {k: v for k, v in arguments.items() if 'explanation' not in k}
+            action = ReplayToolCmdRunAction(
+                command_name='inspect-data',
+                command_args=arguments | {'recordingId': state.replay_recording_id},
+            )
+        elif name == 'inspect-point':
+            # if arguments['expression'] == 'wiredRules':   # hackfix for 10608 experiment
+            #     raise FunctionCallValidationError(f'wiredRules is irrelevant to the problem. Try something else.')
+            action = ReplayToolCmdRunAction(
+                command_name='inspect-point',
+                command_args=arguments | {'recordingId': state.replay_recording_id},
+            )
+    elif is_replay_tool(name, ReplayToolType.PhaseTransition):
         # Request a phase change.
         action = ReplayPhaseUpdateAction(
             new_phase=tool_call['new_phase'], info=json.dumps(arguments)
         )
-    else:
-        raise ValueError(
-            f'Unknown Replay tool. Make sure to add them all to REPLAY_TOOLS: {tool_call.function.name}'
-        )
+    assert action, f'Unhandled Replay tool_call: {tool_call.function.name}'
     return action

From a4b540caddca0efb39d93d5f282719b91a8b4446 Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Wed, 22 Jan 2025 02:49:19 +0800
Subject: [PATCH 09/17] WIP: tool call fixes

---
 openhands/replay/replay_tools.py   | 101 +++++++++++++++++++----------
 replay_benchmarks/bolt/run-bolt.sh |  11 +++-
 2 files changed, 73 insertions(+), 39 deletions(-)

diff --git a/openhands/replay/replay_tools.py b/openhands/replay/replay_tools.py
index 1ee31d929dba..2508f6e52f52 100644
--- a/openhands/replay/replay_tools.py
+++ b/openhands/replay/replay_tools.py
@@ -21,7 +21,7 @@
 
 
 class ReplayToolType(Enum):
-    Analysis = ('analysis',)
+    Analysis = 'analysis'
     PhaseTransition = 'phase_transition'
 
 
@@ -33,11 +33,36 @@ class ReplayAnalysisTool(ReplayTool):
     replay_tool_type = ReplayToolType.Analysis
 
 
-def replay_tool(name: str, description: str, parameters: dict) -> ReplayTool:
-    f = ChatCompletionToolParamFunctionChunk(
-        name=name, description=description, parameters=parameters
+def replay_analysis_tool(name: str, description: str, parameters: dict) -> ReplayTool:
+    tool = ReplayAnalysisTool(
+        replay_tool_type=ReplayToolType.Analysis,
+        type='function',
+        function=ChatCompletionToolParamFunctionChunk(
+            name=name, description=description, parameters=parameters
+        ),
     )
-    return ReplayAnalysisTool(type='function', function=f)
+    return tool
+
+
+class ReplayPhaseTransitionTool(ReplayTool):
+    replay_tool_type = ReplayToolType.PhaseTransition
+    new_phase: ReplayPhase
+
+
+def replay_phase_tool(
+    new_phase: ReplayPhase, name: str, description: str, parameters: dict
+):
+    tool = ReplayPhaseTransitionTool(
+        replay_tool_type=ReplayToolType.PhaseTransition,
+        new_phase=new_phase,
+        type='function',
+        function=ChatCompletionToolParamFunctionChunk(
+            name=name,
+            description=description,
+            parameters=parameters,
+        ),
+    )
+    return tool
 
 
 # ###########################################################################
@@ -50,7 +75,7 @@ def replay_tool(name: str, description: str, parameters: dict) -> ReplayTool:
 IMPORTANT: Prefer using inspect-data over inspect-point.
 """
 
-ReplayInspectDataTool = replay_tool(
+ReplayInspectDataTool = replay_analysis_tool(
     name='inspect-data',
     description=_REPLAY_INSPECT_DATA_DESCRIPTION.strip(),
     parameters={
@@ -82,7 +107,7 @@ def replay_tool(name: str, description: str, parameters: dict) -> ReplayTool:
 Use this tool instead of `inspect-data` only when you don't have a specific data point to investigate.
 """
 
-ReplayInspectPointTool = replay_tool(
+ReplayInspectPointTool = replay_analysis_tool(
     name='inspect-point',
     description=_REPLAY_INSPECT_POINT_DESCRIPTION.strip(),
     parameters={
@@ -100,23 +125,6 @@ def replay_tool(name: str, description: str, parameters: dict) -> ReplayTool:
 # ###########################################################################
 
 
-class ReplayPhaseTransitionTool(ReplayTool):
-    replay_tool_type = ReplayToolType.PhaseTransition
-    new_phase: ReplayPhase
-
-
-def replay_phase_tool(
-    new_phase: ReplayPhase, name: str, description: str, parameters: dict
-):
-    return ReplayPhaseTransitionTool(
-        new_phase=new_phase,
-        type='function',
-        function=ChatCompletionToolParamFunctionChunk(
-            name=name, description=description, parameters=parameters
-        ),
-    )
-
-
 replay_phase_transition_tools: list[ReplayPhaseTransitionTool] = [
     replay_phase_tool(
         ReplayPhase.Edit,
@@ -155,8 +163,7 @@ def replay_phase_tool(
 ]
 replay_tool_names: set[str] = set([t['function']['name'] for t in replay_tools])
 replay_replay_tool_type_by_name = {
-    t['function']['name']: t['function'].get('replay_tool_type', None)
-    for t in replay_tools
+    t['function']['name']: t.get('replay_tool_type', None) for t in replay_tools
 }
 
 
@@ -174,6 +181,24 @@ def is_replay_tool(
 # ###########################################################################
 
 
+def get_replay_transition_tool_for_current_phase(
+    current_phase: ReplayPhase, name: str | None = None
+) -> ReplayTool | None:
+    next_phase = get_replay_child_phase(current_phase)
+    if next_phase:
+        transition_tools = [
+            t
+            for t in replay_phase_transition_tools
+            if t['new_phase'] == next_phase
+            and (not name or t['function']['name'] == name)
+        ]
+        assert len(
+            transition_tools
+        ), f'replay_phase_transition_tools is missing tools for new_phase: {next_phase}'
+        return transition_tools[0]
+    return None
+
+
 def get_replay_tools(
     replay_phase: ReplayPhase, default_tools: list[ChatCompletionToolParam]
 ) -> list[ChatCompletionToolParam]:
@@ -190,15 +215,9 @@ def get_replay_tools(
         raise ValueError(f'Unhandled ReplayPhase in get_tools: {replay_phase}')
 
     # Add tools to allow transitioning to next phase.
-    next_phase = get_replay_child_phase(replay_phase)
-    if next_phase:
-        transition_tools = [
-            t for t in replay_phase_transition_tools if t['new_phase'] == next_phase
-        ]
-        assert len(
-            transition_tools
-        ), f'replay_phase_transition_tools is missing tools for new_phase: {next_phase}'
-        tools += transition_tools
+    next_phase_tool = get_replay_transition_tool_for_current_phase(replay_phase)
+    if next_phase_tool:
+        tools.append(next_phase_tool)
 
     # Return all tools.
     return tools
@@ -234,8 +253,18 @@ def handle_replay_tool_call(
             )
     elif is_replay_tool(name, ReplayToolType.PhaseTransition):
         # Request a phase change.
+        tool = get_replay_transition_tool_for_current_phase(state.replay_phase, name)
+        assert tool, f'Missing ReplayPhaseTransitionTool for {state.replay_phase} in Replay tool_call: {tool_call.function.name}'
+        new_phase = tool['new_phase']
+        assert (
+            new_phase
+        ), f'Missing new_phase in Replay tool_call: {tool_call.function.name}'
+        assert (
+            new_phase
+        ), f'Missing new_phase in Replay tool_call: {tool_call.function.name}'
+        del arguments['new_phase']
         action = ReplayPhaseUpdateAction(
-            new_phase=tool_call['new_phase'], info=json.dumps(arguments)
+            new_phase=new_phase, info=json.dumps(arguments)
         )
     assert action, f'Unhandled Replay tool_call: {tool_call.function.name}'
     return action
diff --git a/replay_benchmarks/bolt/run-bolt.sh b/replay_benchmarks/bolt/run-bolt.sh
index c47d6751d1ce..bb2515c5606a 100755
--- a/replay_benchmarks/bolt/run-bolt.sh
+++ b/replay_benchmarks/bolt/run-bolt.sh
@@ -9,7 +9,7 @@ INSTANCE_ID=$1
 PROMPT_NAME="$2"
 
 THIS_DIR="$(dirname "$0")"
-OH_ROOT="$THIS_DIR/.."
+OH_ROOT="$THIS_DIR/../.."
 OH_ROOT="$(node -e 'console.log(require("path").resolve(process.argv[1]))' $OH_ROOT)"
 if [[ -z "$TMP_DIR" ]]; then
     TMP_DIR="/tmp"
@@ -65,11 +65,14 @@ else
 fi
 
 # Config overrides + sanity checks.
+set -x
 export DEBUG=1
 # export REPLAY_DEV_MODE=1
 export REPLAY_ENABLE_TOOL_CACHE=1
 export WORKSPACE_BASE="$WORKSPACE_ROOT"
 export LLM_MODEL="anthropic/claude-3-5-sonnet-20241022"
+set +x
+
 if [[ -z "$LLM_API_KEY" ]]; then
     if [[ -z "$ANTHROPIC_API_KEY" ]]; then
         echo "LLM_API_KEY or ANTHROPIC_API_KEY environment variable must be set."
@@ -84,9 +87,11 @@ echo "WORKSPACE_ROOT: \"$WORKSPACE_ROOT\""
 echo "Logging to \"$LOG_FILE\"..."
 
 # GO.
+PROMPT_ONELINE=$(echo "$PROMPT" | tr '\n' " ")
 cd $OH_ROOT
-poetry run python -m openhands.core.main -t "$PROMPT" \
-    > "$LOG_FILE" 2>&1
+set -x
+poetry run python -m openhands.core.main -t "$PROMPT_ONELINE" >"${LOG_FILE}" 2>&1
+set +x
 
 
 # Log the relevant diff.

From 9914f0328d0f528ab856b379eebfd85f47fd34c1 Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Fri, 24 Jan 2025 20:14:47 +0800
Subject: [PATCH 10/17] State machine fixes

---
 openhands/controller/agent_controller.py      |  5 ++++-
 openhands/controller/state/state.py           |  2 ++
 openhands/replay/replay_initial_analysis.py   | 14 ++++++--------
 openhands/replay/replay_phases.py             | 19 +++++++++++--------
 openhands/replay/replay_tools.py              |  4 ++--
 .../unit/replay/test_replay_state_machine.py  |  6 ++++++
 tests/unit/replay/test_replay_tools.py        | 14 ++++++++++++++
 7 files changed, 45 insertions(+), 19 deletions(-)
 create mode 100644 tests/unit/replay/test_replay_state_machine.py
 create mode 100644 tests/unit/replay/test_replay_tools.py

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 9a2796f0aab1..8b46a16ad5aa 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -53,7 +53,6 @@
 )
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
-from openhands.replay.replay_phases import on_controller_replay_observation
 from openhands.utils.shutdown_listener import should_continue
 
 # note: RESUME is only available on web GUI
@@ -296,6 +295,10 @@ async def _handle_observation(self, observation: Observation) -> None:
         if self._pending_action and self._pending_action.id == observation.cause:
             self._pending_action = None
             if isinstance(observation, ReplayObservation):
+                from openhands.replay.replay_phases import (
+                    on_controller_replay_observation,
+                )
+
                 on_controller_replay_observation(observation, self.state, self.agent)
 
             if self.state.agent_state == AgentState.USER_CONFIRMED:
diff --git a/openhands/controller/state/state.py b/openhands/controller/state/state.py
index f279bad954b1..6615e64b63f0 100644
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -101,6 +101,8 @@ class State:
     extra_data: dict[str, Any] = field(default_factory=dict)
     last_error: str = ''
 
+    replay_enhance_prompt_id: int = -1
+    replay_enhanced: bool = False
     replay_recording_id: str = ''
     replay_phase: ReplayPhase = ReplayPhase.Normal
 
diff --git a/openhands/replay/replay_initial_analysis.py b/openhands/replay/replay_initial_analysis.py
index 164f1c72aa3f..1c18c37f7e64 100644
--- a/openhands/replay/replay_initial_analysis.py
+++ b/openhands/replay/replay_initial_analysis.py
@@ -44,8 +44,7 @@ def start_initial_analysis(
 
 
 def replay_enhance_action(state: State, is_workspace_repo: bool) -> Action | None:
-    enhance_action_id = state.extra_data.get('replay_enhance_prompt_id')
-    if enhance_action_id is None:
+    if state.replay_enhance_prompt_id == -1:
         # 1. Get current user prompt.
         latest_user_message = state.get_last_user_message()
         if latest_user_message:
@@ -57,7 +56,7 @@ def replay_enhance_action(state: State, is_workspace_repo: bool) -> Action | Non
                 logger.debug(
                     f'[REPLAY] Enhancing prompt for Replay recording "{recording_id}"...'
                 )
-                state.extra_data['replay_enhance_prompt_id'] = latest_user_message.id
+                state.replay_enhance_prompt_id = latest_user_message.id
                 logger.info('[REPLAY] stored latest_user_message id in state')
                 return start_initial_analysis(
                     latest_user_message.content, is_workspace_repo
@@ -89,19 +88,18 @@ def on_replay_internal_command_observation(
     Enhance the user prompt with the results of the initial analysis.
     Returns the metadata needed for the agent to switch to analysis tools.
     """
-    enhance_action_id = state.extra_data.get('replay_enhance_prompt_id')
-    enhance_observed = state.extra_data.get('replay_enhance_observed', False)
-    if enhance_action_id is not None and not enhance_observed:
+    if state.replay_enhance_prompt_id != -1 and not state.replay_enhanced:
         user_message: MessageAction | None = next(
             (
                 m
                 for m in state.history
-                if m.id == enhance_action_id and isinstance(m, MessageAction)
+                if m.id == state.replay_enhance_prompt_id
+                and isinstance(m, MessageAction)
             ),
             None,
         )
         assert user_message
-        state.extra_data['replay_enhance_observed'] = True
+        state.replay_enhanced = True
 
         # Deserialize stringified result.
         result = safe_parse_json(observation.content)
diff --git a/openhands/replay/replay_phases.py b/openhands/replay/replay_phases.py
index cd7ea41c56dc..22746420c8e9 100644
--- a/openhands/replay/replay_phases.py
+++ b/openhands/replay/replay_phases.py
@@ -97,18 +97,21 @@ def update_phase(new_phase: ReplayPhase, state: State, agent: Agent):
 # ReplayStateMachine.
 # ###########################################################################
 
-replay_state_transitions: dict[ReplayPhase, list[ReplayPhase] | None] = {
+replay_state_transitions: dict[ReplayPhase, tuple[ReplayPhase, ...] | None] = {
     ReplayPhase.Normal: None,
-    ReplayPhase.Analysis: [ReplayPhase.Edit],
+    ReplayPhase.Analysis: (ReplayPhase.Edit,),
     ReplayPhase.Edit: None,
 }
 
 
+# This machine represents state transitions that the agent can make.
 class ReplayStateMachine:
     def __init__(self):
-        self.forward_edges = replay_state_transitions
-
-        self.reverse_edges: dict[ReplayPhase, list[ReplayPhase] | None] = {}
+        self.forward_edges: dict[ReplayPhase, list[ReplayPhase] | None] = {
+            phase: list(targets) if targets is not None else None
+            for phase, targets in replay_state_transitions.items()
+        }
+        self.reverse_edges: dict[ReplayPhase, list[ReplayPhase]] = {}
 
         for source, targets in self.forward_edges.items():
             if targets:
@@ -128,7 +131,7 @@ def get_unique_parent_phase(self, phase: ReplayPhase) -> ReplayPhase | None:
         if not phases:
             return None
         if len(phases) == 1:
-            return phases.pop()
+            return phases[0]
         assert len(phases) > 1
         raise ValueError(f'Phase {phase} has multiple parent phases: {phases}')
 
@@ -140,7 +143,7 @@ def get_unique_child_phase(self, phase: ReplayPhase) -> ReplayPhase | None:
         if not phases:
             return None
         if len(phases) == 1:
-            return phases.pop()
+            return phases[0]
         assert len(phases) > 1
         raise ValueError(f'Phase {phase} has multiple child phases: {phases}')
 
@@ -151,5 +154,5 @@ def get_child_phases(self, phase: ReplayPhase) -> list[ReplayPhase] | None:
 replay_state_machine = ReplayStateMachine()
 
 
-def get_replay_child_phase(phase: ReplayPhase) -> ReplayPhase | None:
+def get_next_agent_replay_phase(phase: ReplayPhase) -> ReplayPhase | None:
     return replay_state_machine.get_unique_child_phase(phase)
diff --git a/openhands/replay/replay_tools.py b/openhands/replay/replay_tools.py
index 2508f6e52f52..4b90da7e27ff 100644
--- a/openhands/replay/replay_tools.py
+++ b/openhands/replay/replay_tools.py
@@ -17,7 +17,7 @@
     ReplayPhaseUpdateAction,
     ReplayToolCmdRunAction,
 )
-from openhands.replay.replay_phases import get_replay_child_phase
+from openhands.replay.replay_phases import get_next_agent_replay_phase
 
 
 class ReplayToolType(Enum):
@@ -184,7 +184,7 @@ def is_replay_tool(
 def get_replay_transition_tool_for_current_phase(
     current_phase: ReplayPhase, name: str | None = None
 ) -> ReplayTool | None:
-    next_phase = get_replay_child_phase(current_phase)
+    next_phase = get_next_agent_replay_phase(current_phase)
     if next_phase:
         transition_tools = [
             t
diff --git a/tests/unit/replay/test_replay_state_machine.py b/tests/unit/replay/test_replay_state_machine.py
new file mode 100644
index 000000000000..225c14068146
--- /dev/null
+++ b/tests/unit/replay/test_replay_state_machine.py
@@ -0,0 +1,6 @@
+from openhands.core.schema.replay import ReplayPhase
+from openhands.replay.replay_phases import get_next_agent_replay_phase
+
+
+def test_get_next_agent_replay_phase():
+    assert get_next_agent_replay_phase(ReplayPhase.Analysis) == ReplayPhase.Edit
diff --git a/tests/unit/replay/test_replay_tools.py b/tests/unit/replay/test_replay_tools.py
new file mode 100644
index 000000000000..4b5935cc8af2
--- /dev/null
+++ b/tests/unit/replay/test_replay_tools.py
@@ -0,0 +1,14 @@
+from openhands.core.schema.replay import ReplayPhase
+from openhands.replay.replay_phases import get_next_agent_replay_phase
+from openhands.replay.replay_tools import (
+    get_replay_transition_tool_for_current_phase,
+)
+
+
+def test_get_replay_transition_tool_for_analysis_phase():
+    tool = get_replay_transition_tool_for_current_phase(ReplayPhase.Analysis, 'submit')
+    assert tool is not None
+    assert tool['function']['name'] == 'submit'
+    assert tool['new_phase'] is not None
+    assert get_next_agent_replay_phase(ReplayPhase.Analysis) is not None
+    assert tool['new_phase'] == get_next_agent_replay_phase(ReplayPhase.Analysis)

From 26f6fefb1622c9f7343c3c3ee5f86263f53ae77a Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Fri, 24 Jan 2025 20:31:10 +0800
Subject: [PATCH 11/17] fix unit tests

---
 tests/unit/test_agent_skill.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/test_agent_skill.py b/tests/unit/test_agent_skill.py
index 6079eb659aea..c9e6651bdb1b 100644
--- a/tests/unit/test_agent_skill.py
+++ b/tests/unit/test_agent_skill.py
@@ -757,10 +757,10 @@ def test_file_editor_view(tmp_path):
         result.strip().split('\n')
         == f"""Here's the files and directories up to 2 levels deep in {tmp_path}, excluding hidden items:
 {tmp_path}
-{tmp_path}/dir_2
-{tmp_path}/dir_2/b.txt
 {tmp_path}/dir_1
 {tmp_path}/dir_1/a.txt
+{tmp_path}/dir_2
+{tmp_path}/dir_2/b.txt
 """.strip().split('\n')
     )
 

From 42bdeb4e144204bcf76977d499b7d6923c839cad Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Fri, 24 Jan 2025 20:31:29 +0800
Subject: [PATCH 12/17] fix runtime replay test

---
 tests/runtime/test_replay_cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/runtime/test_replay_cli.py b/tests/runtime/test_replay_cli.py
index 15a7d13a4b16..72ef9511ebf2 100644
--- a/tests/runtime/test_replay_cli.py
+++ b/tests/runtime/test_replay_cli.py
@@ -38,7 +38,7 @@ def test_initial_analysis(temp_dir, runtime_cls, run_as_openhands):
 
         try:
             result: dict = json.loads(obs.content)
-            assert result.get('thisPoint', '') == '78858008544042601258383216576823300'
+            assert result.get('thisPoint', '') == '78858008544042601258383216576823298'
         except json.JSONDecodeError:
             raise AssertionError(
                 f'obs.content should be a valid JSON string: {repr(obs)}'

From 5e320cb05869ca2d7400a9552891885bb9a02aae Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Fri, 24 Jan 2025 20:37:12 +0800
Subject: [PATCH 13/17] fix

---
 openhands/replay/replay_tools.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/openhands/replay/replay_tools.py b/openhands/replay/replay_tools.py
index 4b90da7e27ff..7cf39cfed1c7 100644
--- a/openhands/replay/replay_tools.py
+++ b/openhands/replay/replay_tools.py
@@ -254,17 +254,13 @@ def handle_replay_tool_call(
     elif is_replay_tool(name, ReplayToolType.PhaseTransition):
         # Request a phase change.
         tool = get_replay_transition_tool_for_current_phase(state.replay_phase, name)
-        assert tool, f'Missing ReplayPhaseTransitionTool for {state.replay_phase} in Replay tool_call: {tool_call.function.name}'
+        assert tool, f'[REPLAY] Missing ReplayPhaseTransitionTool for {state.replay_phase} in Replay tool_call: {tool_call.function.name}'
         new_phase = tool['new_phase']
         assert (
             new_phase
-        ), f'Missing new_phase in Replay tool_call: {tool_call.function.name}'
-        assert (
-            new_phase
-        ), f'Missing new_phase in Replay tool_call: {tool_call.function.name}'
-        del arguments['new_phase']
+        ), f'[REPLAY] Missing new_phase in Replay tool_call: {tool_call.function.name}'
         action = ReplayPhaseUpdateAction(
             new_phase=new_phase, info=json.dumps(arguments)
         )
-    assert action, f'Unhandled Replay tool_call: {tool_call.function.name}'
+    assert action, f'[REPLAY] Unhandled Replay tool_call: {tool_call.function.name}'
     return action

From 954e4352ed614696847c6656eaf34a90c3af799f Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Fri, 24 Jan 2025 21:06:58 +0800
Subject: [PATCH 14/17] more tool fixes

---
 .../agenthub/codeact_agent/codeact_agent.py   |  2 +-
 openhands/replay/replay_initial_analysis.py   |  5 +++
 openhands/replay/replay_phases.py             |  2 +-
 openhands/replay/replay_tools.py              | 17 +++-----
 tests/unit/replay/test_replay_tools.py        | 41 +++++++++++++++++--
 5 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index a3d9a7651162..a3c3b790ace4 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -326,7 +326,7 @@ def update_tools(self, phase: ReplayPhase) -> None:
             replay_phase=phase,
         )
         logger.info(
-            f'[REPLAY] update_tools: {json.dumps([t["function"]['name'] for t in self.tools], indent=2)}'
+            f'[REPLAY] update_tools for phase {phase}: {json.dumps([t["function"]['name'] for t in self.tools], indent=2)}'
         )
 
     def step(self, state: State) -> Action:
diff --git a/openhands/replay/replay_initial_analysis.py b/openhands/replay/replay_initial_analysis.py
index 1c18c37f7e64..b24be0a19f39 100644
--- a/openhands/replay/replay_initial_analysis.py
+++ b/openhands/replay/replay_initial_analysis.py
@@ -4,6 +4,7 @@
 
 from openhands.controller.state.state import State
 from openhands.core.logger import openhands_logger as logger
+from openhands.core.schema.replay import ReplayPhase
 from openhands.events.action.action import Action
 from openhands.events.action.message import MessageAction
 from openhands.events.action.replay import ReplayInternalCmdRunAction
@@ -44,6 +45,10 @@ def start_initial_analysis(
 
 
 def replay_enhance_action(state: State, is_workspace_repo: bool) -> Action | None:
+    if state.replay_phase != ReplayPhase.Normal:
+        # We currently only enhance prompts in the Normal phase.
+        return None
+
     if state.replay_enhance_prompt_id == -1:
         # 1. Get current user prompt.
         latest_user_message = state.get_last_user_message()
diff --git a/openhands/replay/replay_phases.py b/openhands/replay/replay_phases.py
index 22746420c8e9..a17257023c14 100644
--- a/openhands/replay/replay_phases.py
+++ b/openhands/replay/replay_phases.py
@@ -88,9 +88,9 @@ def get_phase_prompt(obs) -> str:
 
 def update_phase(new_phase: ReplayPhase, state: State, agent: Agent):
     """Apply phase update side effects."""
+    logger.info(f'[REPLAY] update_phase (replay_phase): {new_phase}')
     state.replay_phase = new_phase
     agent.update_tools(new_phase)
-    logger.info(f'[REPLAY] update_phase (replay_phase): {new_phase}')
 
 
 # ###########################################################################
diff --git a/openhands/replay/replay_tools.py b/openhands/replay/replay_tools.py
index 7cf39cfed1c7..e56660492700 100644
--- a/openhands/replay/replay_tools.py
+++ b/openhands/replay/replay_tools.py
@@ -152,15 +152,15 @@ def replay_phase_tool(
 # Bookkeeping + utilities.
 # ###########################################################################
 
-replay_analysis_tools: list[ReplayTool] = [
+replay_analysis_tools: tuple[ReplayTool, ...] = (
     ReplayInspectDataTool,
     ReplayInspectPointTool,
-]
+)
 
-replay_tools: list[ReplayTool] = [
+replay_tools: tuple[ReplayTool, ...] = (
     *replay_analysis_tools,
     *replay_phase_transition_tools,
-]
+)
 replay_tool_names: set[str] = set([t['function']['name'] for t in replay_tools])
 replay_replay_tool_type_by_name = {
     t['function']['name']: t.get('replay_tool_type', None) for t in replay_tools
@@ -203,23 +203,18 @@ def get_replay_tools(
     replay_phase: ReplayPhase, default_tools: list[ChatCompletionToolParam]
 ) -> list[ChatCompletionToolParam]:
     if replay_phase == ReplayPhase.Normal:
-        # Use the default tools when not in a Replay-specific phase.
         tools = default_tools
     elif replay_phase == ReplayPhase.Analysis:
-        # Only allow analysis in this phase.
-        tools = replay_analysis_tools
+        tools = list(replay_analysis_tools)
     elif replay_phase == ReplayPhase.Edit:
-        # Combine default and analysis tools.
-        tools = default_tools + replay_analysis_tools
+        tools = default_tools + list(replay_analysis_tools)
     else:
         raise ValueError(f'Unhandled ReplayPhase in get_tools: {replay_phase}')
 
-    # Add tools to allow transitioning to next phase.
     next_phase_tool = get_replay_transition_tool_for_current_phase(replay_phase)
     if next_phase_tool:
         tools.append(next_phase_tool)
 
-    # Return all tools.
     return tools
 
 
diff --git a/tests/unit/replay/test_replay_tools.py b/tests/unit/replay/test_replay_tools.py
index 4b5935cc8af2..08fa5d496c70 100644
--- a/tests/unit/replay/test_replay_tools.py
+++ b/tests/unit/replay/test_replay_tools.py
@@ -1,14 +1,49 @@
+import pytest
+
 from openhands.core.schema.replay import ReplayPhase
 from openhands.replay.replay_phases import get_next_agent_replay_phase
 from openhands.replay.replay_tools import (
+    get_replay_tools,
     get_replay_transition_tool_for_current_phase,
+    replay_analysis_tools,
 )
 
 
-def test_get_replay_transition_tool_for_analysis_phase():
+def test_get_replay_transition_tools_analysis():
+    assert get_next_agent_replay_phase(ReplayPhase.Analysis) is not None
     tool = get_replay_transition_tool_for_current_phase(ReplayPhase.Analysis, 'submit')
-    assert tool is not None
+    assert tool
     assert tool['function']['name'] == 'submit'
     assert tool['new_phase'] is not None
-    assert get_next_agent_replay_phase(ReplayPhase.Analysis) is not None
     assert tool['new_phase'] == get_next_agent_replay_phase(ReplayPhase.Analysis)
+
+
+def test_get_replay_transition_tools_edit():
+    assert get_next_agent_replay_phase(ReplayPhase.Edit) is None
+    tool = get_replay_transition_tool_for_current_phase(ReplayPhase.Edit, 'submit')
+    assert not tool
+
+
+def test_get_tools():
+    default_tools = []
+
+    # Test Normal phase
+    tools = get_replay_tools(ReplayPhase.Normal, default_tools)
+    assert len(tools) == len(default_tools)
+    assert all(t in tools for t in default_tools)
+
+    # Test Analysis phase
+    tools = get_replay_tools(ReplayPhase.Analysis, default_tools)
+    assert len(tools) == len(replay_analysis_tools) + 1  # +1 for transition tool
+    assert all(t in tools for t in replay_analysis_tools)
+    assert tools[-1]['function']['name'] == 'submit'
+
+    # Test Edit phase
+    tools = get_replay_tools(ReplayPhase.Edit, default_tools)
+    assert len(tools) == len(default_tools) + len(replay_analysis_tools)
+    assert all(t in tools for t in default_tools)
+    assert all(t in tools for t in replay_analysis_tools)
+
+    # Test invalid phase
+    with pytest.raises(ValueError):
+        get_replay_tools('invalid_phase', default_tools)

From 274973b2e39c73b1f68ee41b8dbf6af6d724d320 Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Fri, 24 Jan 2025 23:49:37 +0800
Subject: [PATCH 15/17] More tests, more fixes

---
 .../agenthub/codeact_agent/codeact_agent.py   |   5 +-
 openhands/core/schema/replay.py               |  13 +-
 openhands/replay/replay_phases.py             |  34 +++--
 openhands/replay/replay_prompts.py            |  18 ++-
 openhands/replay/replay_tools.py              | 125 +++++++++++++-----
 replay_benchmarks/bolt/run-bolt.sh            |   2 -
 tests/unit/replay/replay_test_util.py         |  40 ++++++
 tests/unit/replay/test_replay_prompts.py      |  14 ++
 .../unit/replay/test_replay_state_machine.py  |  17 ++-
 tests/unit/replay/test_replay_tools.py        |  68 ++++++++--
 10 files changed, 270 insertions(+), 66 deletions(-)
 create mode 100644 tests/unit/replay/replay_test_util.py
 create mode 100644 tests/unit/replay/test_replay_prompts.py

diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index a3c3b790ace4..d37eeb42e90c 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -369,7 +369,10 @@ def step(self, state: State) -> Action:
         params['tools'] = self.tools
         if self.mock_function_calling:
             params['mock_function_calling'] = True
-        # logger.debug(f'#######\nCodeActAgent.step: messages:\n{json.dumps(params)}\n\n#######\n')
+
+        # # Debug log the raw input to the LLM:
+        # logger.debug(f'#######\nCodeActAgent.step: RAW LLM INPUT:\n{repr(params)}\n\n#######\n')
+
         response = self.llm.completion(**params)
         actions = codeact_function_calling.response_to_actions(response, state)
         for action in actions:
diff --git a/openhands/core/schema/replay.py b/openhands/core/schema/replay.py
index 9a3308f4fa56..ffe0984637ef 100644
--- a/openhands/core/schema/replay.py
+++ b/openhands/core/schema/replay.py
@@ -2,12 +2,19 @@
 
 
 class ReplayPhase(str, Enum):
+    """All Replay phases that an agent can be in."""
+
     Normal = 'normal'
-    """The agent is not doing anything related to Replay.
+    """The agent does not have access to a recording.
     """
+
     Analysis = 'analysis'
-    """The agent is analyzing a recording.
+    """The agent uses initial-analysis data and dedicated tools to analyze a Replay recording.
     """
+    ConfirmAnalysis = 'confirm_analysis'
+    """The agent is confirming the analysis.
+    """
+
     Edit = 'edit'
-    """The agent is editing the code.
+    """The agent finally edits the code.
     """
diff --git a/openhands/replay/replay_phases.py b/openhands/replay/replay_phases.py
index a17257023c14..16f00e07b94b 100644
--- a/openhands/replay/replay_phases.py
+++ b/openhands/replay/replay_phases.py
@@ -97,21 +97,35 @@ def update_phase(new_phase: ReplayPhase, state: State, agent: Agent):
 # ReplayStateMachine.
 # ###########################################################################
 
-replay_state_transitions: dict[ReplayPhase, tuple[ReplayPhase, ...] | None] = {
-    ReplayPhase.Normal: None,
-    ReplayPhase.Analysis: (ReplayPhase.Edit,),
-    ReplayPhase.Edit: None,
+# Transition edges controlled by the agent.
+replay_agent_state_transitions: dict[ReplayPhase, tuple[ReplayPhase, ...]] = {
+    ReplayPhase.Analysis: (ReplayPhase.ConfirmAnalysis,),
+    ReplayPhase.ConfirmAnalysis: (ReplayPhase.Edit,),
 }
 
 
 # This machine represents state transitions that the agent can make.
-class ReplayStateMachine:
+class ReplayAgentStateMachine:
+    edges: list[tuple[ReplayPhase, ReplayPhase]]
+    """List of all edges."""
+
+    forward_edges: dict[ReplayPhase, list[ReplayPhase]]
+    """Adjacency list of forward edges."""
+
+    reverse_edges: dict[ReplayPhase, list[ReplayPhase]]
+    """Adjacency list of reverse edges."""
+
     def __init__(self):
-        self.forward_edges: dict[ReplayPhase, list[ReplayPhase] | None] = {
+        self.forward_edges = {
             phase: list(targets) if targets is not None else None
-            for phase, targets in replay_state_transitions.items()
+            for phase, targets in replay_agent_state_transitions.items()
         }
-        self.reverse_edges: dict[ReplayPhase, list[ReplayPhase]] = {}
+        self.edges = [
+            (source, target)
+            for source, targets in self.forward_edges.items()
+            for target in targets
+        ]
+        self.reverse_edges = {}
 
         for source, targets in self.forward_edges.items():
             if targets:
@@ -151,8 +165,8 @@ def get_child_phases(self, phase: ReplayPhase) -> list[ReplayPhase] | None:
         return self.forward_edges.get(phase, list())
 
 
-replay_state_machine = ReplayStateMachine()
+replay_agent_state_machine = ReplayAgentStateMachine()
 
 
 def get_next_agent_replay_phase(phase: ReplayPhase) -> ReplayPhase | None:
-    return replay_state_machine.get_unique_child_phase(phase)
+    return replay_agent_state_machine.get_unique_child_phase(phase)
diff --git a/openhands/replay/replay_prompts.py b/openhands/replay/replay_prompts.py
index e8c421886cc4..b18ee100d671 100644
--- a/openhands/replay/replay_prompts.py
+++ b/openhands/replay/replay_prompts.py
@@ -35,17 +35,25 @@ class PromptMap(TypedDict):
 
 
 phase_prompts: dict[ReplayPhase, PromptMap] = {
-    ReplayPhase.Edit: {
+    ReplayPhase.ConfirmAnalysis: {
         'enter': """
-You have concluded the analysis.
+You have submitted a draft of the hypothesis.
 
-IMPORTANT: NOW review, then implement the hypothesized changes using tools. The code is available in the workspace. Start by answering these questions:
+IMPORTANT: NOW review the hypothesis. When reviewing, PAY ATTENTION to this:
   1. What is the goal of the investigation according to the initial prompt and initial analysis? IMPORTANT. PAY ATTENTION TO THIS. THIS IS THE ENTRY POINT OF EVERYTHING.
   2. Given (1), is the hypothesis's `problem` description correct? Does it match the goal of the investigation?
   3. Do the `editSuggestions` actually address the issue?
-  4. Rephrase the hypothesis so that it is consistent and correct.
+
+Rephrase and confirm a revised hypothesis.
+"""
+    },
+    ReplayPhase.Edit: {
+        'enter': """
+You have confirmed the analysis.
+
+IMPORTANT: NOW implement the confirmed suggestions. The code is available in the workspace.
 """
-    }
+    },
 }
 
 
diff --git a/openhands/replay/replay_tools.py b/openhands/replay/replay_tools.py
index e56660492700..49eb8ffab9ae 100644
--- a/openhands/replay/replay_tools.py
+++ b/openhands/replay/replay_tools.py
@@ -12,17 +12,20 @@
 from openhands.controller.state.state import State
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.schema.replay import ReplayPhase
+from openhands.events.action.action import Action
+from openhands.events.action.empty import NullAction
 from openhands.events.action.replay import (
-    ReplayAction,
     ReplayPhaseUpdateAction,
     ReplayToolCmdRunAction,
 )
-from openhands.replay.replay_phases import get_next_agent_replay_phase
 
 
 class ReplayToolType(Enum):
     Analysis = 'analysis'
+    """Tools that allow analyzing a Replay recording."""
+
     PhaseTransition = 'phase_transition'
+    """Tools that trigger a phase transition. Phase transitions are generally used to improve self-consistency."""
 
 
 class ReplayTool(ChatCompletionToolParam):
@@ -45,16 +48,19 @@ def replay_analysis_tool(name: str, description: str, parameters: dict) -> Repla
 
 
 class ReplayPhaseTransitionTool(ReplayTool):
+    edges: list[tuple[ReplayPhase, ReplayPhase]]
     replay_tool_type = ReplayToolType.PhaseTransition
-    new_phase: ReplayPhase
 
 
 def replay_phase_tool(
-    new_phase: ReplayPhase, name: str, description: str, parameters: dict
+    edges: list[tuple[ReplayPhase, ReplayPhase]],
+    name: str,
+    description: str,
+    parameters: dict,
 ):
     tool = ReplayPhaseTransitionTool(
+        edges=edges,
         replay_tool_type=ReplayToolType.PhaseTransition,
-        new_phase=new_phase,
         type='function',
         function=ChatCompletionToolParamFunctionChunk(
             name=name,
@@ -127,27 +133,65 @@ def replay_phase_tool(
 
 replay_phase_transition_tools: list[ReplayPhaseTransitionTool] = [
     replay_phase_tool(
-        ReplayPhase.Edit,
+        [
+            (ReplayPhase.Analysis, ReplayPhase.ConfirmAnalysis),
+        ],
         'submit',
-        """Conclude your analysis.""",
+        """Submit your analysis.""",
         {
             'type': 'object',
             'properties': {
                 'problem': {
                     'type': 'string',
-                    'description': 'One-sentence explanation of the core problem that this will solve.',
+                    'description': 'One-sentence explanation of the core problem, according to user requirements and initial-analysis.',
                 },
                 'rootCauseHypothesis': {'type': 'string'},
+                'badCode': {
+                    'type': 'string',
+                    'description': 'Show exactly which code requires changing, according to the user requirements and initial-analysis.',
+                },
+            },
+            'required': ['problem', 'rootCauseHypothesis', 'badCode'],
+        },
+    ),
+    replay_phase_tool(
+        [
+            (ReplayPhase.ConfirmAnalysis, ReplayPhase.Edit),
+        ],
+        'confirm',
+        """Confirm your analysis and suggest specific code changes.""",
+        {
+            'type': 'object',
+            'properties': {
+                'problem': {
+                    'type': 'string',
+                    'description': 'One-sentence explanation of the core problem, according to user requirements and initial-analysis.',
+                },
+                'rootCauseHypothesis': {'type': 'string'},
+                'badCode': {
+                    'type': 'string',
+                    'description': 'Show exactly which code requires changing, according to the user requirements and initial-analysis.',
+                },
                 'editSuggestions': {
                     'type': 'string',
                     'description': 'Provide suggestions to fix the bug, if you know enough about the code that requires modification.',
                 },
             },
-            'required': ['problem', 'rootCauseHypothesis'],
+            'required': ['problem', 'rootCauseHypothesis', 'editSuggestions'],
         },
-    )
+    ),
 ]
 
+replay_phase_transition_tools_by_from_phase = {
+    phase: [
+        t
+        for t in replay_phase_transition_tools
+        for edge in t['edges']
+        if edge[0] == phase
+    ]
+    for phase in {edge[0] for t in replay_phase_transition_tools for edge in t['edges']}
+}
+
 # ###########################################################################
 # Bookkeeping + utilities.
 # ###########################################################################
@@ -181,22 +225,25 @@ def is_replay_tool(
 # ###########################################################################
 
 
-def get_replay_transition_tool_for_current_phase(
-    current_phase: ReplayPhase, name: str | None = None
+def get_replay_transition_tools(current_phase: ReplayPhase) -> list[ReplayTool] | None:
+    phase_tools = replay_phase_transition_tools_by_from_phase.get(current_phase, None)
+    if not phase_tools:
+        return None
+    assert len(phase_tools)
+    return phase_tools
+
+
+def get_replay_transition_tool(
+    current_phase: ReplayPhase, name: str
 ) -> ReplayTool | None:
-    next_phase = get_next_agent_replay_phase(current_phase)
-    if next_phase:
-        transition_tools = [
-            t
-            for t in replay_phase_transition_tools
-            if t['new_phase'] == next_phase
-            and (not name or t['function']['name'] == name)
-        ]
-        assert len(
-            transition_tools
-        ), f'replay_phase_transition_tools is missing tools for new_phase: {next_phase}'
-        return transition_tools[0]
-    return None
+    tools = get_replay_transition_tools(current_phase)
+    if not tools:
+        return None
+    matching = [t for t in tools if t['function']['name'] == name]
+    assert (
+        len(matching) == 1
+    ), f'replay_phase_transition_tools did not get unique matching tool for phase {current_phase} and name {name}'
+    return matching[0]
 
 
 def get_replay_tools(
@@ -206,14 +253,16 @@ def get_replay_tools(
         tools = default_tools
     elif replay_phase == ReplayPhase.Analysis:
         tools = list(replay_analysis_tools)
+    elif replay_phase == ReplayPhase.ConfirmAnalysis:
+        tools = list(replay_analysis_tools)
     elif replay_phase == ReplayPhase.Edit:
         tools = default_tools + list(replay_analysis_tools)
     else:
         raise ValueError(f'Unhandled ReplayPhase in get_tools: {replay_phase}')
 
-    next_phase_tool = get_replay_transition_tool_for_current_phase(replay_phase)
-    if next_phase_tool:
-        tools.append(next_phase_tool)
+    next_phase_tools = get_replay_transition_tools(replay_phase)
+    if next_phase_tools:
+        tools += next_phase_tools
 
     return tools
 
@@ -225,11 +274,11 @@ def get_replay_tools(
 
 def handle_replay_tool_call(
     tool_call: ChatCompletionMessageToolCall, arguments: dict, state: State
-) -> ReplayAction:
+) -> Action:
     logger.info(
         f'[REPLAY] TOOL_CALL {tool_call.function.name} - arguments: {json.dumps(arguments, indent=2)}'
     )
-    action: ReplayAction
+    action: Action
     name = tool_call.function.name
     if is_replay_tool(name, ReplayToolType.Analysis):
         if name == 'inspect-data':
@@ -248,14 +297,24 @@ def handle_replay_tool_call(
             )
     elif is_replay_tool(name, ReplayToolType.PhaseTransition):
         # Request a phase change.
-        tool = get_replay_transition_tool_for_current_phase(state.replay_phase, name)
-        assert tool, f'[REPLAY] Missing ReplayPhaseTransitionTool for {state.replay_phase} in Replay tool_call: {tool_call.function.name}'
-        new_phase = tool['new_phase']
+        tool = get_replay_transition_tool(state.replay_phase, name)
+        assert tool, f'[REPLAY] Missing ReplayPhaseTransitionTool for {state.replay_phase} in Replay tool_call({tool_call.function.name})'
+        new_phase = next(
+            (
+                to_phase
+                for [from_phase, to_phase] in tool['edges']
+                if from_phase == state.replay_phase
+            ),
+            None,
+        )
         assert (
             new_phase
         ), f'[REPLAY] Missing new_phase in Replay tool_call: {tool_call.function.name}'
         action = ReplayPhaseUpdateAction(
             new_phase=new_phase, info=json.dumps(arguments)
         )
+    else:
+        # NOTE: This is a weird bug where Claude sometimes might call a tool that it *had* but does not have anymore.
+        action = NullAction()
     assert action, f'[REPLAY] Unhandled Replay tool_call: {tool_call.function.name}'
     return action
diff --git a/replay_benchmarks/bolt/run-bolt.sh b/replay_benchmarks/bolt/run-bolt.sh
index bb2515c5606a..3271aa160a60 100755
--- a/replay_benchmarks/bolt/run-bolt.sh
+++ b/replay_benchmarks/bolt/run-bolt.sh
@@ -65,13 +65,11 @@ else
 fi
 
 # Config overrides + sanity checks.
-set -x
 export DEBUG=1
 # export REPLAY_DEV_MODE=1
 export REPLAY_ENABLE_TOOL_CACHE=1
 export WORKSPACE_BASE="$WORKSPACE_ROOT"
 export LLM_MODEL="anthropic/claude-3-5-sonnet-20241022"
-set +x
 
 if [[ -z "$LLM_API_KEY" ]]; then
     if [[ -z "$ANTHROPIC_API_KEY" ]]; then
diff --git a/tests/unit/replay/replay_test_util.py b/tests/unit/replay/replay_test_util.py
new file mode 100644
index 000000000000..1aa0878ba46d
--- /dev/null
+++ b/tests/unit/replay/replay_test_util.py
@@ -0,0 +1,40 @@
+import enum
+from typing import Collection, TypeVar
+
+from openhands.core.schema.replay import ReplayPhase
+from openhands.replay.replay_phases import (
+    replay_agent_state_machine,
+)
+
+T = TypeVar('T', bound=enum.Enum)
+
+
+def format_enums(enums: Collection[T]) -> set[str]:
+    """Convert any collection of enums to readable set of names."""
+    return {e.name for e in enums}
+
+
+def format_edge(edge: tuple[ReplayPhase, ReplayPhase]) -> str:
+    """Format state transition edge as 'from → to'."""
+    from_state, to_state = edge
+    return f'{from_state.name} → {to_state.name}'
+
+
+# The given edges should form a partition of the set of all agent edges.
+def assert_edges_partition(
+    edges: list[tuple[ReplayPhase, ReplayPhase]],
+) -> None:
+    """Verify tool edges form a partition of agent edges."""
+    # Get ground truth
+    all_agent_edges = replay_agent_state_machine.edges
+
+    # Convert to sorted lists of formatted strings for comparison
+    tool_edge_strs = sorted(f'{f.name} → {t.name}' for [f, t] in edges)
+    agent_edge_strs = sorted(f'{f.name} → {t.name}' for [f, t] in all_agent_edges)
+
+    # Check for duplicates in tool edges
+    if len(set(tool_edge_strs)) != len(tool_edge_strs):
+        raise AssertionError('Tool edges contain duplicates')
+
+    # Check lists match exactly
+    assert tool_edge_strs == agent_edge_strs
diff --git a/tests/unit/replay/test_replay_prompts.py b/tests/unit/replay/test_replay_prompts.py
new file mode 100644
index 000000000000..8ae77d2b6b37
--- /dev/null
+++ b/tests/unit/replay/test_replay_prompts.py
@@ -0,0 +1,14 @@
+from openhands.replay.replay_phases import (
+    replay_agent_state_machine,
+)
+from openhands.replay.replay_prompts import phase_prompts
+from tests.unit.replay.replay_test_util import format_enums
+
+
+def test_transition_prompts():
+    # Get ground truth
+    all_agent_edges = replay_agent_state_machine.edges
+    to_edges = {edge[1] for edge in all_agent_edges}
+
+    # All destination phases should have prompts.
+    assert format_enums(phase_prompts.keys()) == format_enums(to_edges)
diff --git a/tests/unit/replay/test_replay_state_machine.py b/tests/unit/replay/test_replay_state_machine.py
index 225c14068146..a0afe4f72dfa 100644
--- a/tests/unit/replay/test_replay_state_machine.py
+++ b/tests/unit/replay/test_replay_state_machine.py
@@ -1,6 +1,19 @@
 from openhands.core.schema.replay import ReplayPhase
-from openhands.replay.replay_phases import get_next_agent_replay_phase
+from openhands.replay.replay_phases import (
+    get_next_agent_replay_phase,
+    replay_agent_state_machine,
+)
+
+
+def test_edges():
+    all_agent_edges = replay_agent_state_machine.edges
+
+    # All agent edges should be unique.
+    assert len(all_agent_edges) == len(set(all_agent_edges))
 
 
 def test_get_next_agent_replay_phase():
-    assert get_next_agent_replay_phase(ReplayPhase.Analysis) == ReplayPhase.Edit
+    assert (
+        get_next_agent_replay_phase(ReplayPhase.Analysis) == ReplayPhase.ConfirmAnalysis
+    )
+    assert get_next_agent_replay_phase(ReplayPhase.ConfirmAnalysis) == ReplayPhase.Edit
diff --git a/tests/unit/replay/test_replay_tools.py b/tests/unit/replay/test_replay_tools.py
index 08fa5d496c70..0e6e6d76872b 100644
--- a/tests/unit/replay/test_replay_tools.py
+++ b/tests/unit/replay/test_replay_tools.py
@@ -1,44 +1,92 @@
 import pytest
 
 from openhands.core.schema.replay import ReplayPhase
-from openhands.replay.replay_phases import get_next_agent_replay_phase
+from openhands.replay.replay_phases import (
+    get_next_agent_replay_phase,
+    replay_agent_state_machine,
+)
 from openhands.replay.replay_tools import (
     get_replay_tools,
-    get_replay_transition_tool_for_current_phase,
+    get_replay_transition_tool,
+    get_replay_transition_tools,
     replay_analysis_tools,
+    replay_phase_transition_tools,
+    replay_phase_transition_tools_by_from_phase,
 )
+from tests.unit.replay.replay_test_util import assert_edges_partition
+
+
+def test_transition_tool_edges():
+    # Get tool edges.
+    tools = replay_phase_transition_tools
+    tool_edges: list[tuple[ReplayPhase, ReplayPhase]] = [
+        edge for t in tools for edge in t['edges']
+    ]
+
+    # All tool edges must form a partition of the set of all agent edges.
+    assert_edges_partition(tool_edges)
+
+
+def test_replay_phase_transition_tools_by_from_phase():
+    # Get ground truth
+    all_agent_edges = replay_agent_state_machine.edges
+    all_agent_from_phases = {edge[0] for edge in all_agent_edges}
+
+    # all_agent_from_phases should exactly match the set of all from_phases of all tools.
+    assert (
+        set(replay_phase_transition_tools_by_from_phase.keys()) == all_agent_from_phases
+    )
+
+    # All tool edges should form a partition of the set of all agent edges
+    all_tool_edges = [
+        edge
+        for _, tools in replay_phase_transition_tools_by_from_phase.items()
+        for tool in tools
+        for edge in tool['edges']
+    ]
+    assert_edges_partition(all_tool_edges)
 
 
 def test_get_replay_transition_tools_analysis():
-    assert get_next_agent_replay_phase(ReplayPhase.Analysis) is not None
-    tool = get_replay_transition_tool_for_current_phase(ReplayPhase.Analysis, 'submit')
+    tool = get_replay_transition_tool(ReplayPhase.Analysis, 'submit')
     assert tool
     assert tool['function']['name'] == 'submit'
-    assert tool['new_phase'] is not None
-    assert tool['new_phase'] == get_next_agent_replay_phase(ReplayPhase.Analysis)
+    assert (ReplayPhase.Analysis, ReplayPhase.ConfirmAnalysis) in tool['edges']
+
+    tools = get_replay_transition_tools(ReplayPhase.Analysis)
+    assert len([t['function']['name'] for t in tools]) == 1
+
+    assert tool in tools
 
 
 def test_get_replay_transition_tools_edit():
     assert get_next_agent_replay_phase(ReplayPhase.Edit) is None
-    tool = get_replay_transition_tool_for_current_phase(ReplayPhase.Edit, 'submit')
+    tool = get_replay_transition_tool(ReplayPhase.Edit, 'submit')
     assert not tool
 
 
 def test_get_tools():
     default_tools = []
 
-    # Test Normal phase
+    # Make sure that all ReplayPhases are handled and no error is raised.
+    for phase in ReplayPhase:
+        get_replay_tools(phase, default_tools)
+
+    # Test individual phases.
     tools = get_replay_tools(ReplayPhase.Normal, default_tools)
     assert len(tools) == len(default_tools)
     assert all(t in tools for t in default_tools)
 
-    # Test Analysis phase
     tools = get_replay_tools(ReplayPhase.Analysis, default_tools)
     assert len(tools) == len(replay_analysis_tools) + 1  # +1 for transition tool
     assert all(t in tools for t in replay_analysis_tools)
     assert tools[-1]['function']['name'] == 'submit'
 
-    # Test Edit phase
+    tools = get_replay_tools(ReplayPhase.ConfirmAnalysis, default_tools)
+    assert len(tools) == len(replay_analysis_tools) + 1  # +1 for transition tool
+    assert all(t in tools for t in replay_analysis_tools)
+    assert tools[-1]['function']['name'] == 'confirm'
+
     tools = get_replay_tools(ReplayPhase.Edit, default_tools)
     assert len(tools) == len(default_tools) + len(replay_analysis_tools)
     assert all(t in tools for t in default_tools)

From 3bc81535df6514f84bc66fff1e4de66c45049af3 Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Sat, 25 Jan 2025 00:04:53 +0800
Subject: [PATCH 16/17] replayapi SHA

---
 openhands/runtime/utils/runtime_templates/Dockerfile.j2 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openhands/runtime/utils/runtime_templates/Dockerfile.j2 b/openhands/runtime/utils/runtime_templates/Dockerfile.j2
index 2ff313887fe4..184afc8e5237 100644
--- a/openhands/runtime/utils/runtime_templates/Dockerfile.j2
+++ b/openhands/runtime/utils/runtime_templates/Dockerfile.j2
@@ -88,7 +88,7 @@ RUN \
 # Clone replay and make sure the layer is not stale when HEAD changes.
 # For now we manually specify a revision as an argument to bust caches when building images.
 ARG REPLAYAPI_SHA
-ARG EXPECTED_REVISION=b942abcb31a6275a91ea9a64182f4258589f4a6b
+ARG EXPECTED_REVISION=0997f61c15156be8cbbe3d81d354c2215f939c73
 RUN \
     mkdir -p /replay && \
     git -C /replay clone https://github.com/replayio-public/replayapi.git && \

From 5a10356e1aef26c77ec1a98d43d053d39ba017c2 Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Sat, 25 Jan 2025 02:00:37 +0800
Subject: [PATCH 17/17] mini update

---
 openhands/replay/replay_prompts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openhands/replay/replay_prompts.py b/openhands/replay/replay_prompts.py
index b18ee100d671..adf21bae91f0 100644
--- a/openhands/replay/replay_prompts.py
+++ b/openhands/replay/replay_prompts.py
@@ -51,7 +51,7 @@ class PromptMap(TypedDict):
         'enter': """
 You have confirmed the analysis.
 
-IMPORTANT: NOW implement the confirmed suggestions. The code is available in the workspace.
+IMPORTANT: NOW implement the confirmed suggestions. The code is available in the workspace. Don't stop until you are finished implementing the proposed fix.
 """
     },
 }