From 0bed17758f05aee421bb0148ab0d6df59bd574fe Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 16 Jan 2025 12:27:00 -0500
Subject: [PATCH] fix: incorrect soft-timeout implementation & fix hard-timeout
 follow-up command (#6280)

---
 .../benchmarks/commit0_bench/run_infer.py     |  24 +--
 evaluation/benchmarks/swe_bench/eval_infer.py |  10 +-
 evaluation/benchmarks/swe_bench/run_infer.py  |  30 ++--
 .../benchmarks/the_agent_company/browsing.py  |   2 +-
 .../benchmarks/the_agent_company/run_infer.py |   4 +-
 openhands/events/event.py                     |  10 +-
 openhands/events/serialization/action.py      |   3 +-
 openhands/resolver/resolve_issue.py           |   2 +-
 openhands/runtime/action_execution_server.py  |   5 +-
 openhands/runtime/base.py                     |   3 +-
 .../action_execution_client.py                |   3 +-
 .../runtime/impl/docker/docker_runtime.py     |   2 +
 openhands/runtime/utils/bash.py               |  49 +++++-
 tests/runtime/test_bash.py                    | 160 +++++++++++++++---
 tests/runtime/test_stress_docker_runtime.py   |   2 +-
 tests/runtime/test_stress_remote_runtime.py   |   4 +-
 tests/unit/test_bash_session.py               |  16 +-
 tests/unit/test_runtime_reboot.py             |   4 +-
 18 files changed, 243 insertions(+), 90 deletions(-)

diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py
index d8f1f64b1a6b..e690952ab9a9 100644
--- a/evaluation/benchmarks/commit0_bench/run_infer.py
+++ b/evaluation/benchmarks/commit0_bench/run_infer.py
@@ -171,7 +171,7 @@ def initialize_runtime(
     action = CmdRunAction(
         command=f'git clone -b commit0_combined https://github.com/{instance["repo"]}.git'
     )
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -181,7 +181,7 @@ def initialize_runtime(
     )
 
     action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -191,7 +191,7 @@ def initialize_runtime(
     )
 
     action = CmdRunAction(command='git checkout -b openhands')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -201,7 +201,7 @@ def initialize_runtime(
 
     # Install commit0
     action = CmdRunAction(command='/root/.cargo/bin/uv pip install commit0')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -231,7 +231,7 @@ def complete_runtime(
     workspace_dir_name = _get_commit0_workspace_dir_name(instance)
 
     action = CmdRunAction(command='git add .')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -241,7 +241,7 @@ def complete_runtime(
     )
 
     action = CmdRunAction(command='git commit -m "openhands edits"')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -258,7 +258,7 @@ def complete_runtime(
         action = CmdRunAction(
             command=f"git diff {instance['base_commit']} HEAD -- . ':(exclude)spec.pdf.bz2'"
         )
-        action.timeout = 600 + 100 * n_retries
+        action.set_hard_timeout(600 + 100 * n_retries)
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -282,7 +282,7 @@ def complete_runtime(
     action = CmdRunAction(
         command=f"{instance['test']['test_cmd']} --json-report --json-report-file=report.json --continue-on-collection-errors {test_dir} > test_output.txt 2>&1"
     )
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -292,7 +292,7 @@ def complete_runtime(
     )
     # Read test output
     action = CmdRunAction(command='cat test_output.txt')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -305,7 +305,7 @@ def complete_runtime(
 
     # Save pytest exit code
     action = CmdRunAction(command='echo $?')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -318,7 +318,7 @@ def complete_runtime(
 
     # Read the test report
     action = CmdRunAction(command='cat report.json')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -330,7 +330,7 @@ def complete_runtime(
     repo_name = instance['repo'].split('/')[1]
     repo_name = repo_name.replace('.', '-')
     action = CmdRunAction(command=f'commit0 get-tests {repo_name}')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py
index 7beacf344408..ab7feb38b678 100644
--- a/evaluation/benchmarks/swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/swe_bench/eval_infer.py
@@ -174,7 +174,7 @@ def process_instance(
 
     # Set +x
     action = CmdRunAction(command='chmod +x /tmp/eval.sh')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -189,7 +189,7 @@ def process_instance(
         "echo 'APPLY_PATCH_FAIL')))"
     )
     action = CmdRunAction(command=exec_command)
-    action.timeout = 600
+    action.set_hard_timeout(600)
     obs = runtime.run_action(action)
     assert isinstance(obs, CmdOutputObservation)
     apply_patch_output = obs.content
@@ -212,7 +212,7 @@ def process_instance(
             # Run eval script in background and save output to log file
             log_file = '/tmp/eval_output.log'
             action = CmdRunAction(command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!')
-            action.timeout = 60  # Short timeout just to get the process ID
+            action.set_hard_timeout(60)  # Short timeout just to get the process ID
             obs = runtime.run_action(action)
 
             if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
@@ -235,7 +235,7 @@ def process_instance(
                     check_action = CmdRunAction(
                         command=f'ps -p {pid} > /dev/null; echo $?'
                     )
-                    check_action.timeout = 60
+                    check_action.set_hard_timeout(60)
                     check_obs = runtime.run_action(check_action)
                     if (
                         isinstance(check_obs, CmdOutputObservation)
@@ -252,7 +252,7 @@ def process_instance(
 
                 # Read the log file
                 cat_action = CmdRunAction(command=f'cat {log_file}')
-                cat_action.timeout = 300
+                cat_action.set_hard_timeout(300)
                 cat_obs = runtime.run_action(cat_action)
 
                 # Grade answer
diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
index bf065ada9734..ac9e85b60b10 100644
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -173,7 +173,7 @@ def initialize_runtime(
     action = CmdRunAction(
         command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc"""
     )
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -182,7 +182,7 @@ def initialize_runtime(
     )
 
     action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -194,7 +194,7 @@ def initialize_runtime(
 
         # inject the instance info
         action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances')
-        action.timeout = 600
+        action.set_hard_timeout(600)
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -223,14 +223,14 @@ def initialize_runtime(
             '/swe_util/',
         )
         action = CmdRunAction(command='cat ~/.bashrc')
-        action.timeout = 600
+        action.set_hard_timeout(600)
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}')
 
         action = CmdRunAction(command='source ~/.bashrc')
-        action.timeout = 600
+        action.set_hard_timeout(600)
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -239,7 +239,7 @@ def initialize_runtime(
         assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
 
         action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
-        action.timeout = 3600
+        action.set_hard_timeout(3600)
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -249,7 +249,7 @@ def initialize_runtime(
         )
     else:
         action = CmdRunAction(command='source /swe_util/swe_entry.sh')
-        action.timeout = 1800
+        action.set_hard_timeout(1800)
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -259,7 +259,7 @@ def initialize_runtime(
         )
 
     action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -269,7 +269,7 @@ def initialize_runtime(
     )
 
     action = CmdRunAction(command='git reset --hard')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -278,14 +278,14 @@ def initialize_runtime(
     action = CmdRunAction(
         command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
     )
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
 
     action = CmdRunAction(command='which python')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -316,7 +316,7 @@ def complete_runtime(
     workspace_dir_name = _get_swebench_workspace_dir_name(instance)
 
     action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -326,7 +326,7 @@ def complete_runtime(
     )
 
     action = CmdRunAction(command='git config --global core.pager ""')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -336,7 +336,7 @@ def complete_runtime(
     )
 
     action = CmdRunAction(command='git add -A')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -351,7 +351,7 @@ def complete_runtime(
         action = CmdRunAction(
             command=f'git diff --no-color --cached {instance["base_commit"]}'
         )
-        action.timeout = 600 + 100 * n_retries
+        action.set_hard_timeout(600 + 100 * n_retries)
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
diff --git a/evaluation/benchmarks/the_agent_company/browsing.py b/evaluation/benchmarks/the_agent_company/browsing.py
index 7384dddbdfce..5ce97129777a 100644
--- a/evaluation/benchmarks/the_agent_company/browsing.py
+++ b/evaluation/benchmarks/the_agent_company/browsing.py
@@ -262,7 +262,7 @@ def pre_login(
             instruction = action.to_instruction()
 
             browser_action = BrowseInteractiveAction(browser_actions=instruction)
-            browser_action.timeout = 10000
+            browser_action.set_hard_timeout(10000)
             logger.info(browser_action, extra={'msg_type': 'ACTION'})
             obs: BrowserOutputObservation = runtime.run_action(browser_action)
             logger.debug(obs, extra={'msg_type': 'OBSERVATION'})
diff --git a/evaluation/benchmarks/the_agent_company/run_infer.py b/evaluation/benchmarks/the_agent_company/run_infer.py
index a82db6d56081..8f8a1b599e6f 100644
--- a/evaluation/benchmarks/the_agent_company/run_infer.py
+++ b/evaluation/benchmarks/the_agent_company/run_infer.py
@@ -86,7 +86,7 @@ def init_task_env(runtime: Runtime, hostname: str, env_llm_config: LLMConfig):
         'bash /utils/init.sh'
     )
     action = CmdRunAction(command=command)
-    action.timeout = 900
+    action.set_hard_timeout(900)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -172,7 +172,7 @@ def run_evaluator(
         f'python_default /utils/eval.py --trajectory_path {trajectory_path} --result_path {result_path}'
     )
     action = CmdRunAction(command=command)
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
diff --git a/openhands/events/event.py b/openhands/events/event.py
index 6c7a2d8a3ac1..1bdece59eb75 100644
--- a/openhands/events/event.py
+++ b/openhands/events/event.py
@@ -64,8 +64,12 @@ def timeout(self) -> int | None:
             return self._timeout  # type: ignore[attr-defined]
         return None
 
-    @timeout.setter
-    def timeout(self, value: int | None) -> None:
+    def set_hard_timeout(self, value: int | None, blocking: bool = True) -> None:
+        """Set the timeout for the event.
+
+        NOTE, this is a hard timeout, meaning that the event will be blocked
+        until the timeout is reached.
+        """
         self._timeout = value
         if value is not None and value > 600:
             from openhands.core.logger import openhands_logger as logger
@@ -78,7 +82,7 @@ def timeout(self, value: int | None) -> None:
         # Check if .blocking is an attribute of the event
         if hasattr(self, 'blocking'):
             # .blocking needs to be set to True if .timeout is set
-            self.blocking = True
+            self.blocking = blocking
 
     # optional metadata, LLM call cost of the edit
     @property
diff --git a/openhands/events/serialization/action.py b/openhands/events/serialization/action.py
index 90945c1d4dfd..be9990750fc6 100644
--- a/openhands/events/serialization/action.py
+++ b/openhands/events/serialization/action.py
@@ -74,7 +74,8 @@ def action_from_dict(action: dict) -> Action:
     try:
         decoded_action = action_class(**args)
         if 'timeout' in action:
-            decoded_action.timeout = action['timeout']
+            blocking = args.get('blocking', False)
+            decoded_action.set_hard_timeout(action['timeout'], blocking=blocking)
 
         # Set timestamp if it was provided
         if timestamp:
diff --git a/openhands/resolver/resolve_issue.py b/openhands/resolver/resolve_issue.py
index f50b37d79447..4e0b2b4ad96c 100644
--- a/openhands/resolver/resolve_issue.py
+++ b/openhands/resolver/resolve_issue.py
@@ -118,7 +118,7 @@ async def complete_runtime(
     git_patch = None
     while n_retries < 5:
         action = CmdRunAction(command=f'git diff --no-color --cached {base_commit}')
-        action.timeout = 600 + 100 * n_retries
+        action.set_hard_timeout(600 + 100 * n_retries)
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py
index b483c183cbdc..8a5fcdc0edd9 100644
--- a/openhands/runtime/action_execution_server.py
+++ b/openhands/runtime/action_execution_server.py
@@ -120,6 +120,9 @@ async def ainit(self):
         self.bash_session = BashSession(
             work_dir=self._initial_cwd,
             username=self.username,
+            no_change_timeout_seconds=int(
+                os.environ.get('NO_CHANGE_TIMEOUT_SECONDS', 30)
+            ),
         )
         self.bash_session.initialize()
         await wait_all(
@@ -163,7 +166,7 @@ async def _init_bash_commands(self):
         logger.debug(f'Initializing by running {len(INIT_COMMANDS)} bash commands...')
         for command in INIT_COMMANDS:
             action = CmdRunAction(command=command)
-            action.timeout = 300
+            action.set_hard_timeout(300)
             logger.debug(f'Executing init command: {command}')
             obs = await self.run(action)
             assert isinstance(obs, CmdOutputObservation)
diff --git a/openhands/runtime/base.py b/openhands/runtime/base.py
index 114289f390b2..94a059f06aa3 100644
--- a/openhands/runtime/base.py
+++ b/openhands/runtime/base.py
@@ -182,7 +182,8 @@ def on_event(self, event: Event) -> None:
 
     async def _handle_action(self, event: Action) -> None:
         if event.timeout is None:
-            event.timeout = self.config.sandbox.timeout
+            # We don't block the command if this is a default timeout action
+            event.set_hard_timeout(self.config.sandbox.timeout, blocking=False)
         assert event.timeout is not None
         try:
             observation: Observation = await call_sync_from_async(
diff --git a/openhands/runtime/impl/action_execution/action_execution_client.py b/openhands/runtime/impl/action_execution/action_execution_client.py
index 24fb8250b30e..f8c93dd5561f 100644
--- a/openhands/runtime/impl/action_execution/action_execution_client.py
+++ b/openhands/runtime/impl/action_execution/action_execution_client.py
@@ -216,7 +216,8 @@ def send_action_for_execution(self, action: Action) -> Observation:
 
         # set timeout to default if not set
         if action.timeout is None:
-            action.timeout = self.config.sandbox.timeout
+            # We don't block the command if this is a default timeout action
+            action.set_hard_timeout(self.config.sandbox.timeout, blocking=False)
 
         with self.action_semaphore:
             if not action.runnable:
diff --git a/openhands/runtime/impl/docker/docker_runtime.py b/openhands/runtime/impl/docker/docker_runtime.py
index 5111f0f36831..bf06e00e854f 100644
--- a/openhands/runtime/impl/docker/docker_runtime.py
+++ b/openhands/runtime/impl/docker/docker_runtime.py
@@ -228,6 +228,8 @@ def _init_container(self):
         }
         if self.config.debug or DEBUG:
             environment['DEBUG'] = 'true'
+        # also update with runtime_startup_env_vars
+        environment.update(self.config.sandbox.runtime_startup_env_vars)
 
         self.log('debug', f'Workspace Base: {self.config.workspace_base}')
         if (
diff --git a/openhands/runtime/utils/bash.py b/openhands/runtime/utils/bash.py
index 351d990dcda6..87b2ae405f1d 100644
--- a/openhands/runtime/utils/bash.py
+++ b/openhands/runtime/utils/bash.py
@@ -174,7 +174,7 @@ def __init__(
         self,
         work_dir: str,
         username: str | None = None,
-        no_change_timeout_seconds: float = 30.0,
+        no_change_timeout_seconds: int = 30,
     ):
         self.NO_CHANGE_TIMEOUT_SECONDS = no_change_timeout_seconds
         self.work_dir = work_dir
@@ -369,7 +369,7 @@ def _handle_nochange_timeout_command(
             command,
             raw_command_output,
             metadata,
-            continue_prefix='[Command output continued from previous command]\n',
+            continue_prefix='[Below is the output of the previous command.]\n',
         )
         return CmdOutputObservation(
             content=command_output,
@@ -404,7 +404,7 @@ def _handle_hard_timeout_command(
             command,
             raw_command_output,
             metadata,
-            continue_prefix='[Command output continued from previous command]\n',
+            continue_prefix='[Below is the output of the previous command.]\n',
         )
 
         return CmdOutputObservation(
@@ -441,6 +441,8 @@ def _combine_outputs_between_matches(
             else:
                 # The command output is the content after the last PS1 prompt
                 return pane_content[ps1_matches[0].end() + 1 :]
+        elif len(ps1_matches) == 0:
+            return pane_content
         combined_output = ''
         for i in range(len(ps1_matches) - 1):
             # Extract content between current and next PS1 prompt
@@ -459,6 +461,9 @@ def execute(self, action: CmdRunAction) -> CmdOutputObservation | ErrorObservati
         # Strip the command of any leading/trailing whitespace
         logger.debug(f'RECEIVED ACTION: {action}')
         command = action.command.strip()
+        is_special_key = self._is_special_key(command)
+
+        # Handle when prev command is hard timeout
 
         if command == '' and self.prev_status not in {
             BashCommandStatus.CONTINUE,
@@ -486,13 +491,45 @@ def execute(self, action: CmdRunAction) -> CmdOutputObservation | ErrorObservati
         last_change_time = start_time
         last_pane_output = self._get_pane_content()
 
-        if command != '':
+        # Do not check hard timeout if the command is a special key
+        if command != '' and is_special_key:
+            logger.debug(f'SENDING SPECIAL KEY: {command!r}')
+            self.pane.send_keys(command, enter=False)
+        # When prev command is hard timeout, and we are trying to execute new command
+        elif self.prev_status == BashCommandStatus.HARD_TIMEOUT and command != '':
+            if not last_pane_output.endswith(CMD_OUTPUT_PS1_END):
+                _ps1_matches = CmdOutputMetadata.matches_ps1_metadata(last_pane_output)
+                raw_command_output = self._combine_outputs_between_matches(
+                    last_pane_output, _ps1_matches
+                )
+                metadata = CmdOutputMetadata()  # No metadata available
+                metadata.suffix = (
+                    f'\n[Your command "{command}" is NOT executed. '
+                    f'The previous command was timed out but still running. Above is the output of the previous command. '
+                    "You may wait longer to see additional output of the previous command by sending empty command '', "
+                    'send other commands to interact with the current process, '
+                    'or send keys ("C-c", "C-z", "C-d") to interrupt/kill the previous command before sending your new command.]'
+                )
+                command_output = self._get_command_output(
+                    command,
+                    raw_command_output,
+                    metadata,
+                    continue_prefix='[Below is the output of the previous command.]\n',
+                )
+                return CmdOutputObservation(
+                    command=command,
+                    content=command_output,
+                    metadata=metadata,
+                )
+        # Only send the command to the pane if it's not a special key and it's not empty
+        # AND previous hard timeout command is resolved
+        elif command != '' and not is_special_key:
             # convert command to raw string
             command = escape_bash_special_chars(command)
             logger.debug(f'SENDING COMMAND: {command!r}')
             self.pane.send_keys(
                 command,
-                enter=not self._is_special_key(command),
+                enter=True,
             )
 
         # Loop until the command completes or times out
@@ -525,7 +562,7 @@ def execute(self, action: CmdRunAction) -> CmdOutputObservation | ErrorObservati
             # We ignore this if the command is *blocking
             time_since_last_change = time.time() - last_change_time
             logger.debug(
-                f'CHECKING NO CHANGE TIMEOUT ({self.NO_CHANGE_TIMEOUT_SECONDS}s): elapsed {time_since_last_change}'
+                f'CHECKING NO CHANGE TIMEOUT ({self.NO_CHANGE_TIMEOUT_SECONDS}s): elapsed {time_since_last_change}. Action blocking: {action.blocking}'
             )
             if (
                 not action.blocking
diff --git a/tests/runtime/test_bash.py b/tests/runtime/test_bash.py
index 3a25fd01ddee..828c859f11dd 100644
--- a/tests/runtime/test_bash.py
+++ b/tests/runtime/test_bash.py
@@ -1,6 +1,7 @@
 """Bash-related tests for the EventStreamRuntime, which connects to the ActionExecutor running in the sandbox."""
 
 import os
+import time
 from pathlib import Path
 
 import pytest
@@ -45,7 +46,7 @@ def test_bash_server(temp_dir, runtime_cls, run_as_openhands):
     runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
     try:
         action = CmdRunAction(command='python3 -m http.server 8080')
-        action.timeout = 1
+        action.set_hard_timeout(1)
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         assert isinstance(obs, CmdOutputObservation)
@@ -57,7 +58,7 @@ def test_bash_server(temp_dir, runtime_cls, run_as_openhands):
         )
 
         action = CmdRunAction(command='C-c')
-        action.timeout = 30
+        action.set_hard_timeout(30)
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         assert isinstance(obs, CmdOutputObservation)
@@ -66,7 +67,7 @@ def test_bash_server(temp_dir, runtime_cls, run_as_openhands):
         assert '/workspace' in obs.metadata.working_dir
 
         action = CmdRunAction(command='ls')
-        action.timeout = 1
+        action.set_hard_timeout(1)
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         assert isinstance(obs, CmdOutputObservation)
@@ -76,7 +77,7 @@ def test_bash_server(temp_dir, runtime_cls, run_as_openhands):
 
         # run it again!
         action = CmdRunAction(command='python3 -m http.server 8080')
-        action.timeout = 1
+        action.set_hard_timeout(1)
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         assert isinstance(obs, CmdOutputObservation)
@@ -555,15 +556,20 @@ def test_basic_command(temp_dir, runtime_cls, run_as_openhands):
 
 
 def test_interactive_command(temp_dir, runtime_cls, run_as_openhands):
-    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    runtime = _load_runtime(
+        temp_dir,
+        runtime_cls,
+        run_as_openhands,
+        runtime_startup_env_vars={'NO_CHANGE_TIMEOUT_SECONDS': '1'},
+    )
     try:
         # Test interactive command
         action = CmdRunAction('read -p "Enter name: " name && echo "Hello $name"')
-        action.timeout = 1
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        # assert 'Enter name:' in obs.content # FIXME: this is not working
-        assert '[The command timed out after 1 seconds.' in obs.metadata.suffix
+        # This should trigger SOFT timeout, so no need to set hard timeout
+        assert 'Enter name:' in obs.content
+        assert '[The command has no new output after 1 seconds.' in obs.metadata.suffix
 
         action = CmdRunAction('John')
         obs = runtime.run_action(action)
@@ -590,7 +596,7 @@ def test_long_output(temp_dir, runtime_cls, run_as_openhands):
     try:
         # Generate a long output
         action = CmdRunAction('for i in $(seq 1 5000); do echo "Line $i"; done')
-        action.timeout = 10
+        action.set_hard_timeout(10)
         obs = runtime.run_action(action)
         assert obs.exit_code == 0
         assert 'Line 1' in obs.content
@@ -604,7 +610,7 @@ def test_long_output_exceed_history_limit(temp_dir, runtime_cls, run_as_openhand
     try:
         # Generate a long output
         action = CmdRunAction('for i in $(seq 1 50000); do echo "Line $i"; done')
-        action.timeout = 30
+        action.set_hard_timeout(30)
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         assert obs.exit_code == 0
@@ -621,13 +627,13 @@ def test_long_output_from_nested_directories(temp_dir, runtime_cls, run_as_openh
         # Create nested directories with many files
         setup_cmd = 'mkdir -p /tmp/test_dir && cd /tmp/test_dir && for i in $(seq 1 100); do mkdir -p "folder_$i"; for j in $(seq 1 100); do touch "folder_$i/file_$j.txt"; done; done'
         setup_action = CmdRunAction(setup_cmd.strip())
-        setup_action.timeout = 60
+        setup_action.set_hard_timeout(60)
         obs = runtime.run_action(setup_action)
         assert obs.exit_code == 0
 
         # List the directory structure recursively
         action = CmdRunAction('ls -R /tmp/test_dir')
-        action.timeout = 60
+        action.set_hard_timeout(60)
         obs = runtime.run_action(action)
         assert obs.exit_code == 0
 
@@ -672,7 +678,7 @@ def test_command_output_continuation(temp_dir, runtime_cls, run_as_openhands):
     try:
         # Start a command that produces output slowly
         action = CmdRunAction('for i in {1..5}; do echo $i; sleep 3; done')
-        action.timeout = 2.5  # Set timeout to 2.5 seconds
+        action.set_hard_timeout(2.5)
         obs = runtime.run_action(action)
         assert obs.content.strip() == '1'
         assert obs.metadata.prefix == ''
@@ -680,20 +686,19 @@ def test_command_output_continuation(temp_dir, runtime_cls, run_as_openhands):
 
         # Continue watching output
         action = CmdRunAction('')
-        action.timeout = 2.5
+        action.set_hard_timeout(2.5)
         obs = runtime.run_action(action)
-        assert '[Command output continued from previous command]' in obs.metadata.prefix
+        assert '[Below is the output of the previous command.]' in obs.metadata.prefix
         assert obs.content.strip() == '2'
         assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
 
         # Continue until completion
         for expected in ['3', '4', '5']:
             action = CmdRunAction('')
-            action.timeout = 2.5
+            action.set_hard_timeout(2.5)
             obs = runtime.run_action(action)
             assert (
-                '[Command output continued from previous command]'
-                in obs.metadata.prefix
+                '[Below is the output of the previous command.]' in obs.metadata.prefix
             )
             assert obs.content.strip() == expected
             assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
@@ -713,8 +718,7 @@ def test_long_running_command_follow_by_execute(
     try:
         # Test command that produces output slowly
         action = CmdRunAction('for i in {1..3}; do echo $i; sleep 3; done')
-        action.timeout = 2.5
-        action.blocking = False
+        action.set_hard_timeout(2.5)
         obs = runtime.run_action(action)
         assert '1' in obs.content  # First number should appear before timeout
         assert obs.metadata.exit_code == -1  # -1 indicates command is still running
@@ -723,25 +727,32 @@ def test_long_running_command_follow_by_execute(
 
         # Continue watching output
         action = CmdRunAction('')
-        action.timeout = 2.5
+        action.set_hard_timeout(2.5)
         obs = runtime.run_action(action)
         assert '2' in obs.content
-        assert (
-            obs.metadata.prefix == '[Command output continued from previous command]\n'
-        )
+        assert obs.metadata.prefix == '[Below is the output of the previous command.]\n'
         assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
         assert obs.metadata.exit_code == -1  # -1 indicates command is still running
 
         # Test command that produces no output
         action = CmdRunAction('sleep 15')
-        action.timeout = 2.5
+        action.set_hard_timeout(2.5)
         obs = runtime.run_action(action)
-        assert '3' in obs.content
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert '3' not in obs.content
+        assert obs.metadata.prefix == '[Below is the output of the previous command.]\n'
         assert (
-            obs.metadata.prefix == '[Command output continued from previous command]\n'
+            'The previous command was timed out but still running.'
+            in obs.metadata.suffix
         )
-        assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
         assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+
+        # Finally continue again
+        action = CmdRunAction('')
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert '3' in obs.content
+        assert '[The command completed with exit code 0.]' in obs.metadata.suffix
     finally:
         _close_test_runtime(runtime)
 
@@ -783,3 +794,96 @@ def test_python_interactive_input(temp_dir, runtime_cls, run_as_openhands):
         assert '[The command completed with exit code 0.]' in obs.metadata.suffix
     finally:
         _close_test_runtime(runtime)
+
+
+def test_stress_long_output_with_soft_and_hard_timeout(
+    temp_dir, runtime_cls, run_as_openhands
+):
+    runtime = _load_runtime(
+        temp_dir,
+        runtime_cls,
+        run_as_openhands,
+        runtime_startup_env_vars={'NO_CHANGE_TIMEOUT_SECONDS': '1'},
+        docker_runtime_kwargs={
+            'cpu_period': 100000,  # 100ms
+            'cpu_quota': 100000,  # Can use 100ms out of each 100ms period (1 CPU)
+            'mem_limit': '4G',  # 4 GB of memory
+        },
+    )
+    try:
+        # Run a command that generates long output multiple times
+        for i in range(10):
+            start_time = time.time()
+
+            # Check tmux memory usage (in KB)
+            mem_action = CmdRunAction(
+                'ps aux | awk \'{printf "%8.1f KB  %s\\n", $6, $0}\' | sort -nr | grep "/usr/bin/tmux" | grep -v grep | awk \'{print $1}\''
+            )
+            mem_obs = runtime.run_action(mem_action)
+            assert mem_obs.exit_code == 0
+            logger.info(
+                f'Tmux memory usage (iteration {i}): {mem_obs.content.strip()} KB'
+            )
+
+            # Check action_execution_server mem
+            mem_action = CmdRunAction(
+                'ps aux | awk \'{printf "%8.1f KB  %s\\n", $6, $0}\' | sort -nr | grep "action_execution_server" | grep "/openhands/poetry" | grep -v grep | awk \'{print $1}\''
+            )
+            mem_obs = runtime.run_action(mem_action)
+            assert mem_obs.exit_code == 0
+            logger.info(
+                f'Action execution server memory usage (iteration {i}): {mem_obs.content.strip()} KB'
+            )
+
+            # Test soft timeout
+            action = CmdRunAction(
+                'read -p "Do you want to continue? [Y/n] " answer; if [[ $answer == "Y" ]]; then echo "Proceeding with operation..."; echo "Operation completed successfully!"; else echo "Operation cancelled."; exit 1; fi'
+            )
+            obs = runtime.run_action(action)
+            assert 'Do you want to continue?' in obs.content
+            assert obs.exit_code == -1  # Command is still running, waiting for input
+
+            # Send the confirmation
+            action = CmdRunAction('Y')
+            obs = runtime.run_action(action)
+            assert 'Proceeding with operation...' in obs.content
+            assert 'Operation completed successfully!' in obs.content
+            assert obs.exit_code == 0
+            assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+
+            # Test hard timeout w/ long output
+            # Generate long output with 1000 asterisks per line
+            action = CmdRunAction(
+                f'export i={i}; for j in $(seq 1 100); do echo "Line $j - Iteration $i - $(printf \'%1000s\' | tr " " "*")"; sleep 1; done'
+            )
+            action.set_hard_timeout(2)
+            obs = runtime.run_action(action)
+
+            # Verify the output
+            assert obs.exit_code == -1
+            assert f'Line 1 - Iteration {i}' in obs.content
+            # assert f'Line 1000 - Iteration {i}' in obs.content
+            # assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+
+            # Because hard-timeout is triggered, the terminal will in a weird state
+            # where it will not accept any new commands.
+            obs = runtime.run_action(CmdRunAction('ls'))
+            assert obs.exit_code == -1
+            assert (
+                'The previous command was timed out but still running.'
+                in obs.metadata.suffix
+            )
+
+            # We need to send a Ctrl+C to reset the terminal.
+            obs = runtime.run_action(CmdRunAction('C-c'))
+            assert obs.exit_code == 130
+
+            # Now make sure the terminal is in a good state
+            obs = runtime.run_action(CmdRunAction('ls'))
+            assert obs.exit_code == 0
+
+            duration = time.time() - start_time
+            logger.info(f'Completed iteration {i} in {duration:.2f} seconds')
+
+    finally:
+        _close_test_runtime(runtime)
diff --git a/tests/runtime/test_stress_docker_runtime.py b/tests/runtime/test_stress_docker_runtime.py
index d0e141ee3142..6e8a9d5957e8 100644
--- a/tests/runtime/test_stress_docker_runtime.py
+++ b/tests/runtime/test_stress_docker_runtime.py
@@ -28,7 +28,7 @@ def test_stress_docker_runtime(temp_dir, runtime_cls, repeat=1):
     for _ in range(repeat):
         # run stress-ng stress tests for 1 minute
         action = CmdRunAction(command='stress-ng --all 1 -t 1m')
-        action.timeout = 120
+        action.set_hard_timeout(120)
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
diff --git a/tests/runtime/test_stress_remote_runtime.py b/tests/runtime/test_stress_remote_runtime.py
index 367af20467be..a2f6c7d2082b 100644
--- a/tests/runtime/test_stress_remote_runtime.py
+++ b/tests/runtime/test_stress_remote_runtime.py
@@ -92,14 +92,14 @@ def initialize_runtime(
     obs: CmdOutputObservation
 
     action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
 
     action = CmdRunAction(command='mkdir -p /dummy_dir')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
diff --git a/tests/unit/test_bash_session.py b/tests/unit/test_bash_session.py
index 13f32ad27d25..fc29eaffb2a5 100644
--- a/tests/unit/test_bash_session.py
+++ b/tests/unit/test_bash_session.py
@@ -94,7 +94,7 @@ def test_long_running_command_follow_by_execute():
     obs = session.execute(CmdRunAction(''))
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert '2' in obs.content
-    assert obs.metadata.prefix == '[Command output continued from previous command]\n'
+    assert obs.metadata.prefix == '[Below is the output of the previous command.]\n'
     assert obs.metadata.suffix == (
         '\n[The command has no new output after 2 seconds. '
         "You may wait longer to see additional output by sending empty command '', "
@@ -108,7 +108,7 @@ def test_long_running_command_follow_by_execute():
     obs = session.execute(CmdRunAction('sleep 15'))
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert '3' in obs.content
-    assert obs.metadata.prefix == '[Command output continued from previous command]\n'
+    assert obs.metadata.prefix == '[Below is the output of the previous command.]\n'
     assert obs.metadata.suffix == (
         '\n[The command has no new output after 2 seconds. '
         "You may wait longer to see additional output by sending empty command '', "
@@ -175,7 +175,7 @@ def test_interactive_command():
         'send other commands to interact with the current process, '
         'or send keys to interrupt/kill the command.]'
     )
-    assert obs.metadata.prefix == '[Command output continued from previous command]\n'
+    assert obs.metadata.prefix == '[Below is the output of the previous command.]\n'
 
     obs = session.execute(CmdRunAction('line 2'))
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -187,7 +187,7 @@ def test_interactive_command():
         'send other commands to interact with the current process, '
         'or send keys to interrupt/kill the command.]'
     )
-    assert obs.metadata.prefix == '[Command output continued from previous command]\n'
+    assert obs.metadata.prefix == '[Below is the output of the previous command.]\n'
 
     obs = session.execute(CmdRunAction('EOF'))
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -266,14 +266,14 @@ def test_command_output_continuation():
 
     obs = session.execute(CmdRunAction(''))
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert '[Command output continued from previous command]' in obs.metadata.prefix
+    assert '[Below is the output of the previous command.]' in obs.metadata.prefix
     assert obs.content.strip() == '2'
     assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix
     assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
 
     obs = session.execute(CmdRunAction(''))
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert '[Command output continued from previous command]' in obs.metadata.prefix
+    assert '[Below is the output of the previous command.]' in obs.metadata.prefix
     assert obs.content.strip() == '3'
 
     assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix
@@ -281,14 +281,14 @@ def test_command_output_continuation():
 
     obs = session.execute(CmdRunAction(''))
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert '[Command output continued from previous command]' in obs.metadata.prefix
+    assert '[Below is the output of the previous command.]' in obs.metadata.prefix
     assert obs.content.strip() == '4'
     assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix
     assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
 
     obs = session.execute(CmdRunAction(''))
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert '[Command output continued from previous command]' in obs.metadata.prefix
+    assert '[Below is the output of the previous command.]' in obs.metadata.prefix
     assert obs.content.strip() == '5'
     assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix
     assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
diff --git a/tests/unit/test_runtime_reboot.py b/tests/unit/test_runtime_reboot.py
index e3ae31815a3e..c78fbe029ad6 100644
--- a/tests/unit/test_runtime_reboot.py
+++ b/tests/unit/test_runtime_reboot.py
@@ -27,7 +27,7 @@ def runtime(mock_session):
 def test_runtime_timeout_error(runtime, mock_session):
     # Create a command action
     action = CmdRunAction(command='test command')
-    action.timeout = 120
+    action.set_hard_timeout(120)
 
     # Mock the runtime to raise a timeout error
     runtime.send_action_for_execution.side_effect = AgentRuntimeTimeoutError(
@@ -78,7 +78,7 @@ def test_runtime_disconnected_error(
 
     # Create a command action
     action = CmdRunAction(command='test command')
-    action.timeout = 120
+    action.set_hard_timeout(120)
 
     # Verify that the error message is correct
     with pytest.raises(AgentRuntimeDisconnectedError) as exc_info: