From 0bed17758f05aee421bb0148ab0d6df59bd574fe Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 16 Jan 2025 12:27:00 -0500 Subject: [PATCH] fix: incorrect soft-timeout implementation & fix hard-timeout follow-up command (#6280) --- .../benchmarks/commit0_bench/run_infer.py | 24 +-- evaluation/benchmarks/swe_bench/eval_infer.py | 10 +- evaluation/benchmarks/swe_bench/run_infer.py | 30 ++-- .../benchmarks/the_agent_company/browsing.py | 2 +- .../benchmarks/the_agent_company/run_infer.py | 4 +- openhands/events/event.py | 10 +- openhands/events/serialization/action.py | 3 +- openhands/resolver/resolve_issue.py | 2 +- openhands/runtime/action_execution_server.py | 5 +- openhands/runtime/base.py | 3 +- .../action_execution_client.py | 3 +- .../runtime/impl/docker/docker_runtime.py | 2 + openhands/runtime/utils/bash.py | 49 +++++- tests/runtime/test_bash.py | 160 +++++++++++++++--- tests/runtime/test_stress_docker_runtime.py | 2 +- tests/runtime/test_stress_remote_runtime.py | 4 +- tests/unit/test_bash_session.py | 16 +- tests/unit/test_runtime_reboot.py | 4 +- 18 files changed, 243 insertions(+), 90 deletions(-) diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py index d8f1f64b1a6b..e690952ab9a9 100644 --- a/evaluation/benchmarks/commit0_bench/run_infer.py +++ b/evaluation/benchmarks/commit0_bench/run_infer.py @@ -171,7 +171,7 @@ def initialize_runtime( action = CmdRunAction( command=f'git clone -b commit0_combined https://github.com/{instance["repo"]}.git' ) - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -181,7 +181,7 @@ def initialize_runtime( ) action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -191,7 +191,7 @@ def initialize_runtime( ) action = CmdRunAction(command='git checkout -b openhands') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -201,7 +201,7 @@ def initialize_runtime( # Install commit0 action = CmdRunAction(command='/root/.cargo/bin/uv pip install commit0') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) # logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -231,7 +231,7 @@ def complete_runtime( workspace_dir_name = _get_commit0_workspace_dir_name(instance) action = CmdRunAction(command='git add .') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -241,7 +241,7 @@ def complete_runtime( ) action = CmdRunAction(command='git commit -m "openhands edits"') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -258,7 +258,7 @@ def complete_runtime( action = CmdRunAction( command=f"git diff {instance['base_commit']} HEAD -- . ':(exclude)spec.pdf.bz2'" ) - action.timeout = 600 + 100 * n_retries + action.set_hard_timeout(600 + 100 * n_retries) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) # logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -282,7 +282,7 @@ def complete_runtime( action = CmdRunAction( command=f"{instance['test']['test_cmd']} --json-report --json-report-file=report.json --continue-on-collection-errors {test_dir} > test_output.txt 2>&1" ) - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -292,7 +292,7 @@ def complete_runtime( ) # Read test output action = CmdRunAction(command='cat test_output.txt') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) # logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -305,7 +305,7 @@ def complete_runtime( # Save pytest exit code action = CmdRunAction(command='echo $?') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) # logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -318,7 +318,7 @@ def complete_runtime( # Read the test report action = CmdRunAction(command='cat report.json') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) # logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -330,7 +330,7 @@ def complete_runtime( repo_name = instance['repo'].split('/')[1] repo_name = repo_name.replace('.', '-') action = CmdRunAction(command=f'commit0 get-tests {repo_name}') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) # logger.info(obs, extra={'msg_type': 'OBSERVATION'}) diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py index 7beacf344408..ab7feb38b678 100644 --- a/evaluation/benchmarks/swe_bench/eval_infer.py +++ b/evaluation/benchmarks/swe_bench/eval_infer.py @@ -174,7 +174,7 @@ def process_instance( # Set +x action = CmdRunAction(command='chmod +x /tmp/eval.sh') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -189,7 +189,7 @@ def process_instance( "echo 'APPLY_PATCH_FAIL')))" ) action = CmdRunAction(command=exec_command) - action.timeout = 600 + action.set_hard_timeout(600) obs = runtime.run_action(action) assert isinstance(obs, CmdOutputObservation) apply_patch_output = obs.content @@ -212,7 +212,7 @@ def process_instance( # Run eval script in background and save output to log file log_file = '/tmp/eval_output.log' action = CmdRunAction(command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!') - action.timeout = 60 # Short timeout just to get the process ID + action.set_hard_timeout(60) # Short timeout just to get the process ID obs = runtime.run_action(action) if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0: @@ -235,7 +235,7 @@ def process_instance( check_action = CmdRunAction( command=f'ps -p {pid} > /dev/null; echo $?' ) - check_action.timeout = 60 + check_action.set_hard_timeout(60) check_obs = runtime.run_action(check_action) if ( isinstance(check_obs, CmdOutputObservation) @@ -252,7 +252,7 @@ def process_instance( # Read the log file cat_action = CmdRunAction(command=f'cat {log_file}') - cat_action.timeout = 300 + cat_action.set_hard_timeout(300) cat_obs = runtime.run_action(cat_action) # Grade answer diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index bf065ada9734..ac9e85b60b10 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -173,7 +173,7 @@ def initialize_runtime( action = CmdRunAction( command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc""" ) - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -182,7 +182,7 @@ def initialize_runtime( ) action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """) - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -194,7 +194,7 @@ def initialize_runtime( # inject the instance info action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -223,14 +223,14 @@ def initialize_runtime( '/swe_util/', ) action = CmdRunAction(command='cat ~/.bashrc') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}') action = CmdRunAction(command='source ~/.bashrc') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -239,7 +239,7 @@ def initialize_runtime( assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}') action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh') - action.timeout = 3600 + action.set_hard_timeout(3600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -249,7 +249,7 @@ def initialize_runtime( ) else: action = CmdRunAction(command='source /swe_util/swe_entry.sh') - action.timeout = 1800 + action.set_hard_timeout(1800) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -259,7 +259,7 @@ def initialize_runtime( ) action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -269,7 +269,7 @@ def initialize_runtime( ) action = CmdRunAction(command='git reset --hard') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -278,14 +278,14 @@ def initialize_runtime( action = CmdRunAction( command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done' ) - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}') action = CmdRunAction(command='which python') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -316,7 +316,7 @@ def complete_runtime( workspace_dir_name = _get_swebench_workspace_dir_name(instance) action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -326,7 +326,7 @@ def complete_runtime( ) action = CmdRunAction(command='git config --global core.pager ""') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -336,7 +336,7 @@ def complete_runtime( ) action = CmdRunAction(command='git add -A') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -351,7 +351,7 @@ def complete_runtime( action = CmdRunAction( command=f'git diff --no-color --cached {instance["base_commit"]}' ) - action.timeout = 600 + 100 * n_retries + action.set_hard_timeout(600 + 100 * n_retries) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) diff --git a/evaluation/benchmarks/the_agent_company/browsing.py b/evaluation/benchmarks/the_agent_company/browsing.py index 7384dddbdfce..5ce97129777a 100644 --- a/evaluation/benchmarks/the_agent_company/browsing.py +++ b/evaluation/benchmarks/the_agent_company/browsing.py @@ -262,7 +262,7 @@ def pre_login( instruction = action.to_instruction() browser_action = BrowseInteractiveAction(browser_actions=instruction) - browser_action.timeout = 10000 + browser_action.set_hard_timeout(10000) logger.info(browser_action, extra={'msg_type': 'ACTION'}) obs: BrowserOutputObservation = runtime.run_action(browser_action) logger.debug(obs, extra={'msg_type': 'OBSERVATION'}) diff --git a/evaluation/benchmarks/the_agent_company/run_infer.py b/evaluation/benchmarks/the_agent_company/run_infer.py index a82db6d56081..8f8a1b599e6f 100644 --- a/evaluation/benchmarks/the_agent_company/run_infer.py +++ b/evaluation/benchmarks/the_agent_company/run_infer.py @@ -86,7 +86,7 @@ def init_task_env(runtime: Runtime, hostname: str, env_llm_config: LLMConfig): 'bash /utils/init.sh' ) action = CmdRunAction(command=command) - action.timeout = 900 + action.set_hard_timeout(900) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -172,7 +172,7 @@ def run_evaluator( f'python_default /utils/eval.py --trajectory_path {trajectory_path} --result_path {result_path}' ) action = CmdRunAction(command=command) - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) diff --git a/openhands/events/event.py b/openhands/events/event.py index 6c7a2d8a3ac1..1bdece59eb75 100644 --- a/openhands/events/event.py +++ b/openhands/events/event.py @@ -64,8 +64,12 @@ def timeout(self) -> int | None: return self._timeout # type: ignore[attr-defined] return None - @timeout.setter - def timeout(self, value: int | None) -> None: + def set_hard_timeout(self, value: int | None, blocking: bool = True) -> None: + """Set the timeout for the event. + + NOTE, this is a hard timeout, meaning that the event will be blocked + until the timeout is reached. + """ self._timeout = value if value is not None and value > 600: from openhands.core.logger import openhands_logger as logger @@ -78,7 +82,7 @@ def timeout(self, value: int | None) -> None: # Check if .blocking is an attribute of the event if hasattr(self, 'blocking'): # .blocking needs to be set to True if .timeout is set - self.blocking = True + self.blocking = blocking # optional metadata, LLM call cost of the edit @property diff --git a/openhands/events/serialization/action.py b/openhands/events/serialization/action.py index 90945c1d4dfd..be9990750fc6 100644 --- a/openhands/events/serialization/action.py +++ b/openhands/events/serialization/action.py @@ -74,7 +74,8 @@ def action_from_dict(action: dict) -> Action: try: decoded_action = action_class(**args) if 'timeout' in action: - decoded_action.timeout = action['timeout'] + blocking = args.get('blocking', False) + decoded_action.set_hard_timeout(action['timeout'], blocking=blocking) # Set timestamp if it was provided if timestamp: diff --git a/openhands/resolver/resolve_issue.py b/openhands/resolver/resolve_issue.py index f50b37d79447..4e0b2b4ad96c 100644 --- a/openhands/resolver/resolve_issue.py +++ b/openhands/resolver/resolve_issue.py @@ -118,7 +118,7 @@ async def complete_runtime( git_patch = None while n_retries < 5: action = CmdRunAction(command=f'git diff --no-color --cached {base_commit}') - action.timeout = 600 + 100 * n_retries + action.set_hard_timeout(600 + 100 * n_retries) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py index b483c183cbdc..8a5fcdc0edd9 100644 --- a/openhands/runtime/action_execution_server.py +++ b/openhands/runtime/action_execution_server.py @@ -120,6 +120,9 @@ async def ainit(self): self.bash_session = BashSession( work_dir=self._initial_cwd, username=self.username, + no_change_timeout_seconds=int( + os.environ.get('NO_CHANGE_TIMEOUT_SECONDS', 30) + ), ) self.bash_session.initialize() await wait_all( @@ -163,7 +166,7 @@ async def _init_bash_commands(self): logger.debug(f'Initializing by running {len(INIT_COMMANDS)} bash commands...') for command in INIT_COMMANDS: action = CmdRunAction(command=command) - action.timeout = 300 + action.set_hard_timeout(300) logger.debug(f'Executing init command: {command}') obs = await self.run(action) assert isinstance(obs, CmdOutputObservation) diff --git a/openhands/runtime/base.py b/openhands/runtime/base.py index 114289f390b2..94a059f06aa3 100644 --- a/openhands/runtime/base.py +++ b/openhands/runtime/base.py @@ -182,7 +182,8 @@ def on_event(self, event: Event) -> None: async def _handle_action(self, event: Action) -> None: if event.timeout is None: - event.timeout = self.config.sandbox.timeout + # We don't block the command if this is a default timeout action + event.set_hard_timeout(self.config.sandbox.timeout, blocking=False) assert event.timeout is not None try: observation: Observation = await call_sync_from_async( diff --git a/openhands/runtime/impl/action_execution/action_execution_client.py b/openhands/runtime/impl/action_execution/action_execution_client.py index 24fb8250b30e..f8c93dd5561f 100644 --- a/openhands/runtime/impl/action_execution/action_execution_client.py +++ b/openhands/runtime/impl/action_execution/action_execution_client.py @@ -216,7 +216,8 @@ def send_action_for_execution(self, action: Action) -> Observation: # set timeout to default if not set if action.timeout is None: - action.timeout = self.config.sandbox.timeout + # We don't block the command if this is a default timeout action + action.set_hard_timeout(self.config.sandbox.timeout, blocking=False) with self.action_semaphore: if not action.runnable: diff --git a/openhands/runtime/impl/docker/docker_runtime.py b/openhands/runtime/impl/docker/docker_runtime.py index 5111f0f36831..bf06e00e854f 100644 --- a/openhands/runtime/impl/docker/docker_runtime.py +++ b/openhands/runtime/impl/docker/docker_runtime.py @@ -228,6 +228,8 @@ def _init_container(self): } if self.config.debug or DEBUG: environment['DEBUG'] = 'true' + # also update with runtime_startup_env_vars + environment.update(self.config.sandbox.runtime_startup_env_vars) self.log('debug', f'Workspace Base: {self.config.workspace_base}') if ( diff --git a/openhands/runtime/utils/bash.py b/openhands/runtime/utils/bash.py index 351d990dcda6..87b2ae405f1d 100644 --- a/openhands/runtime/utils/bash.py +++ b/openhands/runtime/utils/bash.py @@ -174,7 +174,7 @@ def __init__( self, work_dir: str, username: str | None = None, - no_change_timeout_seconds: float = 30.0, + no_change_timeout_seconds: int = 30, ): self.NO_CHANGE_TIMEOUT_SECONDS = no_change_timeout_seconds self.work_dir = work_dir @@ -369,7 +369,7 @@ def _handle_nochange_timeout_command( command, raw_command_output, metadata, - continue_prefix='[Command output continued from previous command]\n', + continue_prefix='[Below is the output of the previous command.]\n', ) return CmdOutputObservation( content=command_output, @@ -404,7 +404,7 @@ def _handle_hard_timeout_command( command, raw_command_output, metadata, - continue_prefix='[Command output continued from previous command]\n', + continue_prefix='[Below is the output of the previous command.]\n', ) return CmdOutputObservation( @@ -441,6 +441,8 @@ def _combine_outputs_between_matches( else: # The command output is the content after the last PS1 prompt return pane_content[ps1_matches[0].end() + 1 :] + elif len(ps1_matches) == 0: + return pane_content combined_output = '' for i in range(len(ps1_matches) - 1): # Extract content between current and next PS1 prompt @@ -459,6 +461,9 @@ def execute(self, action: CmdRunAction) -> CmdOutputObservation | ErrorObservati # Strip the command of any leading/trailing whitespace logger.debug(f'RECEIVED ACTION: {action}') command = action.command.strip() + is_special_key = self._is_special_key(command) + + # Handle when prev command is hard timeout if command == '' and self.prev_status not in { BashCommandStatus.CONTINUE, @@ -486,13 +491,45 @@ def execute(self, action: CmdRunAction) -> CmdOutputObservation | ErrorObservati last_change_time = start_time last_pane_output = self._get_pane_content() - if command != '': + # Do not check hard timeout if the command is a special key + if command != '' and is_special_key: + logger.debug(f'SENDING SPECIAL KEY: {command!r}') + self.pane.send_keys(command, enter=False) + # When prev command is hard timeout, and we are trying to execute new command + elif self.prev_status == BashCommandStatus.HARD_TIMEOUT and command != '': + if not last_pane_output.endswith(CMD_OUTPUT_PS1_END): + _ps1_matches = CmdOutputMetadata.matches_ps1_metadata(last_pane_output) + raw_command_output = self._combine_outputs_between_matches( + last_pane_output, _ps1_matches + ) + metadata = CmdOutputMetadata() # No metadata available + metadata.suffix = ( + f'\n[Your command "{command}" is NOT executed. ' + f'The previous command was timed out but still running. Above is the output of the previous command. ' + "You may wait longer to see additional output of the previous command by sending empty command '', " + 'send other commands to interact with the current process, ' + 'or send keys ("C-c", "C-z", "C-d") to interrupt/kill the previous command before sending your new command.]' + ) + command_output = self._get_command_output( + command, + raw_command_output, + metadata, + continue_prefix='[Below is the output of the previous command.]\n', + ) + return CmdOutputObservation( + command=command, + content=command_output, + metadata=metadata, + ) + # Only send the command to the pane if it's not a special key and it's not empty + # AND previous hard timeout command is resolved + elif command != '' and not is_special_key: # convert command to raw string command = escape_bash_special_chars(command) logger.debug(f'SENDING COMMAND: {command!r}') self.pane.send_keys( command, - enter=not self._is_special_key(command), + enter=True, ) # Loop until the command completes or times out @@ -525,7 +562,7 @@ def execute(self, action: CmdRunAction) -> CmdOutputObservation | ErrorObservati # We ignore this if the command is *blocking time_since_last_change = time.time() - last_change_time logger.debug( - f'CHECKING NO CHANGE TIMEOUT ({self.NO_CHANGE_TIMEOUT_SECONDS}s): elapsed {time_since_last_change}' + f'CHECKING NO CHANGE TIMEOUT ({self.NO_CHANGE_TIMEOUT_SECONDS}s): elapsed {time_since_last_change}. Action blocking: {action.blocking}' ) if ( not action.blocking diff --git a/tests/runtime/test_bash.py b/tests/runtime/test_bash.py index 3a25fd01ddee..828c859f11dd 100644 --- a/tests/runtime/test_bash.py +++ b/tests/runtime/test_bash.py @@ -1,6 +1,7 @@ """Bash-related tests for the EventStreamRuntime, which connects to the ActionExecutor running in the sandbox.""" import os +import time from pathlib import Path import pytest @@ -45,7 +46,7 @@ def test_bash_server(temp_dir, runtime_cls, run_as_openhands): runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) try: action = CmdRunAction(command='python3 -m http.server 8080') - action.timeout = 1 + action.set_hard_timeout(1) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) @@ -57,7 +58,7 @@ def test_bash_server(temp_dir, runtime_cls, run_as_openhands): ) action = CmdRunAction(command='C-c') - action.timeout = 30 + action.set_hard_timeout(30) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) @@ -66,7 +67,7 @@ def test_bash_server(temp_dir, runtime_cls, run_as_openhands): assert '/workspace' in obs.metadata.working_dir action = CmdRunAction(command='ls') - action.timeout = 1 + action.set_hard_timeout(1) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) @@ -76,7 +77,7 @@ def test_bash_server(temp_dir, runtime_cls, run_as_openhands): # run it again! action = CmdRunAction(command='python3 -m http.server 8080') - action.timeout = 1 + action.set_hard_timeout(1) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) @@ -555,15 +556,20 @@ def test_basic_command(temp_dir, runtime_cls, run_as_openhands): def test_interactive_command(temp_dir, runtime_cls, run_as_openhands): - runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) + runtime = _load_runtime( + temp_dir, + runtime_cls, + run_as_openhands, + runtime_startup_env_vars={'NO_CHANGE_TIMEOUT_SECONDS': '1'}, + ) try: # Test interactive command action = CmdRunAction('read -p "Enter name: " name && echo "Hello $name"') - action.timeout = 1 obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - # assert 'Enter name:' in obs.content # FIXME: this is not working - assert '[The command timed out after 1 seconds.' in obs.metadata.suffix + # This should trigger SOFT timeout, so no need to set hard timeout + assert 'Enter name:' in obs.content + assert '[The command has no new output after 1 seconds.' in obs.metadata.suffix action = CmdRunAction('John') obs = runtime.run_action(action) @@ -590,7 +596,7 @@ def test_long_output(temp_dir, runtime_cls, run_as_openhands): try: # Generate a long output action = CmdRunAction('for i in $(seq 1 5000); do echo "Line $i"; done') - action.timeout = 10 + action.set_hard_timeout(10) obs = runtime.run_action(action) assert obs.exit_code == 0 assert 'Line 1' in obs.content @@ -604,7 +610,7 @@ def test_long_output_exceed_history_limit(temp_dir, runtime_cls, run_as_openhand try: # Generate a long output action = CmdRunAction('for i in $(seq 1 50000); do echo "Line $i"; done') - action.timeout = 30 + action.set_hard_timeout(30) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 @@ -621,13 +627,13 @@ def test_long_output_from_nested_directories(temp_dir, runtime_cls, run_as_openh # Create nested directories with many files setup_cmd = 'mkdir -p /tmp/test_dir && cd /tmp/test_dir && for i in $(seq 1 100); do mkdir -p "folder_$i"; for j in $(seq 1 100); do touch "folder_$i/file_$j.txt"; done; done' setup_action = CmdRunAction(setup_cmd.strip()) - setup_action.timeout = 60 + setup_action.set_hard_timeout(60) obs = runtime.run_action(setup_action) assert obs.exit_code == 0 # List the directory structure recursively action = CmdRunAction('ls -R /tmp/test_dir') - action.timeout = 60 + action.set_hard_timeout(60) obs = runtime.run_action(action) assert obs.exit_code == 0 @@ -672,7 +678,7 @@ def test_command_output_continuation(temp_dir, runtime_cls, run_as_openhands): try: # Start a command that produces output slowly action = CmdRunAction('for i in {1..5}; do echo $i; sleep 3; done') - action.timeout = 2.5 # Set timeout to 2.5 seconds + action.set_hard_timeout(2.5) obs = runtime.run_action(action) assert obs.content.strip() == '1' assert obs.metadata.prefix == '' @@ -680,20 +686,19 @@ def test_command_output_continuation(temp_dir, runtime_cls, run_as_openhands): # Continue watching output action = CmdRunAction('') - action.timeout = 2.5 + action.set_hard_timeout(2.5) obs = runtime.run_action(action) - assert '[Command output continued from previous command]' in obs.metadata.prefix + assert '[Below is the output of the previous command.]' in obs.metadata.prefix assert obs.content.strip() == '2' assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix # Continue until completion for expected in ['3', '4', '5']: action = CmdRunAction('') - action.timeout = 2.5 + action.set_hard_timeout(2.5) obs = runtime.run_action(action) assert ( - '[Command output continued from previous command]' - in obs.metadata.prefix + '[Below is the output of the previous command.]' in obs.metadata.prefix ) assert obs.content.strip() == expected assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix @@ -713,8 +718,7 @@ def test_long_running_command_follow_by_execute( try: # Test command that produces output slowly action = CmdRunAction('for i in {1..3}; do echo $i; sleep 3; done') - action.timeout = 2.5 - action.blocking = False + action.set_hard_timeout(2.5) obs = runtime.run_action(action) assert '1' in obs.content # First number should appear before timeout assert obs.metadata.exit_code == -1 # -1 indicates command is still running @@ -723,25 +727,32 @@ def test_long_running_command_follow_by_execute( # Continue watching output action = CmdRunAction('') - action.timeout = 2.5 + action.set_hard_timeout(2.5) obs = runtime.run_action(action) assert '2' in obs.content - assert ( - obs.metadata.prefix == '[Command output continued from previous command]\n' - ) + assert obs.metadata.prefix == '[Below is the output of the previous command.]\n' assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix assert obs.metadata.exit_code == -1 # -1 indicates command is still running # Test command that produces no output action = CmdRunAction('sleep 15') - action.timeout = 2.5 + action.set_hard_timeout(2.5) obs = runtime.run_action(action) - assert '3' in obs.content + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert '3' not in obs.content + assert obs.metadata.prefix == '[Below is the output of the previous command.]\n' assert ( - obs.metadata.prefix == '[Command output continued from previous command]\n' + 'The previous command was timed out but still running.' + in obs.metadata.suffix ) - assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix assert obs.metadata.exit_code == -1 # -1 indicates command is still running + + # Finally continue again + action = CmdRunAction('') + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert '3' in obs.content + assert '[The command completed with exit code 0.]' in obs.metadata.suffix finally: _close_test_runtime(runtime) @@ -783,3 +794,96 @@ def test_python_interactive_input(temp_dir, runtime_cls, run_as_openhands): assert '[The command completed with exit code 0.]' in obs.metadata.suffix finally: _close_test_runtime(runtime) + + +def test_stress_long_output_with_soft_and_hard_timeout( + temp_dir, runtime_cls, run_as_openhands +): + runtime = _load_runtime( + temp_dir, + runtime_cls, + run_as_openhands, + runtime_startup_env_vars={'NO_CHANGE_TIMEOUT_SECONDS': '1'}, + docker_runtime_kwargs={ + 'cpu_period': 100000, # 100ms + 'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU) + 'mem_limit': '4G', # 4 GB of memory + }, + ) + try: + # Run a command that generates long output multiple times + for i in range(10): + start_time = time.time() + + # Check tmux memory usage (in KB) + mem_action = CmdRunAction( + 'ps aux | awk \'{printf "%8.1f KB %s\\n", $6, $0}\' | sort -nr | grep "/usr/bin/tmux" | grep -v grep | awk \'{print $1}\'' + ) + mem_obs = runtime.run_action(mem_action) + assert mem_obs.exit_code == 0 + logger.info( + f'Tmux memory usage (iteration {i}): {mem_obs.content.strip()} KB' + ) + + # Check action_execution_server mem + mem_action = CmdRunAction( + 'ps aux | awk \'{printf "%8.1f KB %s\\n", $6, $0}\' | sort -nr | grep "action_execution_server" | grep "/openhands/poetry" | grep -v grep | awk \'{print $1}\'' + ) + mem_obs = runtime.run_action(mem_action) + assert mem_obs.exit_code == 0 + logger.info( + f'Action execution server memory usage (iteration {i}): {mem_obs.content.strip()} KB' + ) + + # Test soft timeout + action = CmdRunAction( + 'read -p "Do you want to continue? [Y/n] " answer; if [[ $answer == "Y" ]]; then echo "Proceeding with operation..."; echo "Operation completed successfully!"; else echo "Operation cancelled."; exit 1; fi' + ) + obs = runtime.run_action(action) + assert 'Do you want to continue?' in obs.content + assert obs.exit_code == -1 # Command is still running, waiting for input + + # Send the confirmation + action = CmdRunAction('Y') + obs = runtime.run_action(action) + assert 'Proceeding with operation...' in obs.content + assert 'Operation completed successfully!' in obs.content + assert obs.exit_code == 0 + assert '[The command completed with exit code 0.]' in obs.metadata.suffix + + # Test hard timeout w/ long output + # Generate long output with 1000 asterisks per line + action = CmdRunAction( + f'export i={i}; for j in $(seq 1 100); do echo "Line $j - Iteration $i - $(printf \'%1000s\' | tr " " "*")"; sleep 1; done' + ) + action.set_hard_timeout(2) + obs = runtime.run_action(action) + + # Verify the output + assert obs.exit_code == -1 + assert f'Line 1 - Iteration {i}' in obs.content + # assert f'Line 1000 - Iteration {i}' in obs.content + # assert '[The command completed with exit code 0.]' in obs.metadata.suffix + + # Because hard-timeout is triggered, the terminal will in a weird state + # where it will not accept any new commands. + obs = runtime.run_action(CmdRunAction('ls')) + assert obs.exit_code == -1 + assert ( + 'The previous command was timed out but still running.' + in obs.metadata.suffix + ) + + # We need to send a Ctrl+C to reset the terminal. + obs = runtime.run_action(CmdRunAction('C-c')) + assert obs.exit_code == 130 + + # Now make sure the terminal is in a good state + obs = runtime.run_action(CmdRunAction('ls')) + assert obs.exit_code == 0 + + duration = time.time() - start_time + logger.info(f'Completed iteration {i} in {duration:.2f} seconds') + + finally: + _close_test_runtime(runtime) diff --git a/tests/runtime/test_stress_docker_runtime.py b/tests/runtime/test_stress_docker_runtime.py index d0e141ee3142..6e8a9d5957e8 100644 --- a/tests/runtime/test_stress_docker_runtime.py +++ b/tests/runtime/test_stress_docker_runtime.py @@ -28,7 +28,7 @@ def test_stress_docker_runtime(temp_dir, runtime_cls, repeat=1): for _ in range(repeat): # run stress-ng stress tests for 1 minute action = CmdRunAction(command='stress-ng --all 1 -t 1m') - action.timeout = 120 + action.set_hard_timeout(120) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) diff --git a/tests/runtime/test_stress_remote_runtime.py b/tests/runtime/test_stress_remote_runtime.py index 367af20467be..a2f6c7d2082b 100644 --- a/tests/runtime/test_stress_remote_runtime.py +++ b/tests/runtime/test_stress_remote_runtime.py @@ -92,14 +92,14 @@ def initialize_runtime( obs: CmdOutputObservation action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """) - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}') action = CmdRunAction(command='mkdir -p /dummy_dir') - action.timeout = 600 + action.set_hard_timeout(600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) diff --git a/tests/unit/test_bash_session.py b/tests/unit/test_bash_session.py index 13f32ad27d25..fc29eaffb2a5 100644 --- a/tests/unit/test_bash_session.py +++ b/tests/unit/test_bash_session.py @@ -94,7 +94,7 @@ def test_long_running_command_follow_by_execute(): obs = session.execute(CmdRunAction('')) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert '2' in obs.content - assert obs.metadata.prefix == '[Command output continued from previous command]\n' + assert obs.metadata.prefix == '[Below is the output of the previous command.]\n' assert obs.metadata.suffix == ( '\n[The command has no new output after 2 seconds. ' "You may wait longer to see additional output by sending empty command '', " @@ -108,7 +108,7 @@ def test_long_running_command_follow_by_execute(): obs = session.execute(CmdRunAction('sleep 15')) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert '3' in obs.content - assert obs.metadata.prefix == '[Command output continued from previous command]\n' + assert obs.metadata.prefix == '[Below is the output of the previous command.]\n' assert obs.metadata.suffix == ( '\n[The command has no new output after 2 seconds. ' "You may wait longer to see additional output by sending empty command '', " @@ -175,7 +175,7 @@ def test_interactive_command(): 'send other commands to interact with the current process, ' 'or send keys to interrupt/kill the command.]' ) - assert obs.metadata.prefix == '[Command output continued from previous command]\n' + assert obs.metadata.prefix == '[Below is the output of the previous command.]\n' obs = session.execute(CmdRunAction('line 2')) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -187,7 +187,7 @@ def test_interactive_command(): 'send other commands to interact with the current process, ' 'or send keys to interrupt/kill the command.]' ) - assert obs.metadata.prefix == '[Command output continued from previous command]\n' + assert obs.metadata.prefix == '[Below is the output of the previous command.]\n' obs = session.execute(CmdRunAction('EOF')) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -266,14 +266,14 @@ def test_command_output_continuation(): obs = session.execute(CmdRunAction('')) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert '[Command output continued from previous command]' in obs.metadata.prefix + assert '[Below is the output of the previous command.]' in obs.metadata.prefix assert obs.content.strip() == '2' assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT obs = session.execute(CmdRunAction('')) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert '[Command output continued from previous command]' in obs.metadata.prefix + assert '[Below is the output of the previous command.]' in obs.metadata.prefix assert obs.content.strip() == '3' assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix @@ -281,14 +281,14 @@ def test_command_output_continuation(): obs = session.execute(CmdRunAction('')) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert '[Command output continued from previous command]' in obs.metadata.prefix + assert '[Below is the output of the previous command.]' in obs.metadata.prefix assert obs.content.strip() == '4' assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT obs = session.execute(CmdRunAction('')) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert '[Command output continued from previous command]' in obs.metadata.prefix + assert '[Below is the output of the previous command.]' in obs.metadata.prefix assert obs.content.strip() == '5' assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT diff --git a/tests/unit/test_runtime_reboot.py b/tests/unit/test_runtime_reboot.py index e3ae31815a3e..c78fbe029ad6 100644 --- a/tests/unit/test_runtime_reboot.py +++ b/tests/unit/test_runtime_reboot.py @@ -27,7 +27,7 @@ def runtime(mock_session): def test_runtime_timeout_error(runtime, mock_session): # Create a command action action = CmdRunAction(command='test command') - action.timeout = 120 + action.set_hard_timeout(120) # Mock the runtime to raise a timeout error runtime.send_action_for_execution.side_effect = AgentRuntimeTimeoutError( @@ -78,7 +78,7 @@ def test_runtime_disconnected_error( # Create a command action action = CmdRunAction(command='test command') - action.timeout = 120 + action.set_hard_timeout(120) # Verify that the error message is correct with pytest.raises(AgentRuntimeDisconnectedError) as exc_info: