Skip to content

Commit

Permalink
chore: Cleanup runtime exception handling (#5696)
Browse files Browse the repository at this point in the history
  • Loading branch information
xingyaoww authored Dec 19, 2024
1 parent 13097f9 commit e9cafb0
Show file tree
Hide file tree
Showing 16 changed files with 219 additions and 95 deletions.
7 changes: 2 additions & 5 deletions evaluation/benchmarks/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
EvalOutput,
assert_and_raise,
codeact_user_response,
is_fatal_evaluation_error,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -400,11 +401,7 @@ def process_instance(
)

# if fatal error, throw EvalError to trigger re-run
if (
state.last_error
and 'fatal error during agent execution' in state.last_error
and 'stuck in a loop' not in state.last_error
):
if is_fatal_evaluation_error(state.last_error):
raise EvalException('Fatal error detected: ' + state.last_error)

# ======= THIS IS SWE-Bench specific =======
Expand Down
32 changes: 32 additions & 0 deletions evaluation/utils/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@

from openhands.controller.state.state import State
from openhands.core.config import LLMConfig
from openhands.core.exceptions import (
AgentRuntimeBuildError,
AgentRuntimeDisconnectedError,
AgentRuntimeError,
AgentRuntimeNotFoundError,
AgentRuntimeNotReadyError,
AgentRuntimeTimeoutError,
AgentRuntimeUnavailableError,
AgentStuckInLoopError,
)
from openhands.core.logger import get_console_handler
from openhands.core.logger import openhands_logger as logger
from openhands.events.action import Action
Expand Down Expand Up @@ -503,3 +513,25 @@ def compatibility_for_eval_history_pairs(
history_pairs.append((event_to_dict(action), event_to_dict(observation)))

return history_pairs


def is_fatal_evaluation_error(error: str | None) -> bool:
if not error:
return False

FATAL_EXCEPTIONS = [
AgentRuntimeError,
AgentRuntimeBuildError,
AgentRuntimeTimeoutError,
AgentRuntimeUnavailableError,
AgentRuntimeNotReadyError,
AgentRuntimeDisconnectedError,
AgentRuntimeNotFoundError,
AgentStuckInLoopError,
]

if any(exception.__name__ in error for exception in FATAL_EXCEPTIONS):
logger.error(f'Fatal evaluation error detected: {error}')
return True

return False
7 changes: 5 additions & 2 deletions openhands/controller/agent_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from openhands.controller.stuck import StuckDetector
from openhands.core.config import AgentConfig, LLMConfig
from openhands.core.exceptions import (
AgentStuckInLoopError,
FunctionCallNotExistsError,
FunctionCallValidationError,
LLMMalformedActionError,
Expand Down Expand Up @@ -196,7 +197,7 @@ async def _react_to_exception(
err_id = ''
if isinstance(e, litellm.AuthenticationError):
err_id = 'STATUS$ERROR_LLM_AUTHENTICATION'
self.status_callback('error', err_id, str(e))
self.status_callback('error', err_id, type(e).__name__ + ': ' + str(e))

async def start_step_loop(self):
"""The main loop for the agent's step-by-step execution."""
Expand Down Expand Up @@ -502,7 +503,9 @@ async def _step(self) -> None:
return

if self._is_stuck():
await self._react_to_exception(RuntimeError('Agent got stuck in a loop'))
await self._react_to_exception(
AgentStuckInLoopError('Agent got stuck in a loop')
)
return

self.update_state_before_step()
Expand Down
113 changes: 98 additions & 15 deletions openhands/core/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,25 @@
class AgentNoInstructionError(Exception):
# ============================================
# Agent Exceptions
# ============================================


class AgentError(Exception):
"""Base class for all agent exceptions."""

pass


class AgentNoInstructionError(AgentError):
def __init__(self, message='Instruction must be provided'):
super().__init__(message)


class AgentEventTypeError(Exception):
class AgentEventTypeError(AgentError):
def __init__(self, message='Event must be a dictionary'):
super().__init__(message)


class AgentAlreadyRegisteredError(Exception):
class AgentAlreadyRegisteredError(AgentError):
def __init__(self, name=None):
if name is not None:
message = f"Agent class already registered under '{name}'"
Expand All @@ -17,7 +28,7 @@ def __init__(self, name=None):
super().__init__(message)


class AgentNotRegisteredError(Exception):
class AgentNotRegisteredError(AgentError):
def __init__(self, name=None):
if name is not None:
message = f"No agent class registered under '{name}'"
Expand All @@ -26,6 +37,16 @@ def __init__(self, name=None):
super().__init__(message)


class AgentStuckInLoopError(AgentError):
def __init__(self, message='Agent got stuck in a loop'):
super().__init__(message)


# ============================================
# Agent Controller Exceptions
# ============================================


class TaskInvalidStateError(Exception):
def __init__(self, state=None):
if state is not None:
Expand All @@ -35,17 +56,9 @@ def __init__(self, state=None):
super().__init__(message)


class BrowserInitException(Exception):
def __init__(self, message='Failed to initialize browser environment'):
super().__init__(message)


class BrowserUnavailableException(Exception):
def __init__(
self,
message='Browser environment is not available, please check if has been initialized',
):
super().__init__(message)
# ============================================
# LLM Exceptions
# ============================================


# This exception gets sent back to the LLM
Expand Down Expand Up @@ -96,6 +109,11 @@ class CloudFlareBlockageError(Exception):
pass


# ============================================
# LLM function calling Exceptions
# ============================================


class FunctionCallConversionError(Exception):
"""Exception raised when FunctionCallingConverter failed to convert a non-function call message to a function call message.
Expand All @@ -121,3 +139,68 @@ class FunctionCallNotExistsError(Exception):

def __init__(self, message):
super().__init__(message)


# ============================================
# Agent Runtime Exceptions
# ============================================


class AgentRuntimeError(Exception):
"""Base class for all agent runtime exceptions."""

pass


class AgentRuntimeBuildError(AgentRuntimeError):
"""Exception raised when an agent runtime build operation fails."""

pass


class AgentRuntimeTimeoutError(AgentRuntimeError):
"""Exception raised when an agent runtime operation times out."""

pass


class AgentRuntimeUnavailableError(AgentRuntimeError):
"""Exception raised when an agent runtime is unavailable."""

pass


class AgentRuntimeNotReadyError(AgentRuntimeUnavailableError):
"""Exception raised when an agent runtime is not ready."""

pass


class AgentRuntimeDisconnectedError(AgentRuntimeUnavailableError):
"""Exception raised when an agent runtime is disconnected."""

pass


class AgentRuntimeNotFoundError(AgentRuntimeUnavailableError):
"""Exception raised when an agent runtime is not found."""

pass


# ============================================
# Browser Exceptions
# ============================================


class BrowserInitException(Exception):
def __init__(self, message='Failed to initialize browser environment'):
super().__init__(message)


class BrowserUnavailableException(Exception):
def __init__(
self,
message='Browser environment is not available, please check if has been initialized',
):
super().__init__(message)
19 changes: 2 additions & 17 deletions openhands/runtime/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from requests.exceptions import ConnectionError

from openhands.core.config import AppConfig, SandboxConfig
from openhands.core.exceptions import AgentRuntimeDisconnectedError
from openhands.core.logger import openhands_logger as logger
from openhands.events import EventSource, EventStream, EventStreamSubscriber
from openhands.events.action import (
Expand Down Expand Up @@ -47,22 +48,6 @@
}


class RuntimeUnavailableError(Exception):
pass


class RuntimeNotReadyError(RuntimeUnavailableError):
pass


class RuntimeDisconnectedError(RuntimeUnavailableError):
pass


class RuntimeNotFoundError(RuntimeUnavailableError):
pass


def _default_env_vars(sandbox_config: SandboxConfig) -> dict[str, str]:
ret = {}
for key in os.environ:
Expand Down Expand Up @@ -193,7 +178,7 @@ async def on_event(self, event: Event) -> None:
except Exception as e:
err_id = ''
if isinstance(e, ConnectionError) or isinstance(
e, RuntimeDisconnectedError
e, AgentRuntimeDisconnectedError
):
err_id = 'STATUS$ERROR_RUNTIME_DISCONNECTED'
logger.error(
Expand Down
2 changes: 1 addition & 1 deletion openhands/runtime/builder/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def build(
registry prefix). This should be used for subsequent use (e.g., `docker run`).
Raises:
RuntimeError: If the build failed.
AgentRuntimeBuildError: If the build failed.
"""
pass

Expand Down
13 changes: 9 additions & 4 deletions openhands/runtime/builder/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import docker

from openhands import __version__ as oh_version
from openhands.core.exceptions import AgentRuntimeBuildError
from openhands.core.logger import RollingLogger
from openhands.core.logger import openhands_logger as logger
from openhands.runtime.builder.base import RuntimeBuilder
Expand All @@ -19,7 +20,9 @@ def __init__(self, docker_client: docker.DockerClient):
version_info = self.docker_client.version()
server_version = version_info.get('Version', '').replace('-', '.')
if tuple(map(int, server_version.split('.')[:2])) < (18, 9):
raise RuntimeError('Docker server version must be >= 18.09 to use BuildKit')
raise AgentRuntimeBuildError(
'Docker server version must be >= 18.09 to use BuildKit'
)

self.rolling_logger = RollingLogger(max_lines=10)

Expand All @@ -44,7 +47,7 @@ def build(
str: The name of the built Docker image.
Raises:
RuntimeError: If the Docker server version is incompatible or if the build process fails.
AgentRuntimeBuildError: If the Docker server version is incompatible or if the build process fails.
Note:
This method uses Docker BuildKit for improved build performance and caching capabilities.
Expand All @@ -55,7 +58,9 @@ def build(
version_info = self.docker_client.version()
server_version = version_info.get('Version', '').replace('-', '.')
if tuple(map(int, server_version.split('.'))) < (18, 9):
raise RuntimeError('Docker server version must be >= 18.09 to use BuildKit')
raise AgentRuntimeBuildError(
'Docker server version must be >= 18.09 to use BuildKit'
)

target_image_hash_name = tags[0]
target_image_repo, target_image_source_tag = target_image_hash_name.split(':')
Expand Down Expand Up @@ -154,7 +159,7 @@ def build(
# Check if the image is built successfully
image = self.docker_client.images.get(target_image_hash_name)
if image is None:
raise RuntimeError(
raise AgentRuntimeBuildError(
f'Build failed: Image {target_image_hash_name} not found'
)

Expand Down
15 changes: 10 additions & 5 deletions openhands/runtime/builder/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import requests

from openhands.core.exceptions import AgentRuntimeBuildError
from openhands.core.logger import openhands_logger as logger
from openhands.runtime.builder import RuntimeBuilder
from openhands.runtime.utils.request import send_request
Expand Down Expand Up @@ -77,7 +78,7 @@ def build(
while should_continue():
if time.time() - start_time > timeout:
logger.error('Build timed out after 30 minutes')
raise RuntimeError('Build timed out after 30 minutes')
raise AgentRuntimeBuildError('Build timed out after 30 minutes')

status_response = send_request(
self.session,
Expand All @@ -88,7 +89,7 @@ def build(

if status_response.status_code != 200:
logger.error(f'Failed to get build status: {status_response.text}')
raise RuntimeError(
raise AgentRuntimeBuildError(
f'Failed to get build status: {status_response.text}'
)

Expand All @@ -110,12 +111,14 @@ def build(
'error', f'Build failed with status: {status}. Build ID: {build_id}'
)
logger.error(error_message)
raise RuntimeError(error_message)
raise AgentRuntimeBuildError(error_message)

# Wait before polling again
sleep_if_should_continue(30)

raise RuntimeError('Build interrupted (likely received SIGTERM or SIGINT).')
raise AgentRuntimeBuildError(
'Build interrupted (likely received SIGTERM or SIGINT).'
)

def image_exists(self, image_name: str, pull_from_repo: bool = True) -> bool:
"""Checks if an image exists in the remote registry using the /image_exists endpoint."""
Expand All @@ -129,7 +132,9 @@ def image_exists(self, image_name: str, pull_from_repo: bool = True) -> bool:

if response.status_code != 200:
logger.error(f'Failed to check image existence: {response.text}')
raise RuntimeError(f'Failed to check image existence: {response.text}')
raise AgentRuntimeBuildError(
f'Failed to check image existence: {response.text}'
)

result = response.json()

Expand Down
Loading

0 comments on commit e9cafb0

Please sign in to comment.