Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: improve remote runtime reliability on large-scale evaluation #4869

Merged
merged 7 commits into from
Nov 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions evaluation/swe_bench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def get_config(instance: pd.Series) -> AppConfig:
timeout=1800,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
remote_runtime_init_timeout=1800,
),
# do not mount workspace
workspace_base=None,
Expand Down
1 change: 1 addition & 0 deletions evaluation/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def get_config(
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_remote_runtime_alive=False,
remote_runtime_init_timeout=1800,
),
# do not mount workspace
workspace_base=None,
Expand Down
4 changes: 3 additions & 1 deletion openhands/core/config/sandbox_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class SandboxConfig:
base_container_image: The base container image from which to build the runtime image.
runtime_container_image: The runtime container image to use.
user_id: The user ID for the sandbox.
timeout: The timeout for the sandbox.
timeout: The timeout for the default sandbox action execution.
remote_runtime_init_timeout: The timeout for the remote runtime to start.
enable_auto_lint: Whether to enable auto-lint.
use_host_network: Whether to use the host network.
initialize_plugins: Whether to initialize plugins.
Expand All @@ -41,6 +42,7 @@ class SandboxConfig:
runtime_container_image: str | None = None
user_id: int = os.getuid() if hasattr(os, 'getuid') else 1000
timeout: int = 120
remote_runtime_init_timeout: int = 180
enable_auto_lint: bool = (
False # once enabled, OpenHands would lint files after editing
)
Expand Down
20 changes: 13 additions & 7 deletions openhands/runtime/impl/remote/remote_runtime.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from pathlib import Path
import tempfile
import threading
from pathlib import Path
from typing import Callable, Optional
from zipfile import ZipFile

Expand Down Expand Up @@ -260,13 +260,19 @@ def _parse_runtime_response(self, response: requests.Response):
{'X-Session-API-Key': start_response['session_api_key']}
)

@tenacity.retry(
stop=tenacity.stop_after_delay(180) | stop_if_should_exit(),
reraise=True,
retry=tenacity.retry_if_exception_type(RuntimeNotReadyError),
wait=tenacity.wait_fixed(2),
)
def _wait_until_alive(self):
retry_decorator = tenacity.retry(
stop=tenacity.stop_after_delay(
self.config.sandbox.remote_runtime_init_timeout
)
| stop_if_should_exit(),
reraise=True,
retry=tenacity.retry_if_exception_type(RuntimeNotReadyError),
wait=tenacity.wait_fixed(2),
)
return retry_decorator(self._wait_until_alive_impl)()

def _wait_until_alive_impl(self):
self.log('debug', f'Waiting for runtime to be alive at url: {self.runtime_url}')
runtime_info_response = self._send_request(
'GET',
Expand Down
Loading