-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #280 from agential-ai/benchmark_module
Benchmark Module, OSWorld Benchmark
- Loading branch information
Showing
530 changed files
with
46,011 additions
and
1,894 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Base Benchmark.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
"""Base benchmark class.""" | ||
|
||
from abc import ABC, abstractmethod | ||
from typing import Any | ||
|
||
|
||
class BaseBenchmark(ABC): | ||
"""Abstract base class for defining a benchmark. | ||
The `BaseBenchmark` class defines the common interface for evaluating performance, providing | ||
inputs, and retrieving outputs during benchmark execution. Subclasses should implement the | ||
necessary methods to perform specific benchmarking tasks. | ||
Attributes: | ||
kwargs (dict): A dictionary of keyword arguments used for configuring or initializing | ||
the benchmark environment. | ||
""" | ||
|
||
def __init__(self, **kwargs: Any) -> None: | ||
"""Initializes the benchmark with the provided keyword arguments. | ||
Args: | ||
**kwargs: Configuration parameters for the benchmark. These could include settings | ||
for the environment, task-specific parameters, or any other necessary | ||
information for initializing the benchmark. | ||
""" | ||
self.kwargs = kwargs | ||
|
||
@abstractmethod | ||
def evaluate(self) -> float: | ||
"""Evaluates the performance of the agent, model, or system in the benchmark. | ||
This method must be implemented by subclasses to define how the benchmark is scored or | ||
evaluated. It should return an evaluation metric or score that indicates how well the | ||
agent or system performed on the task. | ||
Returns: | ||
float: The evaluation result, which could be a score, a metric, or other form of outcome | ||
depending on the task. | ||
Raises: | ||
NotImplementedError: If the method is not implemented in a subclass. | ||
""" | ||
raise NotImplementedError |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Computer Use Base Benchmark.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
"""Base benchmark class for computer-use benchmarks.""" | ||
|
||
from abc import abstractmethod | ||
from typing import Any | ||
|
||
from agential.benchmarks.base import BaseBenchmark | ||
|
||
|
||
class BaseComputerUseBenchmark(BaseBenchmark): | ||
"""Abstract base class for computer-use benchmarks. | ||
Subclasses should implement the methods to manage the lifecycle of the benchmark task, | ||
including setup, execution steps, and teardown. | ||
""" | ||
|
||
def __init__(self, **kwargs: Any) -> None: | ||
"""Initializes the computer-use benchmark with the provided configuration parameters. | ||
Args: | ||
**kwargs: Configuration parameters, passed on to the parent class for initializing | ||
the benchmark environment. | ||
""" | ||
super().__init__(**kwargs) | ||
|
||
@abstractmethod | ||
def close(self) -> None: | ||
"""Closes the benchmark environment or any associated resources. | ||
This method should be implemented by subclasses to define how the benchmark environment | ||
is closed or cleaned up after the benchmark task is finished. This may involve closing | ||
applications, releasing resources, or any other necessary teardown operations. | ||
Raises: | ||
NotImplementedError: If the method is not implemented in a subclass. | ||
""" | ||
raise NotImplementedError | ||
|
||
@abstractmethod | ||
def reset(self, **kargs: Any) -> Any: | ||
"""Resets the benchmark task to its initial state. | ||
This method should be implemented by subclasses to define how the benchmark environment | ||
or task is reset. It could involve resetting the environment, clearing data, or preparing | ||
the system for a new evaluation. | ||
Returns: | ||
The result of the reset operation, which could be an updated state or configuration | ||
of the environment. | ||
Raises: | ||
NotImplementedError: If the method is not implemented in a subclass. | ||
""" | ||
raise NotImplementedError | ||
|
||
@abstractmethod | ||
def step(self, **kargs: Any) -> Any: | ||
"""Executes a single step of the benchmark task. | ||
This method should be implemented by subclasses to define the actions that occur at each | ||
step of the benchmark. This could involve interacting with the environment, making decisions, | ||
or performing specific tasks as part of the benchmark evaluation. | ||
Returns: | ||
The result of the step, which could include updated states, scores, or other relevant | ||
outcomes from the step execution. | ||
Raises: | ||
NotImplementedError: If the method is not implemented in a subclass. | ||
""" | ||
raise NotImplementedError |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""OSWorld benchmark.""" |
Oops, something went wrong.