Skip to content

Commit

Permalink
Merge pull request #280 from agential-ai/benchmark_module
Browse files Browse the repository at this point in the history
Benchmark Module, OSWorld Benchmark
  • Loading branch information
alckasoc authored Jan 16, 2025
2 parents 8a7cd36 + 45468d3 commit a550366
Show file tree
Hide file tree
Showing 530 changed files with 46,011 additions and 1,894 deletions.
8 changes: 6 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,9 @@ data/tabmwp/*.json
experiments/*/wandb
experiments/*/output

# Analysis
experiments/analysis/artifacts
# OSWorld data files
vmware_vm_data/*
.vmware_vms
.vmware_lck
settings/*
cache/*
2 changes: 1 addition & 1 deletion agential/agents/osworld_baseline/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

Code and implementations are originally from:
- [Code: OSWorld Repository](https://github.com/xlang-ai/OSWorld/tree/main)
- [Paper: OSWorld: A Framework for Multi-modal AI Agents](https://arxiv.org/pdf/2404.07972)
- [Paper: OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments](https://arxiv.org/pdf/2404.07972)

### How to Use

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@

import io

from typing import List, Tuple
from typing import List, Tuple, cast
from xml.etree.ElementTree import Element

from PIL import Image, ImageDraw, ImageFont
from PIL import Image, ImageDraw, ImageFile, ImageFont
from PIL.ImageFont import FreeTypeFont

state_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/state"
state_ns_windows = "https://accessibility.windows.example.org/ns/state"
Expand Down Expand Up @@ -182,7 +183,7 @@ def draw_bounding_boxes(

# Load the screenshot image
image_stream = io.BytesIO(image_file_content)
image = Image.open(image_stream)
image: Image.Image = Image.open(image_stream)
if float(down_sampling_ratio) != 1.0:
image = image.resize(
(
Expand All @@ -197,10 +198,11 @@ def draw_bounding_boxes(

try:
# Adjust the path to the font file you have or use a default one
font = ImageFont.truetype("arial.ttf", 15)
font: FreeTypeFont = ImageFont.truetype("arial.ttf", 15)
except IOError:
# Fallback to a basic font if the specified font can't be loaded
font = ImageFont.load_default()
cur_font = ImageFont.load_default()
font = cast(FreeTypeFont, cur_font)

index = 1

Expand Down Expand Up @@ -245,7 +247,7 @@ def draw_bounding_boxes(
coords[0],
bottom_right[1],
) # Adjust Y to be above the bottom right
text_bbox: Tuple[int, int, int, int] = draw.textbbox(
text_bbox: Tuple[float, float, float, float] = draw.textbbox(
text_position, str(index), font=font, anchor="lb"
)
# offset: int = bottom_right[1]-text_bbox[3]
Expand Down
1 change: 1 addition & 0 deletions agential/benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Base Benchmark."""
44 changes: 44 additions & 0 deletions agential/benchmarks/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""Base benchmark class."""

from abc import ABC, abstractmethod
from typing import Any


class BaseBenchmark(ABC):
"""Abstract base class for defining a benchmark.
The `BaseBenchmark` class defines the common interface for evaluating performance, providing
inputs, and retrieving outputs during benchmark execution. Subclasses should implement the
necessary methods to perform specific benchmarking tasks.
Attributes:
kwargs (dict): A dictionary of keyword arguments used for configuring or initializing
the benchmark environment.
"""

def __init__(self, **kwargs: Any) -> None:
"""Initializes the benchmark with the provided keyword arguments.
Args:
**kwargs: Configuration parameters for the benchmark. These could include settings
for the environment, task-specific parameters, or any other necessary
information for initializing the benchmark.
"""
self.kwargs = kwargs

@abstractmethod
def evaluate(self) -> float:
"""Evaluates the performance of the agent, model, or system in the benchmark.
This method must be implemented by subclasses to define how the benchmark is scored or
evaluated. It should return an evaluation metric or score that indicates how well the
agent or system performed on the task.
Returns:
float: The evaluation result, which could be a score, a metric, or other form of outcome
depending on the task.
Raises:
NotImplementedError: If the method is not implemented in a subclass.
"""
raise NotImplementedError
1 change: 1 addition & 0 deletions agential/benchmarks/computer_use/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Computer Use Base Benchmark."""
70 changes: 70 additions & 0 deletions agential/benchmarks/computer_use/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Base benchmark class for computer-use benchmarks."""

from abc import abstractmethod
from typing import Any

from agential.benchmarks.base import BaseBenchmark


class BaseComputerUseBenchmark(BaseBenchmark):
"""Abstract base class for computer-use benchmarks.
Subclasses should implement the methods to manage the lifecycle of the benchmark task,
including setup, execution steps, and teardown.
"""

def __init__(self, **kwargs: Any) -> None:
"""Initializes the computer-use benchmark with the provided configuration parameters.
Args:
**kwargs: Configuration parameters, passed on to the parent class for initializing
the benchmark environment.
"""
super().__init__(**kwargs)

@abstractmethod
def close(self) -> None:
"""Closes the benchmark environment or any associated resources.
This method should be implemented by subclasses to define how the benchmark environment
is closed or cleaned up after the benchmark task is finished. This may involve closing
applications, releasing resources, or any other necessary teardown operations.
Raises:
NotImplementedError: If the method is not implemented in a subclass.
"""
raise NotImplementedError

@abstractmethod
def reset(self, **kargs: Any) -> Any:
"""Resets the benchmark task to its initial state.
This method should be implemented by subclasses to define how the benchmark environment
or task is reset. It could involve resetting the environment, clearing data, or preparing
the system for a new evaluation.
Returns:
The result of the reset operation, which could be an updated state or configuration
of the environment.
Raises:
NotImplementedError: If the method is not implemented in a subclass.
"""
raise NotImplementedError

@abstractmethod
def step(self, **kargs: Any) -> Any:
"""Executes a single step of the benchmark task.
This method should be implemented by subclasses to define the actions that occur at each
step of the benchmark. This could involve interacting with the environment, making decisions,
or performing specific tasks as part of the benchmark evaluation.
Returns:
The result of the step, which could include updated states, scores, or other relevant
outcomes from the step execution.
Raises:
NotImplementedError: If the method is not implemented in a subclass.
"""
raise NotImplementedError
1 change: 1 addition & 0 deletions agential/benchmarks/computer_use/osworld/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""OSWorld benchmark."""
Loading

0 comments on commit a550366

Please sign in to comment.