Merge pull request #280 from agential-ai/benchmark_module

Benchmark Module, OSWorld Benchmark
agential-ai · Jan 16, 2025 · a550366 · a550366
2 parents 8a7cd36 + 45468d3
commit a550366
Show file tree

Hide file tree

Showing 530 changed files with 46,011 additions and 1,894 deletions.
diff --git a/.gitignore b/.gitignore
@@ -50,5 +50,9 @@ data/tabmwp/*.json
 experiments/*/wandb
 experiments/*/output
 
-# Analysis
-experiments/analysis/artifacts
+# OSWorld data files
+vmware_vm_data/*
+.vmware_vms
+.vmware_lck
+settings/*
+cache/*
diff --git a/agential/agents/osworld_baseline/README.md b/agential/agents/osworld_baseline/README.md
@@ -3,7 +3,7 @@
 
 Code and implementations are originally from:
 - [Code: OSWorld Repository](https://github.com/xlang-ai/OSWorld/tree/main)
-- [Paper: OSWorld: A Framework for Multi-modal AI Agents](https://arxiv.org/pdf/2404.07972)
+- [Paper: OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments](https://arxiv.org/pdf/2404.07972)
 
 ### How to Use
 

diff --git a/agential/agents/osworld_baseline/accessibility_tree_wrap/heuristic_retrieve.py b/agential/agents/osworld_baseline/accessibility_tree_wrap/heuristic_retrieve.py
@@ -2,10 +2,11 @@
 
 import io
 
-from typing import List, Tuple
+from typing import List, Tuple, cast
 from xml.etree.ElementTree import Element
 
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image, ImageDraw, ImageFile, ImageFont
+from PIL.ImageFont import FreeTypeFont
 
 state_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/state"
 state_ns_windows = "https://accessibility.windows.example.org/ns/state"
@@ -182,7 +183,7 @@ def draw_bounding_boxes(
 
     # Load the screenshot image
     image_stream = io.BytesIO(image_file_content)
-    image = Image.open(image_stream)
+    image: Image.Image = Image.open(image_stream)
     if float(down_sampling_ratio) != 1.0:
         image = image.resize(
             (
@@ -197,10 +198,11 @@ def draw_bounding_boxes(
 
     try:
         # Adjust the path to the font file you have or use a default one
-        font = ImageFont.truetype("arial.ttf", 15)
+        font: FreeTypeFont = ImageFont.truetype("arial.ttf", 15)
     except IOError:
         # Fallback to a basic font if the specified font can't be loaded
-        font = ImageFont.load_default()
+        cur_font = ImageFont.load_default()
+        font = cast(FreeTypeFont, cur_font)
 
     index = 1
 
@@ -245,7 +247,7 @@ def draw_bounding_boxes(
                     coords[0],
                     bottom_right[1],
                 )  # Adjust Y to be above the bottom right
-                text_bbox: Tuple[int, int, int, int] = draw.textbbox(
+                text_bbox: Tuple[float, float, float, float] = draw.textbbox(
                     text_position, str(index), font=font, anchor="lb"
                 )
                 # offset: int = bottom_right[1]-text_bbox[3]

diff --git a/agential/benchmarks/__init__.py b/agential/benchmarks/__init__.py
@@ -0,0 +1 @@
+"""Base Benchmark."""
diff --git a/agential/benchmarks/base.py b/agential/benchmarks/base.py
@@ -0,0 +1,44 @@
+"""Base benchmark class."""
+
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+class BaseBenchmark(ABC):
+    """Abstract base class for defining a benchmark.
+
+    The `BaseBenchmark` class defines the common interface for evaluating performance, providing
+    inputs, and retrieving outputs during benchmark execution. Subclasses should implement the
+    necessary methods to perform specific benchmarking tasks.
+
+    Attributes:
+        kwargs (dict): A dictionary of keyword arguments used for configuring or initializing
+            the benchmark environment.
+    """
+
+    def __init__(self, **kwargs: Any) -> None:
+        """Initializes the benchmark with the provided keyword arguments.
+
+        Args:
+            **kwargs: Configuration parameters for the benchmark. These could include settings
+                for the environment, task-specific parameters, or any other necessary
+                information for initializing the benchmark.
+        """
+        self.kwargs = kwargs
+
+    @abstractmethod
+    def evaluate(self) -> float:
+        """Evaluates the performance of the agent, model, or system in the benchmark.
+
+        This method must be implemented by subclasses to define how the benchmark is scored or
+        evaluated. It should return an evaluation metric or score that indicates how well the
+        agent or system performed on the task.
+
+        Returns:
+            float: The evaluation result, which could be a score, a metric, or other form of outcome
+                depending on the task.
+
+        Raises:
+            NotImplementedError: If the method is not implemented in a subclass.
+        """
+        raise NotImplementedError
diff --git a/agential/benchmarks/computer_use/__init__.py b/agential/benchmarks/computer_use/__init__.py
@@ -0,0 +1 @@
+"""Computer Use Base Benchmark."""
diff --git a/agential/benchmarks/computer_use/base.py b/agential/benchmarks/computer_use/base.py
@@ -0,0 +1,70 @@
+"""Base benchmark class for computer-use benchmarks."""
+
+from abc import abstractmethod
+from typing import Any
+
+from agential.benchmarks.base import BaseBenchmark
+
+
+class BaseComputerUseBenchmark(BaseBenchmark):
+    """Abstract base class for computer-use benchmarks.
+
+    Subclasses should implement the methods to manage the lifecycle of the benchmark task,
+    including setup, execution steps, and teardown.
+    """
+
+    def __init__(self, **kwargs: Any) -> None:
+        """Initializes the computer-use benchmark with the provided configuration parameters.
+
+        Args:
+            **kwargs: Configuration parameters, passed on to the parent class for initializing
+                the benchmark environment.
+        """
+        super().__init__(**kwargs)
+
+    @abstractmethod
+    def close(self) -> None:
+        """Closes the benchmark environment or any associated resources.
+
+        This method should be implemented by subclasses to define how the benchmark environment
+        is closed or cleaned up after the benchmark task is finished. This may involve closing
+        applications, releasing resources, or any other necessary teardown operations.
+
+        Raises:
+            NotImplementedError: If the method is not implemented in a subclass.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def reset(self, **kargs: Any) -> Any:
+        """Resets the benchmark task to its initial state.
+
+        This method should be implemented by subclasses to define how the benchmark environment
+        or task is reset. It could involve resetting the environment, clearing data, or preparing
+        the system for a new evaluation.
+
+        Returns:
+            The result of the reset operation, which could be an updated state or configuration
+            of the environment.
+
+        Raises:
+            NotImplementedError: If the method is not implemented in a subclass.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def step(self, **kargs: Any) -> Any:
+        """Executes a single step of the benchmark task.
+
+        This method should be implemented by subclasses to define the actions that occur at each
+        step of the benchmark. This could involve interacting with the environment, making decisions,
+        or performing specific tasks as part of the benchmark evaluation.
+
+        Returns:
+            The result of the step, which could include updated states, scores, or other relevant
+                outcomes from the step execution.
+
+        Raises:
+            NotImplementedError: If the method is not implemented in a subclass.
+        """
+        raise NotImplementedError
diff --git a/agential/benchmarks/computer_use/osworld/__init__.py b/agential/benchmarks/computer_use/osworld/__init__.py
@@ -0,0 +1 @@
+"""OSWorld benchmark."""