From 5ec0694ff1bee3de8a916a3124c4edb608e2fa80 Mon Sep 17 00:00:00 2001 From: Zaki Ali Date: Thu, 13 Feb 2025 21:20:50 -0800 Subject: [PATCH 01/10] initial commit of goosebench --- benchmark/goosebench/README.md | 1 + benchmark/goosebench/hello.py | 6 ++++++ benchmark/goosebench/pyproject.toml | 7 +++++++ 3 files changed, 14 insertions(+) create mode 100644 benchmark/goosebench/README.md create mode 100644 benchmark/goosebench/hello.py create mode 100644 benchmark/goosebench/pyproject.toml diff --git a/benchmark/goosebench/README.md b/benchmark/goosebench/README.md new file mode 100644 index 000000000..e9cbd25ee --- /dev/null +++ b/benchmark/goosebench/README.md @@ -0,0 +1 @@ +## GooseBench diff --git a/benchmark/goosebench/hello.py b/benchmark/goosebench/hello.py new file mode 100644 index 000000000..04e32c222 --- /dev/null +++ b/benchmark/goosebench/hello.py @@ -0,0 +1,6 @@ +def main(): + print("Hello from goosebench!") + + +if __name__ == "__main__": + main() diff --git a/benchmark/goosebench/pyproject.toml b/benchmark/goosebench/pyproject.toml new file mode 100644 index 000000000..e1d21d9c6 --- /dev/null +++ b/benchmark/goosebench/pyproject.toml @@ -0,0 +1,7 @@ +[project] +name = "goosebench" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [] From e1c51be19050062db59d3dbe0ae37a31291565e8 Mon Sep 17 00:00:00 2001 From: Marcelle Bonterre Date: Tue, 18 Feb 2025 12:16:40 -0500 Subject: [PATCH 02/10] ext (in)validation prompts --- .../workflows/install-and-run-goose.yml | 33 ++ benchmark/goosebench/.gitignore | 172 ++++++++++ benchmark/goosebench/README.md | 1 - benchmark/goosebench/archive/test.sh | 209 ++++++++++++ benchmark/goosebench/bin/.python3@3.11.pkg | 1 + benchmark/goosebench/bin/README.hermit.md | 7 + benchmark/goosebench/bin/activate-hermit | 21 ++ benchmark/goosebench/bin/activate-hermit.fish | 24 ++ benchmark/goosebench/bin/hermit | 43 +++ benchmark/goosebench/bin/hermit.hcl | 2 + benchmark/goosebench/bin/pip | 1 + benchmark/goosebench/bin/pip3 | 1 + benchmark/goosebench/bin/pip3.11 | 1 + benchmark/goosebench/bin/pydoc3 | 1 + benchmark/goosebench/bin/pydoc3.11 | 1 + benchmark/goosebench/bin/python | 1 + benchmark/goosebench/bin/python3 | 1 + benchmark/goosebench/bin/python3-config | 1 + benchmark/goosebench/bin/python3.11 | 1 + benchmark/goosebench/bin/python3.11-config | 1 + benchmark/goosebench/config.yaml | 7 + benchmark/goosebench/goosebench/__init__.py | 0 .../goosebench/evaluate_tools/__init__.py | 0 .../computercontroller_tool/__init__.py | 0 .../automation_script.py | 29 ++ .../computercontroller_tool/cache.py | 29 ++ .../computer_control.py | 29 ++ .../computercontroller_tool/web_scrape.py | 29 ++ .../computercontroller_tool/web_search.py | 29 ++ .../evaluate_tools/developer_tool/__init__.py | 0 .../developer_tool/list_windows.py | 29 ++ .../developer_tool/screen_capture.py | 29 ++ .../evaluate_tools/developer_tool/shell.py | 29 ++ .../developer_tool/text_editor.py | 29 ++ .../google_drive_tool/__init__.py | 0 .../google_drive_tool/google_drive_read.py | 29 ++ .../google_drive_tool/google_drive_search.py | 29 ++ .../evaluate_tools/jetbrains_tool/__init__.py | 0 .../jetbrains_tool/jetbrains.py | 29 ++ .../evaluate_tools/memory_tool/__init__.py | 0 .../memory_tool/remember_memory.py | 29 ++ .../memory_tool/remove_memory_category.py | 29 ++ .../memory_tool/remove_specific_memory.py | 29 ++ .../memory_tool/retrieve_memories.py | 29 ++ .../evaluate_tools/tutorial_tool/__init__.py | 0 .../evaluate_tools/tutorial_tool/tutorial.py | 29 ++ benchmark/goosebench/goosebench/main.py | 237 +++++++++++++ benchmark/goosebench/hello.py | 6 - benchmark/goosebench/poetry.lock | 310 ++++++++++++++++++ benchmark/goosebench/pyproject.toml | 29 +- 50 files changed, 1594 insertions(+), 11 deletions(-) create mode 100644 benchmark/goosebench/.github/workflows/install-and-run-goose.yml create mode 100644 benchmark/goosebench/.gitignore delete mode 100644 benchmark/goosebench/README.md create mode 100755 benchmark/goosebench/archive/test.sh create mode 120000 benchmark/goosebench/bin/.python3@3.11.pkg create mode 100644 benchmark/goosebench/bin/README.hermit.md create mode 100755 benchmark/goosebench/bin/activate-hermit create mode 100755 benchmark/goosebench/bin/activate-hermit.fish create mode 100755 benchmark/goosebench/bin/hermit create mode 100644 benchmark/goosebench/bin/hermit.hcl create mode 120000 benchmark/goosebench/bin/pip create mode 120000 benchmark/goosebench/bin/pip3 create mode 120000 benchmark/goosebench/bin/pip3.11 create mode 120000 benchmark/goosebench/bin/pydoc3 create mode 120000 benchmark/goosebench/bin/pydoc3.11 create mode 120000 benchmark/goosebench/bin/python create mode 120000 benchmark/goosebench/bin/python3 create mode 120000 benchmark/goosebench/bin/python3-config create mode 120000 benchmark/goosebench/bin/python3.11 create mode 120000 benchmark/goosebench/bin/python3.11-config create mode 100644 benchmark/goosebench/config.yaml create mode 100644 benchmark/goosebench/goosebench/__init__.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/__init__.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/__init__.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/automation_script.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/cache.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/computer_control.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_scrape.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_search.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/developer_tool/__init__.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/developer_tool/list_windows.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/developer_tool/screen_capture.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/developer_tool/shell.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/developer_tool/text_editor.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/__init__.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_read.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_search.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/__init__.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/jetbrains.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/memory_tool/__init__.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remember_memory.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_memory_category.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_specific_memory.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/memory_tool/retrieve_memories.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/__init__.py create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/tutorial.py create mode 100755 benchmark/goosebench/goosebench/main.py delete mode 100644 benchmark/goosebench/hello.py create mode 100644 benchmark/goosebench/poetry.lock diff --git a/benchmark/goosebench/.github/workflows/install-and-run-goose.yml b/benchmark/goosebench/.github/workflows/install-and-run-goose.yml new file mode 100644 index 000000000..4b13b97e6 --- /dev/null +++ b/benchmark/goosebench/.github/workflows/install-and-run-goose.yml @@ -0,0 +1,33 @@ +name: Install and Run Goose + +on: + push: + branches: + - main # Or your preferred branch + +jobs: + install-and-run-goose: + runs-on: ubuntu-latest + steps: + - name: Checkout Repository + uses: actions/checkout@v2 + + - name: Set up Goose Environment + run: | + echo "GOOSE_BIN_DIR=\$HOME/.local/bin" >> $GITHUB_ENV + echo "CONFIGURE=false" >> $GITHUB_ENV + echo "DATABRICKS_HOST=https://block-lakehouse-production.cloud.databricks.com" >> $GITHUB_ENV + + - name: Install Goose + run: | + curl -fsSL https://github.com/block/goose/releases/download/stable/download_cli.sh | bash + + - name: Configure Goose + run: | + mkdir -p ~/.config/goose/ + cp $GITHUB_WORKSPACE/test.sh ~/test.sh + alias goose=$HOME/.local/bin/goose + + - name: Run Goose Command + run: | + ./test.sh -p databricks -m goose diff --git a/benchmark/goosebench/.gitignore b/benchmark/goosebench/.gitignore new file mode 100644 index 000000000..c4d4e4a7a --- /dev/null +++ b/benchmark/goosebench/.gitignore @@ -0,0 +1,172 @@ +.playpen/*.log +.hermit/ +*.ipynb +.idea/ +.vscode/ +.goose/ +run_slack_app-backup.sh +temp_creds.json +temp_merged_policy.json +token_google_docs.json +creds-*.json +assume_role_policy.json +finetune_data/content/ +finetune_data/datasets/ +insights/messages/ +insights/env.sh +src/qai_server/slack_app/modes/generated_images/ +src/qai_server/ingest/token_google_docs.json +src/qai_server/ingest/content/ +src/qai_server/ingest/ingest_project/**/content/ +src/qai_server/ingest/.env +src/qai_server/evals/messages/ +src/qai_server/evals/annotations_*.jsonl +projects/ingest_docs/ingest_docs/content/ +projects/ingest_docs/content/ +projects/ingest_docs/notion_example*.json +run_gdoc.sh + +# Datafiles +*.csv +*.gz +*.h5 +*.pkl +*.pk +*.html +*.log +*.db +*.db-journal + + +## From: https://github.com/github/gitignore/blob/main/Python.gitignore +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +.python-version +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ \ No newline at end of file diff --git a/benchmark/goosebench/README.md b/benchmark/goosebench/README.md deleted file mode 100644 index e9cbd25ee..000000000 --- a/benchmark/goosebench/README.md +++ /dev/null @@ -1 +0,0 @@ -## GooseBench diff --git a/benchmark/goosebench/archive/test.sh b/benchmark/goosebench/archive/test.sh new file mode 100755 index 000000000..b53752e8d --- /dev/null +++ b/benchmark/goosebench/archive/test.sh @@ -0,0 +1,209 @@ +#!/bin/bash + +# NOTE: MacOS ships with Bash 3.2 by default, which does NOT support declare -A for associative arrays. +# This script uses standard Bash arrays to remain compatible. + +# Color codes for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color +BOLD='\033[1m' + +# Initialize error log array +ERROR_LOG=() + +#---------------------------------------------------------------------------# +# EXTENSIONS +#---------------------------------------------------------------------------# +# We'll define each extension in an array of prompts. Then we define an array +# of extension names, so we can iterate over them. +#---------------------------------------------------------------------------# +EXTENSIONS=(developer computercontroller google_drive memory) + +developer_prompts=( + "List the contents of the current directory." + "Create a new file called test.txt with the content 'Hello, World!'" + "Read the contents of test.txt" +) + +computercontroller_prompts=( + "What are the headlines on hackernews? Organize the list into categories." + "Make a ding sound" +) + +google_drive_prompts=( + "List the files in my Google Drive." + "Search for documents containing 'meeting notes'" +) + +memory_prompts=( + "Save this fact: The capital of France is Paris." + "What is the capital of France?" +) + + +#---------------------------------------------------------------------------# +# LOGGING FUNCTION +#---------------------------------------------------------------------------# +log_error() { + local provider=$1 + local model=$2 + local extension=$3 + local error=$4 + ERROR_LOG+=("${RED}[ERROR]${NC} Provider: $provider, Model: $model, Extension: $extension\n$error\n") +} + +#---------------------------------------------------------------------------# +# MAIN TEST FUNCTION +#---------------------------------------------------------------------------# +run_test() { + local provider=$1 + local model=$2 + local extension=$3 + local prompt=$4 + local timeout_seconds=30 + + echo -e "${YELLOW}Testing:${NC} $provider/$model with $extension" + echo -e "${YELLOW}Prompt:${NC} $prompt" + + local temp_file + temp_file="$(mktemp)" + echo "$prompt" > "$temp_file" + + # Run goose with timeout + timeout $timeout_seconds goose run \ + --with-builtin "$extension" \ + -t "$(cat "$temp_file")" 2>&1 | tee test_output.log + + # Check for errors + if [ ${PIPESTATUS[0]} -ne 0 ]; then + log_error "$provider" "$model" "$extension" "$(cat test_output.log)" + echo -e "${RED}✗ Test failed${NC}" + else + echo -e "${GREEN}✓ Test passed${NC}" + fi + + rm -f "$temp_file" test_output.log +} + +#---------------------------------------------------------------------------# +# TESTING EXTENSION (ITERATING OVER PROMPTS) +#---------------------------------------------------------------------------# +test_extension() { + local provider=$1 + local model=$2 + local extension=$3 + + echo -e "\n${BOLD}Testing extension: $extension${NC}" + + # We'll build the array name dynamically, e.g. developer_prompts, memory_prompts, etc. + # Then we retrieve that array's contents via indirect expansion. + local arr_name="${extension}_prompts[@]" + local prompts=("${!arr_name}") + + for prompt in "${prompts[@]}"; do + run_test "$provider" "$model" "$extension" "$prompt" + sleep 2 # brief pause + done +} + +#---------------------------------------------------------------------------# +# USAGE FUNCTION +#---------------------------------------------------------------------------# +usage() { + echo "Usage: $0 [-p provider -m model[,model2,model3]...]..." + echo " -p provider : Provider to use" + echo " -m models : Comma-separated list of models to use with the provider" + echo " -h : Show this help message" + echo "" + echo "Examples:" + echo " $0 # Uses default: databricks/goose" + echo " $0 -p anthropic -m claude # Single provider/model" + echo " $0 -p anthropic -m claude,claude2 # One provider, multiple models" + echo " $0 -p anthropic -m claude -p databricks -m goose # Multiple providers" + echo " $0 -p anthropic -m claude,claude2 -p databricks -m goose,goose2 # Multiple of both" + exit 1 +} + +#---------------------------------------------------------------------------# +# MAIN WORKFLOW +#---------------------------------------------------------------------------# +main() { + # Arrays to store provider/model combinations + declare -a provider_model_pairs=() + local current_provider="" + + # Parse command line arguments + while [[ $# -gt 0 ]]; do + case "$1" in + -h) + usage + ;; + -p) + shift + if [[ -z "$1" ]]; then + echo "Error: -p requires a provider name" + usage + fi + current_provider="$1" + shift + ;; + -m) + if [[ -z "$current_provider" ]]; then + echo "Error: -m must follow a -p option" + usage + fi + shift + if [[ -z "$1" ]]; then + echo "Error: -m requires at least one model name" + usage + fi + # Split comma-separated models and create provider:model pairs + IFS=',' read -ra models <<< "$1" + for model in "${models[@]}"; do + provider_model_pairs+=("$current_provider:$model") + done + shift + ;; + *) + echo "Error: Unknown option $1" + usage + ;; + esac + done + + # If no providers/models specified, use defaults + if [ ${#provider_model_pairs[@]} -eq 0 ]; then + provider_model_pairs=("databricks:goose") + fi + + echo -e "${BOLD}Starting Goose CLI Integration Tests${NC}" + + # Iterate through provider/model pairs + for pair in "${provider_model_pairs[@]}"; do + # Split the pair into provider and model + IFS=':' read -r provider model <<< "$pair" + + echo -e "\n${BOLD}Testing provider: $provider${NC}" + echo -e "${BOLD}Testing model: $model${NC}" + + # Now test each extension for this provider/model pair + for extension in "${EXTENSIONS[@]}"; do + test_extension "$provider" "$model" "$extension" + done + done + + # Print summary + if [ ${#ERROR_LOG[@]} -eq 0 ]; then + echo -e "\n${GREEN}All tests completed successfully!${NC}" + else + echo -e "\n${RED}Test Summary - Errors Found:${NC}" + echo -e "================================" + printf '%b\n' "${ERROR_LOG[@]}" + exit 1 + fi +} + +# Call main with all arguments +main "$@" \ No newline at end of file diff --git a/benchmark/goosebench/bin/.python3@3.11.pkg b/benchmark/goosebench/bin/.python3@3.11.pkg new file mode 120000 index 000000000..383f4511d --- /dev/null +++ b/benchmark/goosebench/bin/.python3@3.11.pkg @@ -0,0 +1 @@ +hermit \ No newline at end of file diff --git a/benchmark/goosebench/bin/README.hermit.md b/benchmark/goosebench/bin/README.hermit.md new file mode 100644 index 000000000..e889550ba --- /dev/null +++ b/benchmark/goosebench/bin/README.hermit.md @@ -0,0 +1,7 @@ +# Hermit environment + +This is a [Hermit](https://github.com/cashapp/hermit) bin directory. + +The symlinks in this directory are managed by Hermit and will automatically +download and install Hermit itself as well as packages. These packages are +local to this environment. diff --git a/benchmark/goosebench/bin/activate-hermit b/benchmark/goosebench/bin/activate-hermit new file mode 100755 index 000000000..fe28214d3 --- /dev/null +++ b/benchmark/goosebench/bin/activate-hermit @@ -0,0 +1,21 @@ +#!/bin/bash +# This file must be used with "source bin/activate-hermit" from bash or zsh. +# You cannot run it directly +# +# THIS FILE IS GENERATED; DO NOT MODIFY + +if [ "${BASH_SOURCE-}" = "$0" ]; then + echo "You must source this script: \$ source $0" >&2 + exit 33 +fi + +BIN_DIR="$(dirname "${BASH_SOURCE[0]:-${(%):-%x}}")" +if "${BIN_DIR}/hermit" noop > /dev/null; then + eval "$("${BIN_DIR}/hermit" activate "${BIN_DIR}/..")" + + if [ -n "${BASH-}" ] || [ -n "${ZSH_VERSION-}" ]; then + hash -r 2>/dev/null + fi + + echo "Hermit environment $("${HERMIT_ENV}"/bin/hermit env HERMIT_ENV) activated" +fi diff --git a/benchmark/goosebench/bin/activate-hermit.fish b/benchmark/goosebench/bin/activate-hermit.fish new file mode 100755 index 000000000..0367d2331 --- /dev/null +++ b/benchmark/goosebench/bin/activate-hermit.fish @@ -0,0 +1,24 @@ +#!/usr/bin/env fish + +# This file must be sourced with "source bin/activate-hermit.fish" from Fish shell. +# You cannot run it directly. +# +# THIS FILE IS GENERATED; DO NOT MODIFY + +if status is-interactive + set BIN_DIR (dirname (status --current-filename)) + + if "$BIN_DIR/hermit" noop > /dev/null + # Source the activation script generated by Hermit + "$BIN_DIR/hermit" activate "$BIN_DIR/.." | source + + # Clear the command cache if applicable + functions -c > /dev/null 2>&1 + + # Display activation message + echo "Hermit environment $($HERMIT_ENV/bin/hermit env HERMIT_ENV) activated" + end +else + echo "You must source this script: source $argv[0]" >&2 + exit 33 +end diff --git a/benchmark/goosebench/bin/hermit b/benchmark/goosebench/bin/hermit new file mode 100755 index 000000000..6dbd60cce --- /dev/null +++ b/benchmark/goosebench/bin/hermit @@ -0,0 +1,43 @@ +#!/bin/bash +# +# THIS FILE IS GENERATED; DO NOT MODIFY + +set -eo pipefail + +export HERMIT_USER_HOME=~ + +if [ -z "${HERMIT_STATE_DIR}" ]; then + case "$(uname -s)" in + Darwin) + export HERMIT_STATE_DIR="${HERMIT_USER_HOME}/Library/Caches/hermit" + ;; + Linux) + export HERMIT_STATE_DIR="${XDG_CACHE_HOME:-${HERMIT_USER_HOME}/.cache}/hermit" + ;; + esac +fi + +export HERMIT_DIST_URL="${HERMIT_DIST_URL:-https://d1abdrezunyhdp.cloudfront.net/square}" +HERMIT_CHANNEL="$(basename "${HERMIT_DIST_URL}")" +export HERMIT_CHANNEL +export HERMIT_EXE=${HERMIT_EXE:-${HERMIT_STATE_DIR}/pkg/hermit@${HERMIT_CHANNEL}/hermit} + +if [ ! -x "${HERMIT_EXE}" ]; then + echo "Bootstrapping ${HERMIT_EXE} from ${HERMIT_DIST_URL}" 1>&2 + INSTALL_SCRIPT="$(mktemp)" + # This value must match that of the install script + INSTALL_SCRIPT_SHA256="d9774f75517f9a6d9e371daae9991cdb9fbbc390101b47c3fb2f6876d9094bab" + if [ "${INSTALL_SCRIPT_SHA256}" = "BYPASS" ]; then + curl -fsSL "${HERMIT_DIST_URL}/install.sh" -o "${INSTALL_SCRIPT}" + else + # Install script is versioned by its sha256sum value + curl -fsSL "${HERMIT_DIST_URL}/install-${INSTALL_SCRIPT_SHA256}.sh" -o "${INSTALL_SCRIPT}" + # Verify install script's sha256sum + openssl dgst -sha256 "${INSTALL_SCRIPT}" | \ + awk -v EXPECTED="$INSTALL_SCRIPT_SHA256" \ + '$2!=EXPECTED {print "Install script sha256 " $2 " does not match " EXPECTED; exit 1}' + fi + /bin/bash "${INSTALL_SCRIPT}" 1>&2 +fi + +exec "${HERMIT_EXE}" --level=fatal exec "$0" -- "$@" diff --git a/benchmark/goosebench/bin/hermit.hcl b/benchmark/goosebench/bin/hermit.hcl new file mode 100644 index 000000000..081cbe834 --- /dev/null +++ b/benchmark/goosebench/bin/hermit.hcl @@ -0,0 +1,2 @@ +github-token-auth { +} diff --git a/benchmark/goosebench/bin/pip b/benchmark/goosebench/bin/pip new file mode 120000 index 000000000..b78b40b15 --- /dev/null +++ b/benchmark/goosebench/bin/pip @@ -0,0 +1 @@ +.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/pip3 b/benchmark/goosebench/bin/pip3 new file mode 120000 index 000000000..b78b40b15 --- /dev/null +++ b/benchmark/goosebench/bin/pip3 @@ -0,0 +1 @@ +.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/pip3.11 b/benchmark/goosebench/bin/pip3.11 new file mode 120000 index 000000000..b78b40b15 --- /dev/null +++ b/benchmark/goosebench/bin/pip3.11 @@ -0,0 +1 @@ +.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/pydoc3 b/benchmark/goosebench/bin/pydoc3 new file mode 120000 index 000000000..b78b40b15 --- /dev/null +++ b/benchmark/goosebench/bin/pydoc3 @@ -0,0 +1 @@ +.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/pydoc3.11 b/benchmark/goosebench/bin/pydoc3.11 new file mode 120000 index 000000000..b78b40b15 --- /dev/null +++ b/benchmark/goosebench/bin/pydoc3.11 @@ -0,0 +1 @@ +.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/python b/benchmark/goosebench/bin/python new file mode 120000 index 000000000..b78b40b15 --- /dev/null +++ b/benchmark/goosebench/bin/python @@ -0,0 +1 @@ +.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/python3 b/benchmark/goosebench/bin/python3 new file mode 120000 index 000000000..b78b40b15 --- /dev/null +++ b/benchmark/goosebench/bin/python3 @@ -0,0 +1 @@ +.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/python3-config b/benchmark/goosebench/bin/python3-config new file mode 120000 index 000000000..b78b40b15 --- /dev/null +++ b/benchmark/goosebench/bin/python3-config @@ -0,0 +1 @@ +.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/python3.11 b/benchmark/goosebench/bin/python3.11 new file mode 120000 index 000000000..b78b40b15 --- /dev/null +++ b/benchmark/goosebench/bin/python3.11 @@ -0,0 +1 @@ +.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/python3.11-config b/benchmark/goosebench/bin/python3.11-config new file mode 120000 index 000000000..b78b40b15 --- /dev/null +++ b/benchmark/goosebench/bin/python3.11-config @@ -0,0 +1 @@ +.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/config.yaml b/benchmark/goosebench/config.yaml new file mode 100644 index 000000000..50c26b306 --- /dev/null +++ b/benchmark/goosebench/config.yaml @@ -0,0 +1,7 @@ +GOOSE_MODEL: goose +GOOSE_PROVIDER: databricks +extensions: + developer: + enabled: true + name: developer + type: builtin diff --git a/benchmark/goosebench/goosebench/__init__.py b/benchmark/goosebench/goosebench/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmark/goosebench/goosebench/evaluate_tools/__init__.py b/benchmark/goosebench/goosebench/evaluate_tools/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/__init__.py b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/automation_script.py b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/automation_script.py new file mode 100644 index 000000000..9c6944ec8 --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/automation_script.py @@ -0,0 +1,29 @@ +"""Test cases for the automation script tool.""" + +# Prompts that should trigger valid automation script tool usage +valid_prompts = [ + "Create a shell script to sort unique lines in a file", + "Write a Ruby script to process some text data", + "Make a script to extract the second column from a CSV", + "Create a script to find pattern matches in a file", + "Write a shell script to process log files", + "Create a Ruby script for text manipulation", + "Make a script to analyze data in a text file", + "Write a script to format JSON data", + "Create a script to clean up file names", + "Write a script to extract specific data from files", +] + +# Prompts that should not trigger automation script tool usage based on tool description +invalid_prompts = [ + "Create a complex application with multiple files", + "Write a script that requires external dependencies", + "Create a script that needs a database", + "Write a GUI application", + "Create a web server application", + "Write a script that needs special system access", + "Create a script that requires third-party libraries", + "Write a script that needs network services", + "Create a distributed processing script", + "Write a script that requires system installation", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/cache.py b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/cache.py new file mode 100644 index 000000000..be78ad8c8 --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/cache.py @@ -0,0 +1,29 @@ +"""Test cases for the cache tool.""" + +# Prompts that should trigger valid cache tool usage +valid_prompts = [ + "List all cached files", + "Show me what's in the cache", + "View the content of this cached file", + "Delete this specific cached file", + "Clear all cached data", + "Show the contents of a cached file", + "Remove this file from cache", + "List the cache directory contents", + "View a cached text file", + "Delete everything from the cache", +] + +# Prompts that should not trigger cache tool usage based on tool description +invalid_prompts = [ + "Modify a cached file directly", + "Search within cached files", + "Compress the cache directory", + "Move cached files to another location", + "Change cache directory permissions", + "Reorganize cached files", + "Filter cache by file type", + "Sort cached files by size", + "Archive old cached files", + "Backup the cache directory", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/computer_control.py b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/computer_control.py new file mode 100644 index 000000000..3d17f19cd --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/computer_control.py @@ -0,0 +1,29 @@ +"""Test cases for the computer control tool.""" + +# Prompts that should trigger valid computer control tool usage +valid_prompts = [ + "Launch Safari and open a specific URL", + "Use AppleScript to automate Mail app", + "Click a button in the current application", + "Fill out a form in Safari", + "Control system volume using AppleScript", + "Organize files in a folder", + "Add an event to Calendar", + "Send an email using Mail app", + "Manage iTunes playlist", + "Automate document processing in Pages", +] + +# Prompts that should not trigger computer control tool usage based on tool description +invalid_prompts = [ + "Control applications that don't support AppleScript", + "Perform actions requiring root access", + "Modify system files directly", + "Access restricted system areas", + "Control non-Apple applications without AppleScript support", + "Perform actions requiring kernel modifications", + "Execute privileged system commands", + "Modify protected system settings", + "Access hardware directly", + "Control low-level system functions", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_scrape.py b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_scrape.py new file mode 100644 index 000000000..ebc7eed5a --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_scrape.py @@ -0,0 +1,29 @@ +"""Test cases for the web scrape tool.""" + +# Prompts that should trigger valid web scrape tool usage +valid_prompts = [ + "Fetch the content from https://example.com", + "Download the HTML from this webpage", + "Get JSON data from this API endpoint", + "Save this image from the web", + "Scrape text content from this URL", + "Download this webpage as text", + "Get the JSON response from this API", + "Save this binary file from the web", + "Fetch and cache this webpage", + "Download this document as text", +] + +# Prompts that should not trigger web scrape tool usage based on tool description +invalid_prompts = [ + "Scrape a complex web application with dynamic content", + "Extract data from a JavaScript-heavy website", + "Scrape content that requires login", + "Download content from multiple pages at once", + "Extract data from a site with anti-scraping measures", + "Scrape content that requires user interaction", + "Download content from a protected API", + "Extract data from pages requiring authentication", + "Scrape content from multiple URLs simultaneously", + "Download data from a site requiring cookies", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_search.py b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_search.py new file mode 100644 index 000000000..556ad0ea6 --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_search.py @@ -0,0 +1,29 @@ +"""Test cases for the web search tool.""" + +# Prompts that should trigger valid web search tool usage +valid_prompts = [ + "Search for information about 'Tesla'", + "Look up what 'Bitcoin' is", + "Find details about 'SpaceX'", + "Search for 'Python' programming language", + "What is 'Docker'?", + "Look up the company 'Microsoft'", + "Search for information about 'Linux'", + "Find out about 'AWS'", + "What is 'Kubernetes'?", + "Search for 'React' framework", +] + +# Prompts that should not trigger web search tool usage based on tool description +invalid_prompts = [ + "Search for multiple words at once", + "Look up a complex query with multiple terms", + "Search for a long phrase", + "Find results for this entire sentence", + "Search for 'word1 word2 word3'", + "Look up multiple topics at once", + "Search for a paragraph of text", + "Find results for multiple questions", + "Search for a list of items", + "Look up several different topics", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/__init__.py b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/list_windows.py b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/list_windows.py new file mode 100644 index 000000000..94906613f --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/list_windows.py @@ -0,0 +1,29 @@ +"""Test cases for the list windows tool.""" + +# Prompts that should trigger valid list windows tool usage +valid_prompts = [ + "Show me all available windows", + "List the windows that can be captured", + "What windows are currently open?", + "Display available window titles", + "Get a list of windows for screen capture", + "Show window titles that I can screenshot", + "What windows can I take screenshots of?", + "List all window titles", + "Show me what windows are available for capture", + "Get available window names", +] + +# Prompts that should not trigger list windows tool usage based on tool description +invalid_prompts = [ + "Close all windows", + "Minimize the current window", + "Maximize the browser window", + "Move window to another display", + "Resize the current window", + "Change window focus", + "Arrange windows on screen", + "Hide inactive windows", + "Show desktop", + "Switch between windows", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/screen_capture.py b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/screen_capture.py new file mode 100644 index 000000000..0daef846f --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/screen_capture.py @@ -0,0 +1,29 @@ +"""Test cases for the screen capture tool.""" + +# Prompts that should trigger valid screen capture tool usage +valid_prompts = [ + "Take a screenshot of the main display", + "Capture the window titled 'Terminal'", + "Screenshot the current display", + "Take a picture of display 0", + "Capture a screenshot of the browser window", + "Take a screenshot of the active window", + "Capture display 1", + "Screenshot the window named 'Settings'", + "Take a capture of the main screen", + "Screenshot the specified window", +] + +# Prompts that should not trigger screen capture tool usage based on tool description +invalid_prompts = [ + "Capture multiple windows at once", + "Take a screenshot of all displays", + "Record a video of the screen", + "Capture a region of the screen", + "Take a partial screenshot", + "Screenshot a specific area", + "Capture screen with mouse cursor", + "Take a timed screenshot", + "Screenshot with specific dimensions", + "Capture screen without window decorations", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/shell.py b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/shell.py new file mode 100644 index 000000000..916f0cfd4 --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/shell.py @@ -0,0 +1,29 @@ +"""Test cases for the shell tool.""" + +# Prompts that should trigger valid shell tool usage +valid_prompts = [ + "Run the command 'ls' to list files", + "Execute 'pwd' to show current directory", + "Use ripgrep to search for files containing 'example'", + "Find all Python files using 'rg --files | rg .py'", + "Search for the string 'class Example' in files using ripgrep", + "Show the contents of a file using cat", + "Count lines in a file using wc -l", + "Check disk space with df -h", + "List processes with ps", + "Create a directory with mkdir test", +] + +# Prompts that should not trigger shell tool usage based on tool description +invalid_prompts = [ + "Run a command that will produce gigabytes of output", + "Start a long-running server without backgrounding it", + "Use find to recursively search for files", + "Use ls -R to list all files recursively", + "Execute a command that will run indefinitely", + "Run a command that streams continuous output", + "Use grep recursively to search files", + "Start a process that needs to be manually terminated", + "Run a command that generates unlimited output", + "Execute ls -la on the entire filesystem", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/text_editor.py b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/text_editor.py new file mode 100644 index 000000000..689548c01 --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/text_editor.py @@ -0,0 +1,29 @@ +"""Test cases for the text editor tool.""" + +# Prompts that should trigger valid text editor tool usage +valid_prompts = [ + "View the contents of file.txt", + "Show me what's in config.py", + "Create a new file called test.txt with 'Hello World' content", + "Write 'print(\"hello\")' to script.py", + "Replace the string 'old_version' with 'new_version' in config.txt", + "Change 'debug=True' to 'debug=False' in settings.py", + "Undo the last edit made to main.py", + "Revert the previous change in config.json", + "Write this JSON content to data.json", + "Update the version number in package.json", +] + +# Prompts that should not trigger text editor tool usage based on tool description +invalid_prompts = [ + "Edit multiple sections of the file at once", + "Replace all occurrences of a string in the file", + "Make changes to multiple files simultaneously", + "Modify a file that's larger than 400KB", + "Edit a file with more than 400,000 characters", + "Replace a string that appears multiple times in the file", + "Make partial updates to specific sections without full file content", + "Edit binary files", + "Modify files without providing full path", + "Replace text without exact string match", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/__init__.py b/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_read.py b/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_read.py new file mode 100644 index 000000000..4f81b9ec2 --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_read.py @@ -0,0 +1,29 @@ +"""Test cases for the Google Drive read tool.""" + +# Prompts that should trigger valid read tool usage +valid_prompts = [ + "Read the file with URI gdrive:///abc123", + "Show me the contents of gdrive:///xyz789", + "Get the text from gdrive:///doc456", + "Read this Google Doc gdrive:///123abc", + "Show the contents of spreadsheet gdrive:///789xyz", + "Get the text of presentation gdrive:///456def", + "Read file gdrive:///def123 and include images", + "Show me gdrive:///789abc without images", + "Get the content of document gdrive:///xyz456", + "Read text file gdrive:///123xyz", +] + +# Prompts that should not trigger read tool usage based on tool description +invalid_prompts = [ + "Edit the file gdrive:///abc123", + "Write to document gdrive:///xyz789", + "Modify spreadsheet gdrive:///123def", + "Update presentation gdrive:///def789", + "Delete file gdrive:///789xyz", + "Create new document gdrive:///456abc", + "Share file gdrive:///xyz123", + "Move document gdrive:///789def", + "Copy file gdrive:///abc789", + "Rename document gdrive:///def456", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_search.py b/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_search.py new file mode 100644 index 000000000..3a093b5f5 --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_search.py @@ -0,0 +1,29 @@ +"""Test cases for the Google Drive search tool.""" + +# Prompts that should trigger valid search tool usage +valid_prompts = [ + "Search for files named 'budget'", + "Find documents containing 'report'", + "Look for files with 'presentation' in the name", + "Search my drive for 'meeting notes'", + "Find files named 'project plan'", + "Search for 'invoice' in my files", + "Look up documents named 'proposal'", + "Find spreadsheets with 'data' in the name", + "Search for files containing 'schedule'", + "Find documents with 'summary' in the title", +] + +# Prompts that should not trigger search tool usage based on tool description +invalid_prompts = [ + "Search for files modified in the last week", + "Find files larger than 1MB", + "Search for files shared with me", + "Look for files in a specific folder", + "Find files by type", + "Search for files by owner", + "Look for recently modified files", + "Find files with specific permissions", + "Search for files by date", + "Find files in trash", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/__init__.py b/benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/jetbrains.py b/benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/jetbrains.py new file mode 100644 index 000000000..b5cf13ef8 --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/jetbrains.py @@ -0,0 +1,29 @@ +"""Test cases for the JetBrains IDE integration tools.""" + +# Prompts that should trigger valid JetBrains tool usage +valid_prompts = [ + "Open the current file in the IDE", + "Navigate to line 42 in the active file", + "Find usages of this class", + "Go to the definition of this method", + "Show documentation for this symbol", + "Run the current test file", + "Debug this application", + "Show project structure", + "Open recent files", + "Search everywhere in the project", +] + +# Prompts that should not trigger JetBrains tool usage based on tool description +invalid_prompts = [ + "Create a new IDE instance", + "Modify IDE settings", + "Install new plugins", + "Change IDE theme", + "Update the IDE version", + "Configure version control", + "Modify IDE keymap", + "Change project settings", + "Install new IDE features", + "Uninstall IDE components", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/__init__.py b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remember_memory.py b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remember_memory.py new file mode 100644 index 000000000..5be419786 --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remember_memory.py @@ -0,0 +1,29 @@ +"""Test cases for the remember memory tool.""" + +# Prompts that should trigger valid remember memory tool usage +valid_prompts = [ + "Remember this development preference in the 'development' category", + "Store this setting globally with tags #config #setup", + "Save this workflow detail locally in 'workflow' category", + "Remember my name and email in the 'personal' category globally", + "Store project configuration locally with #settings tag", + "Save this formatting preference in development category", + "Remember this shortcut in 'keyboard' category with #shortcuts tag", + "Store build instructions locally in 'build' category", + "Save API credentials globally in 'credentials' category", + "Remember git configuration in 'git' category with #config tag", +] + +# Prompts that should not trigger remember memory tool usage based on tool description +invalid_prompts = [ + "Save this without specifying a category", + "Store this without indicating global or local scope", + "Remember this with invalid tags format", + "Save empty content in a category", + "Store this in multiple categories at once", + "Remember this with system-level access", + "Save this in a protected category", + "Store this with special file permissions", + "Remember this in a non-existent directory", + "Save this with binary content", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_memory_category.py b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_memory_category.py new file mode 100644 index 000000000..fb29aebfe --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_memory_category.py @@ -0,0 +1,29 @@ +"""Test cases for the remove memory category tool.""" + +# Prompts that should trigger valid remove memory category tool usage +valid_prompts = [ + "Delete all memories in the 'development' category", + "Clear the 'workflow' category from global storage", + "Remove all local project settings", + "Delete everything in the 'personal' category", + "Clear all global memories", + "Remove all local memories", + "Delete the 'build' category", + "Clear project configuration category", + "Remove the 'git' category memories", + "Delete all items in 'credentials' category", +] + +# Prompts that should not trigger remove memory category tool usage based on tool description +invalid_prompts = [ + "Delete memories across multiple categories", + "Remove memories without specifying scope", + "Clear memories by date range", + "Delete memories by content", + "Remove memories with specific tags", + "Clear memories by partial category match", + "Delete memories selectively", + "Remove memories by size", + "Clear recently modified memories", + "Delete memories by author", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_specific_memory.py b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_specific_memory.py new file mode 100644 index 000000000..502a43895 --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_specific_memory.py @@ -0,0 +1,29 @@ +"""Test cases for the remove specific memory tool.""" + +# Prompts that should trigger valid remove specific memory tool usage +valid_prompts = [ + "Delete the memory about code formatting from development category", + "Remove the git configuration memory from global storage", + "Delete project API key from credentials category", + "Remove my email setting from personal category", + "Delete the build instruction memory from local storage", + "Remove specific workflow step from workflow category", + "Delete keyboard shortcut memory from shortcuts category", + "Remove specific project setting from local config", + "Delete specific credential from global storage", + "Remove particular preference from settings category", +] + +# Prompts that should not trigger remove specific memory tool usage based on tool description +invalid_prompts = [ + "Delete multiple memories at once", + "Remove memories by pattern matching", + "Delete memories without exact content", + "Remove memories by tag only", + "Delete memories by date", + "Remove partial memory content", + "Delete memories by regex", + "Remove memories without category", + "Delete memories by approximate match", + "Remove memories without scope", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/retrieve_memories.py b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/retrieve_memories.py new file mode 100644 index 000000000..59b187f8e --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/retrieve_memories.py @@ -0,0 +1,29 @@ +"""Test cases for the retrieve memories tool.""" + +# Prompts that should trigger valid retrieve memories tool usage +valid_prompts = [ + "Show all memories in the 'development' category", + "Get my stored preferences from global memory", + "Retrieve local project settings", + "Show me what's stored in the 'workflow' category", + "Get all global memories", + "Retrieve everything from local storage", + "Show memories tagged with #config", + "Get all items from 'personal' category", + "Retrieve project-specific memories", + "Show what's saved in the 'build' category", +] + +# Prompts that should not trigger retrieve memories tool usage based on tool description +invalid_prompts = [ + "Search across multiple categories at once", + "Find memories without specifying scope", + "Get memories with complex search criteria", + "Retrieve memories by date range", + "Search memories by content", + "Get memories by partial category match", + "Retrieve memories with regex patterns", + "Find memories by size", + "Get memories modified recently", + "Search memories by author", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/__init__.py b/benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/tutorial.py b/benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/tutorial.py new file mode 100644 index 000000000..f208d6a76 --- /dev/null +++ b/benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/tutorial.py @@ -0,0 +1,29 @@ +"""Test cases for the load tutorial tool.""" + +# Prompts that should trigger valid load tutorial tool usage +valid_prompts = [ + "Show me the getting-started tutorial", + "Load the developer-mcp tutorial", + "I need help getting started, show the tutorial", + "Can you load the tutorial about development?", + "Show me how to use Goose with the tutorial", + "Load the beginner's guide tutorial", + "I'm new here, can you show me the introduction tutorial?", + "Display the tutorial for developers", + "Show the tutorial about MCP development", + "Load the basic usage tutorial", +] + +# Prompts that should not trigger load tutorial tool usage based on tool description +invalid_prompts = [ + "Create a new tutorial", + "Edit the existing tutorial", + "Delete this tutorial", + "Modify tutorial content", + "Save this as a tutorial", + "Update the tutorial text", + "Remove old tutorials", + "Change tutorial format", + "Add new tutorial section", + "Merge multiple tutorials", +] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/main.py b/benchmark/goosebench/goosebench/main.py new file mode 100755 index 000000000..ce6bf4b69 --- /dev/null +++ b/benchmark/goosebench/goosebench/main.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python3 +import dataclasses +import os +import subprocess +import tempfile +import time +from enum import Enum +from typing import List, Optional + +import typer +from rich.console import Console +from rich.theme import Theme +from typing_extensions import Annotated + +# Initialize typer app and rich console +app = typer.Typer(help="Goose CLI Integration Tests") +console = Console(theme=Theme({ + "info": "cyan", + "warning": "yellow", + "error": "red", + "success": "green" +})) + + +# Define workflow types +class Workflow(str, Enum): + SERIAL = "serial" + CONVERSATIONAL = "conversational" + + +@dataclasses.dataclass +class Topic: + initial_prompt: str + follow_ups: List[str] + + +@dataclasses.dataclass +class Conversation: + topics: List[Topic] + + +# Extension configurations +EXTENSIONS = ['developer', 'computercontroller', 'google_drive', 'memory'] + +EXTENSION_PROMPTS = { + 'developer': [ + "List the contents of the current directory.", + "Create a new file called test.txt with the content 'Hello, World!'", + "Read the contents of test.txt" + ], + 'computercontroller': [ + "What are the headlines on hackernews? Organize the list into categories.", + "Make a ding sound" + ], + 'google_drive': [ + "List the files in my Google Drive.", + "Search for documents containing 'meeting notes'" + ], + 'memory': [ + "Save this fact: The capital of France is Paris.", + "What is the capital of France?" + ] +} + +CONV_EXTENSION_PROMPTS = { + k: Conversation(topics=[ + Topic(val, ["summarize"]) + for val in v + ]) + for k, v in EXTENSION_PROMPTS.items() +} + + +class Bench: + def __init__(self, workflow: Workflow): + self.error_log = [] + self.workflow = workflow + + def log_error(self, provider: str, model: str, extension: str, error: str) -> None: + """Log an error message.""" + self.error_log.append( + f"Provider: {provider}, Model: {model}, Extension: {extension}\n{error}\n" + ) + + def evaluate(self, + provider: str, + model: str, + extension: str, + prompt: str, + follow_ups: Optional[List[str]] = None) -> None: + """Run a single test with the given parameters using pexpect.""" + console.print(f"Testing: {provider}/{model} with {extension}", style="info") + console.print(f"Prompt: {prompt}", style="info") + console.print(f"Workflow: {self.workflow.value}", style="info") + + follow_ups = follow_ups or [] + + # Create temporary file for prompt + with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp: + temp.write(prompt) + temp_path = temp.name + + try: + # Run goose with timeout + cmd = ['goose', 'run', '--with-builtin', extension, '-t', prompt] + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=30 + ) + + if result.returncode != 0: + self.log_error(provider, model, extension, + result.stdout + result.stderr) + console.print("✗ Test failed", style="error") + + else: + console.print("✓ Test passed") + + except subprocess.TimeoutExpired: + self.log_error(provider, model, extension, + "Test timed out after 30 seconds") + console.print("✗ Test timed out", style="error") + except Exception as e: + self.log_error(provider, model, extension, str(e)) + console.print("✗ Test failed with unexpected error", style="error") + finally: + os.unlink(temp_path) + + def _run_serial(self, provider: str, model: str, extension: str) -> None: + prompts = EXTENSION_PROMPTS.get(extension, []) + for prompt in prompts: + self.evaluate(provider, model, extension, prompt) + time.sleep(2) # brief pause between tests + + def _run_conversational(self, provider: str, model: str, extension: str) -> None: + conv = CONV_EXTENSION_PROMPTS.get(extension, []) + for t in conv.topics: + self.evaluate( + provider, model, extension, t.initial_prompt, t.follow_ups + ) + time.sleep(2) # brief pause between tests + + def test_extension(self, provider: str, model: str, extension: str) -> None: + """Test all prompts for a given extension.""" + console.rule(f"Testing extension: {extension}") + + if self.workflow == Workflow.CONVERSATIONAL: + return self._run_conversational(provider, model, extension) + + return self._run_serial(provider, model, extension) + + +def parse_provider_model(ctx: typer.Context, provider_models: List[str]) -> List[ + tuple[str, str]]: + """Parse provider:model strings into tuples.""" + result = [] + for pm in provider_models: + try: + provider, models = pm.split(':') + for model in models.split(','): + result.append((provider.strip(), model.strip())) + except ValueError: + raise typer.BadParameter( + f"Invalid format: {pm}. Use format 'provider:model' or 'provider:model1,model2'" + ) + return result + + +@app.command() +def main( + provider_models: Annotated[ + Optional[List[str]], + typer.Option( + '--provider-model', '-pm', + help="Provider and model in format 'provider:model' or 'provider:model1,model2'" + ) + ] = None, + workflow: Annotated[ + Workflow, + typer.Option( + '--workflow', '-w', + help="Workflow type: serial or conversational" + ) + ] = Workflow.SERIAL, + verbose: Annotated[ + bool, + typer.Option('--verbose', '-v', help="Enable verbose output") + ] = False, +): + """ + Run Goose CLI Integration Tests. + + Example usage: + + python main.py # Uses default: databricks:goose with serial workflow + python main.py -pm anthropic:claude + python main.py -pm anthropic:claude,claude2 + python main.py -pm anthropic:claude -pm databricks:goose + python main.py --workflow conversational # Use conversational workflow + """ + console.print("Starting Goose CLI Integration Tests", style="bold") + + runner = Bench(workflow) + + # Use default if no provider-models specified + if not provider_models: + provider_models = ['databricks:goose'] + + # Parse provider-model pairs + try: + provider_model_pairs = parse_provider_model(typer.Context, provider_models) + except typer.BadParameter as e: + console.print(f"Error: {str(e)}", style="error") + raise typer.Exit(1) + + for provider, model in provider_model_pairs: + console.rule(f"Testing provider: {provider}") + console.print(f"Testing model: {model}", style="bold") + + for extension in EXTENSIONS: + runner.test_extension(provider, model, extension) + + # Print summary + if not runner.error_log: + console.print("\nAll tests completed successfully!", style="success") + else: + console.print("\nTest Summary - Errors Found:", style="error") + console.rule("Errors") + for error in runner.error_log: + console.print(error, style="error") + raise typer.Exit(1) + + +if __name__ == "__main__": + app() diff --git a/benchmark/goosebench/hello.py b/benchmark/goosebench/hello.py deleted file mode 100644 index 04e32c222..000000000 --- a/benchmark/goosebench/hello.py +++ /dev/null @@ -1,6 +0,0 @@ -def main(): - print("Hello from goosebench!") - - -if __name__ == "__main__": - main() diff --git a/benchmark/goosebench/poetry.lock b/benchmark/goosebench/poetry.lock new file mode 100644 index 000000000..7141b3575 --- /dev/null +++ b/benchmark/goosebench/poetry.lock @@ -0,0 +1,310 @@ +# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. + +[[package]] +name = "click" +version = "8.1.8" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"}, + {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.source] +type = "legacy" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" +reference = "artifactory" + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["main", "dev"] +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] +markers = {main = "platform_system == \"Windows\"", dev = "sys_platform == \"win32\""} + +[package.source] +type = "legacy" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" +reference = "artifactory" + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +groups = ["dev"] +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + +[package.source] +type = "legacy" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" +reference = "artifactory" + +[[package]] +name = "markdown-it-py" +version = "3.0.0" +description = "Python port of markdown-it. Markdown parsing, done right!" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, + {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, +] + +[package.dependencies] +mdurl = ">=0.1,<1.0" + +[package.extras] +benchmarking = ["psutil", "pytest", "pytest-benchmark"] +code-style = ["pre-commit (>=3.0,<4.0)"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] +linkify = ["linkify-it-py (>=1,<3)"] +plugins = ["mdit-py-plugins"] +profiling = ["gprof2dot"] +rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + +[package.source] +type = "legacy" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" +reference = "artifactory" + +[[package]] +name = "mdurl" +version = "0.1.2" +description = "Markdown URL utilities" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, + {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, +] + +[package.source] +type = "legacy" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" +reference = "artifactory" + +[[package]] +name = "packaging" +version = "24.2" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, + {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, +] + +[package.source] +type = "legacy" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" +reference = "artifactory" + +[[package]] +name = "pexpect" +version = "4.9.0" +description = "Pexpect allows easy control of interactive console applications." +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"}, + {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"}, +] + +[package.dependencies] +ptyprocess = ">=0.5" + +[package.source] +type = "legacy" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" +reference = "artifactory" + +[[package]] +name = "pluggy" +version = "1.5.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[package.source] +type = "legacy" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" +reference = "artifactory" + +[[package]] +name = "ptyprocess" +version = "0.7.0" +description = "Run a subprocess in a pseudo terminal" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, + {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, +] + +[package.source] +type = "legacy" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" +reference = "artifactory" + +[[package]] +name = "pygments" +version = "2.19.1" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, + {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, +] + +[package.extras] +windows-terminal = ["colorama (>=0.4.6)"] + +[package.source] +type = "legacy" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" +reference = "artifactory" + +[[package]] +name = "pytest" +version = "8.3.4" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"}, + {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=1.5,<2" + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + +[package.source] +type = "legacy" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" +reference = "artifactory" + +[[package]] +name = "rich" +version = "13.9.4" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +optional = false +python-versions = ">=3.8.0" +groups = ["main"] +files = [ + {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"}, + {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"}, +] + +[package.dependencies] +markdown-it-py = ">=2.2.0" +pygments = ">=2.13.0,<3.0.0" + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<9)"] + +[package.source] +type = "legacy" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" +reference = "artifactory" + +[[package]] +name = "shellingham" +version = "1.5.4" +description = "Tool to Detect Surrounding Shell" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"}, + {file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"}, +] + +[package.source] +type = "legacy" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" +reference = "artifactory" + +[[package]] +name = "typer" +version = "0.15.1" +description = "Typer, build great CLIs. Easy to code. Based on Python type hints." +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "typer-0.15.1-py3-none-any.whl", hash = "sha256:7994fb7b8155b64d3402518560648446072864beefd44aa2dc36972a5972e847"}, + {file = "typer-0.15.1.tar.gz", hash = "sha256:a0588c0a7fa68a1978a069818657778f86abe6ff5ea6abf472f940a08bfe4f0a"}, +] + +[package.dependencies] +click = ">=8.0.0" +rich = ">=10.11.0" +shellingham = ">=1.3.0" +typing-extensions = ">=3.7.4.3" + +[package.source] +type = "legacy" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" +reference = "artifactory" + +[[package]] +name = "typing-extensions" +version = "4.12.2" +description = "Backported and Experimental Type Hints for Python 3.8+" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, + {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, +] + +[package.source] +type = "legacy" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" +reference = "artifactory" + +[metadata] +lock-version = "2.1" +python-versions = ">=3.11" +content-hash = "58e6c7676973e0793089a2eff7d2dc54f11c5760cd7b0c43ecd5143588ffa046" diff --git a/benchmark/goosebench/pyproject.toml b/benchmark/goosebench/pyproject.toml index e1d21d9c6..00f2cecce 100644 --- a/benchmark/goosebench/pyproject.toml +++ b/benchmark/goosebench/pyproject.toml @@ -1,7 +1,28 @@ [project] -name = "goosebench" +name = "goose-monitoring-job" version = "0.1.0" -description = "Add your description here" +description = "" readme = "README.md" -requires-python = ">=3.10" -dependencies = [] +authors = [ + { name = "Your Name", email = "you@example.com" } +] + +[[tool.poetry.source]] +name = "artifactory" +priority = "primary" +url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" + + +[tool.poetry.dependencies] +python = ">=3.11" +pexpect = "^4.9.0" +typer = "^0.15.1" +rich = "^13.9.4" + + +[tool.poetry.group.dev.dependencies] +pytest = "^8.3.4" + +[build-system] +requires = ["poetry-core>=2.0.0,<3.0.0"] +build-backend = "poetry.core.masonry.api" From 013933c5a61aea89ebd06b3141f8c9c7b4c82b7f Mon Sep 17 00:00:00 2001 From: Marcelle Bonterre Date: Tue, 18 Feb 2025 22:39:43 -0500 Subject: [PATCH 03/10] WIP --- benchmark/goosebench/goosebench/bench.py | 74 ++++++++++ .../__init__.py | 0 .../computercontroller_tool/__init__.py | 0 .../automation_script.py | 0 .../computercontroller_tool/cache.py | 0 .../computer_control.py | 0 .../computercontroller_tool/web_scrape.py | 0 .../computercontroller_tool/web_search.py | 0 .../developer_tool/__init__.py | 0 .../developer_tool/list_windows.py | 0 .../developer_tool/screen_capture.py | 0 .../developer_tool/shell.py | 0 .../developer_tool/text_editor.py | 0 .../memory_tool}/__init__.py | 0 .../memory_tool/remember_memory.py | 0 .../memory_tool/remove_memory_category.py | 0 .../memory_tool/remove_specific_memory.py | 0 .../memory_tool/retrieve_memories.py | 0 .../not_used}/__init__.py | 0 .../not_used/google_drive_tool}/__init__.py | 0 .../google_drive_tool/google_drive_read.py | 0 .../google_drive_tool/google_drive_search.py | 0 .../not_used/jetbrains_tool}/__init__.py | 0 .../not_used}/jetbrains_tool/jetbrains.py | 0 .../extensions/tutorial_tool/__init__.py | 0 .../tutorial_tool/tutorial.py | 0 benchmark/goosebench/goosebench/main.py | 127 +----------------- crates/goose-bench/Cargo.toml | 21 +++ .../goose-bench/src/eval_suites/evaluation.rs | 36 +++++ crates/goose-bench/src/eval_suites/factory.rs | 60 +++++++++ .../src/eval_suites/flappy_bird.rs | 29 ++++ crates/goose-bench/src/eval_suites/mod.rs | 5 + crates/goose-bench/src/lib.rs | 1 + crates/goose-cli/Cargo.toml | 1 + crates/goose-cli/src/commands/bench.rs | 44 ++++++ crates/goose-cli/src/commands/mod.rs | 1 + crates/goose-cli/src/main.rs | 8 ++ 37 files changed, 284 insertions(+), 123 deletions(-) create mode 100644 benchmark/goosebench/goosebench/bench.py rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/__init__.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/computercontroller_tool/__init__.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/computercontroller_tool/automation_script.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/computercontroller_tool/cache.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/computercontroller_tool/computer_control.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/computercontroller_tool/web_scrape.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/computercontroller_tool/web_search.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/developer_tool/__init__.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/developer_tool/list_windows.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/developer_tool/screen_capture.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/developer_tool/shell.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/developer_tool/text_editor.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools/google_drive_tool => extensions/memory_tool}/__init__.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/memory_tool/remember_memory.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/memory_tool/remove_memory_category.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/memory_tool/remove_specific_memory.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/memory_tool/retrieve_memories.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools/jetbrains_tool => extensions/not_used}/__init__.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools/memory_tool => extensions/not_used/google_drive_tool}/__init__.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions/not_used}/google_drive_tool/google_drive_read.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions/not_used}/google_drive_tool/google_drive_search.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools/tutorial_tool => extensions/not_used/jetbrains_tool}/__init__.py (100%) rename benchmark/goosebench/goosebench/{evaluate_tools => extensions/not_used}/jetbrains_tool/jetbrains.py (100%) create mode 100644 benchmark/goosebench/goosebench/extensions/tutorial_tool/__init__.py rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/tutorial_tool/tutorial.py (100%) create mode 100644 crates/goose-bench/Cargo.toml create mode 100644 crates/goose-bench/src/eval_suites/evaluation.rs create mode 100644 crates/goose-bench/src/eval_suites/factory.rs create mode 100644 crates/goose-bench/src/eval_suites/flappy_bird.rs create mode 100644 crates/goose-bench/src/eval_suites/mod.rs create mode 100644 crates/goose-bench/src/lib.rs create mode 100644 crates/goose-cli/src/commands/bench.rs diff --git a/benchmark/goosebench/goosebench/bench.py b/benchmark/goosebench/goosebench/bench.py new file mode 100644 index 000000000..7566ea43e --- /dev/null +++ b/benchmark/goosebench/goosebench/bench.py @@ -0,0 +1,74 @@ +import os +import subprocess +import tempfile +import time +from typing import Optional, List + +from goosebench.main import console, EXTENSION_PROMPTS + + +class Bench: + def __init__(self): + self.error_log = [] + + def log_error(self, provider: str, model: str, extension: str, error: str) -> None: + """Log an error message.""" + self.error_log.append( + f"Provider: {provider}, Model: {model}, Extension: {extension}\n{error}\n" + ) + + def evaluate(self, + provider: str, + model: str, + extension: str, + prompt: str, + follow_ups: Optional[List[str]] = None) -> None: + """Run a single test with the given parameters using pexpect.""" + console.print(f"Testing: {provider}/{model} with {extension}", style="info") + console.print(f"Prompt: {prompt}", style="info") + + follow_ups = follow_ups or [] + + # Create temporary file for prompt + with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp: + temp.write(prompt) + temp_path = temp.name + + try: + # Run goose with timeout + cmd = ['goose', 'run', '--with-builtin', extension, '-t', prompt] + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=30 + ) + + if result.returncode != 0: + self.log_error(provider, model, extension, + result.stdout + result.stderr) + console.print("✗ Test failed", style="error") + + else: + console.print("✓ Test passed") + + except subprocess.TimeoutExpired: + self.log_error(provider, model, extension, + "Test timed out after 30 seconds") + console.print("✗ Test timed out", style="error") + except Exception as e: + self.log_error(provider, model, extension, str(e)) + console.print("✗ Test failed with unexpected error", style="error") + finally: + os.unlink(temp_path) + + def _run_serial(self, provider: str, model: str, extension: str) -> None: + prompts = EXTENSION_PROMPTS.get(extension, []) + for prompt in prompts: + self.evaluate(provider, model, extension, prompt) + time.sleep(2) # brief pause between tests + + def test_extension(self, provider: str, model: str, extension: str) -> None: + """Test all prompts for a given extension.""" + console.rule(f"Testing extension: {extension}") + return self._run_serial(provider, model, extension) diff --git a/benchmark/goosebench/goosebench/evaluate_tools/__init__.py b/benchmark/goosebench/goosebench/extensions/__init__.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/__init__.py rename to benchmark/goosebench/goosebench/extensions/__init__.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/__init__.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/__init__.py rename to benchmark/goosebench/goosebench/extensions/computercontroller_tool/__init__.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/automation_script.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/automation_script.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/automation_script.py rename to benchmark/goosebench/goosebench/extensions/computercontroller_tool/automation_script.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/cache.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/cache.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/cache.py rename to benchmark/goosebench/goosebench/extensions/computercontroller_tool/cache.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/computer_control.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/computer_control.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/computer_control.py rename to benchmark/goosebench/goosebench/extensions/computercontroller_tool/computer_control.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_scrape.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_scrape.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_scrape.py rename to benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_scrape.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_search.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_search.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_search.py rename to benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_search.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/developer_tool/__init__.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/developer_tool/__init__.py rename to benchmark/goosebench/goosebench/extensions/developer_tool/__init__.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/list_windows.py b/benchmark/goosebench/goosebench/extensions/developer_tool/list_windows.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/developer_tool/list_windows.py rename to benchmark/goosebench/goosebench/extensions/developer_tool/list_windows.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/screen_capture.py b/benchmark/goosebench/goosebench/extensions/developer_tool/screen_capture.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/developer_tool/screen_capture.py rename to benchmark/goosebench/goosebench/extensions/developer_tool/screen_capture.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/shell.py b/benchmark/goosebench/goosebench/extensions/developer_tool/shell.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/developer_tool/shell.py rename to benchmark/goosebench/goosebench/extensions/developer_tool/shell.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/text_editor.py b/benchmark/goosebench/goosebench/extensions/developer_tool/text_editor.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/developer_tool/text_editor.py rename to benchmark/goosebench/goosebench/extensions/developer_tool/text_editor.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/memory_tool/__init__.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/__init__.py rename to benchmark/goosebench/goosebench/extensions/memory_tool/__init__.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remember_memory.py b/benchmark/goosebench/goosebench/extensions/memory_tool/remember_memory.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remember_memory.py rename to benchmark/goosebench/goosebench/extensions/memory_tool/remember_memory.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_memory_category.py b/benchmark/goosebench/goosebench/extensions/memory_tool/remove_memory_category.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_memory_category.py rename to benchmark/goosebench/goosebench/extensions/memory_tool/remove_memory_category.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_specific_memory.py b/benchmark/goosebench/goosebench/extensions/memory_tool/remove_specific_memory.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_specific_memory.py rename to benchmark/goosebench/goosebench/extensions/memory_tool/remove_specific_memory.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/retrieve_memories.py b/benchmark/goosebench/goosebench/extensions/memory_tool/retrieve_memories.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/memory_tool/retrieve_memories.py rename to benchmark/goosebench/goosebench/extensions/memory_tool/retrieve_memories.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/not_used/__init__.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/__init__.py rename to benchmark/goosebench/goosebench/extensions/not_used/__init__.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/__init__.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/memory_tool/__init__.py rename to benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/__init__.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_read.py b/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_read.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_read.py rename to benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_read.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_search.py b/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_search.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_search.py rename to benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_search.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/__init__.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/__init__.py rename to benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/__init__.py diff --git a/benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/jetbrains.py b/benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/jetbrains.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/jetbrains.py rename to benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/jetbrains.py diff --git a/benchmark/goosebench/goosebench/extensions/tutorial_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/tutorial_tool/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/tutorial.py b/benchmark/goosebench/goosebench/extensions/tutorial_tool/tutorial.py similarity index 100% rename from benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/tutorial.py rename to benchmark/goosebench/goosebench/extensions/tutorial_tool/tutorial.py diff --git a/benchmark/goosebench/goosebench/main.py b/benchmark/goosebench/goosebench/main.py index ce6bf4b69..d281412b3 100755 --- a/benchmark/goosebench/goosebench/main.py +++ b/benchmark/goosebench/goosebench/main.py @@ -1,10 +1,4 @@ #!/usr/bin/env python3 -import dataclasses -import os -import subprocess -import tempfile -import time -from enum import Enum from typing import List, Optional import typer @@ -12,6 +6,8 @@ from rich.theme import Theme from typing_extensions import Annotated +from goosebench.bench import Bench + # Initialize typer app and rich console app = typer.Typer(help="Goose CLI Integration Tests") console = Console(theme=Theme({ @@ -21,24 +17,6 @@ "success": "green" })) - -# Define workflow types -class Workflow(str, Enum): - SERIAL = "serial" - CONVERSATIONAL = "conversational" - - -@dataclasses.dataclass -class Topic: - initial_prompt: str - follow_ups: List[str] - - -@dataclasses.dataclass -class Conversation: - topics: List[Topic] - - # Extension configurations EXTENSIONS = ['developer', 'computercontroller', 'google_drive', 'memory'] @@ -62,95 +40,6 @@ class Conversation: ] } -CONV_EXTENSION_PROMPTS = { - k: Conversation(topics=[ - Topic(val, ["summarize"]) - for val in v - ]) - for k, v in EXTENSION_PROMPTS.items() -} - - -class Bench: - def __init__(self, workflow: Workflow): - self.error_log = [] - self.workflow = workflow - - def log_error(self, provider: str, model: str, extension: str, error: str) -> None: - """Log an error message.""" - self.error_log.append( - f"Provider: {provider}, Model: {model}, Extension: {extension}\n{error}\n" - ) - - def evaluate(self, - provider: str, - model: str, - extension: str, - prompt: str, - follow_ups: Optional[List[str]] = None) -> None: - """Run a single test with the given parameters using pexpect.""" - console.print(f"Testing: {provider}/{model} with {extension}", style="info") - console.print(f"Prompt: {prompt}", style="info") - console.print(f"Workflow: {self.workflow.value}", style="info") - - follow_ups = follow_ups or [] - - # Create temporary file for prompt - with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp: - temp.write(prompt) - temp_path = temp.name - - try: - # Run goose with timeout - cmd = ['goose', 'run', '--with-builtin', extension, '-t', prompt] - result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=30 - ) - - if result.returncode != 0: - self.log_error(provider, model, extension, - result.stdout + result.stderr) - console.print("✗ Test failed", style="error") - - else: - console.print("✓ Test passed") - - except subprocess.TimeoutExpired: - self.log_error(provider, model, extension, - "Test timed out after 30 seconds") - console.print("✗ Test timed out", style="error") - except Exception as e: - self.log_error(provider, model, extension, str(e)) - console.print("✗ Test failed with unexpected error", style="error") - finally: - os.unlink(temp_path) - - def _run_serial(self, provider: str, model: str, extension: str) -> None: - prompts = EXTENSION_PROMPTS.get(extension, []) - for prompt in prompts: - self.evaluate(provider, model, extension, prompt) - time.sleep(2) # brief pause between tests - - def _run_conversational(self, provider: str, model: str, extension: str) -> None: - conv = CONV_EXTENSION_PROMPTS.get(extension, []) - for t in conv.topics: - self.evaluate( - provider, model, extension, t.initial_prompt, t.follow_ups - ) - time.sleep(2) # brief pause between tests - - def test_extension(self, provider: str, model: str, extension: str) -> None: - """Test all prompts for a given extension.""" - console.rule(f"Testing extension: {extension}") - - if self.workflow == Workflow.CONVERSATIONAL: - return self._run_conversational(provider, model, extension) - - return self._run_serial(provider, model, extension) - def parse_provider_model(ctx: typer.Context, provider_models: List[str]) -> List[ tuple[str, str]]: @@ -177,13 +66,6 @@ def main( help="Provider and model in format 'provider:model' or 'provider:model1,model2'" ) ] = None, - workflow: Annotated[ - Workflow, - typer.Option( - '--workflow', '-w', - help="Workflow type: serial or conversational" - ) - ] = Workflow.SERIAL, verbose: Annotated[ bool, typer.Option('--verbose', '-v', help="Enable verbose output") @@ -194,15 +76,14 @@ def main( Example usage: - python main.py # Uses default: databricks:goose with serial workflow + python main.py # Uses default: databricks:goose python main.py -pm anthropic:claude python main.py -pm anthropic:claude,claude2 python main.py -pm anthropic:claude -pm databricks:goose - python main.py --workflow conversational # Use conversational workflow """ console.print("Starting Goose CLI Integration Tests", style="bold") - runner = Bench(workflow) + runner = Bench() # Use default if no provider-models specified if not provider_models: diff --git a/crates/goose-bench/Cargo.toml b/crates/goose-bench/Cargo.toml new file mode 100644 index 000000000..69189bf20 --- /dev/null +++ b/crates/goose-bench/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "goose-bench" +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true +repository.workspace = true +description.workspace = true + + +[dependencies] +anyhow = "1.0" +paste = "1.0" +ctor = "0.2.7" + +[target.'cfg(target_os = "windows")'.dependencies] +winapi = { version = "0.3", features = ["wincred"] } + +#[[bench]] +#name = "tokenization_benchmark" +#harness = false diff --git a/crates/goose-bench/src/eval_suites/evaluation.rs b/crates/goose-bench/src/eval_suites/evaluation.rs new file mode 100644 index 000000000..31532e882 --- /dev/null +++ b/crates/goose-bench/src/eval_suites/evaluation.rs @@ -0,0 +1,36 @@ +use anyhow::Result; + + +pub type Model = (String, String); +pub type Extension = String; + +#[derive(Debug)] +pub enum EvaluationMetric { + Integer(i64), + Float(f64), + String(String), + Boolean(bool), +} + +#[derive(Debug)] +pub struct EvaluationReport { + metrics: Vec, +} + +impl Default for EvaluationReport { + fn default() -> Self { + Self { metrics: vec![] } + } +} + +impl EvaluationReport { + pub fn new(metrics: Vec) -> Self { + EvaluationReport { metrics } + } +} + +pub trait Evaluation: Send + Sync { + fn run(&self) -> Result; + fn models(&self) -> Vec; + fn extensions(&self) -> Vec; +} diff --git a/crates/goose-bench/src/eval_suites/factory.rs b/crates/goose-bench/src/eval_suites/factory.rs new file mode 100644 index 000000000..9bcb233da --- /dev/null +++ b/crates/goose-bench/src/eval_suites/factory.rs @@ -0,0 +1,60 @@ +use std::collections::HashMap; +use std::sync::{OnceLock, RwLock}; + +pub use super::Evaluation; + +type EvaluationConstructor = Box Box + Send + Sync>; + +// Use std::sync::RwLock for interior mutability +static EVALUATION_REGISTRY: OnceLock>> = OnceLock::new(); + +/// Initialize the registry if it hasn't been initialized +fn registry() -> &'static RwLock> { + EVALUATION_REGISTRY.get_or_init(|| RwLock::new(HashMap::new())) +} + +/// Register a new evaluation version +pub fn register_evaluation( + version: &'static str, + constructor: impl Fn() -> Box + Send + Sync + 'static, +) { + let registry = registry(); + if let Ok(mut map) = registry.write() { + map.insert(version, Box::new(constructor)); + } +} + +pub struct EvaluationFactory; + +impl EvaluationFactory { + pub fn create(version: &str) -> Option> { + let registry = registry(); + let map = registry + .read() + .expect("Failed to read the benchmark evaluation registry."); + let constructor = map.get(version)?; + Some(constructor()) + } + + pub fn available_evaluations() -> Vec<&'static str> { + registry() + .read() + .map(|map| map.keys().copied().collect()) + .unwrap_or_default() + } +} + +#[macro_export] +macro_rules! register_evaluation { + ($version:expr, $evaluation_type:ty) => { + paste::paste! { + #[ctor::ctor] + #[allow(non_snake_case)] + fn [<__register_evaluation_ $version>]() { + $crate::eval_suites::factory::register_evaluation($version, || { + Box::new(<$evaluation_type>::new()) + }); + } + } + }; +} diff --git a/crates/goose-bench/src/eval_suites/flappy_bird.rs b/crates/goose-bench/src/eval_suites/flappy_bird.rs new file mode 100644 index 000000000..eb75818b3 --- /dev/null +++ b/crates/goose-bench/src/eval_suites/flappy_bird.rs @@ -0,0 +1,29 @@ +use crate::eval_suites::{Evaluation, Extension, Model}; +use crate::eval_suites::evaluation::EvaluationReport; +use crate::register_evaluation; + +pub struct FlappyBird {} + +impl FlappyBird { + fn new() -> FlappyBird { + FlappyBird {} + } +} + +impl Evaluation for FlappyBird { + fn run(&self) -> anyhow::Result { + let mut metrics = Vec::new(); + + Ok(EvaluationReport::new(metrics)) + } + + fn models(&self) -> Vec { + todo!() + } + + fn extensions(&self) -> Vec { + todo!() + } +} + +register_evaluation!("flappy_bird", FlappyBird); diff --git a/crates/goose-bench/src/eval_suites/mod.rs b/crates/goose-bench/src/eval_suites/mod.rs new file mode 100644 index 000000000..1f7d2992d --- /dev/null +++ b/crates/goose-bench/src/eval_suites/mod.rs @@ -0,0 +1,5 @@ +mod factory; +mod flappy_bird; +mod evaluation; +pub use evaluation::*; +pub use factory::{register_evaluation, EvaluationFactory}; diff --git a/crates/goose-bench/src/lib.rs b/crates/goose-bench/src/lib.rs new file mode 100644 index 000000000..0c41da7e8 --- /dev/null +++ b/crates/goose-bench/src/lib.rs @@ -0,0 +1 @@ +pub mod eval_suites; \ No newline at end of file diff --git a/crates/goose-cli/Cargo.toml b/crates/goose-cli/Cargo.toml index 41cb85a60..62dfd85a6 100644 --- a/crates/goose-cli/Cargo.toml +++ b/crates/goose-cli/Cargo.toml @@ -13,6 +13,7 @@ path = "src/main.rs" [dependencies] goose = { path = "../goose" } +goose-bench = { path = "../goose-bench" } goose-mcp = { path = "../goose-mcp" } mcp-client = { path = "../mcp-client" } mcp-server = { path = "../mcp-server" } diff --git a/crates/goose-cli/src/commands/bench.rs b/crates/goose-cli/src/commands/bench.rs new file mode 100644 index 000000000..3cb1825cf --- /dev/null +++ b/crates/goose-cli/src/commands/bench.rs @@ -0,0 +1,44 @@ +use goose::message::Message; +use crate::session::build_session; +use goose_bench::eval_suites::{EvaluationFactory, EvaluationReport}; + +// use std::error::Error; +// build custom run-func that constructs agent from session, then uses custom loop to manage collecting and returning agent messages. +async fn foo(ext) { + let extension = Vec::new(); // todo + let name = None; + let mut session = build_session(name, false, extension, ext).await; + let _ = session.headless_start(prompt).await; +} + +pub async fn headless_start(&mut self, initial_message: String) -> anyhow::Result<()> { + self.messages.push(Message::user().with_text(&initial_message)); + self.process_agent_response().await?; + Ok(()) +} + +pub async fn run_benchmark() { + let mut all_reports: Vec = vec![]; + + for eval in EvaluationFactory::available_evaluations() { + let evaluation = match EvaluationFactory::create(&eval) { + Some(evaluation) => evaluation, + None => continue, + }; + + for (provider, model) in evaluation.models() { + for ext in evaluation.extensions() { + let report = match evaluation.run() { + Ok(report) => report, + _ => continue, + }; + + // print report? + all_reports.push(report); + } + } + } + + // let summary = report_summary(all_reports)? + // print summary? +} diff --git a/crates/goose-cli/src/commands/mod.rs b/crates/goose-cli/src/commands/mod.rs index e9ed50ce5..7302d695e 100644 --- a/crates/goose-cli/src/commands/mod.rs +++ b/crates/goose-cli/src/commands/mod.rs @@ -1,3 +1,4 @@ pub mod agent_version; pub mod configure; pub mod mcp; +pub mod bench; diff --git a/crates/goose-cli/src/main.rs b/crates/goose-cli/src/main.rs index 5443e142c..ecd0c1215 100644 --- a/crates/goose-cli/src/main.rs +++ b/crates/goose-cli/src/main.rs @@ -9,6 +9,7 @@ use goose_cli::commands::mcp::run_server; use goose_cli::logging::setup_logging; use goose_cli::session::build_session; use std::io::{self, Read}; +use goose_cli::commands::bench::run_benchmark; #[derive(Parser)] #[command(author, version, display_name = "", about, long_about = None)] @@ -140,6 +141,9 @@ enum Command { /// List available agent versions Agents(AgentCommand), + + /// Run benchmark suite + Bench {}, } #[derive(clap::ValueEnum, Clone, Debug)] @@ -207,6 +211,10 @@ async fn main() -> Result<()> { cmd.run()?; return Ok(()); } + Some(Command::Bench {}) => { + run_benchmark().await; + return Ok(()); + } None => { Cli::command().print_help()?; println!(); From 49dc81fd95ec6f6b5477a69c34695aab394a182a Mon Sep 17 00:00:00 2001 From: Marcelle Bonterre Date: Tue, 18 Feb 2025 22:41:10 -0500 Subject: [PATCH 04/10] removed py benchmark proj --- .../workflows/install-and-run-goose.yml | 33 -- benchmark/goosebench/.gitignore | 172 ---------- benchmark/goosebench/archive/test.sh | 209 ------------ benchmark/goosebench/bin/.python3@3.11.pkg | 1 - benchmark/goosebench/bin/README.hermit.md | 7 - benchmark/goosebench/bin/activate-hermit | 21 -- benchmark/goosebench/bin/activate-hermit.fish | 24 -- benchmark/goosebench/bin/hermit | 43 --- benchmark/goosebench/bin/hermit.hcl | 2 - benchmark/goosebench/bin/pip | 1 - benchmark/goosebench/bin/pip3 | 1 - benchmark/goosebench/bin/pip3.11 | 1 - benchmark/goosebench/bin/pydoc3 | 1 - benchmark/goosebench/bin/pydoc3.11 | 1 - benchmark/goosebench/bin/python | 1 - benchmark/goosebench/bin/python3 | 1 - benchmark/goosebench/bin/python3-config | 1 - benchmark/goosebench/bin/python3.11 | 1 - benchmark/goosebench/bin/python3.11-config | 1 - benchmark/goosebench/config.yaml | 7 - benchmark/goosebench/goosebench/__init__.py | 0 benchmark/goosebench/goosebench/bench.py | 74 ----- .../goosebench/extensions/__init__.py | 0 .../computercontroller_tool/__init__.py | 0 .../automation_script.py | 29 -- .../computercontroller_tool/cache.py | 29 -- .../computer_control.py | 29 -- .../computercontroller_tool/web_scrape.py | 29 -- .../computercontroller_tool/web_search.py | 29 -- .../extensions/developer_tool/__init__.py | 0 .../extensions/developer_tool/list_windows.py | 29 -- .../developer_tool/screen_capture.py | 29 -- .../extensions/developer_tool/shell.py | 29 -- .../extensions/developer_tool/text_editor.py | 29 -- .../extensions/memory_tool/__init__.py | 0 .../extensions/memory_tool/remember_memory.py | 29 -- .../memory_tool/remove_memory_category.py | 29 -- .../memory_tool/remove_specific_memory.py | 29 -- .../memory_tool/retrieve_memories.py | 29 -- .../extensions/not_used/__init__.py | 0 .../not_used/google_drive_tool/__init__.py | 0 .../google_drive_tool/google_drive_read.py | 29 -- .../google_drive_tool/google_drive_search.py | 29 -- .../not_used/jetbrains_tool/__init__.py | 0 .../not_used/jetbrains_tool/jetbrains.py | 29 -- .../extensions/tutorial_tool/__init__.py | 0 .../extensions/tutorial_tool/tutorial.py | 29 -- benchmark/goosebench/goosebench/main.py | 118 ------- benchmark/goosebench/poetry.lock | 310 ------------------ benchmark/goosebench/pyproject.toml | 28 -- 50 files changed, 1552 deletions(-) delete mode 100644 benchmark/goosebench/.github/workflows/install-and-run-goose.yml delete mode 100644 benchmark/goosebench/.gitignore delete mode 100755 benchmark/goosebench/archive/test.sh delete mode 120000 benchmark/goosebench/bin/.python3@3.11.pkg delete mode 100644 benchmark/goosebench/bin/README.hermit.md delete mode 100755 benchmark/goosebench/bin/activate-hermit delete mode 100755 benchmark/goosebench/bin/activate-hermit.fish delete mode 100755 benchmark/goosebench/bin/hermit delete mode 100644 benchmark/goosebench/bin/hermit.hcl delete mode 120000 benchmark/goosebench/bin/pip delete mode 120000 benchmark/goosebench/bin/pip3 delete mode 120000 benchmark/goosebench/bin/pip3.11 delete mode 120000 benchmark/goosebench/bin/pydoc3 delete mode 120000 benchmark/goosebench/bin/pydoc3.11 delete mode 120000 benchmark/goosebench/bin/python delete mode 120000 benchmark/goosebench/bin/python3 delete mode 120000 benchmark/goosebench/bin/python3-config delete mode 120000 benchmark/goosebench/bin/python3.11 delete mode 120000 benchmark/goosebench/bin/python3.11-config delete mode 100644 benchmark/goosebench/config.yaml delete mode 100644 benchmark/goosebench/goosebench/__init__.py delete mode 100644 benchmark/goosebench/goosebench/bench.py delete mode 100644 benchmark/goosebench/goosebench/extensions/__init__.py delete mode 100644 benchmark/goosebench/goosebench/extensions/computercontroller_tool/__init__.py delete mode 100644 benchmark/goosebench/goosebench/extensions/computercontroller_tool/automation_script.py delete mode 100644 benchmark/goosebench/goosebench/extensions/computercontroller_tool/cache.py delete mode 100644 benchmark/goosebench/goosebench/extensions/computercontroller_tool/computer_control.py delete mode 100644 benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_scrape.py delete mode 100644 benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_search.py delete mode 100644 benchmark/goosebench/goosebench/extensions/developer_tool/__init__.py delete mode 100644 benchmark/goosebench/goosebench/extensions/developer_tool/list_windows.py delete mode 100644 benchmark/goosebench/goosebench/extensions/developer_tool/screen_capture.py delete mode 100644 benchmark/goosebench/goosebench/extensions/developer_tool/shell.py delete mode 100644 benchmark/goosebench/goosebench/extensions/developer_tool/text_editor.py delete mode 100644 benchmark/goosebench/goosebench/extensions/memory_tool/__init__.py delete mode 100644 benchmark/goosebench/goosebench/extensions/memory_tool/remember_memory.py delete mode 100644 benchmark/goosebench/goosebench/extensions/memory_tool/remove_memory_category.py delete mode 100644 benchmark/goosebench/goosebench/extensions/memory_tool/remove_specific_memory.py delete mode 100644 benchmark/goosebench/goosebench/extensions/memory_tool/retrieve_memories.py delete mode 100644 benchmark/goosebench/goosebench/extensions/not_used/__init__.py delete mode 100644 benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/__init__.py delete mode 100644 benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_read.py delete mode 100644 benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_search.py delete mode 100644 benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/__init__.py delete mode 100644 benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/jetbrains.py delete mode 100644 benchmark/goosebench/goosebench/extensions/tutorial_tool/__init__.py delete mode 100644 benchmark/goosebench/goosebench/extensions/tutorial_tool/tutorial.py delete mode 100755 benchmark/goosebench/goosebench/main.py delete mode 100644 benchmark/goosebench/poetry.lock delete mode 100644 benchmark/goosebench/pyproject.toml diff --git a/benchmark/goosebench/.github/workflows/install-and-run-goose.yml b/benchmark/goosebench/.github/workflows/install-and-run-goose.yml deleted file mode 100644 index 4b13b97e6..000000000 --- a/benchmark/goosebench/.github/workflows/install-and-run-goose.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Install and Run Goose - -on: - push: - branches: - - main # Or your preferred branch - -jobs: - install-and-run-goose: - runs-on: ubuntu-latest - steps: - - name: Checkout Repository - uses: actions/checkout@v2 - - - name: Set up Goose Environment - run: | - echo "GOOSE_BIN_DIR=\$HOME/.local/bin" >> $GITHUB_ENV - echo "CONFIGURE=false" >> $GITHUB_ENV - echo "DATABRICKS_HOST=https://block-lakehouse-production.cloud.databricks.com" >> $GITHUB_ENV - - - name: Install Goose - run: | - curl -fsSL https://github.com/block/goose/releases/download/stable/download_cli.sh | bash - - - name: Configure Goose - run: | - mkdir -p ~/.config/goose/ - cp $GITHUB_WORKSPACE/test.sh ~/test.sh - alias goose=$HOME/.local/bin/goose - - - name: Run Goose Command - run: | - ./test.sh -p databricks -m goose diff --git a/benchmark/goosebench/.gitignore b/benchmark/goosebench/.gitignore deleted file mode 100644 index c4d4e4a7a..000000000 --- a/benchmark/goosebench/.gitignore +++ /dev/null @@ -1,172 +0,0 @@ -.playpen/*.log -.hermit/ -*.ipynb -.idea/ -.vscode/ -.goose/ -run_slack_app-backup.sh -temp_creds.json -temp_merged_policy.json -token_google_docs.json -creds-*.json -assume_role_policy.json -finetune_data/content/ -finetune_data/datasets/ -insights/messages/ -insights/env.sh -src/qai_server/slack_app/modes/generated_images/ -src/qai_server/ingest/token_google_docs.json -src/qai_server/ingest/content/ -src/qai_server/ingest/ingest_project/**/content/ -src/qai_server/ingest/.env -src/qai_server/evals/messages/ -src/qai_server/evals/annotations_*.jsonl -projects/ingest_docs/ingest_docs/content/ -projects/ingest_docs/content/ -projects/ingest_docs/notion_example*.json -run_gdoc.sh - -# Datafiles -*.csv -*.gz -*.h5 -*.pkl -*.pk -*.html -*.log -*.db -*.db-journal - - -## From: https://github.com/github/gitignore/blob/main/Python.gitignore -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -.python-version -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ \ No newline at end of file diff --git a/benchmark/goosebench/archive/test.sh b/benchmark/goosebench/archive/test.sh deleted file mode 100755 index b53752e8d..000000000 --- a/benchmark/goosebench/archive/test.sh +++ /dev/null @@ -1,209 +0,0 @@ -#!/bin/bash - -# NOTE: MacOS ships with Bash 3.2 by default, which does NOT support declare -A for associative arrays. -# This script uses standard Bash arrays to remain compatible. - -# Color codes for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color -BOLD='\033[1m' - -# Initialize error log array -ERROR_LOG=() - -#---------------------------------------------------------------------------# -# EXTENSIONS -#---------------------------------------------------------------------------# -# We'll define each extension in an array of prompts. Then we define an array -# of extension names, so we can iterate over them. -#---------------------------------------------------------------------------# -EXTENSIONS=(developer computercontroller google_drive memory) - -developer_prompts=( - "List the contents of the current directory." - "Create a new file called test.txt with the content 'Hello, World!'" - "Read the contents of test.txt" -) - -computercontroller_prompts=( - "What are the headlines on hackernews? Organize the list into categories." - "Make a ding sound" -) - -google_drive_prompts=( - "List the files in my Google Drive." - "Search for documents containing 'meeting notes'" -) - -memory_prompts=( - "Save this fact: The capital of France is Paris." - "What is the capital of France?" -) - - -#---------------------------------------------------------------------------# -# LOGGING FUNCTION -#---------------------------------------------------------------------------# -log_error() { - local provider=$1 - local model=$2 - local extension=$3 - local error=$4 - ERROR_LOG+=("${RED}[ERROR]${NC} Provider: $provider, Model: $model, Extension: $extension\n$error\n") -} - -#---------------------------------------------------------------------------# -# MAIN TEST FUNCTION -#---------------------------------------------------------------------------# -run_test() { - local provider=$1 - local model=$2 - local extension=$3 - local prompt=$4 - local timeout_seconds=30 - - echo -e "${YELLOW}Testing:${NC} $provider/$model with $extension" - echo -e "${YELLOW}Prompt:${NC} $prompt" - - local temp_file - temp_file="$(mktemp)" - echo "$prompt" > "$temp_file" - - # Run goose with timeout - timeout $timeout_seconds goose run \ - --with-builtin "$extension" \ - -t "$(cat "$temp_file")" 2>&1 | tee test_output.log - - # Check for errors - if [ ${PIPESTATUS[0]} -ne 0 ]; then - log_error "$provider" "$model" "$extension" "$(cat test_output.log)" - echo -e "${RED}✗ Test failed${NC}" - else - echo -e "${GREEN}✓ Test passed${NC}" - fi - - rm -f "$temp_file" test_output.log -} - -#---------------------------------------------------------------------------# -# TESTING EXTENSION (ITERATING OVER PROMPTS) -#---------------------------------------------------------------------------# -test_extension() { - local provider=$1 - local model=$2 - local extension=$3 - - echo -e "\n${BOLD}Testing extension: $extension${NC}" - - # We'll build the array name dynamically, e.g. developer_prompts, memory_prompts, etc. - # Then we retrieve that array's contents via indirect expansion. - local arr_name="${extension}_prompts[@]" - local prompts=("${!arr_name}") - - for prompt in "${prompts[@]}"; do - run_test "$provider" "$model" "$extension" "$prompt" - sleep 2 # brief pause - done -} - -#---------------------------------------------------------------------------# -# USAGE FUNCTION -#---------------------------------------------------------------------------# -usage() { - echo "Usage: $0 [-p provider -m model[,model2,model3]...]..." - echo " -p provider : Provider to use" - echo " -m models : Comma-separated list of models to use with the provider" - echo " -h : Show this help message" - echo "" - echo "Examples:" - echo " $0 # Uses default: databricks/goose" - echo " $0 -p anthropic -m claude # Single provider/model" - echo " $0 -p anthropic -m claude,claude2 # One provider, multiple models" - echo " $0 -p anthropic -m claude -p databricks -m goose # Multiple providers" - echo " $0 -p anthropic -m claude,claude2 -p databricks -m goose,goose2 # Multiple of both" - exit 1 -} - -#---------------------------------------------------------------------------# -# MAIN WORKFLOW -#---------------------------------------------------------------------------# -main() { - # Arrays to store provider/model combinations - declare -a provider_model_pairs=() - local current_provider="" - - # Parse command line arguments - while [[ $# -gt 0 ]]; do - case "$1" in - -h) - usage - ;; - -p) - shift - if [[ -z "$1" ]]; then - echo "Error: -p requires a provider name" - usage - fi - current_provider="$1" - shift - ;; - -m) - if [[ -z "$current_provider" ]]; then - echo "Error: -m must follow a -p option" - usage - fi - shift - if [[ -z "$1" ]]; then - echo "Error: -m requires at least one model name" - usage - fi - # Split comma-separated models and create provider:model pairs - IFS=',' read -ra models <<< "$1" - for model in "${models[@]}"; do - provider_model_pairs+=("$current_provider:$model") - done - shift - ;; - *) - echo "Error: Unknown option $1" - usage - ;; - esac - done - - # If no providers/models specified, use defaults - if [ ${#provider_model_pairs[@]} -eq 0 ]; then - provider_model_pairs=("databricks:goose") - fi - - echo -e "${BOLD}Starting Goose CLI Integration Tests${NC}" - - # Iterate through provider/model pairs - for pair in "${provider_model_pairs[@]}"; do - # Split the pair into provider and model - IFS=':' read -r provider model <<< "$pair" - - echo -e "\n${BOLD}Testing provider: $provider${NC}" - echo -e "${BOLD}Testing model: $model${NC}" - - # Now test each extension for this provider/model pair - for extension in "${EXTENSIONS[@]}"; do - test_extension "$provider" "$model" "$extension" - done - done - - # Print summary - if [ ${#ERROR_LOG[@]} -eq 0 ]; then - echo -e "\n${GREEN}All tests completed successfully!${NC}" - else - echo -e "\n${RED}Test Summary - Errors Found:${NC}" - echo -e "================================" - printf '%b\n' "${ERROR_LOG[@]}" - exit 1 - fi -} - -# Call main with all arguments -main "$@" \ No newline at end of file diff --git a/benchmark/goosebench/bin/.python3@3.11.pkg b/benchmark/goosebench/bin/.python3@3.11.pkg deleted file mode 120000 index 383f4511d..000000000 --- a/benchmark/goosebench/bin/.python3@3.11.pkg +++ /dev/null @@ -1 +0,0 @@ -hermit \ No newline at end of file diff --git a/benchmark/goosebench/bin/README.hermit.md b/benchmark/goosebench/bin/README.hermit.md deleted file mode 100644 index e889550ba..000000000 --- a/benchmark/goosebench/bin/README.hermit.md +++ /dev/null @@ -1,7 +0,0 @@ -# Hermit environment - -This is a [Hermit](https://github.com/cashapp/hermit) bin directory. - -The symlinks in this directory are managed by Hermit and will automatically -download and install Hermit itself as well as packages. These packages are -local to this environment. diff --git a/benchmark/goosebench/bin/activate-hermit b/benchmark/goosebench/bin/activate-hermit deleted file mode 100755 index fe28214d3..000000000 --- a/benchmark/goosebench/bin/activate-hermit +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# This file must be used with "source bin/activate-hermit" from bash or zsh. -# You cannot run it directly -# -# THIS FILE IS GENERATED; DO NOT MODIFY - -if [ "${BASH_SOURCE-}" = "$0" ]; then - echo "You must source this script: \$ source $0" >&2 - exit 33 -fi - -BIN_DIR="$(dirname "${BASH_SOURCE[0]:-${(%):-%x}}")" -if "${BIN_DIR}/hermit" noop > /dev/null; then - eval "$("${BIN_DIR}/hermit" activate "${BIN_DIR}/..")" - - if [ -n "${BASH-}" ] || [ -n "${ZSH_VERSION-}" ]; then - hash -r 2>/dev/null - fi - - echo "Hermit environment $("${HERMIT_ENV}"/bin/hermit env HERMIT_ENV) activated" -fi diff --git a/benchmark/goosebench/bin/activate-hermit.fish b/benchmark/goosebench/bin/activate-hermit.fish deleted file mode 100755 index 0367d2331..000000000 --- a/benchmark/goosebench/bin/activate-hermit.fish +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env fish - -# This file must be sourced with "source bin/activate-hermit.fish" from Fish shell. -# You cannot run it directly. -# -# THIS FILE IS GENERATED; DO NOT MODIFY - -if status is-interactive - set BIN_DIR (dirname (status --current-filename)) - - if "$BIN_DIR/hermit" noop > /dev/null - # Source the activation script generated by Hermit - "$BIN_DIR/hermit" activate "$BIN_DIR/.." | source - - # Clear the command cache if applicable - functions -c > /dev/null 2>&1 - - # Display activation message - echo "Hermit environment $($HERMIT_ENV/bin/hermit env HERMIT_ENV) activated" - end -else - echo "You must source this script: source $argv[0]" >&2 - exit 33 -end diff --git a/benchmark/goosebench/bin/hermit b/benchmark/goosebench/bin/hermit deleted file mode 100755 index 6dbd60cce..000000000 --- a/benchmark/goosebench/bin/hermit +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# -# THIS FILE IS GENERATED; DO NOT MODIFY - -set -eo pipefail - -export HERMIT_USER_HOME=~ - -if [ -z "${HERMIT_STATE_DIR}" ]; then - case "$(uname -s)" in - Darwin) - export HERMIT_STATE_DIR="${HERMIT_USER_HOME}/Library/Caches/hermit" - ;; - Linux) - export HERMIT_STATE_DIR="${XDG_CACHE_HOME:-${HERMIT_USER_HOME}/.cache}/hermit" - ;; - esac -fi - -export HERMIT_DIST_URL="${HERMIT_DIST_URL:-https://d1abdrezunyhdp.cloudfront.net/square}" -HERMIT_CHANNEL="$(basename "${HERMIT_DIST_URL}")" -export HERMIT_CHANNEL -export HERMIT_EXE=${HERMIT_EXE:-${HERMIT_STATE_DIR}/pkg/hermit@${HERMIT_CHANNEL}/hermit} - -if [ ! -x "${HERMIT_EXE}" ]; then - echo "Bootstrapping ${HERMIT_EXE} from ${HERMIT_DIST_URL}" 1>&2 - INSTALL_SCRIPT="$(mktemp)" - # This value must match that of the install script - INSTALL_SCRIPT_SHA256="d9774f75517f9a6d9e371daae9991cdb9fbbc390101b47c3fb2f6876d9094bab" - if [ "${INSTALL_SCRIPT_SHA256}" = "BYPASS" ]; then - curl -fsSL "${HERMIT_DIST_URL}/install.sh" -o "${INSTALL_SCRIPT}" - else - # Install script is versioned by its sha256sum value - curl -fsSL "${HERMIT_DIST_URL}/install-${INSTALL_SCRIPT_SHA256}.sh" -o "${INSTALL_SCRIPT}" - # Verify install script's sha256sum - openssl dgst -sha256 "${INSTALL_SCRIPT}" | \ - awk -v EXPECTED="$INSTALL_SCRIPT_SHA256" \ - '$2!=EXPECTED {print "Install script sha256 " $2 " does not match " EXPECTED; exit 1}' - fi - /bin/bash "${INSTALL_SCRIPT}" 1>&2 -fi - -exec "${HERMIT_EXE}" --level=fatal exec "$0" -- "$@" diff --git a/benchmark/goosebench/bin/hermit.hcl b/benchmark/goosebench/bin/hermit.hcl deleted file mode 100644 index 081cbe834..000000000 --- a/benchmark/goosebench/bin/hermit.hcl +++ /dev/null @@ -1,2 +0,0 @@ -github-token-auth { -} diff --git a/benchmark/goosebench/bin/pip b/benchmark/goosebench/bin/pip deleted file mode 120000 index b78b40b15..000000000 --- a/benchmark/goosebench/bin/pip +++ /dev/null @@ -1 +0,0 @@ -.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/pip3 b/benchmark/goosebench/bin/pip3 deleted file mode 120000 index b78b40b15..000000000 --- a/benchmark/goosebench/bin/pip3 +++ /dev/null @@ -1 +0,0 @@ -.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/pip3.11 b/benchmark/goosebench/bin/pip3.11 deleted file mode 120000 index b78b40b15..000000000 --- a/benchmark/goosebench/bin/pip3.11 +++ /dev/null @@ -1 +0,0 @@ -.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/pydoc3 b/benchmark/goosebench/bin/pydoc3 deleted file mode 120000 index b78b40b15..000000000 --- a/benchmark/goosebench/bin/pydoc3 +++ /dev/null @@ -1 +0,0 @@ -.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/pydoc3.11 b/benchmark/goosebench/bin/pydoc3.11 deleted file mode 120000 index b78b40b15..000000000 --- a/benchmark/goosebench/bin/pydoc3.11 +++ /dev/null @@ -1 +0,0 @@ -.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/python b/benchmark/goosebench/bin/python deleted file mode 120000 index b78b40b15..000000000 --- a/benchmark/goosebench/bin/python +++ /dev/null @@ -1 +0,0 @@ -.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/python3 b/benchmark/goosebench/bin/python3 deleted file mode 120000 index b78b40b15..000000000 --- a/benchmark/goosebench/bin/python3 +++ /dev/null @@ -1 +0,0 @@ -.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/python3-config b/benchmark/goosebench/bin/python3-config deleted file mode 120000 index b78b40b15..000000000 --- a/benchmark/goosebench/bin/python3-config +++ /dev/null @@ -1 +0,0 @@ -.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/python3.11 b/benchmark/goosebench/bin/python3.11 deleted file mode 120000 index b78b40b15..000000000 --- a/benchmark/goosebench/bin/python3.11 +++ /dev/null @@ -1 +0,0 @@ -.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/bin/python3.11-config b/benchmark/goosebench/bin/python3.11-config deleted file mode 120000 index b78b40b15..000000000 --- a/benchmark/goosebench/bin/python3.11-config +++ /dev/null @@ -1 +0,0 @@ -.python3@3.11.pkg \ No newline at end of file diff --git a/benchmark/goosebench/config.yaml b/benchmark/goosebench/config.yaml deleted file mode 100644 index 50c26b306..000000000 --- a/benchmark/goosebench/config.yaml +++ /dev/null @@ -1,7 +0,0 @@ -GOOSE_MODEL: goose -GOOSE_PROVIDER: databricks -extensions: - developer: - enabled: true - name: developer - type: builtin diff --git a/benchmark/goosebench/goosebench/__init__.py b/benchmark/goosebench/goosebench/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/benchmark/goosebench/goosebench/bench.py b/benchmark/goosebench/goosebench/bench.py deleted file mode 100644 index 7566ea43e..000000000 --- a/benchmark/goosebench/goosebench/bench.py +++ /dev/null @@ -1,74 +0,0 @@ -import os -import subprocess -import tempfile -import time -from typing import Optional, List - -from goosebench.main import console, EXTENSION_PROMPTS - - -class Bench: - def __init__(self): - self.error_log = [] - - def log_error(self, provider: str, model: str, extension: str, error: str) -> None: - """Log an error message.""" - self.error_log.append( - f"Provider: {provider}, Model: {model}, Extension: {extension}\n{error}\n" - ) - - def evaluate(self, - provider: str, - model: str, - extension: str, - prompt: str, - follow_ups: Optional[List[str]] = None) -> None: - """Run a single test with the given parameters using pexpect.""" - console.print(f"Testing: {provider}/{model} with {extension}", style="info") - console.print(f"Prompt: {prompt}", style="info") - - follow_ups = follow_ups or [] - - # Create temporary file for prompt - with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp: - temp.write(prompt) - temp_path = temp.name - - try: - # Run goose with timeout - cmd = ['goose', 'run', '--with-builtin', extension, '-t', prompt] - result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=30 - ) - - if result.returncode != 0: - self.log_error(provider, model, extension, - result.stdout + result.stderr) - console.print("✗ Test failed", style="error") - - else: - console.print("✓ Test passed") - - except subprocess.TimeoutExpired: - self.log_error(provider, model, extension, - "Test timed out after 30 seconds") - console.print("✗ Test timed out", style="error") - except Exception as e: - self.log_error(provider, model, extension, str(e)) - console.print("✗ Test failed with unexpected error", style="error") - finally: - os.unlink(temp_path) - - def _run_serial(self, provider: str, model: str, extension: str) -> None: - prompts = EXTENSION_PROMPTS.get(extension, []) - for prompt in prompts: - self.evaluate(provider, model, extension, prompt) - time.sleep(2) # brief pause between tests - - def test_extension(self, provider: str, model: str, extension: str) -> None: - """Test all prompts for a given extension.""" - console.rule(f"Testing extension: {extension}") - return self._run_serial(provider, model, extension) diff --git a/benchmark/goosebench/goosebench/extensions/__init__.py b/benchmark/goosebench/goosebench/extensions/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/automation_script.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/automation_script.py deleted file mode 100644 index 9c6944ec8..000000000 --- a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/automation_script.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the automation script tool.""" - -# Prompts that should trigger valid automation script tool usage -valid_prompts = [ - "Create a shell script to sort unique lines in a file", - "Write a Ruby script to process some text data", - "Make a script to extract the second column from a CSV", - "Create a script to find pattern matches in a file", - "Write a shell script to process log files", - "Create a Ruby script for text manipulation", - "Make a script to analyze data in a text file", - "Write a script to format JSON data", - "Create a script to clean up file names", - "Write a script to extract specific data from files", -] - -# Prompts that should not trigger automation script tool usage based on tool description -invalid_prompts = [ - "Create a complex application with multiple files", - "Write a script that requires external dependencies", - "Create a script that needs a database", - "Write a GUI application", - "Create a web server application", - "Write a script that needs special system access", - "Create a script that requires third-party libraries", - "Write a script that needs network services", - "Create a distributed processing script", - "Write a script that requires system installation", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/cache.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/cache.py deleted file mode 100644 index be78ad8c8..000000000 --- a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/cache.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the cache tool.""" - -# Prompts that should trigger valid cache tool usage -valid_prompts = [ - "List all cached files", - "Show me what's in the cache", - "View the content of this cached file", - "Delete this specific cached file", - "Clear all cached data", - "Show the contents of a cached file", - "Remove this file from cache", - "List the cache directory contents", - "View a cached text file", - "Delete everything from the cache", -] - -# Prompts that should not trigger cache tool usage based on tool description -invalid_prompts = [ - "Modify a cached file directly", - "Search within cached files", - "Compress the cache directory", - "Move cached files to another location", - "Change cache directory permissions", - "Reorganize cached files", - "Filter cache by file type", - "Sort cached files by size", - "Archive old cached files", - "Backup the cache directory", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/computer_control.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/computer_control.py deleted file mode 100644 index 3d17f19cd..000000000 --- a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/computer_control.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the computer control tool.""" - -# Prompts that should trigger valid computer control tool usage -valid_prompts = [ - "Launch Safari and open a specific URL", - "Use AppleScript to automate Mail app", - "Click a button in the current application", - "Fill out a form in Safari", - "Control system volume using AppleScript", - "Organize files in a folder", - "Add an event to Calendar", - "Send an email using Mail app", - "Manage iTunes playlist", - "Automate document processing in Pages", -] - -# Prompts that should not trigger computer control tool usage based on tool description -invalid_prompts = [ - "Control applications that don't support AppleScript", - "Perform actions requiring root access", - "Modify system files directly", - "Access restricted system areas", - "Control non-Apple applications without AppleScript support", - "Perform actions requiring kernel modifications", - "Execute privileged system commands", - "Modify protected system settings", - "Access hardware directly", - "Control low-level system functions", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_scrape.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_scrape.py deleted file mode 100644 index ebc7eed5a..000000000 --- a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_scrape.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the web scrape tool.""" - -# Prompts that should trigger valid web scrape tool usage -valid_prompts = [ - "Fetch the content from https://example.com", - "Download the HTML from this webpage", - "Get JSON data from this API endpoint", - "Save this image from the web", - "Scrape text content from this URL", - "Download this webpage as text", - "Get the JSON response from this API", - "Save this binary file from the web", - "Fetch and cache this webpage", - "Download this document as text", -] - -# Prompts that should not trigger web scrape tool usage based on tool description -invalid_prompts = [ - "Scrape a complex web application with dynamic content", - "Extract data from a JavaScript-heavy website", - "Scrape content that requires login", - "Download content from multiple pages at once", - "Extract data from a site with anti-scraping measures", - "Scrape content that requires user interaction", - "Download content from a protected API", - "Extract data from pages requiring authentication", - "Scrape content from multiple URLs simultaneously", - "Download data from a site requiring cookies", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_search.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_search.py deleted file mode 100644 index 556ad0ea6..000000000 --- a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_search.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the web search tool.""" - -# Prompts that should trigger valid web search tool usage -valid_prompts = [ - "Search for information about 'Tesla'", - "Look up what 'Bitcoin' is", - "Find details about 'SpaceX'", - "Search for 'Python' programming language", - "What is 'Docker'?", - "Look up the company 'Microsoft'", - "Search for information about 'Linux'", - "Find out about 'AWS'", - "What is 'Kubernetes'?", - "Search for 'React' framework", -] - -# Prompts that should not trigger web search tool usage based on tool description -invalid_prompts = [ - "Search for multiple words at once", - "Look up a complex query with multiple terms", - "Search for a long phrase", - "Find results for this entire sentence", - "Search for 'word1 word2 word3'", - "Look up multiple topics at once", - "Search for a paragraph of text", - "Find results for multiple questions", - "Search for a list of items", - "Look up several different topics", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/developer_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/developer_tool/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/benchmark/goosebench/goosebench/extensions/developer_tool/list_windows.py b/benchmark/goosebench/goosebench/extensions/developer_tool/list_windows.py deleted file mode 100644 index 94906613f..000000000 --- a/benchmark/goosebench/goosebench/extensions/developer_tool/list_windows.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the list windows tool.""" - -# Prompts that should trigger valid list windows tool usage -valid_prompts = [ - "Show me all available windows", - "List the windows that can be captured", - "What windows are currently open?", - "Display available window titles", - "Get a list of windows for screen capture", - "Show window titles that I can screenshot", - "What windows can I take screenshots of?", - "List all window titles", - "Show me what windows are available for capture", - "Get available window names", -] - -# Prompts that should not trigger list windows tool usage based on tool description -invalid_prompts = [ - "Close all windows", - "Minimize the current window", - "Maximize the browser window", - "Move window to another display", - "Resize the current window", - "Change window focus", - "Arrange windows on screen", - "Hide inactive windows", - "Show desktop", - "Switch between windows", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/developer_tool/screen_capture.py b/benchmark/goosebench/goosebench/extensions/developer_tool/screen_capture.py deleted file mode 100644 index 0daef846f..000000000 --- a/benchmark/goosebench/goosebench/extensions/developer_tool/screen_capture.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the screen capture tool.""" - -# Prompts that should trigger valid screen capture tool usage -valid_prompts = [ - "Take a screenshot of the main display", - "Capture the window titled 'Terminal'", - "Screenshot the current display", - "Take a picture of display 0", - "Capture a screenshot of the browser window", - "Take a screenshot of the active window", - "Capture display 1", - "Screenshot the window named 'Settings'", - "Take a capture of the main screen", - "Screenshot the specified window", -] - -# Prompts that should not trigger screen capture tool usage based on tool description -invalid_prompts = [ - "Capture multiple windows at once", - "Take a screenshot of all displays", - "Record a video of the screen", - "Capture a region of the screen", - "Take a partial screenshot", - "Screenshot a specific area", - "Capture screen with mouse cursor", - "Take a timed screenshot", - "Screenshot with specific dimensions", - "Capture screen without window decorations", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/developer_tool/shell.py b/benchmark/goosebench/goosebench/extensions/developer_tool/shell.py deleted file mode 100644 index 916f0cfd4..000000000 --- a/benchmark/goosebench/goosebench/extensions/developer_tool/shell.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the shell tool.""" - -# Prompts that should trigger valid shell tool usage -valid_prompts = [ - "Run the command 'ls' to list files", - "Execute 'pwd' to show current directory", - "Use ripgrep to search for files containing 'example'", - "Find all Python files using 'rg --files | rg .py'", - "Search for the string 'class Example' in files using ripgrep", - "Show the contents of a file using cat", - "Count lines in a file using wc -l", - "Check disk space with df -h", - "List processes with ps", - "Create a directory with mkdir test", -] - -# Prompts that should not trigger shell tool usage based on tool description -invalid_prompts = [ - "Run a command that will produce gigabytes of output", - "Start a long-running server without backgrounding it", - "Use find to recursively search for files", - "Use ls -R to list all files recursively", - "Execute a command that will run indefinitely", - "Run a command that streams continuous output", - "Use grep recursively to search files", - "Start a process that needs to be manually terminated", - "Run a command that generates unlimited output", - "Execute ls -la on the entire filesystem", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/developer_tool/text_editor.py b/benchmark/goosebench/goosebench/extensions/developer_tool/text_editor.py deleted file mode 100644 index 689548c01..000000000 --- a/benchmark/goosebench/goosebench/extensions/developer_tool/text_editor.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the text editor tool.""" - -# Prompts that should trigger valid text editor tool usage -valid_prompts = [ - "View the contents of file.txt", - "Show me what's in config.py", - "Create a new file called test.txt with 'Hello World' content", - "Write 'print(\"hello\")' to script.py", - "Replace the string 'old_version' with 'new_version' in config.txt", - "Change 'debug=True' to 'debug=False' in settings.py", - "Undo the last edit made to main.py", - "Revert the previous change in config.json", - "Write this JSON content to data.json", - "Update the version number in package.json", -] - -# Prompts that should not trigger text editor tool usage based on tool description -invalid_prompts = [ - "Edit multiple sections of the file at once", - "Replace all occurrences of a string in the file", - "Make changes to multiple files simultaneously", - "Modify a file that's larger than 400KB", - "Edit a file with more than 400,000 characters", - "Replace a string that appears multiple times in the file", - "Make partial updates to specific sections without full file content", - "Edit binary files", - "Modify files without providing full path", - "Replace text without exact string match", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/memory_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/memory_tool/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/benchmark/goosebench/goosebench/extensions/memory_tool/remember_memory.py b/benchmark/goosebench/goosebench/extensions/memory_tool/remember_memory.py deleted file mode 100644 index 5be419786..000000000 --- a/benchmark/goosebench/goosebench/extensions/memory_tool/remember_memory.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the remember memory tool.""" - -# Prompts that should trigger valid remember memory tool usage -valid_prompts = [ - "Remember this development preference in the 'development' category", - "Store this setting globally with tags #config #setup", - "Save this workflow detail locally in 'workflow' category", - "Remember my name and email in the 'personal' category globally", - "Store project configuration locally with #settings tag", - "Save this formatting preference in development category", - "Remember this shortcut in 'keyboard' category with #shortcuts tag", - "Store build instructions locally in 'build' category", - "Save API credentials globally in 'credentials' category", - "Remember git configuration in 'git' category with #config tag", -] - -# Prompts that should not trigger remember memory tool usage based on tool description -invalid_prompts = [ - "Save this without specifying a category", - "Store this without indicating global or local scope", - "Remember this with invalid tags format", - "Save empty content in a category", - "Store this in multiple categories at once", - "Remember this with system-level access", - "Save this in a protected category", - "Store this with special file permissions", - "Remember this in a non-existent directory", - "Save this with binary content", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/memory_tool/remove_memory_category.py b/benchmark/goosebench/goosebench/extensions/memory_tool/remove_memory_category.py deleted file mode 100644 index fb29aebfe..000000000 --- a/benchmark/goosebench/goosebench/extensions/memory_tool/remove_memory_category.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the remove memory category tool.""" - -# Prompts that should trigger valid remove memory category tool usage -valid_prompts = [ - "Delete all memories in the 'development' category", - "Clear the 'workflow' category from global storage", - "Remove all local project settings", - "Delete everything in the 'personal' category", - "Clear all global memories", - "Remove all local memories", - "Delete the 'build' category", - "Clear project configuration category", - "Remove the 'git' category memories", - "Delete all items in 'credentials' category", -] - -# Prompts that should not trigger remove memory category tool usage based on tool description -invalid_prompts = [ - "Delete memories across multiple categories", - "Remove memories without specifying scope", - "Clear memories by date range", - "Delete memories by content", - "Remove memories with specific tags", - "Clear memories by partial category match", - "Delete memories selectively", - "Remove memories by size", - "Clear recently modified memories", - "Delete memories by author", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/memory_tool/remove_specific_memory.py b/benchmark/goosebench/goosebench/extensions/memory_tool/remove_specific_memory.py deleted file mode 100644 index 502a43895..000000000 --- a/benchmark/goosebench/goosebench/extensions/memory_tool/remove_specific_memory.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the remove specific memory tool.""" - -# Prompts that should trigger valid remove specific memory tool usage -valid_prompts = [ - "Delete the memory about code formatting from development category", - "Remove the git configuration memory from global storage", - "Delete project API key from credentials category", - "Remove my email setting from personal category", - "Delete the build instruction memory from local storage", - "Remove specific workflow step from workflow category", - "Delete keyboard shortcut memory from shortcuts category", - "Remove specific project setting from local config", - "Delete specific credential from global storage", - "Remove particular preference from settings category", -] - -# Prompts that should not trigger remove specific memory tool usage based on tool description -invalid_prompts = [ - "Delete multiple memories at once", - "Remove memories by pattern matching", - "Delete memories without exact content", - "Remove memories by tag only", - "Delete memories by date", - "Remove partial memory content", - "Delete memories by regex", - "Remove memories without category", - "Delete memories by approximate match", - "Remove memories without scope", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/memory_tool/retrieve_memories.py b/benchmark/goosebench/goosebench/extensions/memory_tool/retrieve_memories.py deleted file mode 100644 index 59b187f8e..000000000 --- a/benchmark/goosebench/goosebench/extensions/memory_tool/retrieve_memories.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the retrieve memories tool.""" - -# Prompts that should trigger valid retrieve memories tool usage -valid_prompts = [ - "Show all memories in the 'development' category", - "Get my stored preferences from global memory", - "Retrieve local project settings", - "Show me what's stored in the 'workflow' category", - "Get all global memories", - "Retrieve everything from local storage", - "Show memories tagged with #config", - "Get all items from 'personal' category", - "Retrieve project-specific memories", - "Show what's saved in the 'build' category", -] - -# Prompts that should not trigger retrieve memories tool usage based on tool description -invalid_prompts = [ - "Search across multiple categories at once", - "Find memories without specifying scope", - "Get memories with complex search criteria", - "Retrieve memories by date range", - "Search memories by content", - "Get memories by partial category match", - "Retrieve memories with regex patterns", - "Find memories by size", - "Get memories modified recently", - "Search memories by author", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/not_used/__init__.py b/benchmark/goosebench/goosebench/extensions/not_used/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_read.py b/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_read.py deleted file mode 100644 index 4f81b9ec2..000000000 --- a/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_read.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the Google Drive read tool.""" - -# Prompts that should trigger valid read tool usage -valid_prompts = [ - "Read the file with URI gdrive:///abc123", - "Show me the contents of gdrive:///xyz789", - "Get the text from gdrive:///doc456", - "Read this Google Doc gdrive:///123abc", - "Show the contents of spreadsheet gdrive:///789xyz", - "Get the text of presentation gdrive:///456def", - "Read file gdrive:///def123 and include images", - "Show me gdrive:///789abc without images", - "Get the content of document gdrive:///xyz456", - "Read text file gdrive:///123xyz", -] - -# Prompts that should not trigger read tool usage based on tool description -invalid_prompts = [ - "Edit the file gdrive:///abc123", - "Write to document gdrive:///xyz789", - "Modify spreadsheet gdrive:///123def", - "Update presentation gdrive:///def789", - "Delete file gdrive:///789xyz", - "Create new document gdrive:///456abc", - "Share file gdrive:///xyz123", - "Move document gdrive:///789def", - "Copy file gdrive:///abc789", - "Rename document gdrive:///def456", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_search.py b/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_search.py deleted file mode 100644 index 3a093b5f5..000000000 --- a/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_search.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the Google Drive search tool.""" - -# Prompts that should trigger valid search tool usage -valid_prompts = [ - "Search for files named 'budget'", - "Find documents containing 'report'", - "Look for files with 'presentation' in the name", - "Search my drive for 'meeting notes'", - "Find files named 'project plan'", - "Search for 'invoice' in my files", - "Look up documents named 'proposal'", - "Find spreadsheets with 'data' in the name", - "Search for files containing 'schedule'", - "Find documents with 'summary' in the title", -] - -# Prompts that should not trigger search tool usage based on tool description -invalid_prompts = [ - "Search for files modified in the last week", - "Find files larger than 1MB", - "Search for files shared with me", - "Look for files in a specific folder", - "Find files by type", - "Search for files by owner", - "Look for recently modified files", - "Find files with specific permissions", - "Search for files by date", - "Find files in trash", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/jetbrains.py b/benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/jetbrains.py deleted file mode 100644 index b5cf13ef8..000000000 --- a/benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/jetbrains.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the JetBrains IDE integration tools.""" - -# Prompts that should trigger valid JetBrains tool usage -valid_prompts = [ - "Open the current file in the IDE", - "Navigate to line 42 in the active file", - "Find usages of this class", - "Go to the definition of this method", - "Show documentation for this symbol", - "Run the current test file", - "Debug this application", - "Show project structure", - "Open recent files", - "Search everywhere in the project", -] - -# Prompts that should not trigger JetBrains tool usage based on tool description -invalid_prompts = [ - "Create a new IDE instance", - "Modify IDE settings", - "Install new plugins", - "Change IDE theme", - "Update the IDE version", - "Configure version control", - "Modify IDE keymap", - "Change project settings", - "Install new IDE features", - "Uninstall IDE components", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/extensions/tutorial_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/tutorial_tool/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/benchmark/goosebench/goosebench/extensions/tutorial_tool/tutorial.py b/benchmark/goosebench/goosebench/extensions/tutorial_tool/tutorial.py deleted file mode 100644 index f208d6a76..000000000 --- a/benchmark/goosebench/goosebench/extensions/tutorial_tool/tutorial.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Test cases for the load tutorial tool.""" - -# Prompts that should trigger valid load tutorial tool usage -valid_prompts = [ - "Show me the getting-started tutorial", - "Load the developer-mcp tutorial", - "I need help getting started, show the tutorial", - "Can you load the tutorial about development?", - "Show me how to use Goose with the tutorial", - "Load the beginner's guide tutorial", - "I'm new here, can you show me the introduction tutorial?", - "Display the tutorial for developers", - "Show the tutorial about MCP development", - "Load the basic usage tutorial", -] - -# Prompts that should not trigger load tutorial tool usage based on tool description -invalid_prompts = [ - "Create a new tutorial", - "Edit the existing tutorial", - "Delete this tutorial", - "Modify tutorial content", - "Save this as a tutorial", - "Update the tutorial text", - "Remove old tutorials", - "Change tutorial format", - "Add new tutorial section", - "Merge multiple tutorials", -] \ No newline at end of file diff --git a/benchmark/goosebench/goosebench/main.py b/benchmark/goosebench/goosebench/main.py deleted file mode 100755 index d281412b3..000000000 --- a/benchmark/goosebench/goosebench/main.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python3 -from typing import List, Optional - -import typer -from rich.console import Console -from rich.theme import Theme -from typing_extensions import Annotated - -from goosebench.bench import Bench - -# Initialize typer app and rich console -app = typer.Typer(help="Goose CLI Integration Tests") -console = Console(theme=Theme({ - "info": "cyan", - "warning": "yellow", - "error": "red", - "success": "green" -})) - -# Extension configurations -EXTENSIONS = ['developer', 'computercontroller', 'google_drive', 'memory'] - -EXTENSION_PROMPTS = { - 'developer': [ - "List the contents of the current directory.", - "Create a new file called test.txt with the content 'Hello, World!'", - "Read the contents of test.txt" - ], - 'computercontroller': [ - "What are the headlines on hackernews? Organize the list into categories.", - "Make a ding sound" - ], - 'google_drive': [ - "List the files in my Google Drive.", - "Search for documents containing 'meeting notes'" - ], - 'memory': [ - "Save this fact: The capital of France is Paris.", - "What is the capital of France?" - ] -} - - -def parse_provider_model(ctx: typer.Context, provider_models: List[str]) -> List[ - tuple[str, str]]: - """Parse provider:model strings into tuples.""" - result = [] - for pm in provider_models: - try: - provider, models = pm.split(':') - for model in models.split(','): - result.append((provider.strip(), model.strip())) - except ValueError: - raise typer.BadParameter( - f"Invalid format: {pm}. Use format 'provider:model' or 'provider:model1,model2'" - ) - return result - - -@app.command() -def main( - provider_models: Annotated[ - Optional[List[str]], - typer.Option( - '--provider-model', '-pm', - help="Provider and model in format 'provider:model' or 'provider:model1,model2'" - ) - ] = None, - verbose: Annotated[ - bool, - typer.Option('--verbose', '-v', help="Enable verbose output") - ] = False, -): - """ - Run Goose CLI Integration Tests. - - Example usage: - - python main.py # Uses default: databricks:goose - python main.py -pm anthropic:claude - python main.py -pm anthropic:claude,claude2 - python main.py -pm anthropic:claude -pm databricks:goose - """ - console.print("Starting Goose CLI Integration Tests", style="bold") - - runner = Bench() - - # Use default if no provider-models specified - if not provider_models: - provider_models = ['databricks:goose'] - - # Parse provider-model pairs - try: - provider_model_pairs = parse_provider_model(typer.Context, provider_models) - except typer.BadParameter as e: - console.print(f"Error: {str(e)}", style="error") - raise typer.Exit(1) - - for provider, model in provider_model_pairs: - console.rule(f"Testing provider: {provider}") - console.print(f"Testing model: {model}", style="bold") - - for extension in EXTENSIONS: - runner.test_extension(provider, model, extension) - - # Print summary - if not runner.error_log: - console.print("\nAll tests completed successfully!", style="success") - else: - console.print("\nTest Summary - Errors Found:", style="error") - console.rule("Errors") - for error in runner.error_log: - console.print(error, style="error") - raise typer.Exit(1) - - -if __name__ == "__main__": - app() diff --git a/benchmark/goosebench/poetry.lock b/benchmark/goosebench/poetry.lock deleted file mode 100644 index 7141b3575..000000000 --- a/benchmark/goosebench/poetry.lock +++ /dev/null @@ -1,310 +0,0 @@ -# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. - -[[package]] -name = "click" -version = "8.1.8" -description = "Composable command line interface toolkit" -optional = false -python-versions = ">=3.7" -groups = ["main"] -files = [ - {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"}, - {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"}, -] - -[package.dependencies] -colorama = {version = "*", markers = "platform_system == \"Windows\""} - -[package.source] -type = "legacy" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" -reference = "artifactory" - -[[package]] -name = "colorama" -version = "0.4.6" -description = "Cross-platform colored terminal text." -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" -groups = ["main", "dev"] -files = [ - {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, - {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, -] -markers = {main = "platform_system == \"Windows\"", dev = "sys_platform == \"win32\""} - -[package.source] -type = "legacy" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" -reference = "artifactory" - -[[package]] -name = "iniconfig" -version = "2.0.0" -description = "brain-dead simple config-ini parsing" -optional = false -python-versions = ">=3.7" -groups = ["dev"] -files = [ - {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, - {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, -] - -[package.source] -type = "legacy" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" -reference = "artifactory" - -[[package]] -name = "markdown-it-py" -version = "3.0.0" -description = "Python port of markdown-it. Markdown parsing, done right!" -optional = false -python-versions = ">=3.8" -groups = ["main"] -files = [ - {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, - {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, -] - -[package.dependencies] -mdurl = ">=0.1,<1.0" - -[package.extras] -benchmarking = ["psutil", "pytest", "pytest-benchmark"] -code-style = ["pre-commit (>=3.0,<4.0)"] -compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] -linkify = ["linkify-it-py (>=1,<3)"] -plugins = ["mdit-py-plugins"] -profiling = ["gprof2dot"] -rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] -testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] - -[package.source] -type = "legacy" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" -reference = "artifactory" - -[[package]] -name = "mdurl" -version = "0.1.2" -description = "Markdown URL utilities" -optional = false -python-versions = ">=3.7" -groups = ["main"] -files = [ - {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, - {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, -] - -[package.source] -type = "legacy" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" -reference = "artifactory" - -[[package]] -name = "packaging" -version = "24.2" -description = "Core utilities for Python packages" -optional = false -python-versions = ">=3.8" -groups = ["dev"] -files = [ - {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, - {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, -] - -[package.source] -type = "legacy" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" -reference = "artifactory" - -[[package]] -name = "pexpect" -version = "4.9.0" -description = "Pexpect allows easy control of interactive console applications." -optional = false -python-versions = "*" -groups = ["main"] -files = [ - {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"}, - {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"}, -] - -[package.dependencies] -ptyprocess = ">=0.5" - -[package.source] -type = "legacy" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" -reference = "artifactory" - -[[package]] -name = "pluggy" -version = "1.5.0" -description = "plugin and hook calling mechanisms for python" -optional = false -python-versions = ">=3.8" -groups = ["dev"] -files = [ - {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, - {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, -] - -[package.extras] -dev = ["pre-commit", "tox"] -testing = ["pytest", "pytest-benchmark"] - -[package.source] -type = "legacy" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" -reference = "artifactory" - -[[package]] -name = "ptyprocess" -version = "0.7.0" -description = "Run a subprocess in a pseudo terminal" -optional = false -python-versions = "*" -groups = ["main"] -files = [ - {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, - {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, -] - -[package.source] -type = "legacy" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" -reference = "artifactory" - -[[package]] -name = "pygments" -version = "2.19.1" -description = "Pygments is a syntax highlighting package written in Python." -optional = false -python-versions = ">=3.8" -groups = ["main"] -files = [ - {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, - {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, -] - -[package.extras] -windows-terminal = ["colorama (>=0.4.6)"] - -[package.source] -type = "legacy" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" -reference = "artifactory" - -[[package]] -name = "pytest" -version = "8.3.4" -description = "pytest: simple powerful testing with Python" -optional = false -python-versions = ">=3.8" -groups = ["dev"] -files = [ - {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"}, - {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"}, -] - -[package.dependencies] -colorama = {version = "*", markers = "sys_platform == \"win32\""} -iniconfig = "*" -packaging = "*" -pluggy = ">=1.5,<2" - -[package.extras] -dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] - -[package.source] -type = "legacy" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" -reference = "artifactory" - -[[package]] -name = "rich" -version = "13.9.4" -description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" -optional = false -python-versions = ">=3.8.0" -groups = ["main"] -files = [ - {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"}, - {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"}, -] - -[package.dependencies] -markdown-it-py = ">=2.2.0" -pygments = ">=2.13.0,<3.0.0" - -[package.extras] -jupyter = ["ipywidgets (>=7.5.1,<9)"] - -[package.source] -type = "legacy" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" -reference = "artifactory" - -[[package]] -name = "shellingham" -version = "1.5.4" -description = "Tool to Detect Surrounding Shell" -optional = false -python-versions = ">=3.7" -groups = ["main"] -files = [ - {file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"}, - {file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"}, -] - -[package.source] -type = "legacy" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" -reference = "artifactory" - -[[package]] -name = "typer" -version = "0.15.1" -description = "Typer, build great CLIs. Easy to code. Based on Python type hints." -optional = false -python-versions = ">=3.7" -groups = ["main"] -files = [ - {file = "typer-0.15.1-py3-none-any.whl", hash = "sha256:7994fb7b8155b64d3402518560648446072864beefd44aa2dc36972a5972e847"}, - {file = "typer-0.15.1.tar.gz", hash = "sha256:a0588c0a7fa68a1978a069818657778f86abe6ff5ea6abf472f940a08bfe4f0a"}, -] - -[package.dependencies] -click = ">=8.0.0" -rich = ">=10.11.0" -shellingham = ">=1.3.0" -typing-extensions = ">=3.7.4.3" - -[package.source] -type = "legacy" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" -reference = "artifactory" - -[[package]] -name = "typing-extensions" -version = "4.12.2" -description = "Backported and Experimental Type Hints for Python 3.8+" -optional = false -python-versions = ">=3.8" -groups = ["main"] -files = [ - {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, - {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, -] - -[package.source] -type = "legacy" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" -reference = "artifactory" - -[metadata] -lock-version = "2.1" -python-versions = ">=3.11" -content-hash = "58e6c7676973e0793089a2eff7d2dc54f11c5760cd7b0c43ecd5143588ffa046" diff --git a/benchmark/goosebench/pyproject.toml b/benchmark/goosebench/pyproject.toml deleted file mode 100644 index 00f2cecce..000000000 --- a/benchmark/goosebench/pyproject.toml +++ /dev/null @@ -1,28 +0,0 @@ -[project] -name = "goose-monitoring-job" -version = "0.1.0" -description = "" -readme = "README.md" -authors = [ - { name = "Your Name", email = "you@example.com" } -] - -[[tool.poetry.source]] -name = "artifactory" -priority = "primary" -url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple" - - -[tool.poetry.dependencies] -python = ">=3.11" -pexpect = "^4.9.0" -typer = "^0.15.1" -rich = "^13.9.4" - - -[tool.poetry.group.dev.dependencies] -pytest = "^8.3.4" - -[build-system] -requires = ["poetry-core>=2.0.0,<3.0.0"] -build-backend = "poetry.core.masonry.api" From 60a6100d042343ac1c3da8c50f2b624c905c3b37 Mon Sep 17 00:00:00 2001 From: Marcelle Bonterre Date: Wed, 19 Feb 2025 15:37:17 -0500 Subject: [PATCH 05/10] add cli opt to specify which eval suite to run + support nested suites --- crates/goose-bench/Cargo.toml | 6 +- .../core/complex_tasks/flappy_bird.rs | 21 ++++++ .../src/eval_suites/core/complex_tasks/mod.rs | 1 + .../goose-bench/src/eval_suites/core/mod.rs | 1 + .../goose-bench/src/eval_suites/evaluation.rs | 16 +++-- crates/goose-bench/src/eval_suites/factory.rs | 41 +++++++----- .../src/eval_suites/flappy_bird.rs | 29 --------- crates/goose-bench/src/eval_suites/mod.rs | 5 +- crates/goose-cli/Cargo.toml | 1 + crates/goose-cli/src/commands/bench.rs | 64 ++++++++++--------- crates/goose-cli/src/main.rs | 45 ++++++++----- crates/goose-cli/src/session/mod.rs | 4 ++ 12 files changed, 131 insertions(+), 103 deletions(-) create mode 100644 crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs create mode 100644 crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs create mode 100644 crates/goose-bench/src/eval_suites/core/mod.rs delete mode 100644 crates/goose-bench/src/eval_suites/flappy_bird.rs diff --git a/crates/goose-bench/Cargo.toml b/crates/goose-bench/Cargo.toml index 69189bf20..78fac6b22 100644 --- a/crates/goose-bench/Cargo.toml +++ b/crates/goose-bench/Cargo.toml @@ -12,10 +12,8 @@ description.workspace = true anyhow = "1.0" paste = "1.0" ctor = "0.2.7" +goose = { path = "../goose" } +async-trait = "0.1.86" [target.'cfg(target_os = "windows")'.dependencies] winapi = { version = "0.3", features = ["wincred"] } - -#[[bench]] -#name = "tokenization_benchmark" -#harness = false diff --git a/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs b/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs new file mode 100644 index 000000000..3b0fd2a09 --- /dev/null +++ b/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs @@ -0,0 +1,21 @@ +use crate::eval_suites::evaluation::EvaluationReport; +use crate::eval_suites::{BenchAgent, Evaluation}; +use crate::register_evaluation; +use async_trait::async_trait; + +pub struct FlappyBird {} + +impl FlappyBird { + pub fn new() -> Self {FlappyBird {}} +} + +#[async_trait] +impl Evaluation for FlappyBird { + async fn run(&self, mut agent: Box) -> anyhow::Result { + let metrics = Vec::new(); + let _ = agent.prompt("What can you do?".to_string()).await; + Ok(EvaluationReport::new(metrics)) + } +} + +register_evaluation!("core", FlappyBird); \ No newline at end of file diff --git a/crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs b/crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs new file mode 100644 index 000000000..024794fe8 --- /dev/null +++ b/crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs @@ -0,0 +1 @@ +mod flappy_bird; \ No newline at end of file diff --git a/crates/goose-bench/src/eval_suites/core/mod.rs b/crates/goose-bench/src/eval_suites/core/mod.rs new file mode 100644 index 000000000..e47d12dd9 --- /dev/null +++ b/crates/goose-bench/src/eval_suites/core/mod.rs @@ -0,0 +1 @@ +mod complex_tasks; \ No newline at end of file diff --git a/crates/goose-bench/src/eval_suites/evaluation.rs b/crates/goose-bench/src/eval_suites/evaluation.rs index 31532e882..97b44f593 100644 --- a/crates/goose-bench/src/eval_suites/evaluation.rs +++ b/crates/goose-bench/src/eval_suites/evaluation.rs @@ -1,5 +1,6 @@ use anyhow::Result; - +use async_trait::async_trait; +use goose::message::Message; pub type Model = (String, String); pub type Extension = String; @@ -12,7 +13,6 @@ pub enum EvaluationMetric { Boolean(bool), } -#[derive(Debug)] pub struct EvaluationReport { metrics: Vec, } @@ -29,8 +29,12 @@ impl EvaluationReport { } } -pub trait Evaluation: Send + Sync { - fn run(&self) -> Result; - fn models(&self) -> Vec; - fn extensions(&self) -> Vec; +#[async_trait] +pub trait BenchAgent: Send + Sync { + async fn prompt(&mut self, p: String) -> Result>; } + +#[async_trait] +pub trait Evaluation: Send + Sync { + async fn run(&self, agent: Box) -> Result; +} \ No newline at end of file diff --git a/crates/goose-bench/src/eval_suites/factory.rs b/crates/goose-bench/src/eval_suites/factory.rs index 9bcb233da..ec7763a07 100644 --- a/crates/goose-bench/src/eval_suites/factory.rs +++ b/crates/goose-bench/src/eval_suites/factory.rs @@ -1,39 +1,46 @@ +pub use super::Evaluation; use std::collections::HashMap; use std::sync::{OnceLock, RwLock}; -pub use super::Evaluation; - -type EvaluationConstructor = Box Box + Send + Sync>; +type EvaluationConstructor = fn() -> Box; // Use std::sync::RwLock for interior mutability -static EVALUATION_REGISTRY: OnceLock>> = OnceLock::new(); +static EVALUATION_REGISTRY: OnceLock>>> = OnceLock::new(); /// Initialize the registry if it hasn't been initialized -fn registry() -> &'static RwLock> { +fn registry() -> &'static RwLock>> { EVALUATION_REGISTRY.get_or_init(|| RwLock::new(HashMap::new())) } /// Register a new evaluation version pub fn register_evaluation( - version: &'static str, - constructor: impl Fn() -> Box + Send + Sync + 'static, + suite_name: &'static str, + constructor: fn() -> Box, ) { let registry = registry(); if let Ok(mut map) = registry.write() { - map.insert(version, Box::new(constructor)); + map.entry(suite_name) + .or_insert_with(Vec::new) + .push(constructor); } } -pub struct EvaluationFactory; +pub struct EvaluationSuiteFactory; -impl EvaluationFactory { - pub fn create(version: &str) -> Option> { +impl EvaluationSuiteFactory { + pub fn create(suite_name: &str) -> Option>> { let registry = registry(); let map = registry .read() .expect("Failed to read the benchmark evaluation registry."); - let constructor = map.get(version)?; - Some(constructor()) + + let constructors = map.get(suite_name)?; + let instances = constructors + .iter() + .map(|&constructor| constructor()) + .collect::>(); + + Some(instances) } pub fn available_evaluations() -> Vec<&'static str> { @@ -46,15 +53,15 @@ impl EvaluationFactory { #[macro_export] macro_rules! register_evaluation { - ($version:expr, $evaluation_type:ty) => { + ($suite_name:expr, $evaluation_type:ty) => { paste::paste! { #[ctor::ctor] #[allow(non_snake_case)] - fn [<__register_evaluation_ $version>]() { - $crate::eval_suites::factory::register_evaluation($version, || { + fn [<__register_evaluation_ $suite_name>]() { + $crate::eval_suites::factory::register_evaluation($suite_name, || { Box::new(<$evaluation_type>::new()) }); } } }; -} +} \ No newline at end of file diff --git a/crates/goose-bench/src/eval_suites/flappy_bird.rs b/crates/goose-bench/src/eval_suites/flappy_bird.rs deleted file mode 100644 index eb75818b3..000000000 --- a/crates/goose-bench/src/eval_suites/flappy_bird.rs +++ /dev/null @@ -1,29 +0,0 @@ -use crate::eval_suites::{Evaluation, Extension, Model}; -use crate::eval_suites::evaluation::EvaluationReport; -use crate::register_evaluation; - -pub struct FlappyBird {} - -impl FlappyBird { - fn new() -> FlappyBird { - FlappyBird {} - } -} - -impl Evaluation for FlappyBird { - fn run(&self) -> anyhow::Result { - let mut metrics = Vec::new(); - - Ok(EvaluationReport::new(metrics)) - } - - fn models(&self) -> Vec { - todo!() - } - - fn extensions(&self) -> Vec { - todo!() - } -} - -register_evaluation!("flappy_bird", FlappyBird); diff --git a/crates/goose-bench/src/eval_suites/mod.rs b/crates/goose-bench/src/eval_suites/mod.rs index 1f7d2992d..17a975c57 100644 --- a/crates/goose-bench/src/eval_suites/mod.rs +++ b/crates/goose-bench/src/eval_suites/mod.rs @@ -1,5 +1,6 @@ mod factory; -mod flappy_bird; mod evaluation; +mod core; + pub use evaluation::*; -pub use factory::{register_evaluation, EvaluationFactory}; +pub use factory::{register_evaluation, EvaluationSuiteFactory}; diff --git a/crates/goose-cli/Cargo.toml b/crates/goose-cli/Cargo.toml index 62dfd85a6..a4554fba2 100644 --- a/crates/goose-cli/Cargo.toml +++ b/crates/goose-cli/Cargo.toml @@ -48,6 +48,7 @@ chrono = "0.4" tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "json", "time"] } tracing-appender = "0.2" once_cell = "1.20.2" +async-trait = "0.1.86" [target.'cfg(target_os = "windows")'.dependencies] winapi = { version = "0.3", features = ["wincred"] } diff --git a/crates/goose-cli/src/commands/bench.rs b/crates/goose-cli/src/commands/bench.rs index 3cb1825cf..eb06088e0 100644 --- a/crates/goose-cli/src/commands/bench.rs +++ b/crates/goose-cli/src/commands/bench.rs @@ -1,44 +1,50 @@ -use goose::message::Message; use crate::session::build_session; -use goose_bench::eval_suites::{EvaluationFactory, EvaluationReport}; - +use crate::Session; +use async_trait::async_trait; +use goose::message::Message; +use goose_bench::eval_suites::{BenchAgent, EvaluationReport, EvaluationSuiteFactory}; // use std::error::Error; -// build custom run-func that constructs agent from session, then uses custom loop to manage collecting and returning agent messages. -async fn foo(ext) { - let extension = Vec::new(); // todo - let name = None; - let mut session = build_session(name, false, extension, ext).await; - let _ = session.headless_start(prompt).await; -} -pub async fn headless_start(&mut self, initial_message: String) -> anyhow::Result<()> { - self.messages.push(Message::user().with_text(&initial_message)); - self.process_agent_response().await?; - Ok(()) +// cli flag for suite_name [done] +// default suite_name called core [done] +// pass session messages in to run [done] +// eval suite = suite_name / eval_name / test_file_name [done] +// use session config expecting external proc to manage swapping out config + + +#[async_trait] +impl BenchAgent for Session { + async fn prompt(&mut self, p: String) -> anyhow::Result> { + self.headless_start(p).await?; + Ok(self.message_history()) + } } -pub async fn run_benchmark() { +pub async fn run_benchmark(suites: Vec) { let mut all_reports: Vec = vec![]; - for eval in EvaluationFactory::available_evaluations() { - let evaluation = match EvaluationFactory::create(&eval) { - Some(evaluation) => evaluation, + let suites = EvaluationSuiteFactory::available_evaluations() + .into_iter() + .filter(|&s| suites.contains(&s.to_string())) + .collect::>(); + + for suite in suites { + let evaluations = match EvaluationSuiteFactory::create(&suite) { + Some(evaluations) => evaluations, None => continue, }; + for evaluation in evaluations { + let session = build_session(None, false, Vec::new(), Vec::new()).await; + let report = match evaluation.run(Box::new(session)).await { + Ok(report) => report, + _ => continue, + }; - for (provider, model) in evaluation.models() { - for ext in evaluation.extensions() { - let report = match evaluation.run() { - Ok(report) => report, - _ => continue, - }; - - // print report? - all_reports.push(report); - } + // print report? + all_reports.push(report); } } // let summary = report_summary(all_reports)? // print summary? -} +} \ No newline at end of file diff --git a/crates/goose-cli/src/main.rs b/crates/goose-cli/src/main.rs index ecd0c1215..893dde906 100644 --- a/crates/goose-cli/src/main.rs +++ b/crates/goose-cli/src/main.rs @@ -4,12 +4,12 @@ use clap::{CommandFactory, Parser, Subcommand}; use console::style; use goose::config::Config; use goose_cli::commands::agent_version::AgentCommand; +use goose_cli::commands::bench::run_benchmark; use goose_cli::commands::configure::handle_configure; use goose_cli::commands::mcp::run_server; use goose_cli::logging::setup_logging; use goose_cli::session::build_session; use std::io::{self, Read}; -use goose_cli::commands::bench::run_benchmark; #[derive(Parser)] #[command(author, version, display_name = "", about, long_about = None)] @@ -143,7 +143,17 @@ enum Command { Agents(AgentCommand), /// Run benchmark suite - Bench {}, + Bench { + #[arg( + short = 's', + long = "suites", + value_name = "BENCH_SUITE_NAME", + help = "Run this list of bench-suites.", + long_help = "Specify a comma-separated list of evaluation-suite names to be run.", + value_delimiter = ',' + )] + suites: Vec, + }, } #[derive(clap::ValueEnum, Clone, Debug)] @@ -166,24 +176,24 @@ async fn main() -> Result<()> { let _ = run_server(&name).await; } Some(Command::Session { - name, - resume, - extension, - builtin, - }) => { + name, + resume, + extension, + builtin, + }) => { let mut session = build_session(name, resume, extension, builtin).await; setup_logging(session.session_file().file_stem().and_then(|s| s.to_str()))?; let _ = session.start().await; return Ok(()); } Some(Command::Run { - instructions, - input_text, - name, - resume, - extension, - builtin, - }) => { + instructions, + input_text, + name, + resume, + extension, + builtin, + }) => { // Validate that we have some input source if instructions.is_none() && input_text.is_none() { eprintln!("Error: Must provide either --instructions or --text"); @@ -211,8 +221,11 @@ async fn main() -> Result<()> { cmd.run()?; return Ok(()); } - Some(Command::Bench {}) => { - run_benchmark().await; + Some(Command::Bench { + suites, + }) => { + let suites = if suites.is_empty() { vec!["core".to_string()] } else { suites }; + run_benchmark(suites).await; return Ok(()); } None => { diff --git a/crates/goose-cli/src/session/mod.rs b/crates/goose-cli/src/session/mod.rs index 2a65ef058..37acf739f 100644 --- a/crates/goose-cli/src/session/mod.rs +++ b/crates/goose-cli/src/session/mod.rs @@ -299,4 +299,8 @@ impl Session { pub fn session_file(&self) -> PathBuf { self.session_file.clone() } + + pub fn message_history(&self) -> Vec { + self.messages.clone() + } } From e85a8954b4518d52ce83d9e674e208d7c2caa366 Mon Sep 17 00:00:00 2001 From: Marcelle Bonterre Date: Wed, 19 Feb 2025 15:39:08 -0500 Subject: [PATCH 06/10] fmt --- .../core/complex_tasks/flappy_bird.rs | 6 ++-- .../src/eval_suites/core/complex_tasks/mod.rs | 2 +- .../goose-bench/src/eval_suites/core/mod.rs | 2 +- .../goose-bench/src/eval_suites/evaluation.rs | 2 +- crates/goose-bench/src/eval_suites/factory.rs | 10 +++--- crates/goose-bench/src/eval_suites/mod.rs | 4 +-- crates/goose-bench/src/lib.rs | 2 +- crates/goose-cli/src/commands/bench.rs | 3 +- crates/goose-cli/src/commands/mod.rs | 2 +- crates/goose-cli/src/main.rs | 34 ++++++++++--------- 10 files changed, 34 insertions(+), 33 deletions(-) diff --git a/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs b/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs index 3b0fd2a09..1c43ed16b 100644 --- a/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs +++ b/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs @@ -6,7 +6,9 @@ use async_trait::async_trait; pub struct FlappyBird {} impl FlappyBird { - pub fn new() -> Self {FlappyBird {}} + pub fn new() -> Self { + FlappyBird {} + } } #[async_trait] @@ -18,4 +20,4 @@ impl Evaluation for FlappyBird { } } -register_evaluation!("core", FlappyBird); \ No newline at end of file +register_evaluation!("core", FlappyBird); diff --git a/crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs b/crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs index 024794fe8..c09f5510f 100644 --- a/crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs +++ b/crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs @@ -1 +1 @@ -mod flappy_bird; \ No newline at end of file +mod flappy_bird; diff --git a/crates/goose-bench/src/eval_suites/core/mod.rs b/crates/goose-bench/src/eval_suites/core/mod.rs index e47d12dd9..a1efebf95 100644 --- a/crates/goose-bench/src/eval_suites/core/mod.rs +++ b/crates/goose-bench/src/eval_suites/core/mod.rs @@ -1 +1 @@ -mod complex_tasks; \ No newline at end of file +mod complex_tasks; diff --git a/crates/goose-bench/src/eval_suites/evaluation.rs b/crates/goose-bench/src/eval_suites/evaluation.rs index 97b44f593..1589cc542 100644 --- a/crates/goose-bench/src/eval_suites/evaluation.rs +++ b/crates/goose-bench/src/eval_suites/evaluation.rs @@ -37,4 +37,4 @@ pub trait BenchAgent: Send + Sync { #[async_trait] pub trait Evaluation: Send + Sync { async fn run(&self, agent: Box) -> Result; -} \ No newline at end of file +} diff --git a/crates/goose-bench/src/eval_suites/factory.rs b/crates/goose-bench/src/eval_suites/factory.rs index ec7763a07..0f361ac44 100644 --- a/crates/goose-bench/src/eval_suites/factory.rs +++ b/crates/goose-bench/src/eval_suites/factory.rs @@ -5,7 +5,8 @@ use std::sync::{OnceLock, RwLock}; type EvaluationConstructor = fn() -> Box; // Use std::sync::RwLock for interior mutability -static EVALUATION_REGISTRY: OnceLock>>> = OnceLock::new(); +static EVALUATION_REGISTRY: OnceLock>>> = + OnceLock::new(); /// Initialize the registry if it hasn't been initialized fn registry() -> &'static RwLock>> { @@ -13,10 +14,7 @@ fn registry() -> &'static RwLock Box, -) { +pub fn register_evaluation(suite_name: &'static str, constructor: fn() -> Box) { let registry = registry(); if let Ok(mut map) = registry.write() { map.entry(suite_name) @@ -64,4 +62,4 @@ macro_rules! register_evaluation { } } }; -} \ No newline at end of file +} diff --git a/crates/goose-bench/src/eval_suites/mod.rs b/crates/goose-bench/src/eval_suites/mod.rs index 17a975c57..82404e34b 100644 --- a/crates/goose-bench/src/eval_suites/mod.rs +++ b/crates/goose-bench/src/eval_suites/mod.rs @@ -1,6 +1,6 @@ -mod factory; -mod evaluation; mod core; +mod evaluation; +mod factory; pub use evaluation::*; pub use factory::{register_evaluation, EvaluationSuiteFactory}; diff --git a/crates/goose-bench/src/lib.rs b/crates/goose-bench/src/lib.rs index 0c41da7e8..2881661ea 100644 --- a/crates/goose-bench/src/lib.rs +++ b/crates/goose-bench/src/lib.rs @@ -1 +1 @@ -pub mod eval_suites; \ No newline at end of file +pub mod eval_suites; diff --git a/crates/goose-cli/src/commands/bench.rs b/crates/goose-cli/src/commands/bench.rs index eb06088e0..756d58921 100644 --- a/crates/goose-cli/src/commands/bench.rs +++ b/crates/goose-cli/src/commands/bench.rs @@ -11,7 +11,6 @@ use goose_bench::eval_suites::{BenchAgent, EvaluationReport, EvaluationSuiteFact // eval suite = suite_name / eval_name / test_file_name [done] // use session config expecting external proc to manage swapping out config - #[async_trait] impl BenchAgent for Session { async fn prompt(&mut self, p: String) -> anyhow::Result> { @@ -47,4 +46,4 @@ pub async fn run_benchmark(suites: Vec) { // let summary = report_summary(all_reports)? // print summary? -} \ No newline at end of file +} diff --git a/crates/goose-cli/src/commands/mod.rs b/crates/goose-cli/src/commands/mod.rs index 7302d695e..b702cddea 100644 --- a/crates/goose-cli/src/commands/mod.rs +++ b/crates/goose-cli/src/commands/mod.rs @@ -1,4 +1,4 @@ pub mod agent_version; +pub mod bench; pub mod configure; pub mod mcp; -pub mod bench; diff --git a/crates/goose-cli/src/main.rs b/crates/goose-cli/src/main.rs index 893dde906..e53883b83 100644 --- a/crates/goose-cli/src/main.rs +++ b/crates/goose-cli/src/main.rs @@ -176,24 +176,24 @@ async fn main() -> Result<()> { let _ = run_server(&name).await; } Some(Command::Session { - name, - resume, - extension, - builtin, - }) => { + name, + resume, + extension, + builtin, + }) => { let mut session = build_session(name, resume, extension, builtin).await; setup_logging(session.session_file().file_stem().and_then(|s| s.to_str()))?; let _ = session.start().await; return Ok(()); } Some(Command::Run { - instructions, - input_text, - name, - resume, - extension, - builtin, - }) => { + instructions, + input_text, + name, + resume, + extension, + builtin, + }) => { // Validate that we have some input source if instructions.is_none() && input_text.is_none() { eprintln!("Error: Must provide either --instructions or --text"); @@ -221,10 +221,12 @@ async fn main() -> Result<()> { cmd.run()?; return Ok(()); } - Some(Command::Bench { - suites, - }) => { - let suites = if suites.is_empty() { vec!["core".to_string()] } else { suites }; + Some(Command::Bench { suites }) => { + let suites = if suites.is_empty() { + vec!["core".to_string()] + } else { + suites + }; run_benchmark(suites).await; return Ok(()); } From 826050fb8d37826d9d050aab15af31f6f0f4e965 Mon Sep 17 00:00:00 2001 From: Marcelle Bonterre Date: Wed, 19 Feb 2025 15:41:48 -0500 Subject: [PATCH 07/10] remove to-list comments --- crates/goose-cli/src/commands/bench.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/crates/goose-cli/src/commands/bench.rs b/crates/goose-cli/src/commands/bench.rs index 756d58921..a4a7c5696 100644 --- a/crates/goose-cli/src/commands/bench.rs +++ b/crates/goose-cli/src/commands/bench.rs @@ -3,13 +3,6 @@ use crate::Session; use async_trait::async_trait; use goose::message::Message; use goose_bench::eval_suites::{BenchAgent, EvaluationReport, EvaluationSuiteFactory}; -// use std::error::Error; - -// cli flag for suite_name [done] -// default suite_name called core [done] -// pass session messages in to run [done] -// eval suite = suite_name / eval_name / test_file_name [done] -// use session config expecting external proc to manage swapping out config #[async_trait] impl BenchAgent for Session { From 4cb274030b7a67ddf85e9330acd355fc17845596 Mon Sep 17 00:00:00 2001 From: Marcelle Bonterre Date: Wed, 19 Feb 2025 16:03:54 -0500 Subject: [PATCH 08/10] remove report struct --- .../core/complex_tasks/flappy_bird.rs | 7 +++---- .../goose-bench/src/eval_suites/evaluation.rs | 17 +---------------- crates/goose-cli/src/commands/bench.rs | 12 ++---------- 3 files changed, 6 insertions(+), 30 deletions(-) diff --git a/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs b/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs index 1c43ed16b..61d345355 100644 --- a/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs +++ b/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs @@ -1,5 +1,4 @@ -use crate::eval_suites::evaluation::EvaluationReport; -use crate::eval_suites::{BenchAgent, Evaluation}; +use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric}; use crate::register_evaluation; use async_trait::async_trait; @@ -13,10 +12,10 @@ impl FlappyBird { #[async_trait] impl Evaluation for FlappyBird { - async fn run(&self, mut agent: Box) -> anyhow::Result { + async fn run(&self, mut agent: Box) -> anyhow::Result> { let metrics = Vec::new(); let _ = agent.prompt("What can you do?".to_string()).await; - Ok(EvaluationReport::new(metrics)) + Ok(metrics) } } diff --git a/crates/goose-bench/src/eval_suites/evaluation.rs b/crates/goose-bench/src/eval_suites/evaluation.rs index 1589cc542..18ba9cbbe 100644 --- a/crates/goose-bench/src/eval_suites/evaluation.rs +++ b/crates/goose-bench/src/eval_suites/evaluation.rs @@ -13,21 +13,6 @@ pub enum EvaluationMetric { Boolean(bool), } -pub struct EvaluationReport { - metrics: Vec, -} - -impl Default for EvaluationReport { - fn default() -> Self { - Self { metrics: vec![] } - } -} - -impl EvaluationReport { - pub fn new(metrics: Vec) -> Self { - EvaluationReport { metrics } - } -} #[async_trait] pub trait BenchAgent: Send + Sync { @@ -36,5 +21,5 @@ pub trait BenchAgent: Send + Sync { #[async_trait] pub trait Evaluation: Send + Sync { - async fn run(&self, agent: Box) -> Result; + async fn run(&self, agent: Box) -> Result>; } diff --git a/crates/goose-cli/src/commands/bench.rs b/crates/goose-cli/src/commands/bench.rs index a4a7c5696..51371d280 100644 --- a/crates/goose-cli/src/commands/bench.rs +++ b/crates/goose-cli/src/commands/bench.rs @@ -2,7 +2,7 @@ use crate::session::build_session; use crate::Session; use async_trait::async_trait; use goose::message::Message; -use goose_bench::eval_suites::{BenchAgent, EvaluationReport, EvaluationSuiteFactory}; +use goose_bench::eval_suites::{BenchAgent, EvaluationSuiteFactory}; #[async_trait] impl BenchAgent for Session { @@ -13,8 +13,6 @@ impl BenchAgent for Session { } pub async fn run_benchmark(suites: Vec) { - let mut all_reports: Vec = vec![]; - let suites = EvaluationSuiteFactory::available_evaluations() .into_iter() .filter(|&s| suites.contains(&s.to_string())) @@ -27,16 +25,10 @@ pub async fn run_benchmark(suites: Vec) { }; for evaluation in evaluations { let session = build_session(None, false, Vec::new(), Vec::new()).await; - let report = match evaluation.run(Box::new(session)).await { + let _ = match evaluation.run(Box::new(session)).await { Ok(report) => report, _ => continue, }; - - // print report? - all_reports.push(report); } } - - // let summary = report_summary(all_reports)? - // print summary? } From 64826298de883a39879c874a7a7753bef44d20fa Mon Sep 17 00:00:00 2001 From: Marcelle Bonterre Date: Wed, 19 Feb 2025 16:04:21 -0500 Subject: [PATCH 09/10] fmt --- crates/goose-bench/src/eval_suites/evaluation.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/goose-bench/src/eval_suites/evaluation.rs b/crates/goose-bench/src/eval_suites/evaluation.rs index 18ba9cbbe..87890b772 100644 --- a/crates/goose-bench/src/eval_suites/evaluation.rs +++ b/crates/goose-bench/src/eval_suites/evaluation.rs @@ -13,7 +13,6 @@ pub enum EvaluationMetric { Boolean(bool), } - #[async_trait] pub trait BenchAgent: Send + Sync { async fn prompt(&mut self, p: String) -> Result>; From 91511aefdfd2cf5f44bda28e0253e712cada7036 Mon Sep 17 00:00:00 2001 From: Marcelle Bonterre Date: Wed, 19 Feb 2025 16:10:27 -0500 Subject: [PATCH 10/10] clippy --- crates/goose-cli/src/commands/bench.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/goose-cli/src/commands/bench.rs b/crates/goose-cli/src/commands/bench.rs index 51371d280..59be398f9 100644 --- a/crates/goose-cli/src/commands/bench.rs +++ b/crates/goose-cli/src/commands/bench.rs @@ -19,7 +19,7 @@ pub async fn run_benchmark(suites: Vec) { .collect::>(); for suite in suites { - let evaluations = match EvaluationSuiteFactory::create(&suite) { + let evaluations = match EvaluationSuiteFactory::create(suite) { Some(evaluations) => evaluations, None => continue, };