From 5ec0694ff1bee3de8a916a3124c4edb608e2fa80 Mon Sep 17 00:00:00 2001
From: Zaki Ali <zaki@squareup.com>
Date: Thu, 13 Feb 2025 21:20:50 -0800
Subject: [PATCH 01/10] initial commit of goosebench

---
 benchmark/goosebench/README.md      | 1 +
 benchmark/goosebench/hello.py       | 6 ++++++
 benchmark/goosebench/pyproject.toml | 7 +++++++
 3 files changed, 14 insertions(+)
 create mode 100644 benchmark/goosebench/README.md
 create mode 100644 benchmark/goosebench/hello.py
 create mode 100644 benchmark/goosebench/pyproject.toml

diff --git a/benchmark/goosebench/README.md b/benchmark/goosebench/README.md
new file mode 100644
index 000000000..e9cbd25ee
--- /dev/null
+++ b/benchmark/goosebench/README.md
@@ -0,0 +1 @@
+## GooseBench
diff --git a/benchmark/goosebench/hello.py b/benchmark/goosebench/hello.py
new file mode 100644
index 000000000..04e32c222
--- /dev/null
+++ b/benchmark/goosebench/hello.py
@@ -0,0 +1,6 @@
+def main():
+    print("Hello from goosebench!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/goosebench/pyproject.toml b/benchmark/goosebench/pyproject.toml
new file mode 100644
index 000000000..e1d21d9c6
--- /dev/null
+++ b/benchmark/goosebench/pyproject.toml
@@ -0,0 +1,7 @@
+[project]
+name = "goosebench"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = []

From e1c51be19050062db59d3dbe0ae37a31291565e8 Mon Sep 17 00:00:00 2001
From: Marcelle Bonterre <marcelle@squareup.com>
Date: Tue, 18 Feb 2025 12:16:40 -0500
Subject: [PATCH 02/10] ext (in)validation prompts

---
 .../workflows/install-and-run-goose.yml       |  33 ++
 benchmark/goosebench/.gitignore               | 172 ++++++++++
 benchmark/goosebench/README.md                |   1 -
 benchmark/goosebench/archive/test.sh          | 209 ++++++++++++
 benchmark/goosebench/bin/.python3@3.11.pkg    |   1 +
 benchmark/goosebench/bin/README.hermit.md     |   7 +
 benchmark/goosebench/bin/activate-hermit      |  21 ++
 benchmark/goosebench/bin/activate-hermit.fish |  24 ++
 benchmark/goosebench/bin/hermit               |  43 +++
 benchmark/goosebench/bin/hermit.hcl           |   2 +
 benchmark/goosebench/bin/pip                  |   1 +
 benchmark/goosebench/bin/pip3                 |   1 +
 benchmark/goosebench/bin/pip3.11              |   1 +
 benchmark/goosebench/bin/pydoc3               |   1 +
 benchmark/goosebench/bin/pydoc3.11            |   1 +
 benchmark/goosebench/bin/python               |   1 +
 benchmark/goosebench/bin/python3              |   1 +
 benchmark/goosebench/bin/python3-config       |   1 +
 benchmark/goosebench/bin/python3.11           |   1 +
 benchmark/goosebench/bin/python3.11-config    |   1 +
 benchmark/goosebench/config.yaml              |   7 +
 benchmark/goosebench/goosebench/__init__.py   |   0
 .../goosebench/evaluate_tools/__init__.py     |   0
 .../computercontroller_tool/__init__.py       |   0
 .../automation_script.py                      |  29 ++
 .../computercontroller_tool/cache.py          |  29 ++
 .../computer_control.py                       |  29 ++
 .../computercontroller_tool/web_scrape.py     |  29 ++
 .../computercontroller_tool/web_search.py     |  29 ++
 .../evaluate_tools/developer_tool/__init__.py |   0
 .../developer_tool/list_windows.py            |  29 ++
 .../developer_tool/screen_capture.py          |  29 ++
 .../evaluate_tools/developer_tool/shell.py    |  29 ++
 .../developer_tool/text_editor.py             |  29 ++
 .../google_drive_tool/__init__.py             |   0
 .../google_drive_tool/google_drive_read.py    |  29 ++
 .../google_drive_tool/google_drive_search.py  |  29 ++
 .../evaluate_tools/jetbrains_tool/__init__.py |   0
 .../jetbrains_tool/jetbrains.py               |  29 ++
 .../evaluate_tools/memory_tool/__init__.py    |   0
 .../memory_tool/remember_memory.py            |  29 ++
 .../memory_tool/remove_memory_category.py     |  29 ++
 .../memory_tool/remove_specific_memory.py     |  29 ++
 .../memory_tool/retrieve_memories.py          |  29 ++
 .../evaluate_tools/tutorial_tool/__init__.py  |   0
 .../evaluate_tools/tutorial_tool/tutorial.py  |  29 ++
 benchmark/goosebench/goosebench/main.py       | 237 +++++++++++++
 benchmark/goosebench/hello.py                 |   6 -
 benchmark/goosebench/poetry.lock              | 310 ++++++++++++++++++
 benchmark/goosebench/pyproject.toml           |  29 +-
 50 files changed, 1594 insertions(+), 11 deletions(-)
 create mode 100644 benchmark/goosebench/.github/workflows/install-and-run-goose.yml
 create mode 100644 benchmark/goosebench/.gitignore
 delete mode 100644 benchmark/goosebench/README.md
 create mode 100755 benchmark/goosebench/archive/test.sh
 create mode 120000 benchmark/goosebench/bin/.python3@3.11.pkg
 create mode 100644 benchmark/goosebench/bin/README.hermit.md
 create mode 100755 benchmark/goosebench/bin/activate-hermit
 create mode 100755 benchmark/goosebench/bin/activate-hermit.fish
 create mode 100755 benchmark/goosebench/bin/hermit
 create mode 100644 benchmark/goosebench/bin/hermit.hcl
 create mode 120000 benchmark/goosebench/bin/pip
 create mode 120000 benchmark/goosebench/bin/pip3
 create mode 120000 benchmark/goosebench/bin/pip3.11
 create mode 120000 benchmark/goosebench/bin/pydoc3
 create mode 120000 benchmark/goosebench/bin/pydoc3.11
 create mode 120000 benchmark/goosebench/bin/python
 create mode 120000 benchmark/goosebench/bin/python3
 create mode 120000 benchmark/goosebench/bin/python3-config
 create mode 120000 benchmark/goosebench/bin/python3.11
 create mode 120000 benchmark/goosebench/bin/python3.11-config
 create mode 100644 benchmark/goosebench/config.yaml
 create mode 100644 benchmark/goosebench/goosebench/__init__.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/__init__.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/__init__.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/automation_script.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/cache.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/computer_control.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_scrape.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_search.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/developer_tool/__init__.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/developer_tool/list_windows.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/developer_tool/screen_capture.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/developer_tool/shell.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/developer_tool/text_editor.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/__init__.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_read.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_search.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/__init__.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/jetbrains.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/memory_tool/__init__.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remember_memory.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_memory_category.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_specific_memory.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/memory_tool/retrieve_memories.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/__init__.py
 create mode 100644 benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/tutorial.py
 create mode 100755 benchmark/goosebench/goosebench/main.py
 delete mode 100644 benchmark/goosebench/hello.py
 create mode 100644 benchmark/goosebench/poetry.lock

diff --git a/benchmark/goosebench/.github/workflows/install-and-run-goose.yml b/benchmark/goosebench/.github/workflows/install-and-run-goose.yml
new file mode 100644
index 000000000..4b13b97e6
--- /dev/null
+++ b/benchmark/goosebench/.github/workflows/install-and-run-goose.yml
@@ -0,0 +1,33 @@
+name: Install and Run Goose
+
+on: 
+  push:
+    branches:
+      - main  # Or your preferred branch
+
+jobs:
+  install-and-run-goose:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v2
+
+      - name: Set up Goose Environment
+        run: |
+          echo "GOOSE_BIN_DIR=\$HOME/.local/bin" >> $GITHUB_ENV
+          echo "CONFIGURE=false" >> $GITHUB_ENV
+          echo "DATABRICKS_HOST=https://block-lakehouse-production.cloud.databricks.com" >> $GITHUB_ENV
+
+      - name: Install Goose
+        run: |
+          curl -fsSL https://github.com/block/goose/releases/download/stable/download_cli.sh | bash
+
+      - name: Configure Goose
+        run: |
+          mkdir -p ~/.config/goose/
+          cp $GITHUB_WORKSPACE/test.sh ~/test.sh
+          alias goose=$HOME/.local/bin/goose
+
+      - name: Run Goose Command
+        run: |
+            ./test.sh -p databricks -m goose
diff --git a/benchmark/goosebench/.gitignore b/benchmark/goosebench/.gitignore
new file mode 100644
index 000000000..c4d4e4a7a
--- /dev/null
+++ b/benchmark/goosebench/.gitignore
@@ -0,0 +1,172 @@
+.playpen/*.log
+.hermit/
+*.ipynb
+.idea/
+.vscode/
+.goose/
+run_slack_app-backup.sh
+temp_creds.json
+temp_merged_policy.json
+token_google_docs.json
+creds-*.json
+assume_role_policy.json
+finetune_data/content/
+finetune_data/datasets/
+insights/messages/
+insights/env.sh
+src/qai_server/slack_app/modes/generated_images/
+src/qai_server/ingest/token_google_docs.json
+src/qai_server/ingest/content/
+src/qai_server/ingest/ingest_project/**/content/
+src/qai_server/ingest/.env
+src/qai_server/evals/messages/
+src/qai_server/evals/annotations_*.jsonl
+projects/ingest_docs/ingest_docs/content/
+projects/ingest_docs/content/
+projects/ingest_docs/notion_example*.json
+run_gdoc.sh
+
+# Datafiles
+*.csv
+*.gz
+*.h5
+*.pkl
+*.pk
+*.html
+*.log
+*.db
+*.db-journal
+
+
+## From: https://github.com/github/gitignore/blob/main/Python.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+.python-version
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
\ No newline at end of file
diff --git a/benchmark/goosebench/README.md b/benchmark/goosebench/README.md
deleted file mode 100644
index e9cbd25ee..000000000
--- a/benchmark/goosebench/README.md
+++ /dev/null
@@ -1 +0,0 @@
-## GooseBench
diff --git a/benchmark/goosebench/archive/test.sh b/benchmark/goosebench/archive/test.sh
new file mode 100755
index 000000000..b53752e8d
--- /dev/null
+++ b/benchmark/goosebench/archive/test.sh
@@ -0,0 +1,209 @@
+#!/bin/bash
+
+# NOTE: MacOS ships with Bash 3.2 by default, which does NOT support declare -A for associative arrays.
+# This script uses standard Bash arrays to remain compatible.
+
+# Color codes for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+BOLD='\033[1m'
+
+# Initialize error log array
+ERROR_LOG=()
+
+#---------------------------------------------------------------------------#
+# EXTENSIONS
+#---------------------------------------------------------------------------#
+# We'll define each extension in an array of prompts. Then we define an array
+# of extension names, so we can iterate over them.
+#---------------------------------------------------------------------------#
+EXTENSIONS=(developer computercontroller google_drive memory)
+
+developer_prompts=(
+  "List the contents of the current directory."
+  "Create a new file called test.txt with the content 'Hello, World!'"
+  "Read the contents of test.txt"
+)
+
+computercontroller_prompts=(
+    "What are the headlines on hackernews? Organize the list into categories."
+    "Make a ding sound"
+)
+
+google_drive_prompts=(
+  "List the files in my Google Drive."
+  "Search for documents containing 'meeting notes'"
+)
+
+memory_prompts=(
+  "Save this fact: The capital of France is Paris."
+  "What is the capital of France?"
+)
+
+
+#---------------------------------------------------------------------------#
+# LOGGING FUNCTION
+#---------------------------------------------------------------------------#
+log_error() {
+  local provider=$1
+  local model=$2
+  local extension=$3
+  local error=$4
+  ERROR_LOG+=("${RED}[ERROR]${NC} Provider: $provider, Model: $model, Extension: $extension\n$error\n")
+}
+
+#---------------------------------------------------------------------------#
+# MAIN TEST FUNCTION
+#---------------------------------------------------------------------------#
+run_test() {
+  local provider=$1
+  local model=$2
+  local extension=$3
+  local prompt=$4
+  local timeout_seconds=30
+
+  echo -e "${YELLOW}Testing:${NC} $provider/$model with $extension"
+  echo -e "${YELLOW}Prompt:${NC} $prompt"
+
+  local temp_file
+  temp_file="$(mktemp)"
+  echo "$prompt" > "$temp_file"
+
+  # Run goose with timeout
+  timeout $timeout_seconds goose run \
+    --with-builtin "$extension" \
+    -t "$(cat "$temp_file")" 2>&1 | tee test_output.log
+
+  # Check for errors
+  if [ ${PIPESTATUS[0]} -ne 0 ]; then
+    log_error "$provider" "$model" "$extension" "$(cat test_output.log)"
+    echo -e "${RED}✗ Test failed${NC}"
+  else
+    echo -e "${GREEN}✓ Test passed${NC}"
+  fi
+
+  rm -f "$temp_file" test_output.log
+}
+
+#---------------------------------------------------------------------------#
+# TESTING EXTENSION (ITERATING OVER PROMPTS)
+#---------------------------------------------------------------------------#
+test_extension() {
+  local provider=$1
+  local model=$2
+  local extension=$3
+
+  echo -e "\n${BOLD}Testing extension: $extension${NC}"
+
+  # We'll build the array name dynamically, e.g. developer_prompts, memory_prompts, etc.
+  # Then we retrieve that array's contents via indirect expansion.
+  local arr_name="${extension}_prompts[@]"
+  local prompts=("${!arr_name}")
+
+  for prompt in "${prompts[@]}"; do
+    run_test "$provider" "$model" "$extension" "$prompt"
+    sleep 2  # brief pause
+  done
+}
+
+#---------------------------------------------------------------------------#
+# USAGE FUNCTION
+#---------------------------------------------------------------------------#
+usage() {
+  echo "Usage: $0 [-p provider -m model[,model2,model3]...]..."
+  echo "  -p provider : Provider to use"
+  echo "  -m models   : Comma-separated list of models to use with the provider"
+  echo "  -h         : Show this help message"
+  echo ""
+  echo "Examples:"
+  echo "  $0                                    # Uses default: databricks/goose"
+  echo "  $0 -p anthropic -m claude             # Single provider/model"
+  echo "  $0 -p anthropic -m claude,claude2     # One provider, multiple models"
+  echo "  $0 -p anthropic -m claude -p databricks -m goose  # Multiple providers"
+  echo "  $0 -p anthropic -m claude,claude2 -p databricks -m goose,goose2  # Multiple of both"
+  exit 1
+}
+
+#---------------------------------------------------------------------------#
+# MAIN WORKFLOW
+#---------------------------------------------------------------------------#
+main() {
+  # Arrays to store provider/model combinations
+  declare -a provider_model_pairs=()
+  local current_provider=""
+
+  # Parse command line arguments
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      -h)
+        usage
+        ;;
+      -p)
+        shift
+        if [[ -z "$1" ]]; then
+          echo "Error: -p requires a provider name"
+          usage
+        fi
+        current_provider="$1"
+        shift
+        ;;
+      -m)
+        if [[ -z "$current_provider" ]]; then
+          echo "Error: -m must follow a -p option"
+          usage
+        fi
+        shift
+        if [[ -z "$1" ]]; then
+          echo "Error: -m requires at least one model name"
+          usage
+        fi
+        # Split comma-separated models and create provider:model pairs
+        IFS=',' read -ra models <<< "$1"
+        for model in "${models[@]}"; do
+          provider_model_pairs+=("$current_provider:$model")
+        done
+        shift
+        ;;
+      *)
+        echo "Error: Unknown option $1"
+        usage
+        ;;
+    esac
+  done
+
+  # If no providers/models specified, use defaults
+  if [ ${#provider_model_pairs[@]} -eq 0 ]; then
+    provider_model_pairs=("databricks:goose")
+  fi
+
+  echo -e "${BOLD}Starting Goose CLI Integration Tests${NC}"
+
+  # Iterate through provider/model pairs
+  for pair in "${provider_model_pairs[@]}"; do
+    # Split the pair into provider and model
+    IFS=':' read -r provider model <<< "$pair"
+    
+    echo -e "\n${BOLD}Testing provider: $provider${NC}"
+    echo -e "${BOLD}Testing model: $model${NC}"
+
+    # Now test each extension for this provider/model pair
+    for extension in "${EXTENSIONS[@]}"; do
+      test_extension "$provider" "$model" "$extension"
+    done
+  done
+
+  # Print summary
+  if [ ${#ERROR_LOG[@]} -eq 0 ]; then
+    echo -e "\n${GREEN}All tests completed successfully!${NC}"
+  else
+    echo -e "\n${RED}Test Summary - Errors Found:${NC}"
+    echo -e "================================"
+    printf '%b\n' "${ERROR_LOG[@]}"
+    exit 1
+  fi
+}
+
+# Call main with all arguments
+main "$@"
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/.python3@3.11.pkg b/benchmark/goosebench/bin/.python3@3.11.pkg
new file mode 120000
index 000000000..383f4511d
--- /dev/null
+++ b/benchmark/goosebench/bin/.python3@3.11.pkg
@@ -0,0 +1 @@
+hermit
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/README.hermit.md b/benchmark/goosebench/bin/README.hermit.md
new file mode 100644
index 000000000..e889550ba
--- /dev/null
+++ b/benchmark/goosebench/bin/README.hermit.md
@@ -0,0 +1,7 @@
+# Hermit environment
+
+This is a [Hermit](https://github.com/cashapp/hermit) bin directory.
+
+The symlinks in this directory are managed by Hermit and will automatically
+download and install Hermit itself as well as packages. These packages are
+local to this environment.
diff --git a/benchmark/goosebench/bin/activate-hermit b/benchmark/goosebench/bin/activate-hermit
new file mode 100755
index 000000000..fe28214d3
--- /dev/null
+++ b/benchmark/goosebench/bin/activate-hermit
@@ -0,0 +1,21 @@
+#!/bin/bash
+# This file must be used with "source bin/activate-hermit" from bash or zsh.
+# You cannot run it directly
+#
+# THIS FILE IS GENERATED; DO NOT MODIFY
+
+if [ "${BASH_SOURCE-}" = "$0" ]; then
+  echo "You must source this script: \$ source $0" >&2
+  exit 33
+fi
+
+BIN_DIR="$(dirname "${BASH_SOURCE[0]:-${(%):-%x}}")"
+if "${BIN_DIR}/hermit" noop > /dev/null; then
+  eval "$("${BIN_DIR}/hermit" activate "${BIN_DIR}/..")"
+
+  if [ -n "${BASH-}" ] || [ -n "${ZSH_VERSION-}" ]; then
+      hash -r 2>/dev/null
+    fi
+
+    echo "Hermit environment $("${HERMIT_ENV}"/bin/hermit env HERMIT_ENV) activated"
+fi
diff --git a/benchmark/goosebench/bin/activate-hermit.fish b/benchmark/goosebench/bin/activate-hermit.fish
new file mode 100755
index 000000000..0367d2331
--- /dev/null
+++ b/benchmark/goosebench/bin/activate-hermit.fish
@@ -0,0 +1,24 @@
+#!/usr/bin/env fish
+
+# This file must be sourced with "source bin/activate-hermit.fish" from Fish shell.
+# You cannot run it directly.
+#
+# THIS FILE IS GENERATED; DO NOT MODIFY
+
+if status is-interactive
+    set BIN_DIR (dirname (status --current-filename))
+
+    if "$BIN_DIR/hermit" noop > /dev/null
+        # Source the activation script generated by Hermit
+        "$BIN_DIR/hermit" activate "$BIN_DIR/.." | source
+
+        # Clear the command cache if applicable
+        functions -c > /dev/null 2>&1
+
+        # Display activation message
+        echo "Hermit environment $($HERMIT_ENV/bin/hermit env HERMIT_ENV) activated"
+    end
+else
+    echo "You must source this script: source $argv[0]" >&2
+    exit 33
+end
diff --git a/benchmark/goosebench/bin/hermit b/benchmark/goosebench/bin/hermit
new file mode 100755
index 000000000..6dbd60cce
--- /dev/null
+++ b/benchmark/goosebench/bin/hermit
@@ -0,0 +1,43 @@
+#!/bin/bash
+#
+# THIS FILE IS GENERATED; DO NOT MODIFY
+
+set -eo pipefail
+
+export HERMIT_USER_HOME=~
+
+if [ -z "${HERMIT_STATE_DIR}" ]; then
+  case "$(uname -s)" in
+  Darwin)
+    export HERMIT_STATE_DIR="${HERMIT_USER_HOME}/Library/Caches/hermit"
+    ;;
+  Linux)
+    export HERMIT_STATE_DIR="${XDG_CACHE_HOME:-${HERMIT_USER_HOME}/.cache}/hermit"
+    ;;
+  esac
+fi
+
+export HERMIT_DIST_URL="${HERMIT_DIST_URL:-https://d1abdrezunyhdp.cloudfront.net/square}"
+HERMIT_CHANNEL="$(basename "${HERMIT_DIST_URL}")"
+export HERMIT_CHANNEL
+export HERMIT_EXE=${HERMIT_EXE:-${HERMIT_STATE_DIR}/pkg/hermit@${HERMIT_CHANNEL}/hermit}
+
+if [ ! -x "${HERMIT_EXE}" ]; then
+  echo "Bootstrapping ${HERMIT_EXE} from ${HERMIT_DIST_URL}" 1>&2
+  INSTALL_SCRIPT="$(mktemp)"
+  # This value must match that of the install script
+  INSTALL_SCRIPT_SHA256="d9774f75517f9a6d9e371daae9991cdb9fbbc390101b47c3fb2f6876d9094bab"
+  if [ "${INSTALL_SCRIPT_SHA256}" = "BYPASS" ]; then
+    curl -fsSL "${HERMIT_DIST_URL}/install.sh" -o "${INSTALL_SCRIPT}"
+  else
+    # Install script is versioned by its sha256sum value
+    curl -fsSL "${HERMIT_DIST_URL}/install-${INSTALL_SCRIPT_SHA256}.sh" -o "${INSTALL_SCRIPT}"
+    # Verify install script's sha256sum
+    openssl dgst -sha256 "${INSTALL_SCRIPT}" | \
+      awk -v EXPECTED="$INSTALL_SCRIPT_SHA256" \
+      '$2!=EXPECTED {print "Install script sha256 " $2 " does not match " EXPECTED; exit 1}'
+  fi
+  /bin/bash "${INSTALL_SCRIPT}" 1>&2
+fi
+
+exec "${HERMIT_EXE}" --level=fatal exec "$0" -- "$@"
diff --git a/benchmark/goosebench/bin/hermit.hcl b/benchmark/goosebench/bin/hermit.hcl
new file mode 100644
index 000000000..081cbe834
--- /dev/null
+++ b/benchmark/goosebench/bin/hermit.hcl
@@ -0,0 +1,2 @@
+github-token-auth {
+}
diff --git a/benchmark/goosebench/bin/pip b/benchmark/goosebench/bin/pip
new file mode 120000
index 000000000..b78b40b15
--- /dev/null
+++ b/benchmark/goosebench/bin/pip
@@ -0,0 +1 @@
+.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/pip3 b/benchmark/goosebench/bin/pip3
new file mode 120000
index 000000000..b78b40b15
--- /dev/null
+++ b/benchmark/goosebench/bin/pip3
@@ -0,0 +1 @@
+.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/pip3.11 b/benchmark/goosebench/bin/pip3.11
new file mode 120000
index 000000000..b78b40b15
--- /dev/null
+++ b/benchmark/goosebench/bin/pip3.11
@@ -0,0 +1 @@
+.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/pydoc3 b/benchmark/goosebench/bin/pydoc3
new file mode 120000
index 000000000..b78b40b15
--- /dev/null
+++ b/benchmark/goosebench/bin/pydoc3
@@ -0,0 +1 @@
+.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/pydoc3.11 b/benchmark/goosebench/bin/pydoc3.11
new file mode 120000
index 000000000..b78b40b15
--- /dev/null
+++ b/benchmark/goosebench/bin/pydoc3.11
@@ -0,0 +1 @@
+.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/python b/benchmark/goosebench/bin/python
new file mode 120000
index 000000000..b78b40b15
--- /dev/null
+++ b/benchmark/goosebench/bin/python
@@ -0,0 +1 @@
+.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/python3 b/benchmark/goosebench/bin/python3
new file mode 120000
index 000000000..b78b40b15
--- /dev/null
+++ b/benchmark/goosebench/bin/python3
@@ -0,0 +1 @@
+.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/python3-config b/benchmark/goosebench/bin/python3-config
new file mode 120000
index 000000000..b78b40b15
--- /dev/null
+++ b/benchmark/goosebench/bin/python3-config
@@ -0,0 +1 @@
+.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/python3.11 b/benchmark/goosebench/bin/python3.11
new file mode 120000
index 000000000..b78b40b15
--- /dev/null
+++ b/benchmark/goosebench/bin/python3.11
@@ -0,0 +1 @@
+.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/python3.11-config b/benchmark/goosebench/bin/python3.11-config
new file mode 120000
index 000000000..b78b40b15
--- /dev/null
+++ b/benchmark/goosebench/bin/python3.11-config
@@ -0,0 +1 @@
+.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/config.yaml b/benchmark/goosebench/config.yaml
new file mode 100644
index 000000000..50c26b306
--- /dev/null
+++ b/benchmark/goosebench/config.yaml
@@ -0,0 +1,7 @@
+GOOSE_MODEL: goose
+GOOSE_PROVIDER: databricks
+extensions:
+  developer:
+    enabled: true
+    name: developer
+    type: builtin
diff --git a/benchmark/goosebench/goosebench/__init__.py b/benchmark/goosebench/goosebench/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/__init__.py b/benchmark/goosebench/goosebench/evaluate_tools/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/__init__.py b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/automation_script.py b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/automation_script.py
new file mode 100644
index 000000000..9c6944ec8
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/automation_script.py
@@ -0,0 +1,29 @@
+"""Test cases for the automation script tool."""
+
+# Prompts that should trigger valid automation script tool usage
+valid_prompts = [
+    "Create a shell script to sort unique lines in a file",
+    "Write a Ruby script to process some text data",
+    "Make a script to extract the second column from a CSV",
+    "Create a script to find pattern matches in a file",
+    "Write a shell script to process log files",
+    "Create a Ruby script for text manipulation",
+    "Make a script to analyze data in a text file",
+    "Write a script to format JSON data",
+    "Create a script to clean up file names",
+    "Write a script to extract specific data from files",
+]
+
+# Prompts that should not trigger automation script tool usage based on tool description
+invalid_prompts = [
+    "Create a complex application with multiple files",
+    "Write a script that requires external dependencies",
+    "Create a script that needs a database",
+    "Write a GUI application",
+    "Create a web server application",
+    "Write a script that needs special system access",
+    "Create a script that requires third-party libraries",
+    "Write a script that needs network services",
+    "Create a distributed processing script",
+    "Write a script that requires system installation",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/cache.py b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/cache.py
new file mode 100644
index 000000000..be78ad8c8
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/cache.py
@@ -0,0 +1,29 @@
+"""Test cases for the cache tool."""
+
+# Prompts that should trigger valid cache tool usage
+valid_prompts = [
+    "List all cached files",
+    "Show me what's in the cache",
+    "View the content of this cached file",
+    "Delete this specific cached file",
+    "Clear all cached data",
+    "Show the contents of a cached file",
+    "Remove this file from cache",
+    "List the cache directory contents",
+    "View a cached text file",
+    "Delete everything from the cache",
+]
+
+# Prompts that should not trigger cache tool usage based on tool description
+invalid_prompts = [
+    "Modify a cached file directly",
+    "Search within cached files",
+    "Compress the cache directory",
+    "Move cached files to another location",
+    "Change cache directory permissions",
+    "Reorganize cached files",
+    "Filter cache by file type",
+    "Sort cached files by size",
+    "Archive old cached files",
+    "Backup the cache directory",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/computer_control.py b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/computer_control.py
new file mode 100644
index 000000000..3d17f19cd
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/computer_control.py
@@ -0,0 +1,29 @@
+"""Test cases for the computer control tool."""
+
+# Prompts that should trigger valid computer control tool usage
+valid_prompts = [
+    "Launch Safari and open a specific URL",
+    "Use AppleScript to automate Mail app",
+    "Click a button in the current application",
+    "Fill out a form in Safari",
+    "Control system volume using AppleScript",
+    "Organize files in a folder",
+    "Add an event to Calendar",
+    "Send an email using Mail app",
+    "Manage iTunes playlist",
+    "Automate document processing in Pages",
+]
+
+# Prompts that should not trigger computer control tool usage based on tool description
+invalid_prompts = [
+    "Control applications that don't support AppleScript",
+    "Perform actions requiring root access",
+    "Modify system files directly",
+    "Access restricted system areas",
+    "Control non-Apple applications without AppleScript support",
+    "Perform actions requiring kernel modifications",
+    "Execute privileged system commands",
+    "Modify protected system settings",
+    "Access hardware directly",
+    "Control low-level system functions",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_scrape.py b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_scrape.py
new file mode 100644
index 000000000..ebc7eed5a
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_scrape.py
@@ -0,0 +1,29 @@
+"""Test cases for the web scrape tool."""
+
+# Prompts that should trigger valid web scrape tool usage
+valid_prompts = [
+    "Fetch the content from https://example.com",
+    "Download the HTML from this webpage",
+    "Get JSON data from this API endpoint",
+    "Save this image from the web",
+    "Scrape text content from this URL",
+    "Download this webpage as text",
+    "Get the JSON response from this API",
+    "Save this binary file from the web",
+    "Fetch and cache this webpage",
+    "Download this document as text",
+]
+
+# Prompts that should not trigger web scrape tool usage based on tool description
+invalid_prompts = [
+    "Scrape a complex web application with dynamic content",
+    "Extract data from a JavaScript-heavy website",
+    "Scrape content that requires login",
+    "Download content from multiple pages at once",
+    "Extract data from a site with anti-scraping measures",
+    "Scrape content that requires user interaction",
+    "Download content from a protected API",
+    "Extract data from pages requiring authentication",
+    "Scrape content from multiple URLs simultaneously",
+    "Download data from a site requiring cookies",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_search.py b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_search.py
new file mode 100644
index 000000000..556ad0ea6
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_search.py
@@ -0,0 +1,29 @@
+"""Test cases for the web search tool."""
+
+# Prompts that should trigger valid web search tool usage
+valid_prompts = [
+    "Search for information about 'Tesla'",
+    "Look up what 'Bitcoin' is",
+    "Find details about 'SpaceX'",
+    "Search for 'Python' programming language",
+    "What is 'Docker'?",
+    "Look up the company 'Microsoft'",
+    "Search for information about 'Linux'",
+    "Find out about 'AWS'",
+    "What is 'Kubernetes'?",
+    "Search for 'React' framework",
+]
+
+# Prompts that should not trigger web search tool usage based on tool description
+invalid_prompts = [
+    "Search for multiple words at once",
+    "Look up a complex query with multiple terms",
+    "Search for a long phrase",
+    "Find results for this entire sentence",
+    "Search for 'word1 word2 word3'",
+    "Look up multiple topics at once",
+    "Search for a paragraph of text",
+    "Find results for multiple questions",
+    "Search for a list of items",
+    "Look up several different topics",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/__init__.py b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/list_windows.py b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/list_windows.py
new file mode 100644
index 000000000..94906613f
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/list_windows.py
@@ -0,0 +1,29 @@
+"""Test cases for the list windows tool."""
+
+# Prompts that should trigger valid list windows tool usage
+valid_prompts = [
+    "Show me all available windows",
+    "List the windows that can be captured",
+    "What windows are currently open?",
+    "Display available window titles",
+    "Get a list of windows for screen capture",
+    "Show window titles that I can screenshot",
+    "What windows can I take screenshots of?",
+    "List all window titles",
+    "Show me what windows are available for capture",
+    "Get available window names",
+]
+
+# Prompts that should not trigger list windows tool usage based on tool description
+invalid_prompts = [
+    "Close all windows",
+    "Minimize the current window",
+    "Maximize the browser window",
+    "Move window to another display",
+    "Resize the current window",
+    "Change window focus",
+    "Arrange windows on screen",
+    "Hide inactive windows",
+    "Show desktop",
+    "Switch between windows",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/screen_capture.py b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/screen_capture.py
new file mode 100644
index 000000000..0daef846f
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/screen_capture.py
@@ -0,0 +1,29 @@
+"""Test cases for the screen capture tool."""
+
+# Prompts that should trigger valid screen capture tool usage
+valid_prompts = [
+    "Take a screenshot of the main display",
+    "Capture the window titled 'Terminal'",
+    "Screenshot the current display",
+    "Take a picture of display 0",
+    "Capture a screenshot of the browser window",
+    "Take a screenshot of the active window",
+    "Capture display 1",
+    "Screenshot the window named 'Settings'",
+    "Take a capture of the main screen",
+    "Screenshot the specified window",
+]
+
+# Prompts that should not trigger screen capture tool usage based on tool description
+invalid_prompts = [
+    "Capture multiple windows at once",
+    "Take a screenshot of all displays",
+    "Record a video of the screen",
+    "Capture a region of the screen",
+    "Take a partial screenshot",
+    "Screenshot a specific area",
+    "Capture screen with mouse cursor",
+    "Take a timed screenshot",
+    "Screenshot with specific dimensions",
+    "Capture screen without window decorations",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/shell.py b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/shell.py
new file mode 100644
index 000000000..916f0cfd4
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/shell.py
@@ -0,0 +1,29 @@
+"""Test cases for the shell tool."""
+
+# Prompts that should trigger valid shell tool usage
+valid_prompts = [
+    "Run the command 'ls' to list files",
+    "Execute 'pwd' to show current directory",
+    "Use ripgrep to search for files containing 'example'",
+    "Find all Python files using 'rg --files | rg .py'",
+    "Search for the string 'class Example' in files using ripgrep",
+    "Show the contents of a file using cat",
+    "Count lines in a file using wc -l",
+    "Check disk space with df -h",
+    "List processes with ps",
+    "Create a directory with mkdir test",
+]
+
+# Prompts that should not trigger shell tool usage based on tool description
+invalid_prompts = [
+    "Run a command that will produce gigabytes of output",
+    "Start a long-running server without backgrounding it",
+    "Use find to recursively search for files",
+    "Use ls -R to list all files recursively",
+    "Execute a command that will run indefinitely",
+    "Run a command that streams continuous output",
+    "Use grep recursively to search files",
+    "Start a process that needs to be manually terminated",
+    "Run a command that generates unlimited output",
+    "Execute ls -la on the entire filesystem",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/text_editor.py b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/text_editor.py
new file mode 100644
index 000000000..689548c01
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/text_editor.py
@@ -0,0 +1,29 @@
+"""Test cases for the text editor tool."""
+
+# Prompts that should trigger valid text editor tool usage
+valid_prompts = [
+    "View the contents of file.txt",
+    "Show me what's in config.py",
+    "Create a new file called test.txt with 'Hello World' content",
+    "Write 'print(\"hello\")' to script.py",
+    "Replace the string 'old_version' with 'new_version' in config.txt",
+    "Change 'debug=True' to 'debug=False' in settings.py",
+    "Undo the last edit made to main.py",
+    "Revert the previous change in config.json",
+    "Write this JSON content to data.json",
+    "Update the version number in package.json",
+]
+
+# Prompts that should not trigger text editor tool usage based on tool description
+invalid_prompts = [
+    "Edit multiple sections of the file at once",
+    "Replace all occurrences of a string in the file",
+    "Make changes to multiple files simultaneously",
+    "Modify a file that's larger than 400KB",
+    "Edit a file with more than 400,000 characters",
+    "Replace a string that appears multiple times in the file",
+    "Make partial updates to specific sections without full file content",
+    "Edit binary files",
+    "Modify files without providing full path",
+    "Replace text without exact string match",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/__init__.py b/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_read.py b/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_read.py
new file mode 100644
index 000000000..4f81b9ec2
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_read.py
@@ -0,0 +1,29 @@
+"""Test cases for the Google Drive read tool."""
+
+# Prompts that should trigger valid read tool usage
+valid_prompts = [
+    "Read the file with URI gdrive:///abc123",
+    "Show me the contents of gdrive:///xyz789",
+    "Get the text from gdrive:///doc456",
+    "Read this Google Doc gdrive:///123abc",
+    "Show the contents of spreadsheet gdrive:///789xyz",
+    "Get the text of presentation gdrive:///456def",
+    "Read file gdrive:///def123 and include images",
+    "Show me gdrive:///789abc without images",
+    "Get the content of document gdrive:///xyz456",
+    "Read text file gdrive:///123xyz",
+]
+
+# Prompts that should not trigger read tool usage based on tool description
+invalid_prompts = [
+    "Edit the file gdrive:///abc123",
+    "Write to document gdrive:///xyz789",
+    "Modify spreadsheet gdrive:///123def",
+    "Update presentation gdrive:///def789",
+    "Delete file gdrive:///789xyz",
+    "Create new document gdrive:///456abc",
+    "Share file gdrive:///xyz123",
+    "Move document gdrive:///789def",
+    "Copy file gdrive:///abc789",
+    "Rename document gdrive:///def456",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_search.py b/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_search.py
new file mode 100644
index 000000000..3a093b5f5
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_search.py
@@ -0,0 +1,29 @@
+"""Test cases for the Google Drive search tool."""
+
+# Prompts that should trigger valid search tool usage
+valid_prompts = [
+    "Search for files named 'budget'",
+    "Find documents containing 'report'",
+    "Look for files with 'presentation' in the name",
+    "Search my drive for 'meeting notes'",
+    "Find files named 'project plan'",
+    "Search for 'invoice' in my files",
+    "Look up documents named 'proposal'",
+    "Find spreadsheets with 'data' in the name",
+    "Search for files containing 'schedule'",
+    "Find documents with 'summary' in the title",
+]
+
+# Prompts that should not trigger search tool usage based on tool description
+invalid_prompts = [
+    "Search for files modified in the last week",
+    "Find files larger than 1MB",
+    "Search for files shared with me",
+    "Look for files in a specific folder",
+    "Find files by type",
+    "Search for files by owner",
+    "Look for recently modified files",
+    "Find files with specific permissions",
+    "Search for files by date",
+    "Find files in trash",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/__init__.py b/benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/jetbrains.py b/benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/jetbrains.py
new file mode 100644
index 000000000..b5cf13ef8
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/jetbrains.py
@@ -0,0 +1,29 @@
+"""Test cases for the JetBrains IDE integration tools."""
+
+# Prompts that should trigger valid JetBrains tool usage
+valid_prompts = [
+    "Open the current file in the IDE",
+    "Navigate to line 42 in the active file",
+    "Find usages of this class",
+    "Go to the definition of this method",
+    "Show documentation for this symbol",
+    "Run the current test file",
+    "Debug this application",
+    "Show project structure",
+    "Open recent files",
+    "Search everywhere in the project",
+]
+
+# Prompts that should not trigger JetBrains tool usage based on tool description
+invalid_prompts = [
+    "Create a new IDE instance",
+    "Modify IDE settings",
+    "Install new plugins",
+    "Change IDE theme",
+    "Update the IDE version",
+    "Configure version control",
+    "Modify IDE keymap",
+    "Change project settings",
+    "Install new IDE features",
+    "Uninstall IDE components",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/__init__.py b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remember_memory.py b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remember_memory.py
new file mode 100644
index 000000000..5be419786
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remember_memory.py
@@ -0,0 +1,29 @@
+"""Test cases for the remember memory tool."""
+
+# Prompts that should trigger valid remember memory tool usage
+valid_prompts = [
+    "Remember this development preference in the 'development' category",
+    "Store this setting globally with tags #config #setup",
+    "Save this workflow detail locally in 'workflow' category",
+    "Remember my name and email in the 'personal' category globally",
+    "Store project configuration locally with #settings tag",
+    "Save this formatting preference in development category",
+    "Remember this shortcut in 'keyboard' category with #shortcuts tag",
+    "Store build instructions locally in 'build' category",
+    "Save API credentials globally in 'credentials' category",
+    "Remember git configuration in 'git' category with #config tag",
+]
+
+# Prompts that should not trigger remember memory tool usage based on tool description
+invalid_prompts = [
+    "Save this without specifying a category",
+    "Store this without indicating global or local scope",
+    "Remember this with invalid tags format",
+    "Save empty content in a category",
+    "Store this in multiple categories at once",
+    "Remember this with system-level access",
+    "Save this in a protected category",
+    "Store this with special file permissions",
+    "Remember this in a non-existent directory",
+    "Save this with binary content",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_memory_category.py b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_memory_category.py
new file mode 100644
index 000000000..fb29aebfe
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_memory_category.py
@@ -0,0 +1,29 @@
+"""Test cases for the remove memory category tool."""
+
+# Prompts that should trigger valid remove memory category tool usage
+valid_prompts = [
+    "Delete all memories in the 'development' category",
+    "Clear the 'workflow' category from global storage",
+    "Remove all local project settings",
+    "Delete everything in the 'personal' category",
+    "Clear all global memories",
+    "Remove all local memories",
+    "Delete the 'build' category",
+    "Clear project configuration category",
+    "Remove the 'git' category memories",
+    "Delete all items in 'credentials' category",
+]
+
+# Prompts that should not trigger remove memory category tool usage based on tool description
+invalid_prompts = [
+    "Delete memories across multiple categories",
+    "Remove memories without specifying scope",
+    "Clear memories by date range",
+    "Delete memories by content",
+    "Remove memories with specific tags",
+    "Clear memories by partial category match",
+    "Delete memories selectively",
+    "Remove memories by size",
+    "Clear recently modified memories",
+    "Delete memories by author",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_specific_memory.py b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_specific_memory.py
new file mode 100644
index 000000000..502a43895
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_specific_memory.py
@@ -0,0 +1,29 @@
+"""Test cases for the remove specific memory tool."""
+
+# Prompts that should trigger valid remove specific memory tool usage
+valid_prompts = [
+    "Delete the memory about code formatting from development category",
+    "Remove the git configuration memory from global storage",
+    "Delete project API key from credentials category",
+    "Remove my email setting from personal category",
+    "Delete the build instruction memory from local storage",
+    "Remove specific workflow step from workflow category",
+    "Delete keyboard shortcut memory from shortcuts category",
+    "Remove specific project setting from local config",
+    "Delete specific credential from global storage",
+    "Remove particular preference from settings category",
+]
+
+# Prompts that should not trigger remove specific memory tool usage based on tool description
+invalid_prompts = [
+    "Delete multiple memories at once",
+    "Remove memories by pattern matching",
+    "Delete memories without exact content",
+    "Remove memories by tag only",
+    "Delete memories by date",
+    "Remove partial memory content",
+    "Delete memories by regex",
+    "Remove memories without category",
+    "Delete memories by approximate match",
+    "Remove memories without scope",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/retrieve_memories.py b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/retrieve_memories.py
new file mode 100644
index 000000000..59b187f8e
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/retrieve_memories.py
@@ -0,0 +1,29 @@
+"""Test cases for the retrieve memories tool."""
+
+# Prompts that should trigger valid retrieve memories tool usage
+valid_prompts = [
+    "Show all memories in the 'development' category",
+    "Get my stored preferences from global memory",
+    "Retrieve local project settings",
+    "Show me what's stored in the 'workflow' category",
+    "Get all global memories",
+    "Retrieve everything from local storage",
+    "Show memories tagged with #config",
+    "Get all items from 'personal' category",
+    "Retrieve project-specific memories",
+    "Show what's saved in the 'build' category",
+]
+
+# Prompts that should not trigger retrieve memories tool usage based on tool description
+invalid_prompts = [
+    "Search across multiple categories at once",
+    "Find memories without specifying scope",
+    "Get memories with complex search criteria",
+    "Retrieve memories by date range",
+    "Search memories by content",
+    "Get memories by partial category match",
+    "Retrieve memories with regex patterns",
+    "Find memories by size",
+    "Get memories modified recently",
+    "Search memories by author",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/__init__.py b/benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/tutorial.py b/benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/tutorial.py
new file mode 100644
index 000000000..f208d6a76
--- /dev/null
+++ b/benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/tutorial.py
@@ -0,0 +1,29 @@
+"""Test cases for the load tutorial tool."""
+
+# Prompts that should trigger valid load tutorial tool usage
+valid_prompts = [
+    "Show me the getting-started tutorial",
+    "Load the developer-mcp tutorial",
+    "I need help getting started, show the tutorial",
+    "Can you load the tutorial about development?",
+    "Show me how to use Goose with the tutorial",
+    "Load the beginner's guide tutorial",
+    "I'm new here, can you show me the introduction tutorial?",
+    "Display the tutorial for developers",
+    "Show the tutorial about MCP development",
+    "Load the basic usage tutorial",
+]
+
+# Prompts that should not trigger load tutorial tool usage based on tool description
+invalid_prompts = [
+    "Create a new tutorial",
+    "Edit the existing tutorial",
+    "Delete this tutorial",
+    "Modify tutorial content",
+    "Save this as a tutorial",
+    "Update the tutorial text",
+    "Remove old tutorials",
+    "Change tutorial format",
+    "Add new tutorial section",
+    "Merge multiple tutorials",
+]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/main.py b/benchmark/goosebench/goosebench/main.py
new file mode 100755
index 000000000..ce6bf4b69
--- /dev/null
+++ b/benchmark/goosebench/goosebench/main.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python3
+import dataclasses
+import os
+import subprocess
+import tempfile
+import time
+from enum import Enum
+from typing import List, Optional
+
+import typer
+from rich.console import Console
+from rich.theme import Theme
+from typing_extensions import Annotated
+
+# Initialize typer app and rich console
+app = typer.Typer(help="Goose CLI Integration Tests")
+console = Console(theme=Theme({
+    "info": "cyan",
+    "warning": "yellow",
+    "error": "red",
+    "success": "green"
+}))
+
+
+# Define workflow types
+class Workflow(str, Enum):
+    SERIAL = "serial"
+    CONVERSATIONAL = "conversational"
+
+
+@dataclasses.dataclass
+class Topic:
+    initial_prompt: str
+    follow_ups: List[str]
+
+
+@dataclasses.dataclass
+class Conversation:
+    topics: List[Topic]
+
+
+# Extension configurations
+EXTENSIONS = ['developer', 'computercontroller', 'google_drive', 'memory']
+
+EXTENSION_PROMPTS = {
+    'developer': [
+        "List the contents of the current directory.",
+        "Create a new file called test.txt with the content 'Hello, World!'",
+        "Read the contents of test.txt"
+    ],
+    'computercontroller': [
+        "What are the headlines on hackernews? Organize the list into categories.",
+        "Make a ding sound"
+    ],
+    'google_drive': [
+        "List the files in my Google Drive.",
+        "Search for documents containing 'meeting notes'"
+    ],
+    'memory': [
+        "Save this fact: The capital of France is Paris.",
+        "What is the capital of France?"
+    ]
+}
+
+CONV_EXTENSION_PROMPTS = {
+    k: Conversation(topics=[
+        Topic(val, ["summarize"])
+        for val in v
+    ])
+    for k, v in EXTENSION_PROMPTS.items()
+}
+
+
+class Bench:
+    def __init__(self, workflow: Workflow):
+        self.error_log = []
+        self.workflow = workflow
+
+    def log_error(self, provider: str, model: str, extension: str, error: str) -> None:
+        """Log an error message."""
+        self.error_log.append(
+            f"Provider: {provider}, Model: {model}, Extension: {extension}\n{error}\n"
+        )
+
+    def evaluate(self,
+                 provider: str,
+                 model: str,
+                 extension: str,
+                 prompt: str,
+                 follow_ups: Optional[List[str]] = None) -> None:
+        """Run a single test with the given parameters using pexpect."""
+        console.print(f"Testing: {provider}/{model} with {extension}", style="info")
+        console.print(f"Prompt: {prompt}", style="info")
+        console.print(f"Workflow: {self.workflow.value}", style="info")
+
+        follow_ups = follow_ups or []
+
+        # Create temporary file for prompt
+        with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp:
+            temp.write(prompt)
+            temp_path = temp.name
+
+        try:
+            # Run goose with timeout
+            cmd = ['goose', 'run', '--with-builtin', extension, '-t', prompt]
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=30
+            )
+
+            if result.returncode != 0:
+                self.log_error(provider, model, extension,
+                               result.stdout + result.stderr)
+                console.print("✗ Test failed", style="error")
+
+            else:
+                console.print("✓ Test passed")
+
+        except subprocess.TimeoutExpired:
+            self.log_error(provider, model, extension,
+                           "Test timed out after 30 seconds")
+            console.print("✗ Test timed out", style="error")
+        except Exception as e:
+            self.log_error(provider, model, extension, str(e))
+            console.print("✗ Test failed with unexpected error", style="error")
+        finally:
+            os.unlink(temp_path)
+
+    def _run_serial(self, provider: str, model: str, extension: str) -> None:
+        prompts = EXTENSION_PROMPTS.get(extension, [])
+        for prompt in prompts:
+            self.evaluate(provider, model, extension, prompt)
+            time.sleep(2)  # brief pause between tests
+
+    def _run_conversational(self, provider: str, model: str, extension: str) -> None:
+        conv = CONV_EXTENSION_PROMPTS.get(extension, [])
+        for t in conv.topics:
+            self.evaluate(
+                provider, model, extension, t.initial_prompt, t.follow_ups
+            )
+            time.sleep(2)  # brief pause between tests
+
+    def test_extension(self, provider: str, model: str, extension: str) -> None:
+        """Test all prompts for a given extension."""
+        console.rule(f"Testing extension: {extension}")
+
+        if self.workflow == Workflow.CONVERSATIONAL:
+            return self._run_conversational(provider, model, extension)
+
+        return self._run_serial(provider, model, extension)
+
+
+def parse_provider_model(ctx: typer.Context, provider_models: List[str]) -> List[
+    tuple[str, str]]:
+    """Parse provider:model strings into tuples."""
+    result = []
+    for pm in provider_models:
+        try:
+            provider, models = pm.split(':')
+            for model in models.split(','):
+                result.append((provider.strip(), model.strip()))
+        except ValueError:
+            raise typer.BadParameter(
+                f"Invalid format: {pm}. Use format 'provider:model' or 'provider:model1,model2'"
+            )
+    return result
+
+
+@app.command()
+def main(
+        provider_models: Annotated[
+            Optional[List[str]],
+            typer.Option(
+                '--provider-model', '-pm',
+                help="Provider and model in format 'provider:model' or 'provider:model1,model2'"
+            )
+        ] = None,
+        workflow: Annotated[
+            Workflow,
+            typer.Option(
+                '--workflow', '-w',
+                help="Workflow type: serial or conversational"
+            )
+        ] = Workflow.SERIAL,
+        verbose: Annotated[
+            bool,
+            typer.Option('--verbose', '-v', help="Enable verbose output")
+        ] = False,
+):
+    """
+    Run Goose CLI Integration Tests.
+    
+    Example usage:
+    
+    python main.py  # Uses default: databricks:goose with serial workflow
+    python main.py -pm anthropic:claude
+    python main.py -pm anthropic:claude,claude2
+    python main.py -pm anthropic:claude -pm databricks:goose
+    python main.py --workflow conversational  # Use conversational workflow
+    """
+    console.print("Starting Goose CLI Integration Tests", style="bold")
+
+    runner = Bench(workflow)
+
+    # Use default if no provider-models specified
+    if not provider_models:
+        provider_models = ['databricks:goose']
+
+    # Parse provider-model pairs
+    try:
+        provider_model_pairs = parse_provider_model(typer.Context, provider_models)
+    except typer.BadParameter as e:
+        console.print(f"Error: {str(e)}", style="error")
+        raise typer.Exit(1)
+
+    for provider, model in provider_model_pairs:
+        console.rule(f"Testing provider: {provider}")
+        console.print(f"Testing model: {model}", style="bold")
+
+        for extension in EXTENSIONS:
+            runner.test_extension(provider, model, extension)
+
+    # Print summary
+    if not runner.error_log:
+        console.print("\nAll tests completed successfully!", style="success")
+    else:
+        console.print("\nTest Summary - Errors Found:", style="error")
+        console.rule("Errors")
+        for error in runner.error_log:
+            console.print(error, style="error")
+        raise typer.Exit(1)
+
+
+if __name__ == "__main__":
+    app()
diff --git a/benchmark/goosebench/hello.py b/benchmark/goosebench/hello.py
deleted file mode 100644
index 04e32c222..000000000
--- a/benchmark/goosebench/hello.py
+++ /dev/null
@@ -1,6 +0,0 @@
-def main():
-    print("Hello from goosebench!")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/goosebench/poetry.lock b/benchmark/goosebench/poetry.lock
new file mode 100644
index 000000000..7141b3575
--- /dev/null
+++ b/benchmark/goosebench/poetry.lock
@@ -0,0 +1,310 @@
+# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
+
+[[package]]
+name = "click"
+version = "8.1.8"
+description = "Composable command line interface toolkit"
+optional = false
+python-versions = ">=3.7"
+groups = ["main"]
+files = [
+    {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"},
+    {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.source]
+type = "legacy"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+reference = "artifactory"
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+groups = ["main", "dev"]
+files = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
+markers = {main = "platform_system == \"Windows\"", dev = "sys_platform == \"win32\""}
+
+[package.source]
+type = "legacy"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+reference = "artifactory"
+
+[[package]]
+name = "iniconfig"
+version = "2.0.0"
+description = "brain-dead simple config-ini parsing"
+optional = false
+python-versions = ">=3.7"
+groups = ["dev"]
+files = [
+    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
+    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
+]
+
+[package.source]
+type = "legacy"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+reference = "artifactory"
+
+[[package]]
+name = "markdown-it-py"
+version = "3.0.0"
+description = "Python port of markdown-it. Markdown parsing, done right!"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
+    {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
+]
+
+[package.dependencies]
+mdurl = ">=0.1,<1.0"
+
+[package.extras]
+benchmarking = ["psutil", "pytest", "pytest-benchmark"]
+code-style = ["pre-commit (>=3.0,<4.0)"]
+compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"]
+linkify = ["linkify-it-py (>=1,<3)"]
+plugins = ["mdit-py-plugins"]
+profiling = ["gprof2dot"]
+rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
+testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
+
+[package.source]
+type = "legacy"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+reference = "artifactory"
+
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+description = "Markdown URL utilities"
+optional = false
+python-versions = ">=3.7"
+groups = ["main"]
+files = [
+    {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
+    {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
+]
+
+[package.source]
+type = "legacy"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+reference = "artifactory"
+
+[[package]]
+name = "packaging"
+version = "24.2"
+description = "Core utilities for Python packages"
+optional = false
+python-versions = ">=3.8"
+groups = ["dev"]
+files = [
+    {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"},
+    {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"},
+]
+
+[package.source]
+type = "legacy"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+reference = "artifactory"
+
+[[package]]
+name = "pexpect"
+version = "4.9.0"
+description = "Pexpect allows easy control of interactive console applications."
+optional = false
+python-versions = "*"
+groups = ["main"]
+files = [
+    {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"},
+    {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"},
+]
+
+[package.dependencies]
+ptyprocess = ">=0.5"
+
+[package.source]
+type = "legacy"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+reference = "artifactory"
+
+[[package]]
+name = "pluggy"
+version = "1.5.0"
+description = "plugin and hook calling mechanisms for python"
+optional = false
+python-versions = ">=3.8"
+groups = ["dev"]
+files = [
+    {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
+    {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
+]
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+testing = ["pytest", "pytest-benchmark"]
+
+[package.source]
+type = "legacy"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+reference = "artifactory"
+
+[[package]]
+name = "ptyprocess"
+version = "0.7.0"
+description = "Run a subprocess in a pseudo terminal"
+optional = false
+python-versions = "*"
+groups = ["main"]
+files = [
+    {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
+    {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"},
+]
+
+[package.source]
+type = "legacy"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+reference = "artifactory"
+
+[[package]]
+name = "pygments"
+version = "2.19.1"
+description = "Pygments is a syntax highlighting package written in Python."
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"},
+    {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"},
+]
+
+[package.extras]
+windows-terminal = ["colorama (>=0.4.6)"]
+
+[package.source]
+type = "legacy"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+reference = "artifactory"
+
+[[package]]
+name = "pytest"
+version = "8.3.4"
+description = "pytest: simple powerful testing with Python"
+optional = false
+python-versions = ">=3.8"
+groups = ["dev"]
+files = [
+    {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"},
+    {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=1.5,<2"
+
+[package.extras]
+dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
+
+[package.source]
+type = "legacy"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+reference = "artifactory"
+
+[[package]]
+name = "rich"
+version = "13.9.4"
+description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
+optional = false
+python-versions = ">=3.8.0"
+groups = ["main"]
+files = [
+    {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"},
+    {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"},
+]
+
+[package.dependencies]
+markdown-it-py = ">=2.2.0"
+pygments = ">=2.13.0,<3.0.0"
+
+[package.extras]
+jupyter = ["ipywidgets (>=7.5.1,<9)"]
+
+[package.source]
+type = "legacy"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+reference = "artifactory"
+
+[[package]]
+name = "shellingham"
+version = "1.5.4"
+description = "Tool to Detect Surrounding Shell"
+optional = false
+python-versions = ">=3.7"
+groups = ["main"]
+files = [
+    {file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"},
+    {file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"},
+]
+
+[package.source]
+type = "legacy"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+reference = "artifactory"
+
+[[package]]
+name = "typer"
+version = "0.15.1"
+description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
+optional = false
+python-versions = ">=3.7"
+groups = ["main"]
+files = [
+    {file = "typer-0.15.1-py3-none-any.whl", hash = "sha256:7994fb7b8155b64d3402518560648446072864beefd44aa2dc36972a5972e847"},
+    {file = "typer-0.15.1.tar.gz", hash = "sha256:a0588c0a7fa68a1978a069818657778f86abe6ff5ea6abf472f940a08bfe4f0a"},
+]
+
+[package.dependencies]
+click = ">=8.0.0"
+rich = ">=10.11.0"
+shellingham = ">=1.3.0"
+typing-extensions = ">=3.7.4.3"
+
+[package.source]
+type = "legacy"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+reference = "artifactory"
+
+[[package]]
+name = "typing-extensions"
+version = "4.12.2"
+description = "Backported and Experimental Type Hints for Python 3.8+"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
+    {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
+]
+
+[package.source]
+type = "legacy"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+reference = "artifactory"
+
+[metadata]
+lock-version = "2.1"
+python-versions = ">=3.11"
+content-hash = "58e6c7676973e0793089a2eff7d2dc54f11c5760cd7b0c43ecd5143588ffa046"
diff --git a/benchmark/goosebench/pyproject.toml b/benchmark/goosebench/pyproject.toml
index e1d21d9c6..00f2cecce 100644
--- a/benchmark/goosebench/pyproject.toml
+++ b/benchmark/goosebench/pyproject.toml
@@ -1,7 +1,28 @@
 [project]
-name = "goosebench"
+name = "goose-monitoring-job"
 version = "0.1.0"
-description = "Add your description here"
+description = ""
 readme = "README.md"
-requires-python = ">=3.10"
-dependencies = []
+authors = [
+    { name = "Your Name", email = "you@example.com" }
+]
+
+[[tool.poetry.source]]
+name = "artifactory"
+priority = "primary"
+url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
+
+
+[tool.poetry.dependencies]
+python = ">=3.11"
+pexpect = "^4.9.0"
+typer = "^0.15.1"
+rich = "^13.9.4"
+
+
+[tool.poetry.group.dev.dependencies]
+pytest = "^8.3.4"
+
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"

From 013933c5a61aea89ebd06b3141f8c9c7b4c82b7f Mon Sep 17 00:00:00 2001
From: Marcelle Bonterre <marcelle@squareup.com>
Date: Tue, 18 Feb 2025 22:39:43 -0500
Subject: [PATCH 03/10] WIP

---
 benchmark/goosebench/goosebench/bench.py      |  74 ++++++++++
 .../__init__.py                               |   0
 .../computercontroller_tool/__init__.py       |   0
 .../automation_script.py                      |   0
 .../computercontroller_tool/cache.py          |   0
 .../computer_control.py                       |   0
 .../computercontroller_tool/web_scrape.py     |   0
 .../computercontroller_tool/web_search.py     |   0
 .../developer_tool/__init__.py                |   0
 .../developer_tool/list_windows.py            |   0
 .../developer_tool/screen_capture.py          |   0
 .../developer_tool/shell.py                   |   0
 .../developer_tool/text_editor.py             |   0
 .../memory_tool}/__init__.py                  |   0
 .../memory_tool/remember_memory.py            |   0
 .../memory_tool/remove_memory_category.py     |   0
 .../memory_tool/remove_specific_memory.py     |   0
 .../memory_tool/retrieve_memories.py          |   0
 .../not_used}/__init__.py                     |   0
 .../not_used/google_drive_tool}/__init__.py   |   0
 .../google_drive_tool/google_drive_read.py    |   0
 .../google_drive_tool/google_drive_search.py  |   0
 .../not_used/jetbrains_tool}/__init__.py      |   0
 .../not_used}/jetbrains_tool/jetbrains.py     |   0
 .../extensions/tutorial_tool/__init__.py      |   0
 .../tutorial_tool/tutorial.py                 |   0
 benchmark/goosebench/goosebench/main.py       | 127 +-----------------
 crates/goose-bench/Cargo.toml                 |  21 +++
 .../goose-bench/src/eval_suites/evaluation.rs |  36 +++++
 crates/goose-bench/src/eval_suites/factory.rs |  60 +++++++++
 .../src/eval_suites/flappy_bird.rs            |  29 ++++
 crates/goose-bench/src/eval_suites/mod.rs     |   5 +
 crates/goose-bench/src/lib.rs                 |   1 +
 crates/goose-cli/Cargo.toml                   |   1 +
 crates/goose-cli/src/commands/bench.rs        |  44 ++++++
 crates/goose-cli/src/commands/mod.rs          |   1 +
 crates/goose-cli/src/main.rs                  |   8 ++
 37 files changed, 284 insertions(+), 123 deletions(-)
 create mode 100644 benchmark/goosebench/goosebench/bench.py
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/__init__.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/computercontroller_tool/__init__.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/computercontroller_tool/automation_script.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/computercontroller_tool/cache.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/computercontroller_tool/computer_control.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/computercontroller_tool/web_scrape.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/computercontroller_tool/web_search.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/developer_tool/__init__.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/developer_tool/list_windows.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/developer_tool/screen_capture.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/developer_tool/shell.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/developer_tool/text_editor.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools/google_drive_tool => extensions/memory_tool}/__init__.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/memory_tool/remember_memory.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/memory_tool/remove_memory_category.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/memory_tool/remove_specific_memory.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/memory_tool/retrieve_memories.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools/jetbrains_tool => extensions/not_used}/__init__.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools/memory_tool => extensions/not_used/google_drive_tool}/__init__.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions/not_used}/google_drive_tool/google_drive_read.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions/not_used}/google_drive_tool/google_drive_search.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools/tutorial_tool => extensions/not_used/jetbrains_tool}/__init__.py (100%)
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions/not_used}/jetbrains_tool/jetbrains.py (100%)
 create mode 100644 benchmark/goosebench/goosebench/extensions/tutorial_tool/__init__.py
 rename benchmark/goosebench/goosebench/{evaluate_tools => extensions}/tutorial_tool/tutorial.py (100%)
 create mode 100644 crates/goose-bench/Cargo.toml
 create mode 100644 crates/goose-bench/src/eval_suites/evaluation.rs
 create mode 100644 crates/goose-bench/src/eval_suites/factory.rs
 create mode 100644 crates/goose-bench/src/eval_suites/flappy_bird.rs
 create mode 100644 crates/goose-bench/src/eval_suites/mod.rs
 create mode 100644 crates/goose-bench/src/lib.rs
 create mode 100644 crates/goose-cli/src/commands/bench.rs

diff --git a/benchmark/goosebench/goosebench/bench.py b/benchmark/goosebench/goosebench/bench.py
new file mode 100644
index 000000000..7566ea43e
--- /dev/null
+++ b/benchmark/goosebench/goosebench/bench.py
@@ -0,0 +1,74 @@
+import os
+import subprocess
+import tempfile
+import time
+from typing import Optional, List
+
+from goosebench.main import console, EXTENSION_PROMPTS
+
+
+class Bench:
+    def __init__(self):
+        self.error_log = []
+
+    def log_error(self, provider: str, model: str, extension: str, error: str) -> None:
+        """Log an error message."""
+        self.error_log.append(
+            f"Provider: {provider}, Model: {model}, Extension: {extension}\n{error}\n"
+        )
+
+    def evaluate(self,
+                 provider: str,
+                 model: str,
+                 extension: str,
+                 prompt: str,
+                 follow_ups: Optional[List[str]] = None) -> None:
+        """Run a single test with the given parameters using pexpect."""
+        console.print(f"Testing: {provider}/{model} with {extension}", style="info")
+        console.print(f"Prompt: {prompt}", style="info")
+
+        follow_ups = follow_ups or []
+
+        # Create temporary file for prompt
+        with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp:
+            temp.write(prompt)
+            temp_path = temp.name
+
+        try:
+            # Run goose with timeout
+            cmd = ['goose', 'run', '--with-builtin', extension, '-t', prompt]
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=30
+            )
+
+            if result.returncode != 0:
+                self.log_error(provider, model, extension,
+                               result.stdout + result.stderr)
+                console.print("✗ Test failed", style="error")
+
+            else:
+                console.print("✓ Test passed")
+
+        except subprocess.TimeoutExpired:
+            self.log_error(provider, model, extension,
+                           "Test timed out after 30 seconds")
+            console.print("✗ Test timed out", style="error")
+        except Exception as e:
+            self.log_error(provider, model, extension, str(e))
+            console.print("✗ Test failed with unexpected error", style="error")
+        finally:
+            os.unlink(temp_path)
+
+    def _run_serial(self, provider: str, model: str, extension: str) -> None:
+        prompts = EXTENSION_PROMPTS.get(extension, [])
+        for prompt in prompts:
+            self.evaluate(provider, model, extension, prompt)
+            time.sleep(2)  # brief pause between tests
+
+    def test_extension(self, provider: str, model: str, extension: str) -> None:
+        """Test all prompts for a given extension."""
+        console.rule(f"Testing extension: {extension}")
+        return self._run_serial(provider, model, extension)
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/__init__.py b/benchmark/goosebench/goosebench/extensions/__init__.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/__init__.py
rename to benchmark/goosebench/goosebench/extensions/__init__.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/__init__.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/__init__.py
rename to benchmark/goosebench/goosebench/extensions/computercontroller_tool/__init__.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/automation_script.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/automation_script.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/automation_script.py
rename to benchmark/goosebench/goosebench/extensions/computercontroller_tool/automation_script.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/cache.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/cache.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/cache.py
rename to benchmark/goosebench/goosebench/extensions/computercontroller_tool/cache.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/computer_control.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/computer_control.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/computer_control.py
rename to benchmark/goosebench/goosebench/extensions/computercontroller_tool/computer_control.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_scrape.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_scrape.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_scrape.py
rename to benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_scrape.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_search.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_search.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/computercontroller_tool/web_search.py
rename to benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_search.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/developer_tool/__init__.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/developer_tool/__init__.py
rename to benchmark/goosebench/goosebench/extensions/developer_tool/__init__.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/list_windows.py b/benchmark/goosebench/goosebench/extensions/developer_tool/list_windows.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/developer_tool/list_windows.py
rename to benchmark/goosebench/goosebench/extensions/developer_tool/list_windows.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/screen_capture.py b/benchmark/goosebench/goosebench/extensions/developer_tool/screen_capture.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/developer_tool/screen_capture.py
rename to benchmark/goosebench/goosebench/extensions/developer_tool/screen_capture.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/shell.py b/benchmark/goosebench/goosebench/extensions/developer_tool/shell.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/developer_tool/shell.py
rename to benchmark/goosebench/goosebench/extensions/developer_tool/shell.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/developer_tool/text_editor.py b/benchmark/goosebench/goosebench/extensions/developer_tool/text_editor.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/developer_tool/text_editor.py
rename to benchmark/goosebench/goosebench/extensions/developer_tool/text_editor.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/memory_tool/__init__.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/__init__.py
rename to benchmark/goosebench/goosebench/extensions/memory_tool/__init__.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remember_memory.py b/benchmark/goosebench/goosebench/extensions/memory_tool/remember_memory.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remember_memory.py
rename to benchmark/goosebench/goosebench/extensions/memory_tool/remember_memory.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_memory_category.py b/benchmark/goosebench/goosebench/extensions/memory_tool/remove_memory_category.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_memory_category.py
rename to benchmark/goosebench/goosebench/extensions/memory_tool/remove_memory_category.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_specific_memory.py b/benchmark/goosebench/goosebench/extensions/memory_tool/remove_specific_memory.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/memory_tool/remove_specific_memory.py
rename to benchmark/goosebench/goosebench/extensions/memory_tool/remove_specific_memory.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/retrieve_memories.py b/benchmark/goosebench/goosebench/extensions/memory_tool/retrieve_memories.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/memory_tool/retrieve_memories.py
rename to benchmark/goosebench/goosebench/extensions/memory_tool/retrieve_memories.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/not_used/__init__.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/__init__.py
rename to benchmark/goosebench/goosebench/extensions/not_used/__init__.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/memory_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/__init__.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/memory_tool/__init__.py
rename to benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/__init__.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_read.py b/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_read.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_read.py
rename to benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_read.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_search.py b/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_search.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/google_drive_tool/google_drive_search.py
rename to benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_search.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/__init__.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/__init__.py
rename to benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/__init__.py
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/jetbrains.py b/benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/jetbrains.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/jetbrains_tool/jetbrains.py
rename to benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/jetbrains.py
diff --git a/benchmark/goosebench/goosebench/extensions/tutorial_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/tutorial_tool/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/tutorial.py b/benchmark/goosebench/goosebench/extensions/tutorial_tool/tutorial.py
similarity index 100%
rename from benchmark/goosebench/goosebench/evaluate_tools/tutorial_tool/tutorial.py
rename to benchmark/goosebench/goosebench/extensions/tutorial_tool/tutorial.py
diff --git a/benchmark/goosebench/goosebench/main.py b/benchmark/goosebench/goosebench/main.py
index ce6bf4b69..d281412b3 100755
--- a/benchmark/goosebench/goosebench/main.py
+++ b/benchmark/goosebench/goosebench/main.py
@@ -1,10 +1,4 @@
 #!/usr/bin/env python3
-import dataclasses
-import os
-import subprocess
-import tempfile
-import time
-from enum import Enum
 from typing import List, Optional
 
 import typer
@@ -12,6 +6,8 @@
 from rich.theme import Theme
 from typing_extensions import Annotated
 
+from goosebench.bench import Bench
+
 # Initialize typer app and rich console
 app = typer.Typer(help="Goose CLI Integration Tests")
 console = Console(theme=Theme({
@@ -21,24 +17,6 @@
     "success": "green"
 }))
 
-
-# Define workflow types
-class Workflow(str, Enum):
-    SERIAL = "serial"
-    CONVERSATIONAL = "conversational"
-
-
-@dataclasses.dataclass
-class Topic:
-    initial_prompt: str
-    follow_ups: List[str]
-
-
-@dataclasses.dataclass
-class Conversation:
-    topics: List[Topic]
-
-
 # Extension configurations
 EXTENSIONS = ['developer', 'computercontroller', 'google_drive', 'memory']
 
@@ -62,95 +40,6 @@ class Conversation:
     ]
 }
 
-CONV_EXTENSION_PROMPTS = {
-    k: Conversation(topics=[
-        Topic(val, ["summarize"])
-        for val in v
-    ])
-    for k, v in EXTENSION_PROMPTS.items()
-}
-
-
-class Bench:
-    def __init__(self, workflow: Workflow):
-        self.error_log = []
-        self.workflow = workflow
-
-    def log_error(self, provider: str, model: str, extension: str, error: str) -> None:
-        """Log an error message."""
-        self.error_log.append(
-            f"Provider: {provider}, Model: {model}, Extension: {extension}\n{error}\n"
-        )
-
-    def evaluate(self,
-                 provider: str,
-                 model: str,
-                 extension: str,
-                 prompt: str,
-                 follow_ups: Optional[List[str]] = None) -> None:
-        """Run a single test with the given parameters using pexpect."""
-        console.print(f"Testing: {provider}/{model} with {extension}", style="info")
-        console.print(f"Prompt: {prompt}", style="info")
-        console.print(f"Workflow: {self.workflow.value}", style="info")
-
-        follow_ups = follow_ups or []
-
-        # Create temporary file for prompt
-        with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp:
-            temp.write(prompt)
-            temp_path = temp.name
-
-        try:
-            # Run goose with timeout
-            cmd = ['goose', 'run', '--with-builtin', extension, '-t', prompt]
-            result = subprocess.run(
-                cmd,
-                capture_output=True,
-                text=True,
-                timeout=30
-            )
-
-            if result.returncode != 0:
-                self.log_error(provider, model, extension,
-                               result.stdout + result.stderr)
-                console.print("✗ Test failed", style="error")
-
-            else:
-                console.print("✓ Test passed")
-
-        except subprocess.TimeoutExpired:
-            self.log_error(provider, model, extension,
-                           "Test timed out after 30 seconds")
-            console.print("✗ Test timed out", style="error")
-        except Exception as e:
-            self.log_error(provider, model, extension, str(e))
-            console.print("✗ Test failed with unexpected error", style="error")
-        finally:
-            os.unlink(temp_path)
-
-    def _run_serial(self, provider: str, model: str, extension: str) -> None:
-        prompts = EXTENSION_PROMPTS.get(extension, [])
-        for prompt in prompts:
-            self.evaluate(provider, model, extension, prompt)
-            time.sleep(2)  # brief pause between tests
-
-    def _run_conversational(self, provider: str, model: str, extension: str) -> None:
-        conv = CONV_EXTENSION_PROMPTS.get(extension, [])
-        for t in conv.topics:
-            self.evaluate(
-                provider, model, extension, t.initial_prompt, t.follow_ups
-            )
-            time.sleep(2)  # brief pause between tests
-
-    def test_extension(self, provider: str, model: str, extension: str) -> None:
-        """Test all prompts for a given extension."""
-        console.rule(f"Testing extension: {extension}")
-
-        if self.workflow == Workflow.CONVERSATIONAL:
-            return self._run_conversational(provider, model, extension)
-
-        return self._run_serial(provider, model, extension)
-
 
 def parse_provider_model(ctx: typer.Context, provider_models: List[str]) -> List[
     tuple[str, str]]:
@@ -177,13 +66,6 @@ def main(
                 help="Provider and model in format 'provider:model' or 'provider:model1,model2'"
             )
         ] = None,
-        workflow: Annotated[
-            Workflow,
-            typer.Option(
-                '--workflow', '-w',
-                help="Workflow type: serial or conversational"
-            )
-        ] = Workflow.SERIAL,
         verbose: Annotated[
             bool,
             typer.Option('--verbose', '-v', help="Enable verbose output")
@@ -194,15 +76,14 @@ def main(
     
     Example usage:
     
-    python main.py  # Uses default: databricks:goose with serial workflow
+    python main.py  # Uses default: databricks:goose
     python main.py -pm anthropic:claude
     python main.py -pm anthropic:claude,claude2
     python main.py -pm anthropic:claude -pm databricks:goose
-    python main.py --workflow conversational  # Use conversational workflow
     """
     console.print("Starting Goose CLI Integration Tests", style="bold")
 
-    runner = Bench(workflow)
+    runner = Bench()
 
     # Use default if no provider-models specified
     if not provider_models:
diff --git a/crates/goose-bench/Cargo.toml b/crates/goose-bench/Cargo.toml
new file mode 100644
index 000000000..69189bf20
--- /dev/null
+++ b/crates/goose-bench/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "goose-bench"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+license.workspace = true
+repository.workspace = true
+description.workspace = true
+
+
+[dependencies]
+anyhow = "1.0"
+paste = "1.0"
+ctor = "0.2.7"
+
+[target.'cfg(target_os = "windows")'.dependencies]
+winapi = { version = "0.3", features = ["wincred"] }
+
+#[[bench]]
+#name = "tokenization_benchmark"
+#harness = false
diff --git a/crates/goose-bench/src/eval_suites/evaluation.rs b/crates/goose-bench/src/eval_suites/evaluation.rs
new file mode 100644
index 000000000..31532e882
--- /dev/null
+++ b/crates/goose-bench/src/eval_suites/evaluation.rs
@@ -0,0 +1,36 @@
+use anyhow::Result;
+
+
+pub type Model = (String, String);
+pub type Extension = String;
+
+#[derive(Debug)]
+pub enum EvaluationMetric {
+    Integer(i64),
+    Float(f64),
+    String(String),
+    Boolean(bool),
+}
+
+#[derive(Debug)]
+pub struct EvaluationReport {
+    metrics: Vec<EvaluationMetric>,
+}
+
+impl Default for EvaluationReport {
+    fn default() -> Self {
+        Self { metrics: vec![] }
+    }
+}
+
+impl EvaluationReport {
+    pub fn new(metrics: Vec<EvaluationMetric>) -> Self {
+        EvaluationReport { metrics }
+    }
+}
+
+pub trait Evaluation: Send + Sync {
+    fn run(&self) -> Result<EvaluationReport>;
+    fn models(&self) -> Vec<Model>;
+    fn extensions(&self) -> Vec<Extension>;
+}
diff --git a/crates/goose-bench/src/eval_suites/factory.rs b/crates/goose-bench/src/eval_suites/factory.rs
new file mode 100644
index 000000000..9bcb233da
--- /dev/null
+++ b/crates/goose-bench/src/eval_suites/factory.rs
@@ -0,0 +1,60 @@
+use std::collections::HashMap;
+use std::sync::{OnceLock, RwLock};
+
+pub use super::Evaluation;
+
+type EvaluationConstructor = Box<dyn Fn() -> Box<dyn Evaluation> + Send + Sync>;
+
+// Use std::sync::RwLock for interior mutability
+static EVALUATION_REGISTRY: OnceLock<RwLock<HashMap<&'static str, EvaluationConstructor>>> = OnceLock::new();
+
+/// Initialize the registry if it hasn't been initialized
+fn registry() -> &'static RwLock<HashMap<&'static str, EvaluationConstructor>> {
+    EVALUATION_REGISTRY.get_or_init(|| RwLock::new(HashMap::new()))
+}
+
+/// Register a new evaluation version
+pub fn register_evaluation(
+    version: &'static str,
+    constructor: impl Fn() -> Box<dyn Evaluation> + Send + Sync + 'static,
+) {
+    let registry = registry();
+    if let Ok(mut map) = registry.write() {
+        map.insert(version, Box::new(constructor));
+    }
+}
+
+pub struct EvaluationFactory;
+
+impl EvaluationFactory {
+    pub fn create(version: &str) -> Option<Box<dyn Evaluation>> {
+        let registry = registry();
+        let map = registry
+            .read()
+            .expect("Failed to read the benchmark evaluation registry.");
+        let constructor = map.get(version)?;
+        Some(constructor())
+    }
+
+    pub fn available_evaluations() -> Vec<&'static str> {
+        registry()
+            .read()
+            .map(|map| map.keys().copied().collect())
+            .unwrap_or_default()
+    }
+}
+
+#[macro_export]
+macro_rules! register_evaluation {
+    ($version:expr, $evaluation_type:ty) => {
+        paste::paste! {
+            #[ctor::ctor]
+            #[allow(non_snake_case)]
+            fn [<__register_evaluation_ $version>]() {
+                $crate::eval_suites::factory::register_evaluation($version, || {
+                    Box::new(<$evaluation_type>::new())
+                });
+            }
+        }
+    };
+}
diff --git a/crates/goose-bench/src/eval_suites/flappy_bird.rs b/crates/goose-bench/src/eval_suites/flappy_bird.rs
new file mode 100644
index 000000000..eb75818b3
--- /dev/null
+++ b/crates/goose-bench/src/eval_suites/flappy_bird.rs
@@ -0,0 +1,29 @@
+use crate::eval_suites::{Evaluation, Extension, Model};
+use crate::eval_suites::evaluation::EvaluationReport;
+use crate::register_evaluation;
+
+pub struct FlappyBird {}
+
+impl FlappyBird {
+    fn new() -> FlappyBird {
+        FlappyBird {}
+    }
+}
+
+impl Evaluation for FlappyBird {
+    fn run(&self) -> anyhow::Result<EvaluationReport> {
+        let mut metrics = Vec::new();
+
+        Ok(EvaluationReport::new(metrics))
+    }
+
+    fn models(&self) -> Vec<Model> {
+        todo!()
+    }
+
+    fn extensions(&self) -> Vec<Extension> {
+        todo!()
+    }
+}
+
+register_evaluation!("flappy_bird", FlappyBird);
diff --git a/crates/goose-bench/src/eval_suites/mod.rs b/crates/goose-bench/src/eval_suites/mod.rs
new file mode 100644
index 000000000..1f7d2992d
--- /dev/null
+++ b/crates/goose-bench/src/eval_suites/mod.rs
@@ -0,0 +1,5 @@
+mod factory;
+mod flappy_bird;
+mod evaluation;
+pub use evaluation::*;
+pub use factory::{register_evaluation, EvaluationFactory};
diff --git a/crates/goose-bench/src/lib.rs b/crates/goose-bench/src/lib.rs
new file mode 100644
index 000000000..0c41da7e8
--- /dev/null
+++ b/crates/goose-bench/src/lib.rs
@@ -0,0 +1 @@
+pub mod eval_suites;
\ No newline at end of file
diff --git a/crates/goose-cli/Cargo.toml b/crates/goose-cli/Cargo.toml
index 41cb85a60..62dfd85a6 100644
--- a/crates/goose-cli/Cargo.toml
+++ b/crates/goose-cli/Cargo.toml
@@ -13,6 +13,7 @@ path = "src/main.rs"
 
 [dependencies]
 goose = { path = "../goose" }
+goose-bench = { path = "../goose-bench" }
 goose-mcp = { path = "../goose-mcp" }
 mcp-client = { path = "../mcp-client" }
 mcp-server = { path = "../mcp-server" }
diff --git a/crates/goose-cli/src/commands/bench.rs b/crates/goose-cli/src/commands/bench.rs
new file mode 100644
index 000000000..3cb1825cf
--- /dev/null
+++ b/crates/goose-cli/src/commands/bench.rs
@@ -0,0 +1,44 @@
+use goose::message::Message;
+use crate::session::build_session;
+use goose_bench::eval_suites::{EvaluationFactory, EvaluationReport};
+
+// use std::error::Error;
+// build custom run-func that constructs agent from session, then uses custom loop to manage collecting and returning agent messages.
+async fn foo(ext) {
+    let extension = Vec::new(); // todo
+    let name = None;
+    let mut session = build_session(name, false, extension, ext).await;
+    let _ = session.headless_start(prompt).await;
+}
+
+pub async fn headless_start(&mut self, initial_message: String) -> anyhow::Result<()> {
+    self.messages.push(Message::user().with_text(&initial_message));
+    self.process_agent_response().await?;
+    Ok(())
+}
+
+pub async fn run_benchmark() {
+    let mut all_reports: Vec<EvaluationReport> = vec![];
+
+    for eval in EvaluationFactory::available_evaluations() {
+        let evaluation = match EvaluationFactory::create(&eval) {
+            Some(evaluation) => evaluation,
+            None => continue,
+        };
+
+        for (provider, model) in evaluation.models() {
+            for ext in evaluation.extensions() {
+                let report = match evaluation.run() {
+                    Ok(report) => report,
+                    _ => continue,
+                };
+
+                // print report?
+                all_reports.push(report);
+            }
+        }
+    }
+
+    // let summary = report_summary(all_reports)?
+    // print summary?
+}
diff --git a/crates/goose-cli/src/commands/mod.rs b/crates/goose-cli/src/commands/mod.rs
index e9ed50ce5..7302d695e 100644
--- a/crates/goose-cli/src/commands/mod.rs
+++ b/crates/goose-cli/src/commands/mod.rs
@@ -1,3 +1,4 @@
 pub mod agent_version;
 pub mod configure;
 pub mod mcp;
+pub mod bench;
diff --git a/crates/goose-cli/src/main.rs b/crates/goose-cli/src/main.rs
index 5443e142c..ecd0c1215 100644
--- a/crates/goose-cli/src/main.rs
+++ b/crates/goose-cli/src/main.rs
@@ -9,6 +9,7 @@ use goose_cli::commands::mcp::run_server;
 use goose_cli::logging::setup_logging;
 use goose_cli::session::build_session;
 use std::io::{self, Read};
+use goose_cli::commands::bench::run_benchmark;
 
 #[derive(Parser)]
 #[command(author, version, display_name = "", about, long_about = None)]
@@ -140,6 +141,9 @@ enum Command {
 
     /// List available agent versions
     Agents(AgentCommand),
+
+    /// Run benchmark suite
+    Bench {},
 }
 
 #[derive(clap::ValueEnum, Clone, Debug)]
@@ -207,6 +211,10 @@ async fn main() -> Result<()> {
             cmd.run()?;
             return Ok(());
         }
+        Some(Command::Bench {}) => {
+            run_benchmark().await;
+            return Ok(());
+        }
         None => {
             Cli::command().print_help()?;
             println!();

From 49dc81fd95ec6f6b5477a69c34695aab394a182a Mon Sep 17 00:00:00 2001
From: Marcelle Bonterre <marcelle@squareup.com>
Date: Tue, 18 Feb 2025 22:41:10 -0500
Subject: [PATCH 04/10] removed py benchmark proj

---
 .../workflows/install-and-run-goose.yml       |  33 --
 benchmark/goosebench/.gitignore               | 172 ----------
 benchmark/goosebench/archive/test.sh          | 209 ------------
 benchmark/goosebench/bin/.python3@3.11.pkg    |   1 -
 benchmark/goosebench/bin/README.hermit.md     |   7 -
 benchmark/goosebench/bin/activate-hermit      |  21 --
 benchmark/goosebench/bin/activate-hermit.fish |  24 --
 benchmark/goosebench/bin/hermit               |  43 ---
 benchmark/goosebench/bin/hermit.hcl           |   2 -
 benchmark/goosebench/bin/pip                  |   1 -
 benchmark/goosebench/bin/pip3                 |   1 -
 benchmark/goosebench/bin/pip3.11              |   1 -
 benchmark/goosebench/bin/pydoc3               |   1 -
 benchmark/goosebench/bin/pydoc3.11            |   1 -
 benchmark/goosebench/bin/python               |   1 -
 benchmark/goosebench/bin/python3              |   1 -
 benchmark/goosebench/bin/python3-config       |   1 -
 benchmark/goosebench/bin/python3.11           |   1 -
 benchmark/goosebench/bin/python3.11-config    |   1 -
 benchmark/goosebench/config.yaml              |   7 -
 benchmark/goosebench/goosebench/__init__.py   |   0
 benchmark/goosebench/goosebench/bench.py      |  74 -----
 .../goosebench/extensions/__init__.py         |   0
 .../computercontroller_tool/__init__.py       |   0
 .../automation_script.py                      |  29 --
 .../computercontroller_tool/cache.py          |  29 --
 .../computer_control.py                       |  29 --
 .../computercontroller_tool/web_scrape.py     |  29 --
 .../computercontroller_tool/web_search.py     |  29 --
 .../extensions/developer_tool/__init__.py     |   0
 .../extensions/developer_tool/list_windows.py |  29 --
 .../developer_tool/screen_capture.py          |  29 --
 .../extensions/developer_tool/shell.py        |  29 --
 .../extensions/developer_tool/text_editor.py  |  29 --
 .../extensions/memory_tool/__init__.py        |   0
 .../extensions/memory_tool/remember_memory.py |  29 --
 .../memory_tool/remove_memory_category.py     |  29 --
 .../memory_tool/remove_specific_memory.py     |  29 --
 .../memory_tool/retrieve_memories.py          |  29 --
 .../extensions/not_used/__init__.py           |   0
 .../not_used/google_drive_tool/__init__.py    |   0
 .../google_drive_tool/google_drive_read.py    |  29 --
 .../google_drive_tool/google_drive_search.py  |  29 --
 .../not_used/jetbrains_tool/__init__.py       |   0
 .../not_used/jetbrains_tool/jetbrains.py      |  29 --
 .../extensions/tutorial_tool/__init__.py      |   0
 .../extensions/tutorial_tool/tutorial.py      |  29 --
 benchmark/goosebench/goosebench/main.py       | 118 -------
 benchmark/goosebench/poetry.lock              | 310 ------------------
 benchmark/goosebench/pyproject.toml           |  28 --
 50 files changed, 1552 deletions(-)
 delete mode 100644 benchmark/goosebench/.github/workflows/install-and-run-goose.yml
 delete mode 100644 benchmark/goosebench/.gitignore
 delete mode 100755 benchmark/goosebench/archive/test.sh
 delete mode 120000 benchmark/goosebench/bin/.python3@3.11.pkg
 delete mode 100644 benchmark/goosebench/bin/README.hermit.md
 delete mode 100755 benchmark/goosebench/bin/activate-hermit
 delete mode 100755 benchmark/goosebench/bin/activate-hermit.fish
 delete mode 100755 benchmark/goosebench/bin/hermit
 delete mode 100644 benchmark/goosebench/bin/hermit.hcl
 delete mode 120000 benchmark/goosebench/bin/pip
 delete mode 120000 benchmark/goosebench/bin/pip3
 delete mode 120000 benchmark/goosebench/bin/pip3.11
 delete mode 120000 benchmark/goosebench/bin/pydoc3
 delete mode 120000 benchmark/goosebench/bin/pydoc3.11
 delete mode 120000 benchmark/goosebench/bin/python
 delete mode 120000 benchmark/goosebench/bin/python3
 delete mode 120000 benchmark/goosebench/bin/python3-config
 delete mode 120000 benchmark/goosebench/bin/python3.11
 delete mode 120000 benchmark/goosebench/bin/python3.11-config
 delete mode 100644 benchmark/goosebench/config.yaml
 delete mode 100644 benchmark/goosebench/goosebench/__init__.py
 delete mode 100644 benchmark/goosebench/goosebench/bench.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/__init__.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/computercontroller_tool/__init__.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/computercontroller_tool/automation_script.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/computercontroller_tool/cache.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/computercontroller_tool/computer_control.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_scrape.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_search.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/developer_tool/__init__.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/developer_tool/list_windows.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/developer_tool/screen_capture.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/developer_tool/shell.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/developer_tool/text_editor.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/memory_tool/__init__.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/memory_tool/remember_memory.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/memory_tool/remove_memory_category.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/memory_tool/remove_specific_memory.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/memory_tool/retrieve_memories.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/not_used/__init__.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/__init__.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_read.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_search.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/__init__.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/jetbrains.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/tutorial_tool/__init__.py
 delete mode 100644 benchmark/goosebench/goosebench/extensions/tutorial_tool/tutorial.py
 delete mode 100755 benchmark/goosebench/goosebench/main.py
 delete mode 100644 benchmark/goosebench/poetry.lock
 delete mode 100644 benchmark/goosebench/pyproject.toml

diff --git a/benchmark/goosebench/.github/workflows/install-and-run-goose.yml b/benchmark/goosebench/.github/workflows/install-and-run-goose.yml
deleted file mode 100644
index 4b13b97e6..000000000
--- a/benchmark/goosebench/.github/workflows/install-and-run-goose.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: Install and Run Goose
-
-on: 
-  push:
-    branches:
-      - main  # Or your preferred branch
-
-jobs:
-  install-and-run-goose:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout Repository
-        uses: actions/checkout@v2
-
-      - name: Set up Goose Environment
-        run: |
-          echo "GOOSE_BIN_DIR=\$HOME/.local/bin" >> $GITHUB_ENV
-          echo "CONFIGURE=false" >> $GITHUB_ENV
-          echo "DATABRICKS_HOST=https://block-lakehouse-production.cloud.databricks.com" >> $GITHUB_ENV
-
-      - name: Install Goose
-        run: |
-          curl -fsSL https://github.com/block/goose/releases/download/stable/download_cli.sh | bash
-
-      - name: Configure Goose
-        run: |
-          mkdir -p ~/.config/goose/
-          cp $GITHUB_WORKSPACE/test.sh ~/test.sh
-          alias goose=$HOME/.local/bin/goose
-
-      - name: Run Goose Command
-        run: |
-            ./test.sh -p databricks -m goose
diff --git a/benchmark/goosebench/.gitignore b/benchmark/goosebench/.gitignore
deleted file mode 100644
index c4d4e4a7a..000000000
--- a/benchmark/goosebench/.gitignore
+++ /dev/null
@@ -1,172 +0,0 @@
-.playpen/*.log
-.hermit/
-*.ipynb
-.idea/
-.vscode/
-.goose/
-run_slack_app-backup.sh
-temp_creds.json
-temp_merged_policy.json
-token_google_docs.json
-creds-*.json
-assume_role_policy.json
-finetune_data/content/
-finetune_data/datasets/
-insights/messages/
-insights/env.sh
-src/qai_server/slack_app/modes/generated_images/
-src/qai_server/ingest/token_google_docs.json
-src/qai_server/ingest/content/
-src/qai_server/ingest/ingest_project/**/content/
-src/qai_server/ingest/.env
-src/qai_server/evals/messages/
-src/qai_server/evals/annotations_*.jsonl
-projects/ingest_docs/ingest_docs/content/
-projects/ingest_docs/content/
-projects/ingest_docs/notion_example*.json
-run_gdoc.sh
-
-# Datafiles
-*.csv
-*.gz
-*.h5
-*.pkl
-*.pk
-*.html
-*.log
-*.db
-*.db-journal
-
-
-## From: https://github.com/github/gitignore/blob/main/Python.gitignore
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-.python-version
-.pdm.toml
-.pdm-python
-.pdm-build/
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
\ No newline at end of file
diff --git a/benchmark/goosebench/archive/test.sh b/benchmark/goosebench/archive/test.sh
deleted file mode 100755
index b53752e8d..000000000
--- a/benchmark/goosebench/archive/test.sh
+++ /dev/null
@@ -1,209 +0,0 @@
-#!/bin/bash
-
-# NOTE: MacOS ships with Bash 3.2 by default, which does NOT support declare -A for associative arrays.
-# This script uses standard Bash arrays to remain compatible.
-
-# Color codes for output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-NC='\033[0m' # No Color
-BOLD='\033[1m'
-
-# Initialize error log array
-ERROR_LOG=()
-
-#---------------------------------------------------------------------------#
-# EXTENSIONS
-#---------------------------------------------------------------------------#
-# We'll define each extension in an array of prompts. Then we define an array
-# of extension names, so we can iterate over them.
-#---------------------------------------------------------------------------#
-EXTENSIONS=(developer computercontroller google_drive memory)
-
-developer_prompts=(
-  "List the contents of the current directory."
-  "Create a new file called test.txt with the content 'Hello, World!'"
-  "Read the contents of test.txt"
-)
-
-computercontroller_prompts=(
-    "What are the headlines on hackernews? Organize the list into categories."
-    "Make a ding sound"
-)
-
-google_drive_prompts=(
-  "List the files in my Google Drive."
-  "Search for documents containing 'meeting notes'"
-)
-
-memory_prompts=(
-  "Save this fact: The capital of France is Paris."
-  "What is the capital of France?"
-)
-
-
-#---------------------------------------------------------------------------#
-# LOGGING FUNCTION
-#---------------------------------------------------------------------------#
-log_error() {
-  local provider=$1
-  local model=$2
-  local extension=$3
-  local error=$4
-  ERROR_LOG+=("${RED}[ERROR]${NC} Provider: $provider, Model: $model, Extension: $extension\n$error\n")
-}
-
-#---------------------------------------------------------------------------#
-# MAIN TEST FUNCTION
-#---------------------------------------------------------------------------#
-run_test() {
-  local provider=$1
-  local model=$2
-  local extension=$3
-  local prompt=$4
-  local timeout_seconds=30
-
-  echo -e "${YELLOW}Testing:${NC} $provider/$model with $extension"
-  echo -e "${YELLOW}Prompt:${NC} $prompt"
-
-  local temp_file
-  temp_file="$(mktemp)"
-  echo "$prompt" > "$temp_file"
-
-  # Run goose with timeout
-  timeout $timeout_seconds goose run \
-    --with-builtin "$extension" \
-    -t "$(cat "$temp_file")" 2>&1 | tee test_output.log
-
-  # Check for errors
-  if [ ${PIPESTATUS[0]} -ne 0 ]; then
-    log_error "$provider" "$model" "$extension" "$(cat test_output.log)"
-    echo -e "${RED}✗ Test failed${NC}"
-  else
-    echo -e "${GREEN}✓ Test passed${NC}"
-  fi
-
-  rm -f "$temp_file" test_output.log
-}
-
-#---------------------------------------------------------------------------#
-# TESTING EXTENSION (ITERATING OVER PROMPTS)
-#---------------------------------------------------------------------------#
-test_extension() {
-  local provider=$1
-  local model=$2
-  local extension=$3
-
-  echo -e "\n${BOLD}Testing extension: $extension${NC}"
-
-  # We'll build the array name dynamically, e.g. developer_prompts, memory_prompts, etc.
-  # Then we retrieve that array's contents via indirect expansion.
-  local arr_name="${extension}_prompts[@]"
-  local prompts=("${!arr_name}")
-
-  for prompt in "${prompts[@]}"; do
-    run_test "$provider" "$model" "$extension" "$prompt"
-    sleep 2  # brief pause
-  done
-}
-
-#---------------------------------------------------------------------------#
-# USAGE FUNCTION
-#---------------------------------------------------------------------------#
-usage() {
-  echo "Usage: $0 [-p provider -m model[,model2,model3]...]..."
-  echo "  -p provider : Provider to use"
-  echo "  -m models   : Comma-separated list of models to use with the provider"
-  echo "  -h         : Show this help message"
-  echo ""
-  echo "Examples:"
-  echo "  $0                                    # Uses default: databricks/goose"
-  echo "  $0 -p anthropic -m claude             # Single provider/model"
-  echo "  $0 -p anthropic -m claude,claude2     # One provider, multiple models"
-  echo "  $0 -p anthropic -m claude -p databricks -m goose  # Multiple providers"
-  echo "  $0 -p anthropic -m claude,claude2 -p databricks -m goose,goose2  # Multiple of both"
-  exit 1
-}
-
-#---------------------------------------------------------------------------#
-# MAIN WORKFLOW
-#---------------------------------------------------------------------------#
-main() {
-  # Arrays to store provider/model combinations
-  declare -a provider_model_pairs=()
-  local current_provider=""
-
-  # Parse command line arguments
-  while [[ $# -gt 0 ]]; do
-    case "$1" in
-      -h)
-        usage
-        ;;
-      -p)
-        shift
-        if [[ -z "$1" ]]; then
-          echo "Error: -p requires a provider name"
-          usage
-        fi
-        current_provider="$1"
-        shift
-        ;;
-      -m)
-        if [[ -z "$current_provider" ]]; then
-          echo "Error: -m must follow a -p option"
-          usage
-        fi
-        shift
-        if [[ -z "$1" ]]; then
-          echo "Error: -m requires at least one model name"
-          usage
-        fi
-        # Split comma-separated models and create provider:model pairs
-        IFS=',' read -ra models <<< "$1"
-        for model in "${models[@]}"; do
-          provider_model_pairs+=("$current_provider:$model")
-        done
-        shift
-        ;;
-      *)
-        echo "Error: Unknown option $1"
-        usage
-        ;;
-    esac
-  done
-
-  # If no providers/models specified, use defaults
-  if [ ${#provider_model_pairs[@]} -eq 0 ]; then
-    provider_model_pairs=("databricks:goose")
-  fi
-
-  echo -e "${BOLD}Starting Goose CLI Integration Tests${NC}"
-
-  # Iterate through provider/model pairs
-  for pair in "${provider_model_pairs[@]}"; do
-    # Split the pair into provider and model
-    IFS=':' read -r provider model <<< "$pair"
-    
-    echo -e "\n${BOLD}Testing provider: $provider${NC}"
-    echo -e "${BOLD}Testing model: $model${NC}"
-
-    # Now test each extension for this provider/model pair
-    for extension in "${EXTENSIONS[@]}"; do
-      test_extension "$provider" "$model" "$extension"
-    done
-  done
-
-  # Print summary
-  if [ ${#ERROR_LOG[@]} -eq 0 ]; then
-    echo -e "\n${GREEN}All tests completed successfully!${NC}"
-  else
-    echo -e "\n${RED}Test Summary - Errors Found:${NC}"
-    echo -e "================================"
-    printf '%b\n' "${ERROR_LOG[@]}"
-    exit 1
-  fi
-}
-
-# Call main with all arguments
-main "$@"
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/.python3@3.11.pkg b/benchmark/goosebench/bin/.python3@3.11.pkg
deleted file mode 120000
index 383f4511d..000000000
--- a/benchmark/goosebench/bin/.python3@3.11.pkg
+++ /dev/null
@@ -1 +0,0 @@
-hermit
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/README.hermit.md b/benchmark/goosebench/bin/README.hermit.md
deleted file mode 100644
index e889550ba..000000000
--- a/benchmark/goosebench/bin/README.hermit.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# Hermit environment
-
-This is a [Hermit](https://github.com/cashapp/hermit) bin directory.
-
-The symlinks in this directory are managed by Hermit and will automatically
-download and install Hermit itself as well as packages. These packages are
-local to this environment.
diff --git a/benchmark/goosebench/bin/activate-hermit b/benchmark/goosebench/bin/activate-hermit
deleted file mode 100755
index fe28214d3..000000000
--- a/benchmark/goosebench/bin/activate-hermit
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-# This file must be used with "source bin/activate-hermit" from bash or zsh.
-# You cannot run it directly
-#
-# THIS FILE IS GENERATED; DO NOT MODIFY
-
-if [ "${BASH_SOURCE-}" = "$0" ]; then
-  echo "You must source this script: \$ source $0" >&2
-  exit 33
-fi
-
-BIN_DIR="$(dirname "${BASH_SOURCE[0]:-${(%):-%x}}")"
-if "${BIN_DIR}/hermit" noop > /dev/null; then
-  eval "$("${BIN_DIR}/hermit" activate "${BIN_DIR}/..")"
-
-  if [ -n "${BASH-}" ] || [ -n "${ZSH_VERSION-}" ]; then
-      hash -r 2>/dev/null
-    fi
-
-    echo "Hermit environment $("${HERMIT_ENV}"/bin/hermit env HERMIT_ENV) activated"
-fi
diff --git a/benchmark/goosebench/bin/activate-hermit.fish b/benchmark/goosebench/bin/activate-hermit.fish
deleted file mode 100755
index 0367d2331..000000000
--- a/benchmark/goosebench/bin/activate-hermit.fish
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env fish
-
-# This file must be sourced with "source bin/activate-hermit.fish" from Fish shell.
-# You cannot run it directly.
-#
-# THIS FILE IS GENERATED; DO NOT MODIFY
-
-if status is-interactive
-    set BIN_DIR (dirname (status --current-filename))
-
-    if "$BIN_DIR/hermit" noop > /dev/null
-        # Source the activation script generated by Hermit
-        "$BIN_DIR/hermit" activate "$BIN_DIR/.." | source
-
-        # Clear the command cache if applicable
-        functions -c > /dev/null 2>&1
-
-        # Display activation message
-        echo "Hermit environment $($HERMIT_ENV/bin/hermit env HERMIT_ENV) activated"
-    end
-else
-    echo "You must source this script: source $argv[0]" >&2
-    exit 33
-end
diff --git a/benchmark/goosebench/bin/hermit b/benchmark/goosebench/bin/hermit
deleted file mode 100755
index 6dbd60cce..000000000
--- a/benchmark/goosebench/bin/hermit
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-#
-# THIS FILE IS GENERATED; DO NOT MODIFY
-
-set -eo pipefail
-
-export HERMIT_USER_HOME=~
-
-if [ -z "${HERMIT_STATE_DIR}" ]; then
-  case "$(uname -s)" in
-  Darwin)
-    export HERMIT_STATE_DIR="${HERMIT_USER_HOME}/Library/Caches/hermit"
-    ;;
-  Linux)
-    export HERMIT_STATE_DIR="${XDG_CACHE_HOME:-${HERMIT_USER_HOME}/.cache}/hermit"
-    ;;
-  esac
-fi
-
-export HERMIT_DIST_URL="${HERMIT_DIST_URL:-https://d1abdrezunyhdp.cloudfront.net/square}"
-HERMIT_CHANNEL="$(basename "${HERMIT_DIST_URL}")"
-export HERMIT_CHANNEL
-export HERMIT_EXE=${HERMIT_EXE:-${HERMIT_STATE_DIR}/pkg/hermit@${HERMIT_CHANNEL}/hermit}
-
-if [ ! -x "${HERMIT_EXE}" ]; then
-  echo "Bootstrapping ${HERMIT_EXE} from ${HERMIT_DIST_URL}" 1>&2
-  INSTALL_SCRIPT="$(mktemp)"
-  # This value must match that of the install script
-  INSTALL_SCRIPT_SHA256="d9774f75517f9a6d9e371daae9991cdb9fbbc390101b47c3fb2f6876d9094bab"
-  if [ "${INSTALL_SCRIPT_SHA256}" = "BYPASS" ]; then
-    curl -fsSL "${HERMIT_DIST_URL}/install.sh" -o "${INSTALL_SCRIPT}"
-  else
-    # Install script is versioned by its sha256sum value
-    curl -fsSL "${HERMIT_DIST_URL}/install-${INSTALL_SCRIPT_SHA256}.sh" -o "${INSTALL_SCRIPT}"
-    # Verify install script's sha256sum
-    openssl dgst -sha256 "${INSTALL_SCRIPT}" | \
-      awk -v EXPECTED="$INSTALL_SCRIPT_SHA256" \
-      '$2!=EXPECTED {print "Install script sha256 " $2 " does not match " EXPECTED; exit 1}'
-  fi
-  /bin/bash "${INSTALL_SCRIPT}" 1>&2
-fi
-
-exec "${HERMIT_EXE}" --level=fatal exec "$0" -- "$@"
diff --git a/benchmark/goosebench/bin/hermit.hcl b/benchmark/goosebench/bin/hermit.hcl
deleted file mode 100644
index 081cbe834..000000000
--- a/benchmark/goosebench/bin/hermit.hcl
+++ /dev/null
@@ -1,2 +0,0 @@
-github-token-auth {
-}
diff --git a/benchmark/goosebench/bin/pip b/benchmark/goosebench/bin/pip
deleted file mode 120000
index b78b40b15..000000000
--- a/benchmark/goosebench/bin/pip
+++ /dev/null
@@ -1 +0,0 @@
-.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/pip3 b/benchmark/goosebench/bin/pip3
deleted file mode 120000
index b78b40b15..000000000
--- a/benchmark/goosebench/bin/pip3
+++ /dev/null
@@ -1 +0,0 @@
-.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/pip3.11 b/benchmark/goosebench/bin/pip3.11
deleted file mode 120000
index b78b40b15..000000000
--- a/benchmark/goosebench/bin/pip3.11
+++ /dev/null
@@ -1 +0,0 @@
-.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/pydoc3 b/benchmark/goosebench/bin/pydoc3
deleted file mode 120000
index b78b40b15..000000000
--- a/benchmark/goosebench/bin/pydoc3
+++ /dev/null
@@ -1 +0,0 @@
-.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/pydoc3.11 b/benchmark/goosebench/bin/pydoc3.11
deleted file mode 120000
index b78b40b15..000000000
--- a/benchmark/goosebench/bin/pydoc3.11
+++ /dev/null
@@ -1 +0,0 @@
-.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/python b/benchmark/goosebench/bin/python
deleted file mode 120000
index b78b40b15..000000000
--- a/benchmark/goosebench/bin/python
+++ /dev/null
@@ -1 +0,0 @@
-.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/python3 b/benchmark/goosebench/bin/python3
deleted file mode 120000
index b78b40b15..000000000
--- a/benchmark/goosebench/bin/python3
+++ /dev/null
@@ -1 +0,0 @@
-.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/python3-config b/benchmark/goosebench/bin/python3-config
deleted file mode 120000
index b78b40b15..000000000
--- a/benchmark/goosebench/bin/python3-config
+++ /dev/null
@@ -1 +0,0 @@
-.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/python3.11 b/benchmark/goosebench/bin/python3.11
deleted file mode 120000
index b78b40b15..000000000
--- a/benchmark/goosebench/bin/python3.11
+++ /dev/null
@@ -1 +0,0 @@
-.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/bin/python3.11-config b/benchmark/goosebench/bin/python3.11-config
deleted file mode 120000
index b78b40b15..000000000
--- a/benchmark/goosebench/bin/python3.11-config
+++ /dev/null
@@ -1 +0,0 @@
-.python3@3.11.pkg
\ No newline at end of file
diff --git a/benchmark/goosebench/config.yaml b/benchmark/goosebench/config.yaml
deleted file mode 100644
index 50c26b306..000000000
--- a/benchmark/goosebench/config.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-GOOSE_MODEL: goose
-GOOSE_PROVIDER: databricks
-extensions:
-  developer:
-    enabled: true
-    name: developer
-    type: builtin
diff --git a/benchmark/goosebench/goosebench/__init__.py b/benchmark/goosebench/goosebench/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/benchmark/goosebench/goosebench/bench.py b/benchmark/goosebench/goosebench/bench.py
deleted file mode 100644
index 7566ea43e..000000000
--- a/benchmark/goosebench/goosebench/bench.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import os
-import subprocess
-import tempfile
-import time
-from typing import Optional, List
-
-from goosebench.main import console, EXTENSION_PROMPTS
-
-
-class Bench:
-    def __init__(self):
-        self.error_log = []
-
-    def log_error(self, provider: str, model: str, extension: str, error: str) -> None:
-        """Log an error message."""
-        self.error_log.append(
-            f"Provider: {provider}, Model: {model}, Extension: {extension}\n{error}\n"
-        )
-
-    def evaluate(self,
-                 provider: str,
-                 model: str,
-                 extension: str,
-                 prompt: str,
-                 follow_ups: Optional[List[str]] = None) -> None:
-        """Run a single test with the given parameters using pexpect."""
-        console.print(f"Testing: {provider}/{model} with {extension}", style="info")
-        console.print(f"Prompt: {prompt}", style="info")
-
-        follow_ups = follow_ups or []
-
-        # Create temporary file for prompt
-        with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp:
-            temp.write(prompt)
-            temp_path = temp.name
-
-        try:
-            # Run goose with timeout
-            cmd = ['goose', 'run', '--with-builtin', extension, '-t', prompt]
-            result = subprocess.run(
-                cmd,
-                capture_output=True,
-                text=True,
-                timeout=30
-            )
-
-            if result.returncode != 0:
-                self.log_error(provider, model, extension,
-                               result.stdout + result.stderr)
-                console.print("✗ Test failed", style="error")
-
-            else:
-                console.print("✓ Test passed")
-
-        except subprocess.TimeoutExpired:
-            self.log_error(provider, model, extension,
-                           "Test timed out after 30 seconds")
-            console.print("✗ Test timed out", style="error")
-        except Exception as e:
-            self.log_error(provider, model, extension, str(e))
-            console.print("✗ Test failed with unexpected error", style="error")
-        finally:
-            os.unlink(temp_path)
-
-    def _run_serial(self, provider: str, model: str, extension: str) -> None:
-        prompts = EXTENSION_PROMPTS.get(extension, [])
-        for prompt in prompts:
-            self.evaluate(provider, model, extension, prompt)
-            time.sleep(2)  # brief pause between tests
-
-    def test_extension(self, provider: str, model: str, extension: str) -> None:
-        """Test all prompts for a given extension."""
-        console.rule(f"Testing extension: {extension}")
-        return self._run_serial(provider, model, extension)
diff --git a/benchmark/goosebench/goosebench/extensions/__init__.py b/benchmark/goosebench/goosebench/extensions/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/automation_script.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/automation_script.py
deleted file mode 100644
index 9c6944ec8..000000000
--- a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/automation_script.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the automation script tool."""
-
-# Prompts that should trigger valid automation script tool usage
-valid_prompts = [
-    "Create a shell script to sort unique lines in a file",
-    "Write a Ruby script to process some text data",
-    "Make a script to extract the second column from a CSV",
-    "Create a script to find pattern matches in a file",
-    "Write a shell script to process log files",
-    "Create a Ruby script for text manipulation",
-    "Make a script to analyze data in a text file",
-    "Write a script to format JSON data",
-    "Create a script to clean up file names",
-    "Write a script to extract specific data from files",
-]
-
-# Prompts that should not trigger automation script tool usage based on tool description
-invalid_prompts = [
-    "Create a complex application with multiple files",
-    "Write a script that requires external dependencies",
-    "Create a script that needs a database",
-    "Write a GUI application",
-    "Create a web server application",
-    "Write a script that needs special system access",
-    "Create a script that requires third-party libraries",
-    "Write a script that needs network services",
-    "Create a distributed processing script",
-    "Write a script that requires system installation",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/cache.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/cache.py
deleted file mode 100644
index be78ad8c8..000000000
--- a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/cache.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the cache tool."""
-
-# Prompts that should trigger valid cache tool usage
-valid_prompts = [
-    "List all cached files",
-    "Show me what's in the cache",
-    "View the content of this cached file",
-    "Delete this specific cached file",
-    "Clear all cached data",
-    "Show the contents of a cached file",
-    "Remove this file from cache",
-    "List the cache directory contents",
-    "View a cached text file",
-    "Delete everything from the cache",
-]
-
-# Prompts that should not trigger cache tool usage based on tool description
-invalid_prompts = [
-    "Modify a cached file directly",
-    "Search within cached files",
-    "Compress the cache directory",
-    "Move cached files to another location",
-    "Change cache directory permissions",
-    "Reorganize cached files",
-    "Filter cache by file type",
-    "Sort cached files by size",
-    "Archive old cached files",
-    "Backup the cache directory",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/computer_control.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/computer_control.py
deleted file mode 100644
index 3d17f19cd..000000000
--- a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/computer_control.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the computer control tool."""
-
-# Prompts that should trigger valid computer control tool usage
-valid_prompts = [
-    "Launch Safari and open a specific URL",
-    "Use AppleScript to automate Mail app",
-    "Click a button in the current application",
-    "Fill out a form in Safari",
-    "Control system volume using AppleScript",
-    "Organize files in a folder",
-    "Add an event to Calendar",
-    "Send an email using Mail app",
-    "Manage iTunes playlist",
-    "Automate document processing in Pages",
-]
-
-# Prompts that should not trigger computer control tool usage based on tool description
-invalid_prompts = [
-    "Control applications that don't support AppleScript",
-    "Perform actions requiring root access",
-    "Modify system files directly",
-    "Access restricted system areas",
-    "Control non-Apple applications without AppleScript support",
-    "Perform actions requiring kernel modifications",
-    "Execute privileged system commands",
-    "Modify protected system settings",
-    "Access hardware directly",
-    "Control low-level system functions",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_scrape.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_scrape.py
deleted file mode 100644
index ebc7eed5a..000000000
--- a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_scrape.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the web scrape tool."""
-
-# Prompts that should trigger valid web scrape tool usage
-valid_prompts = [
-    "Fetch the content from https://example.com",
-    "Download the HTML from this webpage",
-    "Get JSON data from this API endpoint",
-    "Save this image from the web",
-    "Scrape text content from this URL",
-    "Download this webpage as text",
-    "Get the JSON response from this API",
-    "Save this binary file from the web",
-    "Fetch and cache this webpage",
-    "Download this document as text",
-]
-
-# Prompts that should not trigger web scrape tool usage based on tool description
-invalid_prompts = [
-    "Scrape a complex web application with dynamic content",
-    "Extract data from a JavaScript-heavy website",
-    "Scrape content that requires login",
-    "Download content from multiple pages at once",
-    "Extract data from a site with anti-scraping measures",
-    "Scrape content that requires user interaction",
-    "Download content from a protected API",
-    "Extract data from pages requiring authentication",
-    "Scrape content from multiple URLs simultaneously",
-    "Download data from a site requiring cookies",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_search.py b/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_search.py
deleted file mode 100644
index 556ad0ea6..000000000
--- a/benchmark/goosebench/goosebench/extensions/computercontroller_tool/web_search.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the web search tool."""
-
-# Prompts that should trigger valid web search tool usage
-valid_prompts = [
-    "Search for information about 'Tesla'",
-    "Look up what 'Bitcoin' is",
-    "Find details about 'SpaceX'",
-    "Search for 'Python' programming language",
-    "What is 'Docker'?",
-    "Look up the company 'Microsoft'",
-    "Search for information about 'Linux'",
-    "Find out about 'AWS'",
-    "What is 'Kubernetes'?",
-    "Search for 'React' framework",
-]
-
-# Prompts that should not trigger web search tool usage based on tool description
-invalid_prompts = [
-    "Search for multiple words at once",
-    "Look up a complex query with multiple terms",
-    "Search for a long phrase",
-    "Find results for this entire sentence",
-    "Search for 'word1 word2 word3'",
-    "Look up multiple topics at once",
-    "Search for a paragraph of text",
-    "Find results for multiple questions",
-    "Search for a list of items",
-    "Look up several different topics",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/developer_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/developer_tool/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/benchmark/goosebench/goosebench/extensions/developer_tool/list_windows.py b/benchmark/goosebench/goosebench/extensions/developer_tool/list_windows.py
deleted file mode 100644
index 94906613f..000000000
--- a/benchmark/goosebench/goosebench/extensions/developer_tool/list_windows.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the list windows tool."""
-
-# Prompts that should trigger valid list windows tool usage
-valid_prompts = [
-    "Show me all available windows",
-    "List the windows that can be captured",
-    "What windows are currently open?",
-    "Display available window titles",
-    "Get a list of windows for screen capture",
-    "Show window titles that I can screenshot",
-    "What windows can I take screenshots of?",
-    "List all window titles",
-    "Show me what windows are available for capture",
-    "Get available window names",
-]
-
-# Prompts that should not trigger list windows tool usage based on tool description
-invalid_prompts = [
-    "Close all windows",
-    "Minimize the current window",
-    "Maximize the browser window",
-    "Move window to another display",
-    "Resize the current window",
-    "Change window focus",
-    "Arrange windows on screen",
-    "Hide inactive windows",
-    "Show desktop",
-    "Switch between windows",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/developer_tool/screen_capture.py b/benchmark/goosebench/goosebench/extensions/developer_tool/screen_capture.py
deleted file mode 100644
index 0daef846f..000000000
--- a/benchmark/goosebench/goosebench/extensions/developer_tool/screen_capture.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the screen capture tool."""
-
-# Prompts that should trigger valid screen capture tool usage
-valid_prompts = [
-    "Take a screenshot of the main display",
-    "Capture the window titled 'Terminal'",
-    "Screenshot the current display",
-    "Take a picture of display 0",
-    "Capture a screenshot of the browser window",
-    "Take a screenshot of the active window",
-    "Capture display 1",
-    "Screenshot the window named 'Settings'",
-    "Take a capture of the main screen",
-    "Screenshot the specified window",
-]
-
-# Prompts that should not trigger screen capture tool usage based on tool description
-invalid_prompts = [
-    "Capture multiple windows at once",
-    "Take a screenshot of all displays",
-    "Record a video of the screen",
-    "Capture a region of the screen",
-    "Take a partial screenshot",
-    "Screenshot a specific area",
-    "Capture screen with mouse cursor",
-    "Take a timed screenshot",
-    "Screenshot with specific dimensions",
-    "Capture screen without window decorations",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/developer_tool/shell.py b/benchmark/goosebench/goosebench/extensions/developer_tool/shell.py
deleted file mode 100644
index 916f0cfd4..000000000
--- a/benchmark/goosebench/goosebench/extensions/developer_tool/shell.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the shell tool."""
-
-# Prompts that should trigger valid shell tool usage
-valid_prompts = [
-    "Run the command 'ls' to list files",
-    "Execute 'pwd' to show current directory",
-    "Use ripgrep to search for files containing 'example'",
-    "Find all Python files using 'rg --files | rg .py'",
-    "Search for the string 'class Example' in files using ripgrep",
-    "Show the contents of a file using cat",
-    "Count lines in a file using wc -l",
-    "Check disk space with df -h",
-    "List processes with ps",
-    "Create a directory with mkdir test",
-]
-
-# Prompts that should not trigger shell tool usage based on tool description
-invalid_prompts = [
-    "Run a command that will produce gigabytes of output",
-    "Start a long-running server without backgrounding it",
-    "Use find to recursively search for files",
-    "Use ls -R to list all files recursively",
-    "Execute a command that will run indefinitely",
-    "Run a command that streams continuous output",
-    "Use grep recursively to search files",
-    "Start a process that needs to be manually terminated",
-    "Run a command that generates unlimited output",
-    "Execute ls -la on the entire filesystem",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/developer_tool/text_editor.py b/benchmark/goosebench/goosebench/extensions/developer_tool/text_editor.py
deleted file mode 100644
index 689548c01..000000000
--- a/benchmark/goosebench/goosebench/extensions/developer_tool/text_editor.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the text editor tool."""
-
-# Prompts that should trigger valid text editor tool usage
-valid_prompts = [
-    "View the contents of file.txt",
-    "Show me what's in config.py",
-    "Create a new file called test.txt with 'Hello World' content",
-    "Write 'print(\"hello\")' to script.py",
-    "Replace the string 'old_version' with 'new_version' in config.txt",
-    "Change 'debug=True' to 'debug=False' in settings.py",
-    "Undo the last edit made to main.py",
-    "Revert the previous change in config.json",
-    "Write this JSON content to data.json",
-    "Update the version number in package.json",
-]
-
-# Prompts that should not trigger text editor tool usage based on tool description
-invalid_prompts = [
-    "Edit multiple sections of the file at once",
-    "Replace all occurrences of a string in the file",
-    "Make changes to multiple files simultaneously",
-    "Modify a file that's larger than 400KB",
-    "Edit a file with more than 400,000 characters",
-    "Replace a string that appears multiple times in the file",
-    "Make partial updates to specific sections without full file content",
-    "Edit binary files",
-    "Modify files without providing full path",
-    "Replace text without exact string match",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/memory_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/memory_tool/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/benchmark/goosebench/goosebench/extensions/memory_tool/remember_memory.py b/benchmark/goosebench/goosebench/extensions/memory_tool/remember_memory.py
deleted file mode 100644
index 5be419786..000000000
--- a/benchmark/goosebench/goosebench/extensions/memory_tool/remember_memory.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the remember memory tool."""
-
-# Prompts that should trigger valid remember memory tool usage
-valid_prompts = [
-    "Remember this development preference in the 'development' category",
-    "Store this setting globally with tags #config #setup",
-    "Save this workflow detail locally in 'workflow' category",
-    "Remember my name and email in the 'personal' category globally",
-    "Store project configuration locally with #settings tag",
-    "Save this formatting preference in development category",
-    "Remember this shortcut in 'keyboard' category with #shortcuts tag",
-    "Store build instructions locally in 'build' category",
-    "Save API credentials globally in 'credentials' category",
-    "Remember git configuration in 'git' category with #config tag",
-]
-
-# Prompts that should not trigger remember memory tool usage based on tool description
-invalid_prompts = [
-    "Save this without specifying a category",
-    "Store this without indicating global or local scope",
-    "Remember this with invalid tags format",
-    "Save empty content in a category",
-    "Store this in multiple categories at once",
-    "Remember this with system-level access",
-    "Save this in a protected category",
-    "Store this with special file permissions",
-    "Remember this in a non-existent directory",
-    "Save this with binary content",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/memory_tool/remove_memory_category.py b/benchmark/goosebench/goosebench/extensions/memory_tool/remove_memory_category.py
deleted file mode 100644
index fb29aebfe..000000000
--- a/benchmark/goosebench/goosebench/extensions/memory_tool/remove_memory_category.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the remove memory category tool."""
-
-# Prompts that should trigger valid remove memory category tool usage
-valid_prompts = [
-    "Delete all memories in the 'development' category",
-    "Clear the 'workflow' category from global storage",
-    "Remove all local project settings",
-    "Delete everything in the 'personal' category",
-    "Clear all global memories",
-    "Remove all local memories",
-    "Delete the 'build' category",
-    "Clear project configuration category",
-    "Remove the 'git' category memories",
-    "Delete all items in 'credentials' category",
-]
-
-# Prompts that should not trigger remove memory category tool usage based on tool description
-invalid_prompts = [
-    "Delete memories across multiple categories",
-    "Remove memories without specifying scope",
-    "Clear memories by date range",
-    "Delete memories by content",
-    "Remove memories with specific tags",
-    "Clear memories by partial category match",
-    "Delete memories selectively",
-    "Remove memories by size",
-    "Clear recently modified memories",
-    "Delete memories by author",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/memory_tool/remove_specific_memory.py b/benchmark/goosebench/goosebench/extensions/memory_tool/remove_specific_memory.py
deleted file mode 100644
index 502a43895..000000000
--- a/benchmark/goosebench/goosebench/extensions/memory_tool/remove_specific_memory.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the remove specific memory tool."""
-
-# Prompts that should trigger valid remove specific memory tool usage
-valid_prompts = [
-    "Delete the memory about code formatting from development category",
-    "Remove the git configuration memory from global storage",
-    "Delete project API key from credentials category",
-    "Remove my email setting from personal category",
-    "Delete the build instruction memory from local storage",
-    "Remove specific workflow step from workflow category",
-    "Delete keyboard shortcut memory from shortcuts category",
-    "Remove specific project setting from local config",
-    "Delete specific credential from global storage",
-    "Remove particular preference from settings category",
-]
-
-# Prompts that should not trigger remove specific memory tool usage based on tool description
-invalid_prompts = [
-    "Delete multiple memories at once",
-    "Remove memories by pattern matching",
-    "Delete memories without exact content",
-    "Remove memories by tag only",
-    "Delete memories by date",
-    "Remove partial memory content",
-    "Delete memories by regex",
-    "Remove memories without category",
-    "Delete memories by approximate match",
-    "Remove memories without scope",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/memory_tool/retrieve_memories.py b/benchmark/goosebench/goosebench/extensions/memory_tool/retrieve_memories.py
deleted file mode 100644
index 59b187f8e..000000000
--- a/benchmark/goosebench/goosebench/extensions/memory_tool/retrieve_memories.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the retrieve memories tool."""
-
-# Prompts that should trigger valid retrieve memories tool usage
-valid_prompts = [
-    "Show all memories in the 'development' category",
-    "Get my stored preferences from global memory",
-    "Retrieve local project settings",
-    "Show me what's stored in the 'workflow' category",
-    "Get all global memories",
-    "Retrieve everything from local storage",
-    "Show memories tagged with #config",
-    "Get all items from 'personal' category",
-    "Retrieve project-specific memories",
-    "Show what's saved in the 'build' category",
-]
-
-# Prompts that should not trigger retrieve memories tool usage based on tool description
-invalid_prompts = [
-    "Search across multiple categories at once",
-    "Find memories without specifying scope",
-    "Get memories with complex search criteria",
-    "Retrieve memories by date range",
-    "Search memories by content",
-    "Get memories by partial category match",
-    "Retrieve memories with regex patterns",
-    "Find memories by size",
-    "Get memories modified recently",
-    "Search memories by author",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/not_used/__init__.py b/benchmark/goosebench/goosebench/extensions/not_used/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_read.py b/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_read.py
deleted file mode 100644
index 4f81b9ec2..000000000
--- a/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_read.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the Google Drive read tool."""
-
-# Prompts that should trigger valid read tool usage
-valid_prompts = [
-    "Read the file with URI gdrive:///abc123",
-    "Show me the contents of gdrive:///xyz789",
-    "Get the text from gdrive:///doc456",
-    "Read this Google Doc gdrive:///123abc",
-    "Show the contents of spreadsheet gdrive:///789xyz",
-    "Get the text of presentation gdrive:///456def",
-    "Read file gdrive:///def123 and include images",
-    "Show me gdrive:///789abc without images",
-    "Get the content of document gdrive:///xyz456",
-    "Read text file gdrive:///123xyz",
-]
-
-# Prompts that should not trigger read tool usage based on tool description
-invalid_prompts = [
-    "Edit the file gdrive:///abc123",
-    "Write to document gdrive:///xyz789",
-    "Modify spreadsheet gdrive:///123def",
-    "Update presentation gdrive:///def789",
-    "Delete file gdrive:///789xyz",
-    "Create new document gdrive:///456abc",
-    "Share file gdrive:///xyz123",
-    "Move document gdrive:///789def",
-    "Copy file gdrive:///abc789",
-    "Rename document gdrive:///def456",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_search.py b/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_search.py
deleted file mode 100644
index 3a093b5f5..000000000
--- a/benchmark/goosebench/goosebench/extensions/not_used/google_drive_tool/google_drive_search.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the Google Drive search tool."""
-
-# Prompts that should trigger valid search tool usage
-valid_prompts = [
-    "Search for files named 'budget'",
-    "Find documents containing 'report'",
-    "Look for files with 'presentation' in the name",
-    "Search my drive for 'meeting notes'",
-    "Find files named 'project plan'",
-    "Search for 'invoice' in my files",
-    "Look up documents named 'proposal'",
-    "Find spreadsheets with 'data' in the name",
-    "Search for files containing 'schedule'",
-    "Find documents with 'summary' in the title",
-]
-
-# Prompts that should not trigger search tool usage based on tool description
-invalid_prompts = [
-    "Search for files modified in the last week",
-    "Find files larger than 1MB",
-    "Search for files shared with me",
-    "Look for files in a specific folder",
-    "Find files by type",
-    "Search for files by owner",
-    "Look for recently modified files",
-    "Find files with specific permissions",
-    "Search for files by date",
-    "Find files in trash",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/jetbrains.py b/benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/jetbrains.py
deleted file mode 100644
index b5cf13ef8..000000000
--- a/benchmark/goosebench/goosebench/extensions/not_used/jetbrains_tool/jetbrains.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the JetBrains IDE integration tools."""
-
-# Prompts that should trigger valid JetBrains tool usage
-valid_prompts = [
-    "Open the current file in the IDE",
-    "Navigate to line 42 in the active file",
-    "Find usages of this class",
-    "Go to the definition of this method",
-    "Show documentation for this symbol",
-    "Run the current test file",
-    "Debug this application",
-    "Show project structure",
-    "Open recent files",
-    "Search everywhere in the project",
-]
-
-# Prompts that should not trigger JetBrains tool usage based on tool description
-invalid_prompts = [
-    "Create a new IDE instance",
-    "Modify IDE settings",
-    "Install new plugins",
-    "Change IDE theme",
-    "Update the IDE version",
-    "Configure version control",
-    "Modify IDE keymap",
-    "Change project settings",
-    "Install new IDE features",
-    "Uninstall IDE components",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/extensions/tutorial_tool/__init__.py b/benchmark/goosebench/goosebench/extensions/tutorial_tool/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/benchmark/goosebench/goosebench/extensions/tutorial_tool/tutorial.py b/benchmark/goosebench/goosebench/extensions/tutorial_tool/tutorial.py
deleted file mode 100644
index f208d6a76..000000000
--- a/benchmark/goosebench/goosebench/extensions/tutorial_tool/tutorial.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Test cases for the load tutorial tool."""
-
-# Prompts that should trigger valid load tutorial tool usage
-valid_prompts = [
-    "Show me the getting-started tutorial",
-    "Load the developer-mcp tutorial",
-    "I need help getting started, show the tutorial",
-    "Can you load the tutorial about development?",
-    "Show me how to use Goose with the tutorial",
-    "Load the beginner's guide tutorial",
-    "I'm new here, can you show me the introduction tutorial?",
-    "Display the tutorial for developers",
-    "Show the tutorial about MCP development",
-    "Load the basic usage tutorial",
-]
-
-# Prompts that should not trigger load tutorial tool usage based on tool description
-invalid_prompts = [
-    "Create a new tutorial",
-    "Edit the existing tutorial",
-    "Delete this tutorial",
-    "Modify tutorial content",
-    "Save this as a tutorial",
-    "Update the tutorial text",
-    "Remove old tutorials",
-    "Change tutorial format",
-    "Add new tutorial section",
-    "Merge multiple tutorials",
-]
\ No newline at end of file
diff --git a/benchmark/goosebench/goosebench/main.py b/benchmark/goosebench/goosebench/main.py
deleted file mode 100755
index d281412b3..000000000
--- a/benchmark/goosebench/goosebench/main.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/usr/bin/env python3
-from typing import List, Optional
-
-import typer
-from rich.console import Console
-from rich.theme import Theme
-from typing_extensions import Annotated
-
-from goosebench.bench import Bench
-
-# Initialize typer app and rich console
-app = typer.Typer(help="Goose CLI Integration Tests")
-console = Console(theme=Theme({
-    "info": "cyan",
-    "warning": "yellow",
-    "error": "red",
-    "success": "green"
-}))
-
-# Extension configurations
-EXTENSIONS = ['developer', 'computercontroller', 'google_drive', 'memory']
-
-EXTENSION_PROMPTS = {
-    'developer': [
-        "List the contents of the current directory.",
-        "Create a new file called test.txt with the content 'Hello, World!'",
-        "Read the contents of test.txt"
-    ],
-    'computercontroller': [
-        "What are the headlines on hackernews? Organize the list into categories.",
-        "Make a ding sound"
-    ],
-    'google_drive': [
-        "List the files in my Google Drive.",
-        "Search for documents containing 'meeting notes'"
-    ],
-    'memory': [
-        "Save this fact: The capital of France is Paris.",
-        "What is the capital of France?"
-    ]
-}
-
-
-def parse_provider_model(ctx: typer.Context, provider_models: List[str]) -> List[
-    tuple[str, str]]:
-    """Parse provider:model strings into tuples."""
-    result = []
-    for pm in provider_models:
-        try:
-            provider, models = pm.split(':')
-            for model in models.split(','):
-                result.append((provider.strip(), model.strip()))
-        except ValueError:
-            raise typer.BadParameter(
-                f"Invalid format: {pm}. Use format 'provider:model' or 'provider:model1,model2'"
-            )
-    return result
-
-
-@app.command()
-def main(
-        provider_models: Annotated[
-            Optional[List[str]],
-            typer.Option(
-                '--provider-model', '-pm',
-                help="Provider and model in format 'provider:model' or 'provider:model1,model2'"
-            )
-        ] = None,
-        verbose: Annotated[
-            bool,
-            typer.Option('--verbose', '-v', help="Enable verbose output")
-        ] = False,
-):
-    """
-    Run Goose CLI Integration Tests.
-    
-    Example usage:
-    
-    python main.py  # Uses default: databricks:goose
-    python main.py -pm anthropic:claude
-    python main.py -pm anthropic:claude,claude2
-    python main.py -pm anthropic:claude -pm databricks:goose
-    """
-    console.print("Starting Goose CLI Integration Tests", style="bold")
-
-    runner = Bench()
-
-    # Use default if no provider-models specified
-    if not provider_models:
-        provider_models = ['databricks:goose']
-
-    # Parse provider-model pairs
-    try:
-        provider_model_pairs = parse_provider_model(typer.Context, provider_models)
-    except typer.BadParameter as e:
-        console.print(f"Error: {str(e)}", style="error")
-        raise typer.Exit(1)
-
-    for provider, model in provider_model_pairs:
-        console.rule(f"Testing provider: {provider}")
-        console.print(f"Testing model: {model}", style="bold")
-
-        for extension in EXTENSIONS:
-            runner.test_extension(provider, model, extension)
-
-    # Print summary
-    if not runner.error_log:
-        console.print("\nAll tests completed successfully!", style="success")
-    else:
-        console.print("\nTest Summary - Errors Found:", style="error")
-        console.rule("Errors")
-        for error in runner.error_log:
-            console.print(error, style="error")
-        raise typer.Exit(1)
-
-
-if __name__ == "__main__":
-    app()
diff --git a/benchmark/goosebench/poetry.lock b/benchmark/goosebench/poetry.lock
deleted file mode 100644
index 7141b3575..000000000
--- a/benchmark/goosebench/poetry.lock
+++ /dev/null
@@ -1,310 +0,0 @@
-# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
-
-[[package]]
-name = "click"
-version = "8.1.8"
-description = "Composable command line interface toolkit"
-optional = false
-python-versions = ">=3.7"
-groups = ["main"]
-files = [
-    {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"},
-    {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"},
-]
-
-[package.dependencies]
-colorama = {version = "*", markers = "platform_system == \"Windows\""}
-
-[package.source]
-type = "legacy"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-reference = "artifactory"
-
-[[package]]
-name = "colorama"
-version = "0.4.6"
-description = "Cross-platform colored terminal text."
-optional = false
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
-groups = ["main", "dev"]
-files = [
-    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
-    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
-]
-markers = {main = "platform_system == \"Windows\"", dev = "sys_platform == \"win32\""}
-
-[package.source]
-type = "legacy"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-reference = "artifactory"
-
-[[package]]
-name = "iniconfig"
-version = "2.0.0"
-description = "brain-dead simple config-ini parsing"
-optional = false
-python-versions = ">=3.7"
-groups = ["dev"]
-files = [
-    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
-    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
-]
-
-[package.source]
-type = "legacy"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-reference = "artifactory"
-
-[[package]]
-name = "markdown-it-py"
-version = "3.0.0"
-description = "Python port of markdown-it. Markdown parsing, done right!"
-optional = false
-python-versions = ">=3.8"
-groups = ["main"]
-files = [
-    {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
-    {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
-]
-
-[package.dependencies]
-mdurl = ">=0.1,<1.0"
-
-[package.extras]
-benchmarking = ["psutil", "pytest", "pytest-benchmark"]
-code-style = ["pre-commit (>=3.0,<4.0)"]
-compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"]
-linkify = ["linkify-it-py (>=1,<3)"]
-plugins = ["mdit-py-plugins"]
-profiling = ["gprof2dot"]
-rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
-testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
-
-[package.source]
-type = "legacy"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-reference = "artifactory"
-
-[[package]]
-name = "mdurl"
-version = "0.1.2"
-description = "Markdown URL utilities"
-optional = false
-python-versions = ">=3.7"
-groups = ["main"]
-files = [
-    {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
-    {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
-]
-
-[package.source]
-type = "legacy"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-reference = "artifactory"
-
-[[package]]
-name = "packaging"
-version = "24.2"
-description = "Core utilities for Python packages"
-optional = false
-python-versions = ">=3.8"
-groups = ["dev"]
-files = [
-    {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"},
-    {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"},
-]
-
-[package.source]
-type = "legacy"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-reference = "artifactory"
-
-[[package]]
-name = "pexpect"
-version = "4.9.0"
-description = "Pexpect allows easy control of interactive console applications."
-optional = false
-python-versions = "*"
-groups = ["main"]
-files = [
-    {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"},
-    {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"},
-]
-
-[package.dependencies]
-ptyprocess = ">=0.5"
-
-[package.source]
-type = "legacy"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-reference = "artifactory"
-
-[[package]]
-name = "pluggy"
-version = "1.5.0"
-description = "plugin and hook calling mechanisms for python"
-optional = false
-python-versions = ">=3.8"
-groups = ["dev"]
-files = [
-    {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
-    {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
-]
-
-[package.extras]
-dev = ["pre-commit", "tox"]
-testing = ["pytest", "pytest-benchmark"]
-
-[package.source]
-type = "legacy"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-reference = "artifactory"
-
-[[package]]
-name = "ptyprocess"
-version = "0.7.0"
-description = "Run a subprocess in a pseudo terminal"
-optional = false
-python-versions = "*"
-groups = ["main"]
-files = [
-    {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
-    {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"},
-]
-
-[package.source]
-type = "legacy"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-reference = "artifactory"
-
-[[package]]
-name = "pygments"
-version = "2.19.1"
-description = "Pygments is a syntax highlighting package written in Python."
-optional = false
-python-versions = ">=3.8"
-groups = ["main"]
-files = [
-    {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"},
-    {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"},
-]
-
-[package.extras]
-windows-terminal = ["colorama (>=0.4.6)"]
-
-[package.source]
-type = "legacy"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-reference = "artifactory"
-
-[[package]]
-name = "pytest"
-version = "8.3.4"
-description = "pytest: simple powerful testing with Python"
-optional = false
-python-versions = ">=3.8"
-groups = ["dev"]
-files = [
-    {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"},
-    {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"},
-]
-
-[package.dependencies]
-colorama = {version = "*", markers = "sys_platform == \"win32\""}
-iniconfig = "*"
-packaging = "*"
-pluggy = ">=1.5,<2"
-
-[package.extras]
-dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
-
-[package.source]
-type = "legacy"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-reference = "artifactory"
-
-[[package]]
-name = "rich"
-version = "13.9.4"
-description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
-optional = false
-python-versions = ">=3.8.0"
-groups = ["main"]
-files = [
-    {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"},
-    {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"},
-]
-
-[package.dependencies]
-markdown-it-py = ">=2.2.0"
-pygments = ">=2.13.0,<3.0.0"
-
-[package.extras]
-jupyter = ["ipywidgets (>=7.5.1,<9)"]
-
-[package.source]
-type = "legacy"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-reference = "artifactory"
-
-[[package]]
-name = "shellingham"
-version = "1.5.4"
-description = "Tool to Detect Surrounding Shell"
-optional = false
-python-versions = ">=3.7"
-groups = ["main"]
-files = [
-    {file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"},
-    {file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"},
-]
-
-[package.source]
-type = "legacy"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-reference = "artifactory"
-
-[[package]]
-name = "typer"
-version = "0.15.1"
-description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
-optional = false
-python-versions = ">=3.7"
-groups = ["main"]
-files = [
-    {file = "typer-0.15.1-py3-none-any.whl", hash = "sha256:7994fb7b8155b64d3402518560648446072864beefd44aa2dc36972a5972e847"},
-    {file = "typer-0.15.1.tar.gz", hash = "sha256:a0588c0a7fa68a1978a069818657778f86abe6ff5ea6abf472f940a08bfe4f0a"},
-]
-
-[package.dependencies]
-click = ">=8.0.0"
-rich = ">=10.11.0"
-shellingham = ">=1.3.0"
-typing-extensions = ">=3.7.4.3"
-
-[package.source]
-type = "legacy"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-reference = "artifactory"
-
-[[package]]
-name = "typing-extensions"
-version = "4.12.2"
-description = "Backported and Experimental Type Hints for Python 3.8+"
-optional = false
-python-versions = ">=3.8"
-groups = ["main"]
-files = [
-    {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
-    {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
-]
-
-[package.source]
-type = "legacy"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-reference = "artifactory"
-
-[metadata]
-lock-version = "2.1"
-python-versions = ">=3.11"
-content-hash = "58e6c7676973e0793089a2eff7d2dc54f11c5760cd7b0c43ecd5143588ffa046"
diff --git a/benchmark/goosebench/pyproject.toml b/benchmark/goosebench/pyproject.toml
deleted file mode 100644
index 00f2cecce..000000000
--- a/benchmark/goosebench/pyproject.toml
+++ /dev/null
@@ -1,28 +0,0 @@
-[project]
-name = "goose-monitoring-job"
-version = "0.1.0"
-description = ""
-readme = "README.md"
-authors = [
-    { name = "Your Name", email = "you@example.com" }
-]
-
-[[tool.poetry.source]]
-name = "artifactory"
-priority = "primary"
-url = "https://artifactory.global.square/artifactory/api/pypi/block-pypi/simple"
-
-
-[tool.poetry.dependencies]
-python = ">=3.11"
-pexpect = "^4.9.0"
-typer = "^0.15.1"
-rich = "^13.9.4"
-
-
-[tool.poetry.group.dev.dependencies]
-pytest = "^8.3.4"
-
-[build-system]
-requires = ["poetry-core>=2.0.0,<3.0.0"]
-build-backend = "poetry.core.masonry.api"

From 60a6100d042343ac1c3da8c50f2b624c905c3b37 Mon Sep 17 00:00:00 2001
From: Marcelle Bonterre <marcelle@squareup.com>
Date: Wed, 19 Feb 2025 15:37:17 -0500
Subject: [PATCH 05/10] add cli opt to specify which eval suite to run +
 support nested suites

---
 crates/goose-bench/Cargo.toml                 |  6 +-
 .../core/complex_tasks/flappy_bird.rs         | 21 ++++++
 .../src/eval_suites/core/complex_tasks/mod.rs |  1 +
 .../goose-bench/src/eval_suites/core/mod.rs   |  1 +
 .../goose-bench/src/eval_suites/evaluation.rs | 16 +++--
 crates/goose-bench/src/eval_suites/factory.rs | 41 +++++++-----
 .../src/eval_suites/flappy_bird.rs            | 29 ---------
 crates/goose-bench/src/eval_suites/mod.rs     |  5 +-
 crates/goose-cli/Cargo.toml                   |  1 +
 crates/goose-cli/src/commands/bench.rs        | 64 ++++++++++---------
 crates/goose-cli/src/main.rs                  | 45 ++++++++-----
 crates/goose-cli/src/session/mod.rs           |  4 ++
 12 files changed, 131 insertions(+), 103 deletions(-)
 create mode 100644 crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs
 create mode 100644 crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs
 create mode 100644 crates/goose-bench/src/eval_suites/core/mod.rs
 delete mode 100644 crates/goose-bench/src/eval_suites/flappy_bird.rs

diff --git a/crates/goose-bench/Cargo.toml b/crates/goose-bench/Cargo.toml
index 69189bf20..78fac6b22 100644
--- a/crates/goose-bench/Cargo.toml
+++ b/crates/goose-bench/Cargo.toml
@@ -12,10 +12,8 @@ description.workspace = true
 anyhow = "1.0"
 paste = "1.0"
 ctor = "0.2.7"
+goose = { path = "../goose" }
+async-trait = "0.1.86"
 
 [target.'cfg(target_os = "windows")'.dependencies]
 winapi = { version = "0.3", features = ["wincred"] }
-
-#[[bench]]
-#name = "tokenization_benchmark"
-#harness = false
diff --git a/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs b/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs
new file mode 100644
index 000000000..3b0fd2a09
--- /dev/null
+++ b/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs
@@ -0,0 +1,21 @@
+use crate::eval_suites::evaluation::EvaluationReport;
+use crate::eval_suites::{BenchAgent, Evaluation};
+use crate::register_evaluation;
+use async_trait::async_trait;
+
+pub struct FlappyBird {}
+
+impl FlappyBird {
+    pub fn new() -> Self {FlappyBird {}}
+}
+
+#[async_trait]
+impl Evaluation for FlappyBird {
+    async fn run(&self, mut agent: Box<dyn BenchAgent>) -> anyhow::Result<EvaluationReport> {
+        let metrics = Vec::new();
+        let _ = agent.prompt("What can you do?".to_string()).await;
+        Ok(EvaluationReport::new(metrics))
+    }
+}
+
+register_evaluation!("core", FlappyBird);
\ No newline at end of file
diff --git a/crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs b/crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs
new file mode 100644
index 000000000..024794fe8
--- /dev/null
+++ b/crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs
@@ -0,0 +1 @@
+mod flappy_bird;
\ No newline at end of file
diff --git a/crates/goose-bench/src/eval_suites/core/mod.rs b/crates/goose-bench/src/eval_suites/core/mod.rs
new file mode 100644
index 000000000..e47d12dd9
--- /dev/null
+++ b/crates/goose-bench/src/eval_suites/core/mod.rs
@@ -0,0 +1 @@
+mod complex_tasks;
\ No newline at end of file
diff --git a/crates/goose-bench/src/eval_suites/evaluation.rs b/crates/goose-bench/src/eval_suites/evaluation.rs
index 31532e882..97b44f593 100644
--- a/crates/goose-bench/src/eval_suites/evaluation.rs
+++ b/crates/goose-bench/src/eval_suites/evaluation.rs
@@ -1,5 +1,6 @@
 use anyhow::Result;
-
+use async_trait::async_trait;
+use goose::message::Message;
 
 pub type Model = (String, String);
 pub type Extension = String;
@@ -12,7 +13,6 @@ pub enum EvaluationMetric {
     Boolean(bool),
 }
 
-#[derive(Debug)]
 pub struct EvaluationReport {
     metrics: Vec<EvaluationMetric>,
 }
@@ -29,8 +29,12 @@ impl EvaluationReport {
     }
 }
 
-pub trait Evaluation: Send + Sync {
-    fn run(&self) -> Result<EvaluationReport>;
-    fn models(&self) -> Vec<Model>;
-    fn extensions(&self) -> Vec<Extension>;
+#[async_trait]
+pub trait BenchAgent: Send + Sync {
+    async fn prompt(&mut self, p: String) -> Result<Vec<Message>>;
 }
+
+#[async_trait]
+pub trait Evaluation: Send + Sync {
+    async fn run(&self, agent: Box<dyn BenchAgent>) -> Result<EvaluationReport>;
+}
\ No newline at end of file
diff --git a/crates/goose-bench/src/eval_suites/factory.rs b/crates/goose-bench/src/eval_suites/factory.rs
index 9bcb233da..ec7763a07 100644
--- a/crates/goose-bench/src/eval_suites/factory.rs
+++ b/crates/goose-bench/src/eval_suites/factory.rs
@@ -1,39 +1,46 @@
+pub use super::Evaluation;
 use std::collections::HashMap;
 use std::sync::{OnceLock, RwLock};
 
-pub use super::Evaluation;
-
-type EvaluationConstructor = Box<dyn Fn() -> Box<dyn Evaluation> + Send + Sync>;
+type EvaluationConstructor = fn() -> Box<dyn Evaluation>;
 
 // Use std::sync::RwLock for interior mutability
-static EVALUATION_REGISTRY: OnceLock<RwLock<HashMap<&'static str, EvaluationConstructor>>> = OnceLock::new();
+static EVALUATION_REGISTRY: OnceLock<RwLock<HashMap<&'static str, Vec<EvaluationConstructor>>>> = OnceLock::new();
 
 /// Initialize the registry if it hasn't been initialized
-fn registry() -> &'static RwLock<HashMap<&'static str, EvaluationConstructor>> {
+fn registry() -> &'static RwLock<HashMap<&'static str, Vec<EvaluationConstructor>>> {
     EVALUATION_REGISTRY.get_or_init(|| RwLock::new(HashMap::new()))
 }
 
 /// Register a new evaluation version
 pub fn register_evaluation(
-    version: &'static str,
-    constructor: impl Fn() -> Box<dyn Evaluation> + Send + Sync + 'static,
+    suite_name: &'static str,
+    constructor: fn() -> Box<dyn Evaluation>,
 ) {
     let registry = registry();
     if let Ok(mut map) = registry.write() {
-        map.insert(version, Box::new(constructor));
+        map.entry(suite_name)
+            .or_insert_with(Vec::new)
+            .push(constructor);
     }
 }
 
-pub struct EvaluationFactory;
+pub struct EvaluationSuiteFactory;
 
-impl EvaluationFactory {
-    pub fn create(version: &str) -> Option<Box<dyn Evaluation>> {
+impl EvaluationSuiteFactory {
+    pub fn create(suite_name: &str) -> Option<Vec<Box<dyn Evaluation>>> {
         let registry = registry();
         let map = registry
             .read()
             .expect("Failed to read the benchmark evaluation registry.");
-        let constructor = map.get(version)?;
-        Some(constructor())
+
+        let constructors = map.get(suite_name)?;
+        let instances = constructors
+            .iter()
+            .map(|&constructor| constructor())
+            .collect::<Vec<_>>();
+
+        Some(instances)
     }
 
     pub fn available_evaluations() -> Vec<&'static str> {
@@ -46,15 +53,15 @@ impl EvaluationFactory {
 
 #[macro_export]
 macro_rules! register_evaluation {
-    ($version:expr, $evaluation_type:ty) => {
+    ($suite_name:expr, $evaluation_type:ty) => {
         paste::paste! {
             #[ctor::ctor]
             #[allow(non_snake_case)]
-            fn [<__register_evaluation_ $version>]() {
-                $crate::eval_suites::factory::register_evaluation($version, || {
+            fn [<__register_evaluation_ $suite_name>]() {
+                $crate::eval_suites::factory::register_evaluation($suite_name, || {
                     Box::new(<$evaluation_type>::new())
                 });
             }
         }
     };
-}
+}
\ No newline at end of file
diff --git a/crates/goose-bench/src/eval_suites/flappy_bird.rs b/crates/goose-bench/src/eval_suites/flappy_bird.rs
deleted file mode 100644
index eb75818b3..000000000
--- a/crates/goose-bench/src/eval_suites/flappy_bird.rs
+++ /dev/null
@@ -1,29 +0,0 @@
-use crate::eval_suites::{Evaluation, Extension, Model};
-use crate::eval_suites::evaluation::EvaluationReport;
-use crate::register_evaluation;
-
-pub struct FlappyBird {}
-
-impl FlappyBird {
-    fn new() -> FlappyBird {
-        FlappyBird {}
-    }
-}
-
-impl Evaluation for FlappyBird {
-    fn run(&self) -> anyhow::Result<EvaluationReport> {
-        let mut metrics = Vec::new();
-
-        Ok(EvaluationReport::new(metrics))
-    }
-
-    fn models(&self) -> Vec<Model> {
-        todo!()
-    }
-
-    fn extensions(&self) -> Vec<Extension> {
-        todo!()
-    }
-}
-
-register_evaluation!("flappy_bird", FlappyBird);
diff --git a/crates/goose-bench/src/eval_suites/mod.rs b/crates/goose-bench/src/eval_suites/mod.rs
index 1f7d2992d..17a975c57 100644
--- a/crates/goose-bench/src/eval_suites/mod.rs
+++ b/crates/goose-bench/src/eval_suites/mod.rs
@@ -1,5 +1,6 @@
 mod factory;
-mod flappy_bird;
 mod evaluation;
+mod core;
+
 pub use evaluation::*;
-pub use factory::{register_evaluation, EvaluationFactory};
+pub use factory::{register_evaluation, EvaluationSuiteFactory};
diff --git a/crates/goose-cli/Cargo.toml b/crates/goose-cli/Cargo.toml
index 62dfd85a6..a4554fba2 100644
--- a/crates/goose-cli/Cargo.toml
+++ b/crates/goose-cli/Cargo.toml
@@ -48,6 +48,7 @@ chrono = "0.4"
 tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt", "json", "time"] }
 tracing-appender = "0.2"
 once_cell = "1.20.2"
+async-trait = "0.1.86"
 
 [target.'cfg(target_os = "windows")'.dependencies]
 winapi = { version = "0.3", features = ["wincred"] }
diff --git a/crates/goose-cli/src/commands/bench.rs b/crates/goose-cli/src/commands/bench.rs
index 3cb1825cf..eb06088e0 100644
--- a/crates/goose-cli/src/commands/bench.rs
+++ b/crates/goose-cli/src/commands/bench.rs
@@ -1,44 +1,50 @@
-use goose::message::Message;
 use crate::session::build_session;
-use goose_bench::eval_suites::{EvaluationFactory, EvaluationReport};
-
+use crate::Session;
+use async_trait::async_trait;
+use goose::message::Message;
+use goose_bench::eval_suites::{BenchAgent, EvaluationReport, EvaluationSuiteFactory};
 // use std::error::Error;
-// build custom run-func that constructs agent from session, then uses custom loop to manage collecting and returning agent messages.
-async fn foo(ext) {
-    let extension = Vec::new(); // todo
-    let name = None;
-    let mut session = build_session(name, false, extension, ext).await;
-    let _ = session.headless_start(prompt).await;
-}
 
-pub async fn headless_start(&mut self, initial_message: String) -> anyhow::Result<()> {
-    self.messages.push(Message::user().with_text(&initial_message));
-    self.process_agent_response().await?;
-    Ok(())
+// cli flag for suite_name [done]
+// default suite_name called core [done]
+// pass session messages in to run [done]
+// eval suite = suite_name / eval_name / test_file_name [done]
+// use session config expecting external proc to manage swapping out config
+
+
+#[async_trait]
+impl BenchAgent for Session {
+    async fn prompt(&mut self, p: String) -> anyhow::Result<Vec<Message>> {
+        self.headless_start(p).await?;
+        Ok(self.message_history())
+    }
 }
 
-pub async fn run_benchmark() {
+pub async fn run_benchmark(suites: Vec<String>) {
     let mut all_reports: Vec<EvaluationReport> = vec![];
 
-    for eval in EvaluationFactory::available_evaluations() {
-        let evaluation = match EvaluationFactory::create(&eval) {
-            Some(evaluation) => evaluation,
+    let suites = EvaluationSuiteFactory::available_evaluations()
+        .into_iter()
+        .filter(|&s| suites.contains(&s.to_string()))
+        .collect::<Vec<_>>();
+
+    for suite in suites {
+        let evaluations = match EvaluationSuiteFactory::create(&suite) {
+            Some(evaluations) => evaluations,
             None => continue,
         };
+        for evaluation in evaluations {
+            let session = build_session(None, false, Vec::new(), Vec::new()).await;
+            let report = match evaluation.run(Box::new(session)).await {
+                Ok(report) => report,
+                _ => continue,
+            };
 
-        for (provider, model) in evaluation.models() {
-            for ext in evaluation.extensions() {
-                let report = match evaluation.run() {
-                    Ok(report) => report,
-                    _ => continue,
-                };
-
-                // print report?
-                all_reports.push(report);
-            }
+            // print report?
+            all_reports.push(report);
         }
     }
 
     // let summary = report_summary(all_reports)?
     // print summary?
-}
+}
\ No newline at end of file
diff --git a/crates/goose-cli/src/main.rs b/crates/goose-cli/src/main.rs
index ecd0c1215..893dde906 100644
--- a/crates/goose-cli/src/main.rs
+++ b/crates/goose-cli/src/main.rs
@@ -4,12 +4,12 @@ use clap::{CommandFactory, Parser, Subcommand};
 use console::style;
 use goose::config::Config;
 use goose_cli::commands::agent_version::AgentCommand;
+use goose_cli::commands::bench::run_benchmark;
 use goose_cli::commands::configure::handle_configure;
 use goose_cli::commands::mcp::run_server;
 use goose_cli::logging::setup_logging;
 use goose_cli::session::build_session;
 use std::io::{self, Read};
-use goose_cli::commands::bench::run_benchmark;
 
 #[derive(Parser)]
 #[command(author, version, display_name = "", about, long_about = None)]
@@ -143,7 +143,17 @@ enum Command {
     Agents(AgentCommand),
 
     /// Run benchmark suite
-    Bench {},
+    Bench {
+        #[arg(
+            short = 's',
+            long = "suites",
+            value_name = "BENCH_SUITE_NAME",
+            help = "Run this list of bench-suites.",
+            long_help = "Specify a comma-separated list of evaluation-suite names to be run.",
+            value_delimiter = ','
+        )]
+        suites: Vec<String>,
+    },
 }
 
 #[derive(clap::ValueEnum, Clone, Debug)]
@@ -166,24 +176,24 @@ async fn main() -> Result<()> {
             let _ = run_server(&name).await;
         }
         Some(Command::Session {
-            name,
-            resume,
-            extension,
-            builtin,
-        }) => {
+                 name,
+                 resume,
+                 extension,
+                 builtin,
+             }) => {
             let mut session = build_session(name, resume, extension, builtin).await;
             setup_logging(session.session_file().file_stem().and_then(|s| s.to_str()))?;
             let _ = session.start().await;
             return Ok(());
         }
         Some(Command::Run {
-            instructions,
-            input_text,
-            name,
-            resume,
-            extension,
-            builtin,
-        }) => {
+                 instructions,
+                 input_text,
+                 name,
+                 resume,
+                 extension,
+                 builtin,
+             }) => {
             // Validate that we have some input source
             if instructions.is_none() && input_text.is_none() {
                 eprintln!("Error: Must provide either --instructions or --text");
@@ -211,8 +221,11 @@ async fn main() -> Result<()> {
             cmd.run()?;
             return Ok(());
         }
-        Some(Command::Bench {}) => {
-            run_benchmark().await;
+        Some(Command::Bench {
+            suites,
+             }) => {
+            let suites = if suites.is_empty() { vec!["core".to_string()] } else { suites };
+            run_benchmark(suites).await;
             return Ok(());
         }
         None => {
diff --git a/crates/goose-cli/src/session/mod.rs b/crates/goose-cli/src/session/mod.rs
index 2a65ef058..37acf739f 100644
--- a/crates/goose-cli/src/session/mod.rs
+++ b/crates/goose-cli/src/session/mod.rs
@@ -299,4 +299,8 @@ impl Session {
     pub fn session_file(&self) -> PathBuf {
         self.session_file.clone()
     }
+
+    pub fn message_history(&self) -> Vec<Message> {
+        self.messages.clone()
+    }
 }

From e85a8954b4518d52ce83d9e674e208d7c2caa366 Mon Sep 17 00:00:00 2001
From: Marcelle Bonterre <marcelle@squareup.com>
Date: Wed, 19 Feb 2025 15:39:08 -0500
Subject: [PATCH 06/10] fmt

---
 .../core/complex_tasks/flappy_bird.rs         |  6 ++--
 .../src/eval_suites/core/complex_tasks/mod.rs |  2 +-
 .../goose-bench/src/eval_suites/core/mod.rs   |  2 +-
 .../goose-bench/src/eval_suites/evaluation.rs |  2 +-
 crates/goose-bench/src/eval_suites/factory.rs | 10 +++---
 crates/goose-bench/src/eval_suites/mod.rs     |  4 +--
 crates/goose-bench/src/lib.rs                 |  2 +-
 crates/goose-cli/src/commands/bench.rs        |  3 +-
 crates/goose-cli/src/commands/mod.rs          |  2 +-
 crates/goose-cli/src/main.rs                  | 34 ++++++++++---------
 10 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs b/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs
index 3b0fd2a09..1c43ed16b 100644
--- a/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs
+++ b/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs
@@ -6,7 +6,9 @@ use async_trait::async_trait;
 pub struct FlappyBird {}
 
 impl FlappyBird {
-    pub fn new() -> Self {FlappyBird {}}
+    pub fn new() -> Self {
+        FlappyBird {}
+    }
 }
 
 #[async_trait]
@@ -18,4 +20,4 @@ impl Evaluation for FlappyBird {
     }
 }
 
-register_evaluation!("core", FlappyBird);
\ No newline at end of file
+register_evaluation!("core", FlappyBird);
diff --git a/crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs b/crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs
index 024794fe8..c09f5510f 100644
--- a/crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs
+++ b/crates/goose-bench/src/eval_suites/core/complex_tasks/mod.rs
@@ -1 +1 @@
-mod flappy_bird;
\ No newline at end of file
+mod flappy_bird;
diff --git a/crates/goose-bench/src/eval_suites/core/mod.rs b/crates/goose-bench/src/eval_suites/core/mod.rs
index e47d12dd9..a1efebf95 100644
--- a/crates/goose-bench/src/eval_suites/core/mod.rs
+++ b/crates/goose-bench/src/eval_suites/core/mod.rs
@@ -1 +1 @@
-mod complex_tasks;
\ No newline at end of file
+mod complex_tasks;
diff --git a/crates/goose-bench/src/eval_suites/evaluation.rs b/crates/goose-bench/src/eval_suites/evaluation.rs
index 97b44f593..1589cc542 100644
--- a/crates/goose-bench/src/eval_suites/evaluation.rs
+++ b/crates/goose-bench/src/eval_suites/evaluation.rs
@@ -37,4 +37,4 @@ pub trait BenchAgent: Send + Sync {
 #[async_trait]
 pub trait Evaluation: Send + Sync {
     async fn run(&self, agent: Box<dyn BenchAgent>) -> Result<EvaluationReport>;
-}
\ No newline at end of file
+}
diff --git a/crates/goose-bench/src/eval_suites/factory.rs b/crates/goose-bench/src/eval_suites/factory.rs
index ec7763a07..0f361ac44 100644
--- a/crates/goose-bench/src/eval_suites/factory.rs
+++ b/crates/goose-bench/src/eval_suites/factory.rs
@@ -5,7 +5,8 @@ use std::sync::{OnceLock, RwLock};
 type EvaluationConstructor = fn() -> Box<dyn Evaluation>;
 
 // Use std::sync::RwLock for interior mutability
-static EVALUATION_REGISTRY: OnceLock<RwLock<HashMap<&'static str, Vec<EvaluationConstructor>>>> = OnceLock::new();
+static EVALUATION_REGISTRY: OnceLock<RwLock<HashMap<&'static str, Vec<EvaluationConstructor>>>> =
+    OnceLock::new();
 
 /// Initialize the registry if it hasn't been initialized
 fn registry() -> &'static RwLock<HashMap<&'static str, Vec<EvaluationConstructor>>> {
@@ -13,10 +14,7 @@ fn registry() -> &'static RwLock<HashMap<&'static str, Vec<EvaluationConstructor
 }
 
 /// Register a new evaluation version
-pub fn register_evaluation(
-    suite_name: &'static str,
-    constructor: fn() -> Box<dyn Evaluation>,
-) {
+pub fn register_evaluation(suite_name: &'static str, constructor: fn() -> Box<dyn Evaluation>) {
     let registry = registry();
     if let Ok(mut map) = registry.write() {
         map.entry(suite_name)
@@ -64,4 +62,4 @@ macro_rules! register_evaluation {
             }
         }
     };
-}
\ No newline at end of file
+}
diff --git a/crates/goose-bench/src/eval_suites/mod.rs b/crates/goose-bench/src/eval_suites/mod.rs
index 17a975c57..82404e34b 100644
--- a/crates/goose-bench/src/eval_suites/mod.rs
+++ b/crates/goose-bench/src/eval_suites/mod.rs
@@ -1,6 +1,6 @@
-mod factory;
-mod evaluation;
 mod core;
+mod evaluation;
+mod factory;
 
 pub use evaluation::*;
 pub use factory::{register_evaluation, EvaluationSuiteFactory};
diff --git a/crates/goose-bench/src/lib.rs b/crates/goose-bench/src/lib.rs
index 0c41da7e8..2881661ea 100644
--- a/crates/goose-bench/src/lib.rs
+++ b/crates/goose-bench/src/lib.rs
@@ -1 +1 @@
-pub mod eval_suites;
\ No newline at end of file
+pub mod eval_suites;
diff --git a/crates/goose-cli/src/commands/bench.rs b/crates/goose-cli/src/commands/bench.rs
index eb06088e0..756d58921 100644
--- a/crates/goose-cli/src/commands/bench.rs
+++ b/crates/goose-cli/src/commands/bench.rs
@@ -11,7 +11,6 @@ use goose_bench::eval_suites::{BenchAgent, EvaluationReport, EvaluationSuiteFact
 // eval suite = suite_name / eval_name / test_file_name [done]
 // use session config expecting external proc to manage swapping out config
 
-
 #[async_trait]
 impl BenchAgent for Session {
     async fn prompt(&mut self, p: String) -> anyhow::Result<Vec<Message>> {
@@ -47,4 +46,4 @@ pub async fn run_benchmark(suites: Vec<String>) {
 
     // let summary = report_summary(all_reports)?
     // print summary?
-}
\ No newline at end of file
+}
diff --git a/crates/goose-cli/src/commands/mod.rs b/crates/goose-cli/src/commands/mod.rs
index 7302d695e..b702cddea 100644
--- a/crates/goose-cli/src/commands/mod.rs
+++ b/crates/goose-cli/src/commands/mod.rs
@@ -1,4 +1,4 @@
 pub mod agent_version;
+pub mod bench;
 pub mod configure;
 pub mod mcp;
-pub mod bench;
diff --git a/crates/goose-cli/src/main.rs b/crates/goose-cli/src/main.rs
index 893dde906..e53883b83 100644
--- a/crates/goose-cli/src/main.rs
+++ b/crates/goose-cli/src/main.rs
@@ -176,24 +176,24 @@ async fn main() -> Result<()> {
             let _ = run_server(&name).await;
         }
         Some(Command::Session {
-                 name,
-                 resume,
-                 extension,
-                 builtin,
-             }) => {
+            name,
+            resume,
+            extension,
+            builtin,
+        }) => {
             let mut session = build_session(name, resume, extension, builtin).await;
             setup_logging(session.session_file().file_stem().and_then(|s| s.to_str()))?;
             let _ = session.start().await;
             return Ok(());
         }
         Some(Command::Run {
-                 instructions,
-                 input_text,
-                 name,
-                 resume,
-                 extension,
-                 builtin,
-             }) => {
+            instructions,
+            input_text,
+            name,
+            resume,
+            extension,
+            builtin,
+        }) => {
             // Validate that we have some input source
             if instructions.is_none() && input_text.is_none() {
                 eprintln!("Error: Must provide either --instructions or --text");
@@ -221,10 +221,12 @@ async fn main() -> Result<()> {
             cmd.run()?;
             return Ok(());
         }
-        Some(Command::Bench {
-            suites,
-             }) => {
-            let suites = if suites.is_empty() { vec!["core".to_string()] } else { suites };
+        Some(Command::Bench { suites }) => {
+            let suites = if suites.is_empty() {
+                vec!["core".to_string()]
+            } else {
+                suites
+            };
             run_benchmark(suites).await;
             return Ok(());
         }

From 826050fb8d37826d9d050aab15af31f6f0f4e965 Mon Sep 17 00:00:00 2001
From: Marcelle Bonterre <marcelle@squareup.com>
Date: Wed, 19 Feb 2025 15:41:48 -0500
Subject: [PATCH 07/10] remove to-list comments

---
 crates/goose-cli/src/commands/bench.rs | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/crates/goose-cli/src/commands/bench.rs b/crates/goose-cli/src/commands/bench.rs
index 756d58921..a4a7c5696 100644
--- a/crates/goose-cli/src/commands/bench.rs
+++ b/crates/goose-cli/src/commands/bench.rs
@@ -3,13 +3,6 @@ use crate::Session;
 use async_trait::async_trait;
 use goose::message::Message;
 use goose_bench::eval_suites::{BenchAgent, EvaluationReport, EvaluationSuiteFactory};
-// use std::error::Error;
-
-// cli flag for suite_name [done]
-// default suite_name called core [done]
-// pass session messages in to run [done]
-// eval suite = suite_name / eval_name / test_file_name [done]
-// use session config expecting external proc to manage swapping out config
 
 #[async_trait]
 impl BenchAgent for Session {

From 4cb274030b7a67ddf85e9330acd355fc17845596 Mon Sep 17 00:00:00 2001
From: Marcelle Bonterre <marcelle@squareup.com>
Date: Wed, 19 Feb 2025 16:03:54 -0500
Subject: [PATCH 08/10] remove report struct

---
 .../core/complex_tasks/flappy_bird.rs           |  7 +++----
 .../goose-bench/src/eval_suites/evaluation.rs   | 17 +----------------
 crates/goose-cli/src/commands/bench.rs          | 12 ++----------
 3 files changed, 6 insertions(+), 30 deletions(-)

diff --git a/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs b/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs
index 1c43ed16b..61d345355 100644
--- a/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs
+++ b/crates/goose-bench/src/eval_suites/core/complex_tasks/flappy_bird.rs
@@ -1,5 +1,4 @@
-use crate::eval_suites::evaluation::EvaluationReport;
-use crate::eval_suites::{BenchAgent, Evaluation};
+use crate::eval_suites::{BenchAgent, Evaluation, EvaluationMetric};
 use crate::register_evaluation;
 use async_trait::async_trait;
 
@@ -13,10 +12,10 @@ impl FlappyBird {
 
 #[async_trait]
 impl Evaluation for FlappyBird {
-    async fn run(&self, mut agent: Box<dyn BenchAgent>) -> anyhow::Result<EvaluationReport> {
+    async fn run(&self, mut agent: Box<dyn BenchAgent>) -> anyhow::Result<Vec<EvaluationMetric>> {
         let metrics = Vec::new();
         let _ = agent.prompt("What can you do?".to_string()).await;
-        Ok(EvaluationReport::new(metrics))
+        Ok(metrics)
     }
 }
 
diff --git a/crates/goose-bench/src/eval_suites/evaluation.rs b/crates/goose-bench/src/eval_suites/evaluation.rs
index 1589cc542..18ba9cbbe 100644
--- a/crates/goose-bench/src/eval_suites/evaluation.rs
+++ b/crates/goose-bench/src/eval_suites/evaluation.rs
@@ -13,21 +13,6 @@ pub enum EvaluationMetric {
     Boolean(bool),
 }
 
-pub struct EvaluationReport {
-    metrics: Vec<EvaluationMetric>,
-}
-
-impl Default for EvaluationReport {
-    fn default() -> Self {
-        Self { metrics: vec![] }
-    }
-}
-
-impl EvaluationReport {
-    pub fn new(metrics: Vec<EvaluationMetric>) -> Self {
-        EvaluationReport { metrics }
-    }
-}
 
 #[async_trait]
 pub trait BenchAgent: Send + Sync {
@@ -36,5 +21,5 @@ pub trait BenchAgent: Send + Sync {
 
 #[async_trait]
 pub trait Evaluation: Send + Sync {
-    async fn run(&self, agent: Box<dyn BenchAgent>) -> Result<EvaluationReport>;
+    async fn run(&self, agent: Box<dyn BenchAgent>) -> Result<Vec<EvaluationMetric>>;
 }
diff --git a/crates/goose-cli/src/commands/bench.rs b/crates/goose-cli/src/commands/bench.rs
index a4a7c5696..51371d280 100644
--- a/crates/goose-cli/src/commands/bench.rs
+++ b/crates/goose-cli/src/commands/bench.rs
@@ -2,7 +2,7 @@ use crate::session::build_session;
 use crate::Session;
 use async_trait::async_trait;
 use goose::message::Message;
-use goose_bench::eval_suites::{BenchAgent, EvaluationReport, EvaluationSuiteFactory};
+use goose_bench::eval_suites::{BenchAgent, EvaluationSuiteFactory};
 
 #[async_trait]
 impl BenchAgent for Session {
@@ -13,8 +13,6 @@ impl BenchAgent for Session {
 }
 
 pub async fn run_benchmark(suites: Vec<String>) {
-    let mut all_reports: Vec<EvaluationReport> = vec![];
-
     let suites = EvaluationSuiteFactory::available_evaluations()
         .into_iter()
         .filter(|&s| suites.contains(&s.to_string()))
@@ -27,16 +25,10 @@ pub async fn run_benchmark(suites: Vec<String>) {
         };
         for evaluation in evaluations {
             let session = build_session(None, false, Vec::new(), Vec::new()).await;
-            let report = match evaluation.run(Box::new(session)).await {
+            let _ = match evaluation.run(Box::new(session)).await {
                 Ok(report) => report,
                 _ => continue,
             };
-
-            // print report?
-            all_reports.push(report);
         }
     }
-
-    // let summary = report_summary(all_reports)?
-    // print summary?
 }

From 64826298de883a39879c874a7a7753bef44d20fa Mon Sep 17 00:00:00 2001
From: Marcelle Bonterre <marcelle@squareup.com>
Date: Wed, 19 Feb 2025 16:04:21 -0500
Subject: [PATCH 09/10] fmt

---
 crates/goose-bench/src/eval_suites/evaluation.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crates/goose-bench/src/eval_suites/evaluation.rs b/crates/goose-bench/src/eval_suites/evaluation.rs
index 18ba9cbbe..87890b772 100644
--- a/crates/goose-bench/src/eval_suites/evaluation.rs
+++ b/crates/goose-bench/src/eval_suites/evaluation.rs
@@ -13,7 +13,6 @@ pub enum EvaluationMetric {
     Boolean(bool),
 }
 
-
 #[async_trait]
 pub trait BenchAgent: Send + Sync {
     async fn prompt(&mut self, p: String) -> Result<Vec<Message>>;

From 91511aefdfd2cf5f44bda28e0253e712cada7036 Mon Sep 17 00:00:00 2001
From: Marcelle Bonterre <marcelle@squareup.com>
Date: Wed, 19 Feb 2025 16:10:27 -0500
Subject: [PATCH 10/10] clippy

---
 crates/goose-cli/src/commands/bench.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/goose-cli/src/commands/bench.rs b/crates/goose-cli/src/commands/bench.rs
index 51371d280..59be398f9 100644
--- a/crates/goose-cli/src/commands/bench.rs
+++ b/crates/goose-cli/src/commands/bench.rs
@@ -19,7 +19,7 @@ pub async fn run_benchmark(suites: Vec<String>) {
         .collect::<Vec<_>>();
 
     for suite in suites {
-        let evaluations = match EvaluationSuiteFactory::create(&suite) {
+        let evaluations = match EvaluationSuiteFactory::create(suite) {
             Some(evaluations) => evaluations,
             None => continue,
         };