From 05f33ee3a72c1b6ae6c67948046c0d0e886f4436 Mon Sep 17 00:00:00 2001 From: Devin Date: Wed, 18 Dec 2024 17:59:53 +0000 Subject: [PATCH] feat: Add SWE-bench benchmarking integration (#415) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Erkin Alp Güney --- requirements.txt | 43 ++------- src/benchmark/swebench/__init__.py | 18 ++++ src/benchmark/swebench/dataset.py | 38 ++++++++ src/benchmark/swebench/evaluator.py | 139 ++++++++++++++++++++++++++++ src/benchmark/swebench/reporter.py | 53 +++++++++++ src/benchmark/swebench/swebench.py | 62 +++++++++++++ tests/benchmark/__init__.py | 1 + tests/benchmark/conftest.py | 28 ++++++ tests/benchmark/test_swebench.py | 136 +++++++++++++++++++++++++++ 9 files changed, 485 insertions(+), 33 deletions(-) create mode 100644 src/benchmark/swebench/__init__.py create mode 100644 src/benchmark/swebench/dataset.py create mode 100644 src/benchmark/swebench/evaluator.py create mode 100644 src/benchmark/swebench/reporter.py create mode 100644 src/benchmark/swebench/swebench.py create mode 100644 tests/benchmark/__init__.py create mode 100644 tests/benchmark/conftest.py create mode 100644 tests/benchmark/test_swebench.py diff --git a/requirements.txt b/requirements.txt index 91666960..68d1690e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,33 +1,10 @@ -flask -flask-cors -toml -urllib3 -requests -colorama -fastlogging -Jinja2 -mistletoe -markdownify -pdfminer.six -playwright -pytest-playwright -tiktoken -ollama -openai -anthropic -google-generativeai -sqlmodel -keybert -GitPython -netlify-py -Markdown -xhtml2pdf -mistralai -Flask-SocketIO -eventlet -groq -duckduckgo-search -orjson -gevent -gevent-websocket -curl_cffi +# Core dependencies +datasets>=2.0.0 +docker>=6.0.0 +pytest>=7.0.0 +pytest-asyncio>=0.21.0 +pytest-cov>=4.1.0 + +# SWE-bench dependencies +swebench>=0.1.0 +huggingface-hub>=0.19.0 diff --git a/src/benchmark/swebench/__init__.py b/src/benchmark/swebench/__init__.py new file mode 100644 index 00000000..78e3cfef --- /dev/null +++ b/src/benchmark/swebench/__init__.py @@ -0,0 +1,18 @@ +""" +SWE-bench integration module for Devika. + +This module provides integration with the SWE-bench benchmark for evaluating +code generation capabilities on real-world GitHub issues. +""" + +from .swebench import SWEBenchRunner +from .dataset import SWEBenchDataset +from .evaluator import SWEBenchEvaluator +from .reporter import SWEBenchReporter + +__all__ = [ + 'SWEBenchRunner', + 'SWEBenchDataset', + 'SWEBenchEvaluator', + 'SWEBenchReporter', +] diff --git a/src/benchmark/swebench/dataset.py b/src/benchmark/swebench/dataset.py new file mode 100644 index 00000000..d55471da --- /dev/null +++ b/src/benchmark/swebench/dataset.py @@ -0,0 +1,38 @@ +"""SWE-bench dataset loading and management.""" + +from typing import Dict, List, Optional +from datasets import load_dataset + +class SWEBenchDataset: + """Handler for SWE-bench dataset operations.""" + + def __init__(self, dataset_name: str = "princeton-nlp/SWE-bench"): + """Initialize dataset handler. + + Args: + dataset_name: HuggingFace dataset name + """ + self.dataset_name = dataset_name + self.dataset = None + + def load_instances(self, instance_ids: Optional[List[str]] = None) -> List[Dict]: + """Load benchmark instances. + + Args: + instance_ids: Optional list of specific instances to load + + Returns: + List of benchmark instances + """ + if self.dataset is None: + self.dataset = load_dataset(self.dataset_name, split='test') + + if instance_ids: + instances = [ + inst for inst in self.dataset + if inst['instance_id'] in instance_ids + ] + else: + instances = list(self.dataset) + + return instances diff --git a/src/benchmark/swebench/evaluator.py b/src/benchmark/swebench/evaluator.py new file mode 100644 index 00000000..f0787356 --- /dev/null +++ b/src/benchmark/swebench/evaluator.py @@ -0,0 +1,139 @@ +"""Docker-based evaluation harness for SWE-bench.""" + +import json +import logging +import os +import subprocess +import tempfile +from pathlib import Path +from typing import Dict, List, Optional + +logger = logging.getLogger(__name__) + +class SWEBenchEvaluator: + """Evaluator for running SWE-bench in Docker containers.""" + + def __init__(self, max_workers: int = 4, working_dir: Optional[Path] = None): + """Initialize evaluator. + + Args: + max_workers: Number of parallel workers + working_dir: Working directory for evaluation files + """ + self.max_workers = max_workers + self.working_dir = working_dir or Path(tempfile.mkdtemp(prefix='swebench_')) + self.working_dir.mkdir(parents=True, exist_ok=True) + + def evaluate_instances( + self, + instances: List[Dict], + run_id: Optional[str] = None + ) -> Dict: + """Evaluate benchmark instances. + + Args: + instances: List of benchmark instances to evaluate + run_id: Optional identifier for this evaluation run + + Returns: + Dictionary containing evaluation results + """ + results = {} + run_dir = self.working_dir / (run_id or 'default') + run_dir.mkdir(parents=True, exist_ok=True) + + # Save predictions for batch evaluation + predictions_dir = run_dir / 'predictions' + predictions_dir.mkdir(parents=True, exist_ok=True) + + for instance in instances: + try: + # Save instance prediction + instance_dir = predictions_dir / instance['instance_id'] + instance_dir.mkdir(parents=True, exist_ok=True) + with open(instance_dir / 'prediction.json', 'w') as f: + json.dump(instance, f, indent=2) + except Exception as e: + logger.error(f"Error preparing {instance['instance_id']}: {e}") + results[instance['instance_id']] = { + 'status': 'error', + 'error': f"Failed to prepare instance: {str(e)}" + } + + # Run batch evaluation using SWE-bench harness + try: + result = self._run_docker_evaluation(predictions_dir, run_id) + results.update(self._parse_evaluation_results(result)) + except Exception as e: + logger.error(f"Docker evaluation failed: {e}") + for instance in instances: + if instance['instance_id'] not in results: + results[instance['instance_id']] = { + 'status': 'error', + 'error': f"Docker evaluation failed: {str(e)}" + } + + return results + + def _run_docker_evaluation(self, predictions_dir: Path, run_id: str) -> str: + """Run Docker-based evaluation using SWE-bench harness. + + Args: + predictions_dir: Directory containing instance predictions + run_id: Identifier for this evaluation run + + Returns: + Raw evaluation output + """ + cmd = [ + 'python', '-m', 'swebench.harness.run_evaluation', + '--predictions_path', str(predictions_dir), + '--max_workers', str(self.max_workers), + '--run_id', run_id or 'default' + ] + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + check=True + ) + return result.stdout + except subprocess.CalledProcessError as e: + logger.error(f"Docker evaluation command failed: {e.output}") + raise RuntimeError(f"Docker evaluation failed: {str(e)}") + + def _parse_evaluation_results(self, output: str) -> Dict: + """Parse evaluation output to extract metrics. + + Args: + output: Raw evaluation output string + + Returns: + Dictionary containing parsed metrics per instance + """ + results = {} + try: + # Extract results from evaluation output + # Format: instance_id: {metrics} + for line in output.splitlines(): + if ':' in line: + instance_id, metrics_str = line.split(':', 1) + instance_id = instance_id.strip() + try: + metrics = json.loads(metrics_str.strip()) + results[instance_id] = { + 'status': 'success', + 'metrics': metrics + } + except json.JSONDecodeError: + results[instance_id] = { + 'status': 'error', + 'error': f"Failed to parse metrics: {metrics_str}" + } + except Exception as e: + logger.error(f"Failed to parse evaluation results: {e}") + raise RuntimeError(f"Failed to parse evaluation results: {str(e)}") + + return results diff --git a/src/benchmark/swebench/reporter.py b/src/benchmark/swebench/reporter.py new file mode 100644 index 00000000..22bc1718 --- /dev/null +++ b/src/benchmark/swebench/reporter.py @@ -0,0 +1,53 @@ +"""Results reporting for SWE-bench benchmark.""" + +import json +from pathlib import Path +from typing import Dict + +class SWEBenchReporter: + """Reporter for SWE-bench benchmark results.""" + + def generate_report(self, results: Dict) -> Dict: + """Generate benchmark report. + + Args: + results: Dictionary containing benchmark results + + Returns: + Dictionary containing formatted report + """ + report = { + 'summary': self._generate_summary(results), + 'details': results + } + return report + + def save_report(self, report: Dict, output_file: Path): + """Save benchmark report to file. + + Args: + report: Dictionary containing benchmark report + output_file: Path to save report + """ + with open(output_file, 'w') as f: + json.dump(report, f, indent=2) + + def _generate_summary(self, results: Dict) -> Dict: + """Generate summary statistics from results. + + Args: + results: Dictionary containing benchmark results + + Returns: + Dictionary containing summary statistics + """ + total = len(results) + successful = sum(1 for r in results.values() if r.get('status') == 'success') + failed = sum(1 for r in results.values() if r.get('status') == 'error') + + return { + 'total_instances': total, + 'successful': successful, + 'failed': failed, + 'success_rate': successful / total if total > 0 else 0 + } diff --git a/src/benchmark/swebench/swebench.py b/src/benchmark/swebench/swebench.py new file mode 100644 index 00000000..f773f558 --- /dev/null +++ b/src/benchmark/swebench/swebench.py @@ -0,0 +1,62 @@ +"""Main SWE-bench runner implementation.""" + +import logging +from pathlib import Path +from typing import Dict, List, Optional + +from .dataset import SWEBenchDataset +from .evaluator import SWEBenchEvaluator +from .reporter import SWEBenchReporter + +logger = logging.getLogger(__name__) + +class SWEBenchRunner: + """Main class for running SWE-bench benchmarks.""" + + def __init__( + self, + dataset_name: str = "princeton-nlp/SWE-bench", + max_workers: int = 4, + working_dir: Optional[Path] = None + ): + """Initialize SWE-bench runner. + + Args: + dataset_name: HuggingFace dataset name + max_workers: Number of parallel workers for evaluation + working_dir: Working directory for benchmark files + """ + self.dataset = SWEBenchDataset(dataset_name) + self.evaluator = SWEBenchEvaluator(max_workers=max_workers) + self.reporter = SWEBenchReporter() + self.working_dir = working_dir or Path.cwd() / "swebench_results" + self.working_dir.mkdir(parents=True, exist_ok=True) + + def run_benchmark( + self, + instance_ids: Optional[List[str]] = None, + run_id: Optional[str] = None + ) -> Dict: + """Run benchmark evaluation. + + Args: + instance_ids: Optional list of specific instances to evaluate + run_id: Optional identifier for this benchmark run + + Returns: + Dictionary containing benchmark results + """ + logger.info("Loading benchmark dataset...") + instances = self.dataset.load_instances(instance_ids) + + logger.info("Running evaluations...") + results = self.evaluator.evaluate_instances(instances, run_id) + + logger.info("Generating report...") + report = self.reporter.generate_report(results) + + # Save results + results_file = self.working_dir / f"results_{run_id or 'default'}.json" + self.reporter.save_report(report, results_file) + + return report diff --git a/tests/benchmark/__init__.py b/tests/benchmark/__init__.py new file mode 100644 index 00000000..6eb613d8 --- /dev/null +++ b/tests/benchmark/__init__.py @@ -0,0 +1 @@ +"""Benchmark test package.""" diff --git a/tests/benchmark/conftest.py b/tests/benchmark/conftest.py new file mode 100644 index 00000000..aba71946 --- /dev/null +++ b/tests/benchmark/conftest.py @@ -0,0 +1,28 @@ +"""Pytest configuration for benchmark tests.""" + +import pytest +from pathlib import Path + +@pytest.fixture +def sample_instance(): + """Sample benchmark instance for testing.""" + return { + 'instance_id': 'test_instance', + 'repo': 'test/repo', + 'issue': 'Sample issue description', + 'patch': 'Sample patch content' + } + +@pytest.fixture +def sample_results(): + """Sample benchmark results for testing.""" + return { + 'test_instance_1': { + 'status': 'success', + 'metrics': {'accuracy': 0.95} + }, + 'test_instance_2': { + 'status': 'error', + 'error': 'Test error message' + } + } diff --git a/tests/benchmark/test_swebench.py b/tests/benchmark/test_swebench.py new file mode 100644 index 00000000..415b9b9a --- /dev/null +++ b/tests/benchmark/test_swebench.py @@ -0,0 +1,136 @@ +"""Tests for SWE-bench integration.""" + +import json +import pytest +import subprocess +from pathlib import Path +from unittest.mock import patch, MagicMock +from src.benchmark.swebench import ( + SWEBenchRunner, + SWEBenchDataset, + SWEBenchEvaluator, + SWEBenchReporter +) + +def test_dataset_loading(): + """Test dataset loading functionality.""" + dataset = SWEBenchDataset("princeton-nlp/SWE-bench_Lite") + instances = dataset.load_instances() + assert isinstance(instances, list) + assert len(instances) > 0 + +def test_reporter_summary(): + """Test report generation.""" + reporter = SWEBenchReporter() + results = { + 'test1': {'status': 'success'}, + 'test2': {'status': 'error'} + } + report = reporter.generate_report(results) + assert report['summary']['total_instances'] == 2 + assert report['summary']['successful'] == 1 + assert report['summary']['failed'] == 1 + assert report['summary']['success_rate'] == 0.5 + +@pytest.fixture +def temp_working_dir(tmp_path): + """Fixture for temporary working directory.""" + return tmp_path / "swebench_test" + +@pytest.fixture +def mock_subprocess(): + """Mock subprocess for testing Docker evaluation.""" + with patch('subprocess.run') as mock_run: + mock_run.return_value = MagicMock( + stdout='Test output\nMetrics: {"accuracy": 0.95}', + returncode=0 + ) + yield mock_run + +def test_runner_initialization(temp_working_dir): + """Test runner initialization.""" + runner = SWEBenchRunner( + dataset_name="princeton-nlp/SWE-bench_Lite", + working_dir=temp_working_dir + ) + assert runner.working_dir.exists() + assert isinstance(runner.dataset, SWEBenchDataset) + assert isinstance(runner.evaluator, SWEBenchEvaluator) + assert isinstance(runner.reporter, SWEBenchReporter) + +def test_evaluator_initialization(temp_working_dir): + """Test evaluator initialization.""" + evaluator = SWEBenchEvaluator(working_dir=temp_working_dir) + assert evaluator.working_dir.exists() + assert evaluator.max_workers == 4 + +@pytest.mark.parametrize("max_workers", [1, 4, 8]) +def test_evaluator_max_workers(max_workers): + """Test evaluator with different worker counts.""" + evaluator = SWEBenchEvaluator(max_workers=max_workers) + assert evaluator.max_workers == max_workers + +def test_evaluator_docker_run(temp_working_dir, mock_subprocess, sample_instance): + """Test Docker-based evaluation.""" + evaluator = SWEBenchEvaluator(working_dir=temp_working_dir) + + # Mock successful evaluation output + mock_subprocess.return_value.stdout = ( + f"{sample_instance['instance_id']}: " + '{"success": true, "metrics": {"accuracy": 0.95}}\n' + ) + + results = evaluator.evaluate_instances([sample_instance], run_id="test_run") + + assert sample_instance['instance_id'] in results + result = results[sample_instance['instance_id']] + assert result['status'] == 'success' + assert result['metrics']['accuracy'] == 0.95 + + # Verify Docker command + mock_subprocess.assert_called_once() + cmd_args = mock_subprocess.call_args[0][0] + assert '--predictions_path' in cmd_args + assert '--max_workers' in cmd_args + assert str(evaluator.max_workers) in cmd_args + +def test_evaluator_docker_failure(temp_working_dir, mock_subprocess, sample_instance): + """Test Docker evaluation failure handling.""" + evaluator = SWEBenchEvaluator(working_dir=temp_working_dir) + + # Mock subprocess failure + error_msg = "Docker evaluation failed" + mock_subprocess.side_effect = subprocess.CalledProcessError( + 1, [], output=error_msg + ) + + results = evaluator.evaluate_instances([sample_instance]) + + assert sample_instance['instance_id'] in results + result = results[sample_instance['instance_id']] + assert result['status'] == 'error' + assert 'Docker evaluation failed' in result['error'] + +def test_evaluator_parse_results(): + """Test evaluation results parsing.""" + evaluator = SWEBenchEvaluator() + + # Test successful parsing + output = 'test_1: {"accuracy": 0.95}\ntest_2: {"accuracy": 0.85}\n' + results = evaluator._parse_evaluation_results(output) + + assert len(results) == 2 + assert results['test_1']['status'] == 'success' + assert results['test_1']['metrics']['accuracy'] == 0.95 + assert results['test_2']['metrics']['accuracy'] == 0.85 + +def test_evaluator_parse_results_failure(): + """Test evaluation results parsing failure.""" + evaluator = SWEBenchEvaluator() + + # Test invalid JSON + output = 'test_1: invalid_json\n' + results = evaluator._parse_evaluation_results(output) + + assert results['test_1']['status'] == 'error' + assert 'Failed to parse metrics' in results['test_1']['error']