From 05f33ee3a72c1b6ae6c67948046c0d0e886f4436 Mon Sep 17 00:00:00 2001
From: Devin <devin@anthropic.com>
Date: Wed, 18 Dec 2024 17:59:53 +0000
Subject: [PATCH] feat: Add SWE-bench benchmarking integration (#415)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Erkin Alp Güney <erkinalp9035@gmail.com>
---
 requirements.txt                    |  43 ++-------
 src/benchmark/swebench/__init__.py  |  18 ++++
 src/benchmark/swebench/dataset.py   |  38 ++++++++
 src/benchmark/swebench/evaluator.py | 139 ++++++++++++++++++++++++++++
 src/benchmark/swebench/reporter.py  |  53 +++++++++++
 src/benchmark/swebench/swebench.py  |  62 +++++++++++++
 tests/benchmark/__init__.py         |   1 +
 tests/benchmark/conftest.py         |  28 ++++++
 tests/benchmark/test_swebench.py    | 136 +++++++++++++++++++++++++++
 9 files changed, 485 insertions(+), 33 deletions(-)
 create mode 100644 src/benchmark/swebench/__init__.py
 create mode 100644 src/benchmark/swebench/dataset.py
 create mode 100644 src/benchmark/swebench/evaluator.py
 create mode 100644 src/benchmark/swebench/reporter.py
 create mode 100644 src/benchmark/swebench/swebench.py
 create mode 100644 tests/benchmark/__init__.py
 create mode 100644 tests/benchmark/conftest.py
 create mode 100644 tests/benchmark/test_swebench.py

diff --git a/requirements.txt b/requirements.txt
index 91666960..68d1690e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,33 +1,10 @@
-flask
-flask-cors
-toml
-urllib3
-requests
-colorama
-fastlogging
-Jinja2
-mistletoe
-markdownify
-pdfminer.six
-playwright
-pytest-playwright
-tiktoken
-ollama
-openai
-anthropic
-google-generativeai
-sqlmodel
-keybert
-GitPython
-netlify-py
-Markdown
-xhtml2pdf
-mistralai
-Flask-SocketIO
-eventlet
-groq
-duckduckgo-search
-orjson
-gevent
-gevent-websocket
-curl_cffi
+# Core dependencies
+datasets>=2.0.0
+docker>=6.0.0
+pytest>=7.0.0
+pytest-asyncio>=0.21.0
+pytest-cov>=4.1.0
+
+# SWE-bench dependencies
+swebench>=0.1.0
+huggingface-hub>=0.19.0
diff --git a/src/benchmark/swebench/__init__.py b/src/benchmark/swebench/__init__.py
new file mode 100644
index 00000000..78e3cfef
--- /dev/null
+++ b/src/benchmark/swebench/__init__.py
@@ -0,0 +1,18 @@
+"""
+SWE-bench integration module for Devika.
+
+This module provides integration with the SWE-bench benchmark for evaluating
+code generation capabilities on real-world GitHub issues.
+"""
+
+from .swebench import SWEBenchRunner
+from .dataset import SWEBenchDataset
+from .evaluator import SWEBenchEvaluator
+from .reporter import SWEBenchReporter
+
+__all__ = [
+    'SWEBenchRunner',
+    'SWEBenchDataset',
+    'SWEBenchEvaluator',
+    'SWEBenchReporter',
+]
diff --git a/src/benchmark/swebench/dataset.py b/src/benchmark/swebench/dataset.py
new file mode 100644
index 00000000..d55471da
--- /dev/null
+++ b/src/benchmark/swebench/dataset.py
@@ -0,0 +1,38 @@
+"""SWE-bench dataset loading and management."""
+
+from typing import Dict, List, Optional
+from datasets import load_dataset
+
+class SWEBenchDataset:
+    """Handler for SWE-bench dataset operations."""
+
+    def __init__(self, dataset_name: str = "princeton-nlp/SWE-bench"):
+        """Initialize dataset handler.
+
+        Args:
+            dataset_name: HuggingFace dataset name
+        """
+        self.dataset_name = dataset_name
+        self.dataset = None
+
+    def load_instances(self, instance_ids: Optional[List[str]] = None) -> List[Dict]:
+        """Load benchmark instances.
+
+        Args:
+            instance_ids: Optional list of specific instances to load
+
+        Returns:
+            List of benchmark instances
+        """
+        if self.dataset is None:
+            self.dataset = load_dataset(self.dataset_name, split='test')
+
+        if instance_ids:
+            instances = [
+                inst for inst in self.dataset
+                if inst['instance_id'] in instance_ids
+            ]
+        else:
+            instances = list(self.dataset)
+
+        return instances
diff --git a/src/benchmark/swebench/evaluator.py b/src/benchmark/swebench/evaluator.py
new file mode 100644
index 00000000..f0787356
--- /dev/null
+++ b/src/benchmark/swebench/evaluator.py
@@ -0,0 +1,139 @@
+"""Docker-based evaluation harness for SWE-bench."""
+
+import json
+import logging
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+class SWEBenchEvaluator:
+    """Evaluator for running SWE-bench in Docker containers."""
+
+    def __init__(self, max_workers: int = 4, working_dir: Optional[Path] = None):
+        """Initialize evaluator.
+
+        Args:
+            max_workers: Number of parallel workers
+            working_dir: Working directory for evaluation files
+        """
+        self.max_workers = max_workers
+        self.working_dir = working_dir or Path(tempfile.mkdtemp(prefix='swebench_'))
+        self.working_dir.mkdir(parents=True, exist_ok=True)
+
+    def evaluate_instances(
+        self,
+        instances: List[Dict],
+        run_id: Optional[str] = None
+    ) -> Dict:
+        """Evaluate benchmark instances.
+
+        Args:
+            instances: List of benchmark instances to evaluate
+            run_id: Optional identifier for this evaluation run
+
+        Returns:
+            Dictionary containing evaluation results
+        """
+        results = {}
+        run_dir = self.working_dir / (run_id or 'default')
+        run_dir.mkdir(parents=True, exist_ok=True)
+
+        # Save predictions for batch evaluation
+        predictions_dir = run_dir / 'predictions'
+        predictions_dir.mkdir(parents=True, exist_ok=True)
+
+        for instance in instances:
+            try:
+                # Save instance prediction
+                instance_dir = predictions_dir / instance['instance_id']
+                instance_dir.mkdir(parents=True, exist_ok=True)
+                with open(instance_dir / 'prediction.json', 'w') as f:
+                    json.dump(instance, f, indent=2)
+            except Exception as e:
+                logger.error(f"Error preparing {instance['instance_id']}: {e}")
+                results[instance['instance_id']] = {
+                    'status': 'error',
+                    'error': f"Failed to prepare instance: {str(e)}"
+                }
+
+        # Run batch evaluation using SWE-bench harness
+        try:
+            result = self._run_docker_evaluation(predictions_dir, run_id)
+            results.update(self._parse_evaluation_results(result))
+        except Exception as e:
+            logger.error(f"Docker evaluation failed: {e}")
+            for instance in instances:
+                if instance['instance_id'] not in results:
+                    results[instance['instance_id']] = {
+                        'status': 'error',
+                        'error': f"Docker evaluation failed: {str(e)}"
+                    }
+
+        return results
+
+    def _run_docker_evaluation(self, predictions_dir: Path, run_id: str) -> str:
+        """Run Docker-based evaluation using SWE-bench harness.
+
+        Args:
+            predictions_dir: Directory containing instance predictions
+            run_id: Identifier for this evaluation run
+
+        Returns:
+            Raw evaluation output
+        """
+        cmd = [
+            'python', '-m', 'swebench.harness.run_evaluation',
+            '--predictions_path', str(predictions_dir),
+            '--max_workers', str(self.max_workers),
+            '--run_id', run_id or 'default'
+        ]
+
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            return result.stdout
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Docker evaluation command failed: {e.output}")
+            raise RuntimeError(f"Docker evaluation failed: {str(e)}")
+
+    def _parse_evaluation_results(self, output: str) -> Dict:
+        """Parse evaluation output to extract metrics.
+
+        Args:
+            output: Raw evaluation output string
+
+        Returns:
+            Dictionary containing parsed metrics per instance
+        """
+        results = {}
+        try:
+            # Extract results from evaluation output
+            # Format: instance_id: {metrics}
+            for line in output.splitlines():
+                if ':' in line:
+                    instance_id, metrics_str = line.split(':', 1)
+                    instance_id = instance_id.strip()
+                    try:
+                        metrics = json.loads(metrics_str.strip())
+                        results[instance_id] = {
+                            'status': 'success',
+                            'metrics': metrics
+                        }
+                    except json.JSONDecodeError:
+                        results[instance_id] = {
+                            'status': 'error',
+                            'error': f"Failed to parse metrics: {metrics_str}"
+                        }
+        except Exception as e:
+            logger.error(f"Failed to parse evaluation results: {e}")
+            raise RuntimeError(f"Failed to parse evaluation results: {str(e)}")
+
+        return results
diff --git a/src/benchmark/swebench/reporter.py b/src/benchmark/swebench/reporter.py
new file mode 100644
index 00000000..22bc1718
--- /dev/null
+++ b/src/benchmark/swebench/reporter.py
@@ -0,0 +1,53 @@
+"""Results reporting for SWE-bench benchmark."""
+
+import json
+from pathlib import Path
+from typing import Dict
+
+class SWEBenchReporter:
+    """Reporter for SWE-bench benchmark results."""
+
+    def generate_report(self, results: Dict) -> Dict:
+        """Generate benchmark report.
+
+        Args:
+            results: Dictionary containing benchmark results
+
+        Returns:
+            Dictionary containing formatted report
+        """
+        report = {
+            'summary': self._generate_summary(results),
+            'details': results
+        }
+        return report
+
+    def save_report(self, report: Dict, output_file: Path):
+        """Save benchmark report to file.
+
+        Args:
+            report: Dictionary containing benchmark report
+            output_file: Path to save report
+        """
+        with open(output_file, 'w') as f:
+            json.dump(report, f, indent=2)
+
+    def _generate_summary(self, results: Dict) -> Dict:
+        """Generate summary statistics from results.
+
+        Args:
+            results: Dictionary containing benchmark results
+
+        Returns:
+            Dictionary containing summary statistics
+        """
+        total = len(results)
+        successful = sum(1 for r in results.values() if r.get('status') == 'success')
+        failed = sum(1 for r in results.values() if r.get('status') == 'error')
+
+        return {
+            'total_instances': total,
+            'successful': successful,
+            'failed': failed,
+            'success_rate': successful / total if total > 0 else 0
+        }
diff --git a/src/benchmark/swebench/swebench.py b/src/benchmark/swebench/swebench.py
new file mode 100644
index 00000000..f773f558
--- /dev/null
+++ b/src/benchmark/swebench/swebench.py
@@ -0,0 +1,62 @@
+"""Main SWE-bench runner implementation."""
+
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from .dataset import SWEBenchDataset
+from .evaluator import SWEBenchEvaluator
+from .reporter import SWEBenchReporter
+
+logger = logging.getLogger(__name__)
+
+class SWEBenchRunner:
+    """Main class for running SWE-bench benchmarks."""
+
+    def __init__(
+        self,
+        dataset_name: str = "princeton-nlp/SWE-bench",
+        max_workers: int = 4,
+        working_dir: Optional[Path] = None
+    ):
+        """Initialize SWE-bench runner.
+
+        Args:
+            dataset_name: HuggingFace dataset name
+            max_workers: Number of parallel workers for evaluation
+            working_dir: Working directory for benchmark files
+        """
+        self.dataset = SWEBenchDataset(dataset_name)
+        self.evaluator = SWEBenchEvaluator(max_workers=max_workers)
+        self.reporter = SWEBenchReporter()
+        self.working_dir = working_dir or Path.cwd() / "swebench_results"
+        self.working_dir.mkdir(parents=True, exist_ok=True)
+
+    def run_benchmark(
+        self,
+        instance_ids: Optional[List[str]] = None,
+        run_id: Optional[str] = None
+    ) -> Dict:
+        """Run benchmark evaluation.
+
+        Args:
+            instance_ids: Optional list of specific instances to evaluate
+            run_id: Optional identifier for this benchmark run
+
+        Returns:
+            Dictionary containing benchmark results
+        """
+        logger.info("Loading benchmark dataset...")
+        instances = self.dataset.load_instances(instance_ids)
+
+        logger.info("Running evaluations...")
+        results = self.evaluator.evaluate_instances(instances, run_id)
+
+        logger.info("Generating report...")
+        report = self.reporter.generate_report(results)
+
+        # Save results
+        results_file = self.working_dir / f"results_{run_id or 'default'}.json"
+        self.reporter.save_report(report, results_file)
+
+        return report
diff --git a/tests/benchmark/__init__.py b/tests/benchmark/__init__.py
new file mode 100644
index 00000000..6eb613d8
--- /dev/null
+++ b/tests/benchmark/__init__.py
@@ -0,0 +1 @@
+"""Benchmark test package."""
diff --git a/tests/benchmark/conftest.py b/tests/benchmark/conftest.py
new file mode 100644
index 00000000..aba71946
--- /dev/null
+++ b/tests/benchmark/conftest.py
@@ -0,0 +1,28 @@
+"""Pytest configuration for benchmark tests."""
+
+import pytest
+from pathlib import Path
+
+@pytest.fixture
+def sample_instance():
+    """Sample benchmark instance for testing."""
+    return {
+        'instance_id': 'test_instance',
+        'repo': 'test/repo',
+        'issue': 'Sample issue description',
+        'patch': 'Sample patch content'
+    }
+
+@pytest.fixture
+def sample_results():
+    """Sample benchmark results for testing."""
+    return {
+        'test_instance_1': {
+            'status': 'success',
+            'metrics': {'accuracy': 0.95}
+        },
+        'test_instance_2': {
+            'status': 'error',
+            'error': 'Test error message'
+        }
+    }
diff --git a/tests/benchmark/test_swebench.py b/tests/benchmark/test_swebench.py
new file mode 100644
index 00000000..415b9b9a
--- /dev/null
+++ b/tests/benchmark/test_swebench.py
@@ -0,0 +1,136 @@
+"""Tests for SWE-bench integration."""
+
+import json
+import pytest
+import subprocess
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+from src.benchmark.swebench import (
+    SWEBenchRunner,
+    SWEBenchDataset,
+    SWEBenchEvaluator,
+    SWEBenchReporter
+)
+
+def test_dataset_loading():
+    """Test dataset loading functionality."""
+    dataset = SWEBenchDataset("princeton-nlp/SWE-bench_Lite")
+    instances = dataset.load_instances()
+    assert isinstance(instances, list)
+    assert len(instances) > 0
+
+def test_reporter_summary():
+    """Test report generation."""
+    reporter = SWEBenchReporter()
+    results = {
+        'test1': {'status': 'success'},
+        'test2': {'status': 'error'}
+    }
+    report = reporter.generate_report(results)
+    assert report['summary']['total_instances'] == 2
+    assert report['summary']['successful'] == 1
+    assert report['summary']['failed'] == 1
+    assert report['summary']['success_rate'] == 0.5
+
+@pytest.fixture
+def temp_working_dir(tmp_path):
+    """Fixture for temporary working directory."""
+    return tmp_path / "swebench_test"
+
+@pytest.fixture
+def mock_subprocess():
+    """Mock subprocess for testing Docker evaluation."""
+    with patch('subprocess.run') as mock_run:
+        mock_run.return_value = MagicMock(
+            stdout='Test output\nMetrics: {"accuracy": 0.95}',
+            returncode=0
+        )
+        yield mock_run
+
+def test_runner_initialization(temp_working_dir):
+    """Test runner initialization."""
+    runner = SWEBenchRunner(
+        dataset_name="princeton-nlp/SWE-bench_Lite",
+        working_dir=temp_working_dir
+    )
+    assert runner.working_dir.exists()
+    assert isinstance(runner.dataset, SWEBenchDataset)
+    assert isinstance(runner.evaluator, SWEBenchEvaluator)
+    assert isinstance(runner.reporter, SWEBenchReporter)
+
+def test_evaluator_initialization(temp_working_dir):
+    """Test evaluator initialization."""
+    evaluator = SWEBenchEvaluator(working_dir=temp_working_dir)
+    assert evaluator.working_dir.exists()
+    assert evaluator.max_workers == 4
+
+@pytest.mark.parametrize("max_workers", [1, 4, 8])
+def test_evaluator_max_workers(max_workers):
+    """Test evaluator with different worker counts."""
+    evaluator = SWEBenchEvaluator(max_workers=max_workers)
+    assert evaluator.max_workers == max_workers
+
+def test_evaluator_docker_run(temp_working_dir, mock_subprocess, sample_instance):
+    """Test Docker-based evaluation."""
+    evaluator = SWEBenchEvaluator(working_dir=temp_working_dir)
+
+    # Mock successful evaluation output
+    mock_subprocess.return_value.stdout = (
+        f"{sample_instance['instance_id']}: "
+        '{"success": true, "metrics": {"accuracy": 0.95}}\n'
+    )
+
+    results = evaluator.evaluate_instances([sample_instance], run_id="test_run")
+
+    assert sample_instance['instance_id'] in results
+    result = results[sample_instance['instance_id']]
+    assert result['status'] == 'success'
+    assert result['metrics']['accuracy'] == 0.95
+
+    # Verify Docker command
+    mock_subprocess.assert_called_once()
+    cmd_args = mock_subprocess.call_args[0][0]
+    assert '--predictions_path' in cmd_args
+    assert '--max_workers' in cmd_args
+    assert str(evaluator.max_workers) in cmd_args
+
+def test_evaluator_docker_failure(temp_working_dir, mock_subprocess, sample_instance):
+    """Test Docker evaluation failure handling."""
+    evaluator = SWEBenchEvaluator(working_dir=temp_working_dir)
+
+    # Mock subprocess failure
+    error_msg = "Docker evaluation failed"
+    mock_subprocess.side_effect = subprocess.CalledProcessError(
+        1, [], output=error_msg
+    )
+
+    results = evaluator.evaluate_instances([sample_instance])
+
+    assert sample_instance['instance_id'] in results
+    result = results[sample_instance['instance_id']]
+    assert result['status'] == 'error'
+    assert 'Docker evaluation failed' in result['error']
+
+def test_evaluator_parse_results():
+    """Test evaluation results parsing."""
+    evaluator = SWEBenchEvaluator()
+
+    # Test successful parsing
+    output = 'test_1: {"accuracy": 0.95}\ntest_2: {"accuracy": 0.85}\n'
+    results = evaluator._parse_evaluation_results(output)
+
+    assert len(results) == 2
+    assert results['test_1']['status'] == 'success'
+    assert results['test_1']['metrics']['accuracy'] == 0.95
+    assert results['test_2']['metrics']['accuracy'] == 0.85
+
+def test_evaluator_parse_results_failure():
+    """Test evaluation results parsing failure."""
+    evaluator = SWEBenchEvaluator()
+
+    # Test invalid JSON
+    output = 'test_1: invalid_json\n'
+    results = evaluator._parse_evaluation_results(output)
+
+    assert results['test_1']['status'] == 'error'
+    assert 'Failed to parse metrics' in results['test_1']['error']