Skip to content

Commit

Permalink
feat: Add SWE-bench benchmarking integration (stitionai#415)
Browse files Browse the repository at this point in the history
Co-Authored-By: Erkin Alp Güney <[email protected]>
  • Loading branch information
devin-kuokka and erkinalp committed Dec 18, 2024
1 parent 3b98ed3 commit 05f33ee
Show file tree
Hide file tree
Showing 9 changed files with 485 additions and 33 deletions.
43 changes: 10 additions & 33 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,33 +1,10 @@
flask
flask-cors
toml
urllib3
requests
colorama
fastlogging
Jinja2
mistletoe
markdownify
pdfminer.six
playwright
pytest-playwright
tiktoken
ollama
openai
anthropic
google-generativeai
sqlmodel
keybert
GitPython
netlify-py
Markdown
xhtml2pdf
mistralai
Flask-SocketIO
eventlet
groq
duckduckgo-search
orjson
gevent
gevent-websocket
curl_cffi
# Core dependencies
datasets>=2.0.0
docker>=6.0.0
pytest>=7.0.0
pytest-asyncio>=0.21.0
pytest-cov>=4.1.0

# SWE-bench dependencies
swebench>=0.1.0
huggingface-hub>=0.19.0
18 changes: 18 additions & 0 deletions src/benchmark/swebench/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""
SWE-bench integration module for Devika.
This module provides integration with the SWE-bench benchmark for evaluating
code generation capabilities on real-world GitHub issues.
"""

from .swebench import SWEBenchRunner
from .dataset import SWEBenchDataset
from .evaluator import SWEBenchEvaluator
from .reporter import SWEBenchReporter

__all__ = [
'SWEBenchRunner',
'SWEBenchDataset',
'SWEBenchEvaluator',
'SWEBenchReporter',
]
38 changes: 38 additions & 0 deletions src/benchmark/swebench/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""SWE-bench dataset loading and management."""

from typing import Dict, List, Optional
from datasets import load_dataset

class SWEBenchDataset:
"""Handler for SWE-bench dataset operations."""

def __init__(self, dataset_name: str = "princeton-nlp/SWE-bench"):
"""Initialize dataset handler.
Args:
dataset_name: HuggingFace dataset name
"""
self.dataset_name = dataset_name
self.dataset = None

def load_instances(self, instance_ids: Optional[List[str]] = None) -> List[Dict]:
"""Load benchmark instances.
Args:
instance_ids: Optional list of specific instances to load
Returns:
List of benchmark instances
"""
if self.dataset is None:
self.dataset = load_dataset(self.dataset_name, split='test')

if instance_ids:
instances = [
inst for inst in self.dataset
if inst['instance_id'] in instance_ids
]
else:
instances = list(self.dataset)

return instances
139 changes: 139 additions & 0 deletions src/benchmark/swebench/evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
"""Docker-based evaluation harness for SWE-bench."""

import json
import logging
import os
import subprocess
import tempfile
from pathlib import Path
from typing import Dict, List, Optional

logger = logging.getLogger(__name__)

class SWEBenchEvaluator:
"""Evaluator for running SWE-bench in Docker containers."""

def __init__(self, max_workers: int = 4, working_dir: Optional[Path] = None):
"""Initialize evaluator.
Args:
max_workers: Number of parallel workers
working_dir: Working directory for evaluation files
"""
self.max_workers = max_workers
self.working_dir = working_dir or Path(tempfile.mkdtemp(prefix='swebench_'))
self.working_dir.mkdir(parents=True, exist_ok=True)

def evaluate_instances(
self,
instances: List[Dict],
run_id: Optional[str] = None
) -> Dict:
"""Evaluate benchmark instances.
Args:
instances: List of benchmark instances to evaluate
run_id: Optional identifier for this evaluation run
Returns:
Dictionary containing evaluation results
"""
results = {}
run_dir = self.working_dir / (run_id or 'default')
run_dir.mkdir(parents=True, exist_ok=True)

# Save predictions for batch evaluation
predictions_dir = run_dir / 'predictions'
predictions_dir.mkdir(parents=True, exist_ok=True)

for instance in instances:
try:
# Save instance prediction
instance_dir = predictions_dir / instance['instance_id']
instance_dir.mkdir(parents=True, exist_ok=True)
with open(instance_dir / 'prediction.json', 'w') as f:
json.dump(instance, f, indent=2)
except Exception as e:
logger.error(f"Error preparing {instance['instance_id']}: {e}")
results[instance['instance_id']] = {
'status': 'error',
'error': f"Failed to prepare instance: {str(e)}"
}

# Run batch evaluation using SWE-bench harness
try:
result = self._run_docker_evaluation(predictions_dir, run_id)
results.update(self._parse_evaluation_results(result))
except Exception as e:
logger.error(f"Docker evaluation failed: {e}")
for instance in instances:
if instance['instance_id'] not in results:
results[instance['instance_id']] = {
'status': 'error',
'error': f"Docker evaluation failed: {str(e)}"
}

return results

def _run_docker_evaluation(self, predictions_dir: Path, run_id: str) -> str:
"""Run Docker-based evaluation using SWE-bench harness.
Args:
predictions_dir: Directory containing instance predictions
run_id: Identifier for this evaluation run
Returns:
Raw evaluation output
"""
cmd = [
'python', '-m', 'swebench.harness.run_evaluation',
'--predictions_path', str(predictions_dir),
'--max_workers', str(self.max_workers),
'--run_id', run_id or 'default'
]

try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
check=True
)
return result.stdout
except subprocess.CalledProcessError as e:
logger.error(f"Docker evaluation command failed: {e.output}")
raise RuntimeError(f"Docker evaluation failed: {str(e)}")

def _parse_evaluation_results(self, output: str) -> Dict:
"""Parse evaluation output to extract metrics.
Args:
output: Raw evaluation output string
Returns:
Dictionary containing parsed metrics per instance
"""
results = {}
try:
# Extract results from evaluation output
# Format: instance_id: {metrics}
for line in output.splitlines():
if ':' in line:
instance_id, metrics_str = line.split(':', 1)
instance_id = instance_id.strip()
try:
metrics = json.loads(metrics_str.strip())
results[instance_id] = {
'status': 'success',
'metrics': metrics
}
except json.JSONDecodeError:
results[instance_id] = {
'status': 'error',
'error': f"Failed to parse metrics: {metrics_str}"
}
except Exception as e:
logger.error(f"Failed to parse evaluation results: {e}")
raise RuntimeError(f"Failed to parse evaluation results: {str(e)}")

return results
53 changes: 53 additions & 0 deletions src/benchmark/swebench/reporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Results reporting for SWE-bench benchmark."""

import json
from pathlib import Path
from typing import Dict

class SWEBenchReporter:
"""Reporter for SWE-bench benchmark results."""

def generate_report(self, results: Dict) -> Dict:
"""Generate benchmark report.
Args:
results: Dictionary containing benchmark results
Returns:
Dictionary containing formatted report
"""
report = {
'summary': self._generate_summary(results),
'details': results
}
return report

def save_report(self, report: Dict, output_file: Path):
"""Save benchmark report to file.
Args:
report: Dictionary containing benchmark report
output_file: Path to save report
"""
with open(output_file, 'w') as f:
json.dump(report, f, indent=2)

def _generate_summary(self, results: Dict) -> Dict:
"""Generate summary statistics from results.
Args:
results: Dictionary containing benchmark results
Returns:
Dictionary containing summary statistics
"""
total = len(results)
successful = sum(1 for r in results.values() if r.get('status') == 'success')
failed = sum(1 for r in results.values() if r.get('status') == 'error')

return {
'total_instances': total,
'successful': successful,
'failed': failed,
'success_rate': successful / total if total > 0 else 0
}
62 changes: 62 additions & 0 deletions src/benchmark/swebench/swebench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Main SWE-bench runner implementation."""

import logging
from pathlib import Path
from typing import Dict, List, Optional

from .dataset import SWEBenchDataset
from .evaluator import SWEBenchEvaluator
from .reporter import SWEBenchReporter

logger = logging.getLogger(__name__)

class SWEBenchRunner:
"""Main class for running SWE-bench benchmarks."""

def __init__(
self,
dataset_name: str = "princeton-nlp/SWE-bench",
max_workers: int = 4,
working_dir: Optional[Path] = None
):
"""Initialize SWE-bench runner.
Args:
dataset_name: HuggingFace dataset name
max_workers: Number of parallel workers for evaluation
working_dir: Working directory for benchmark files
"""
self.dataset = SWEBenchDataset(dataset_name)
self.evaluator = SWEBenchEvaluator(max_workers=max_workers)
self.reporter = SWEBenchReporter()
self.working_dir = working_dir or Path.cwd() / "swebench_results"
self.working_dir.mkdir(parents=True, exist_ok=True)

def run_benchmark(
self,
instance_ids: Optional[List[str]] = None,
run_id: Optional[str] = None
) -> Dict:
"""Run benchmark evaluation.
Args:
instance_ids: Optional list of specific instances to evaluate
run_id: Optional identifier for this benchmark run
Returns:
Dictionary containing benchmark results
"""
logger.info("Loading benchmark dataset...")
instances = self.dataset.load_instances(instance_ids)

logger.info("Running evaluations...")
results = self.evaluator.evaluate_instances(instances, run_id)

logger.info("Generating report...")
report = self.reporter.generate_report(results)

# Save results
results_file = self.working_dir / f"results_{run_id or 'default'}.json"
self.reporter.save_report(report, results_file)

return report
1 change: 1 addition & 0 deletions tests/benchmark/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Benchmark test package."""
28 changes: 28 additions & 0 deletions tests/benchmark/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Pytest configuration for benchmark tests."""

import pytest
from pathlib import Path

@pytest.fixture
def sample_instance():
"""Sample benchmark instance for testing."""
return {
'instance_id': 'test_instance',
'repo': 'test/repo',
'issue': 'Sample issue description',
'patch': 'Sample patch content'
}

@pytest.fixture
def sample_results():
"""Sample benchmark results for testing."""
return {
'test_instance_1': {
'status': 'success',
'metrics': {'accuracy': 0.95}
},
'test_instance_2': {
'status': 'error',
'error': 'Test error message'
}
}
Loading

0 comments on commit 05f33ee

Please sign in to comment.