forked from stitionai/devika
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add SWE-bench benchmarking integration (stitionai#415)
Co-Authored-By: Erkin Alp Güney <[email protected]>
- Loading branch information
1 parent
3b98ed3
commit 05f33ee
Showing
9 changed files
with
485 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,33 +1,10 @@ | ||
flask | ||
flask-cors | ||
toml | ||
urllib3 | ||
requests | ||
colorama | ||
fastlogging | ||
Jinja2 | ||
mistletoe | ||
markdownify | ||
pdfminer.six | ||
playwright | ||
pytest-playwright | ||
tiktoken | ||
ollama | ||
openai | ||
anthropic | ||
google-generativeai | ||
sqlmodel | ||
keybert | ||
GitPython | ||
netlify-py | ||
Markdown | ||
xhtml2pdf | ||
mistralai | ||
Flask-SocketIO | ||
eventlet | ||
groq | ||
duckduckgo-search | ||
orjson | ||
gevent | ||
gevent-websocket | ||
curl_cffi | ||
# Core dependencies | ||
datasets>=2.0.0 | ||
docker>=6.0.0 | ||
pytest>=7.0.0 | ||
pytest-asyncio>=0.21.0 | ||
pytest-cov>=4.1.0 | ||
|
||
# SWE-bench dependencies | ||
swebench>=0.1.0 | ||
huggingface-hub>=0.19.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
""" | ||
SWE-bench integration module for Devika. | ||
This module provides integration with the SWE-bench benchmark for evaluating | ||
code generation capabilities on real-world GitHub issues. | ||
""" | ||
|
||
from .swebench import SWEBenchRunner | ||
from .dataset import SWEBenchDataset | ||
from .evaluator import SWEBenchEvaluator | ||
from .reporter import SWEBenchReporter | ||
|
||
__all__ = [ | ||
'SWEBenchRunner', | ||
'SWEBenchDataset', | ||
'SWEBenchEvaluator', | ||
'SWEBenchReporter', | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
"""SWE-bench dataset loading and management.""" | ||
|
||
from typing import Dict, List, Optional | ||
from datasets import load_dataset | ||
|
||
class SWEBenchDataset: | ||
"""Handler for SWE-bench dataset operations.""" | ||
|
||
def __init__(self, dataset_name: str = "princeton-nlp/SWE-bench"): | ||
"""Initialize dataset handler. | ||
Args: | ||
dataset_name: HuggingFace dataset name | ||
""" | ||
self.dataset_name = dataset_name | ||
self.dataset = None | ||
|
||
def load_instances(self, instance_ids: Optional[List[str]] = None) -> List[Dict]: | ||
"""Load benchmark instances. | ||
Args: | ||
instance_ids: Optional list of specific instances to load | ||
Returns: | ||
List of benchmark instances | ||
""" | ||
if self.dataset is None: | ||
self.dataset = load_dataset(self.dataset_name, split='test') | ||
|
||
if instance_ids: | ||
instances = [ | ||
inst for inst in self.dataset | ||
if inst['instance_id'] in instance_ids | ||
] | ||
else: | ||
instances = list(self.dataset) | ||
|
||
return instances |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
"""Docker-based evaluation harness for SWE-bench.""" | ||
|
||
import json | ||
import logging | ||
import os | ||
import subprocess | ||
import tempfile | ||
from pathlib import Path | ||
from typing import Dict, List, Optional | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
class SWEBenchEvaluator: | ||
"""Evaluator for running SWE-bench in Docker containers.""" | ||
|
||
def __init__(self, max_workers: int = 4, working_dir: Optional[Path] = None): | ||
"""Initialize evaluator. | ||
Args: | ||
max_workers: Number of parallel workers | ||
working_dir: Working directory for evaluation files | ||
""" | ||
self.max_workers = max_workers | ||
self.working_dir = working_dir or Path(tempfile.mkdtemp(prefix='swebench_')) | ||
self.working_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
def evaluate_instances( | ||
self, | ||
instances: List[Dict], | ||
run_id: Optional[str] = None | ||
) -> Dict: | ||
"""Evaluate benchmark instances. | ||
Args: | ||
instances: List of benchmark instances to evaluate | ||
run_id: Optional identifier for this evaluation run | ||
Returns: | ||
Dictionary containing evaluation results | ||
""" | ||
results = {} | ||
run_dir = self.working_dir / (run_id or 'default') | ||
run_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
# Save predictions for batch evaluation | ||
predictions_dir = run_dir / 'predictions' | ||
predictions_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
for instance in instances: | ||
try: | ||
# Save instance prediction | ||
instance_dir = predictions_dir / instance['instance_id'] | ||
instance_dir.mkdir(parents=True, exist_ok=True) | ||
with open(instance_dir / 'prediction.json', 'w') as f: | ||
json.dump(instance, f, indent=2) | ||
except Exception as e: | ||
logger.error(f"Error preparing {instance['instance_id']}: {e}") | ||
results[instance['instance_id']] = { | ||
'status': 'error', | ||
'error': f"Failed to prepare instance: {str(e)}" | ||
} | ||
|
||
# Run batch evaluation using SWE-bench harness | ||
try: | ||
result = self._run_docker_evaluation(predictions_dir, run_id) | ||
results.update(self._parse_evaluation_results(result)) | ||
except Exception as e: | ||
logger.error(f"Docker evaluation failed: {e}") | ||
for instance in instances: | ||
if instance['instance_id'] not in results: | ||
results[instance['instance_id']] = { | ||
'status': 'error', | ||
'error': f"Docker evaluation failed: {str(e)}" | ||
} | ||
|
||
return results | ||
|
||
def _run_docker_evaluation(self, predictions_dir: Path, run_id: str) -> str: | ||
"""Run Docker-based evaluation using SWE-bench harness. | ||
Args: | ||
predictions_dir: Directory containing instance predictions | ||
run_id: Identifier for this evaluation run | ||
Returns: | ||
Raw evaluation output | ||
""" | ||
cmd = [ | ||
'python', '-m', 'swebench.harness.run_evaluation', | ||
'--predictions_path', str(predictions_dir), | ||
'--max_workers', str(self.max_workers), | ||
'--run_id', run_id or 'default' | ||
] | ||
|
||
try: | ||
result = subprocess.run( | ||
cmd, | ||
capture_output=True, | ||
text=True, | ||
check=True | ||
) | ||
return result.stdout | ||
except subprocess.CalledProcessError as e: | ||
logger.error(f"Docker evaluation command failed: {e.output}") | ||
raise RuntimeError(f"Docker evaluation failed: {str(e)}") | ||
|
||
def _parse_evaluation_results(self, output: str) -> Dict: | ||
"""Parse evaluation output to extract metrics. | ||
Args: | ||
output: Raw evaluation output string | ||
Returns: | ||
Dictionary containing parsed metrics per instance | ||
""" | ||
results = {} | ||
try: | ||
# Extract results from evaluation output | ||
# Format: instance_id: {metrics} | ||
for line in output.splitlines(): | ||
if ':' in line: | ||
instance_id, metrics_str = line.split(':', 1) | ||
instance_id = instance_id.strip() | ||
try: | ||
metrics = json.loads(metrics_str.strip()) | ||
results[instance_id] = { | ||
'status': 'success', | ||
'metrics': metrics | ||
} | ||
except json.JSONDecodeError: | ||
results[instance_id] = { | ||
'status': 'error', | ||
'error': f"Failed to parse metrics: {metrics_str}" | ||
} | ||
except Exception as e: | ||
logger.error(f"Failed to parse evaluation results: {e}") | ||
raise RuntimeError(f"Failed to parse evaluation results: {str(e)}") | ||
|
||
return results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
"""Results reporting for SWE-bench benchmark.""" | ||
|
||
import json | ||
from pathlib import Path | ||
from typing import Dict | ||
|
||
class SWEBenchReporter: | ||
"""Reporter for SWE-bench benchmark results.""" | ||
|
||
def generate_report(self, results: Dict) -> Dict: | ||
"""Generate benchmark report. | ||
Args: | ||
results: Dictionary containing benchmark results | ||
Returns: | ||
Dictionary containing formatted report | ||
""" | ||
report = { | ||
'summary': self._generate_summary(results), | ||
'details': results | ||
} | ||
return report | ||
|
||
def save_report(self, report: Dict, output_file: Path): | ||
"""Save benchmark report to file. | ||
Args: | ||
report: Dictionary containing benchmark report | ||
output_file: Path to save report | ||
""" | ||
with open(output_file, 'w') as f: | ||
json.dump(report, f, indent=2) | ||
|
||
def _generate_summary(self, results: Dict) -> Dict: | ||
"""Generate summary statistics from results. | ||
Args: | ||
results: Dictionary containing benchmark results | ||
Returns: | ||
Dictionary containing summary statistics | ||
""" | ||
total = len(results) | ||
successful = sum(1 for r in results.values() if r.get('status') == 'success') | ||
failed = sum(1 for r in results.values() if r.get('status') == 'error') | ||
|
||
return { | ||
'total_instances': total, | ||
'successful': successful, | ||
'failed': failed, | ||
'success_rate': successful / total if total > 0 else 0 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
"""Main SWE-bench runner implementation.""" | ||
|
||
import logging | ||
from pathlib import Path | ||
from typing import Dict, List, Optional | ||
|
||
from .dataset import SWEBenchDataset | ||
from .evaluator import SWEBenchEvaluator | ||
from .reporter import SWEBenchReporter | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
class SWEBenchRunner: | ||
"""Main class for running SWE-bench benchmarks.""" | ||
|
||
def __init__( | ||
self, | ||
dataset_name: str = "princeton-nlp/SWE-bench", | ||
max_workers: int = 4, | ||
working_dir: Optional[Path] = None | ||
): | ||
"""Initialize SWE-bench runner. | ||
Args: | ||
dataset_name: HuggingFace dataset name | ||
max_workers: Number of parallel workers for evaluation | ||
working_dir: Working directory for benchmark files | ||
""" | ||
self.dataset = SWEBenchDataset(dataset_name) | ||
self.evaluator = SWEBenchEvaluator(max_workers=max_workers) | ||
self.reporter = SWEBenchReporter() | ||
self.working_dir = working_dir or Path.cwd() / "swebench_results" | ||
self.working_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
def run_benchmark( | ||
self, | ||
instance_ids: Optional[List[str]] = None, | ||
run_id: Optional[str] = None | ||
) -> Dict: | ||
"""Run benchmark evaluation. | ||
Args: | ||
instance_ids: Optional list of specific instances to evaluate | ||
run_id: Optional identifier for this benchmark run | ||
Returns: | ||
Dictionary containing benchmark results | ||
""" | ||
logger.info("Loading benchmark dataset...") | ||
instances = self.dataset.load_instances(instance_ids) | ||
|
||
logger.info("Running evaluations...") | ||
results = self.evaluator.evaluate_instances(instances, run_id) | ||
|
||
logger.info("Generating report...") | ||
report = self.reporter.generate_report(results) | ||
|
||
# Save results | ||
results_file = self.working_dir / f"results_{run_id or 'default'}.json" | ||
self.reporter.save_report(report, results_file) | ||
|
||
return report |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Benchmark test package.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
"""Pytest configuration for benchmark tests.""" | ||
|
||
import pytest | ||
from pathlib import Path | ||
|
||
@pytest.fixture | ||
def sample_instance(): | ||
"""Sample benchmark instance for testing.""" | ||
return { | ||
'instance_id': 'test_instance', | ||
'repo': 'test/repo', | ||
'issue': 'Sample issue description', | ||
'patch': 'Sample patch content' | ||
} | ||
|
||
@pytest.fixture | ||
def sample_results(): | ||
"""Sample benchmark results for testing.""" | ||
return { | ||
'test_instance_1': { | ||
'status': 'success', | ||
'metrics': {'accuracy': 0.95} | ||
}, | ||
'test_instance_2': { | ||
'status': 'error', | ||
'error': 'Test error message' | ||
} | ||
} |
Oops, something went wrong.