Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a new benchmark ENAMEL for evaluating the efficiency of LLM-generated code #260

Open
wants to merge 29 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
4eadff5
Add a new benchmark ENAMEL
q-rz Jun 30, 2024
d23b938
Add a new benchmark ENAMEL
q-rz Jul 1, 2024
72e50d3
Add a new benchmark ENAMEL
q-rz Jul 1, 2024
3d4c275
Add a new benchmark ENAMEL
q-rz Jul 1, 2024
7847bb0
Add a new benchmark ENAMEL
q-rz Jul 1, 2024
48c2f1c
Add a new benchmark ENAMEL
q-rz Jul 1, 2024
02c43e9
Add a new benchmark ENAMEL
q-rz Jul 1, 2024
d1e10b9
Add a new benchmark ENAMEL
q-rz Jul 18, 2024
86f3902
Add a new benchmark ENAMEL
q-rz Jul 18, 2024
4caa5dd
Add a new benchmark ENAMEL
q-rz Jul 18, 2024
027afcb
Add a new benchmark ENAMEL
q-rz Jul 18, 2024
eb43103
Add a new benchmark ENAMEL
q-rz Jul 18, 2024
f3e86ac
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
5aafc0e
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
afb8471
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
8cf92a6
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
71c69f6
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
32265c8
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
bef7566
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
bf3348f
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
93c47cc
Add a new benchmark ENAMEL
q-rz Jul 22, 2024
6b6163d
Add a new benchmark ENAMEL
q-rz Jul 22, 2024
fd7694f
Add a new benchmark ENAMEL
q-rz Jul 22, 2024
cd0810c
Add a new benchmark ENAMEL
q-rz Jul 22, 2024
de62d20
Add a new benchmark ENAMEL
q-rz Jul 22, 2024
80f4e14
Add a new benchmark ENAMEL
q-rz Jul 22, 2024
7109433
Add a new benchmark ENAMEL
q-rz Jul 22, 2024
1553acf
Merge branch 'bigcode-project:main' into main
q-rz Jul 22, 2024
cca02b2
Update README.md
q-rz Jul 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add a new benchmark ENAMEL
q-rz committed Jul 1, 2024
commit 02c43e9a6540d836afce1e7abd18034d414bb349
12 changes: 12 additions & 0 deletions bigcode_eval/tasks/custom_metrics/enamel_eval.py
Original file line number Diff line number Diff line change
@@ -1 +1,13 @@
# TODO: eff@k

def evaluate_all(generations, references, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec):
# TODO

def might_catch_timeout_signal():
# TODO

might_catch_timeout_signal.WARNING = """\
We have detected that the generated code samples use `try ... except` within a loop, which might catch \
our timeout signal and cause a dead loop. Since resolving this rare issue via `multiprocessing` would \
significantly slow down the evaluation process for our large-scale inputs, we have decided not to resolve \
this issue. If this issue does happen, please consider removing the corresponding code samples."""
30 changes: 17 additions & 13 deletions bigcode_eval/tasks/enamel.py
Original file line number Diff line number Diff line change
@@ -7,11 +7,6 @@
Homepage: https://github.com/q-rz/enamel
"""

from warnings import warn
import numpy as np
from bigcode_eval.tasks.humaneval import GeneralHumanEval
from bigcode_eval.custom_metrics.enamel_eval import

_CITATION = """
@article{qiu2024enamel,
title={How efficient is {LLM}-generated code? A rigorous \& high-standard benchmark},
@@ -22,6 +17,12 @@
"""


from warnings import warn
import numpy as np
from bigcode_eval.tasks.humaneval import GeneralHumanEval
from bigcode_eval.custom_metrics.enamel_eval import evaluate_all, might_catch_timeout_signal


class GeneralENAMEL(GeneralHumanEval):
"""A task represents an entire benchmark including its dataset, problems,
answers, generation settings and evaluation methods.
@@ -31,17 +32,18 @@ class GeneralENAMEL(GeneralHumanEval):
DATASET_NAME = "ENAMEL_HumanEval"

def __init__(self, subset, # list of problem IDs
hardness=[0., 3., 3., 4.], memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
strip_prompt=True, k=[1, 10, 100], num_workers=16,
hardness=[0., 3., 3., 4.], n_reps = 6, memory_giga=4., timeout_factor=2., tolerence_sec=0.01, tests_path="cache/eval~tests.pkl",
strip_prompt=True, k=[1, 10, 100],
):
super().__init__(strip_prompt=strip_prompt, k=k, num_workers=num_workers, timeout=None) # each problem has a different time limit
super().__init__(strip_prompt=strip_prompt, k=k, num_workers=1, timeout=None) # each problem has a different time limit
if isinstance(subset, list):
self.subset = subset
else:
assert subset in self.DATASET_SUBSETS, f"unknown subset {repr(subset)}"
self.subset = self.DATASET_SUBSETS[subset]
self.dataset[self.__name__] = self.dataset["ENAMEL_HumanEval"].iloc[np.array(self.subset), :] # TODO
self.hardness = hardness
self.n_reps = n_reps
self.memory_giga = memory_giga
self.timeout_factor = timeout_factor
self.tolerence_sec = tolerence_sec
@@ -60,7 +62,7 @@ def get_reference(self, doc):
sample from the test dataset
:return: str
"""
return ""
return "" # TODO: include tests

def postprocess_generation(self, generation, idx):
"""
@@ -73,22 +75,24 @@ def postprocess_generation(self, generation, idx):
"""
prompt = self.get_prompt(self.get_dataset()[idx])
generation = self._stop_at_stop_token(generation, self.stop_words)
if (not self.warned_dead_loop) and might_catch_timeout_signal(generation):
warn(might_catch_timeout_signal.WARNING)
return prompt + "\n pass\n" + generation # this should work no matter generation contains prompt or not

def process_results(self, generations, references):
# TODO: define how the evaluation score is computed from list of \
# generations and reference solutions
"""
Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations as in {"metric_name": result}.
We encourage to directly load the metric from `evaluate` library to keep the code concise.
:param generations: list(list(str))
list of lists containing generations
:param references: list(str)
list of str containing refrences
:return: dict[str: float]
"""
return {}
return evaluate_all(
generations, references, k=self.k, hardness=self.hardness, n_reps=self.n_reps,
memory_giga=self.memory_giga, timeout_factor=self.timeout_factor, tolerence_sec=self.tolerence_sec,
)


def create_task(name, subset):