Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feat] update proprocessing with new params #1

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions evals/humanevalx/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,27 @@ def postprocess_generation(sample, generation_mode="completion"):
return sample


def process_test(sample, problems, dataset_type, language_type, generation_mode):
def process_test(sample, problems, dataset_type, language_type, generation_mode, with_prompt):
sample["generation"] = cleanup_code(sample["generation"], dataset_type, language_type)
if dataset_type == "humanevalx":
task_id = sample["task_id"]
prompt = problems[task_id]["prompt"]
test = problems[task_id]["test"]
code = sample["generation"]

if not with_prompt:
prompt = ''
# Pre-process for different languages
if language_type == "python":
test_setup = "\n".join(IMPORT_HELPER["python"]) + "\n"
test_string = test_setup + prompt + code + "\n" + test + "\n"
elif language_type == "cpp":
test_set_up = ""
funcname = re.search('using namespace std;\n(.*?){\s', prompt)
if funcname:
funcname = funcname.group()[21:].strip()
if funcname in code:
code = code[code.find('{')+1:]
for s in IMPORT_HELPER["cpp"]:
if s not in prompt:
test_set_up += s + "\n"
Expand All @@ -66,6 +73,9 @@ def process_test(sample, problems, dataset_type, language_type, generation_mode)
elif language_type == "go":
import_string = problems[task_id]["import"]
prompt = prompt.replace(import_string, "")
# remove unnecessary title if not use prompt
if "func " in code and not with_prompt:
code = code[code.find("func "):]
test = problems[task_id]["test"]
test_setup = problems[task_id]["test_setup"]
other_pkgs = []
Expand Down Expand Up @@ -106,6 +116,7 @@ def evaluate_functional_correctness(
dataset_type: str = "humanevalx",
generation_mode: str = "completion",
test_groundtruth: bool = False,
with_prompt: bool = True,
):
if log_path is None:
log_path = os.path.join(output_path, "evaluation.log")
Expand All @@ -127,7 +138,7 @@ def evaluate_functional_correctness(

if output_path is not None:
os.makedirs(output_path, exist_ok=True)

with ThreadPoolExecutor(max_workers=n_workers) as executor:

futures = []
Expand All @@ -152,7 +163,7 @@ def evaluate_functional_correctness(
sample["prompt"] = problems[task_id]["prompt"]
sample["prompt"] = problems[task_id]["prompt"]
sample = postprocess_generation(sample, generation_mode)
sample["test_code"] = process_test(sample, problems, dataset_type, language_type, generation_mode)
sample["test_code"] = process_test(sample, problems, dataset_type, language_type, generation_mode, with_prompt)
if sample["test_code"] is None:
continue
if "completion_id" in sample:
Expand Down
27 changes: 10 additions & 17 deletions evals/humanevalx/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,21 +387,8 @@ def cleanup_code(
code = first_block(code, stop_words)
elif dataset_type == "humanevalx":
if language_type.lower() == "python":
code_splits = code.split("\n")
is_empty_line = False
ind_empty_line = None
for i, line in enumerate(code_splits):
if len(line.strip()) > 0 and line[0] != ' ' and line[0] != '\t':
is_empty_line = True
ind_empty_line = i
break
if is_empty_line:
code = "\n".join(code_splits[:ind_empty_line])
else:
end_words = ["\ndef", "\nclass", "\n#", "\nassert", '\n"""', "\nprint", "\nif", "\n\n\n"]
for w in end_words:
if w in code:
code = code[:code.rfind(w)]
Comment on lines -390 to -404
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

此处是否可以保留?因为 code-evaluator 应该是可以脱离 opencompass 单独存在的。

如果 opencompass 中处理了一遍后,在 code-evaluator 中再处理一遍是否会有问题?没问题的话,我倾向于保留这段逻辑。

# already processed in Opencompass
pass
elif language_type.lower() == "java":
main_pos = code.find("public static void main")
if main_pos != -1:
Expand All @@ -410,12 +397,18 @@ def cleanup_code(
code = code[:code.rfind('}')] + '}'
if code.count('{') + 1 == code.count('}'):
code += "\n}"
if "public class" in code:
code = code[:code.find("public class")]
elif language_type.lower() == "go":
if "\nfunc main(" in code:
code = code[:code.rfind("func main(")]
pattern = re.compile("func main\((.*?)\n}", re.DOTALL)
code = pattern.sub('', code)
if "package main" in code:
code = code[code.find("package main"):]
if '}' in code:
code = code[:code.rfind('}')] + '}'
elif language_type.lower() == "cpp":
if "using namespace std;" not in code:
code = "using namespace std;\n"+code
code = extract_block(code)
if "\nint main()" in code:
code = code[:code.rfind("int main()")]
Expand Down
6 changes: 5 additions & 1 deletion scripts/eval_humanevalx.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ OUTPUT_DIR=outputs/humanevalx-${LANGUAGE}
TMP_DIR=tmp

OPTIND=3
while getopts "n:o:t:l:" OPT;
while getopts "n:o:t:p:l:" OPT;
do
case $OPT in
n)
Expand All @@ -24,6 +24,9 @@ do
t)
TMP_DIR="$OPTARG"
;;
p)
WITH_PROMPT=$OPTARG
;;
\?)
echo "Invalid option -$OPTARG" >&2
exit 1
Expand Down Expand Up @@ -77,6 +80,7 @@ CMD="python ./evals/humanevalx/evaluation.py \
--n_workers $NUM_WORKERS \
--tmp_dir $TMP_DIR \
--problem_file $DATA_PATH \
--with_prompt $WITH_PROMPT \
--timeout $TIMEOUT"

echo "Running CMD: " $CMD
Expand Down
10 changes: 7 additions & 3 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def check_datasets(dataset):
else:
raise NotImplementedError(f"{dataset} not implemented...")

def make_cmd(eval_filepath, dataset, ip_address):
def make_cmd(eval_filepath, dataset, ip_address, with_prompt):
if 'humanevalx' in dataset:
dataset, language = dataset.split("/")
result_dir = f"outputs/{ip_address}-{dataset}-{language}"
Expand All @@ -45,6 +45,7 @@ def make_cmd(eval_filepath, dataset, ip_address):
eval_filepath,
language,
"-n", '8',
"-p", f"{with_prompt}",
"-o", result_dir,
"-t", tmp_dir], result_dir

Expand All @@ -56,15 +57,18 @@ def _eval(single_request):

dataset = single_request.form.get('dataset')
ip_address = single_request.remote_addr

if 'With-Prompt' in single_request.headers:
with_prompt = single_request.headers['With-Prompt']
else:
with_prompt = True
try:
check_datasets(dataset)
except ValueError as e:
return {'message':f'dataset name ({dataset}) is wrong.', 'exception': e}, 400
except NotImplementedError as e:
return {'message':f'Dataset({dataset}) not supported.', 'exception': e}, 400

cmd_items, result_dir = make_cmd(eval_filepath, dataset, ip_address)
cmd_items, result_dir = make_cmd(eval_filepath, dataset, ip_address, with_prompt)
print("RUN CMD : " + " ".join(cmd_items))

result = subprocess.run(cmd_items, capture_output=True, text=True)
Expand Down