forked from huggingface/lm-evaluation-harness
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a new task GPQA (the part CoT and generative) (EleutherAI#1482)
* Add new tasks of GPQA * Add README * Remove unused functions * Remove unused functions * Linters * Add flexible match * update * Remove deplicate function * Linter * update * Update lm_eval/filters/extraction.py Co-authored-by: Hailey Schoelkopf <[email protected]> * register multi_choice_regex * Update * run precommit --------- Co-authored-by: Hailey Schoelkopf <[email protected]> Co-authored-by: haileyschoelkopf <[email protected]>
- Loading branch information
1 parent
8a875e9
commit 01108ac
Showing
23 changed files
with
466 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import yaml | ||
from tqdm import tqdm | ||
|
||
|
||
def main() -> None: | ||
subset = ["extended", "diamond", "main"] | ||
setting = "cot_n_shot" | ||
for task in tqdm(subset): | ||
file_name = f"gpqa_{task}_{setting}.yaml" | ||
try: | ||
with open(f"{file_name}", "w") as f: | ||
f.write("# Generated by _generate_configs.py\n") | ||
yaml.dump( | ||
{ | ||
"include": f"_gpqa_{setting}_yaml", | ||
"task": f"gpqa_{task}_{setting}", | ||
"dataset_name": f"gpqa_{task}", | ||
}, | ||
f, | ||
) | ||
except FileExistsError: | ||
pass | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
dataset_path: Idavidrein/gpqa | ||
group: gpqa | ||
output_type: generate_until | ||
process_docs: !function utils.process_docs | ||
training_split: train | ||
# Because huggingface dataset only has train split | ||
validation_split: train | ||
test_split: null | ||
description: "Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n" | ||
doc_to_text: "Question: {{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nLet's think step by step: " | ||
doc_to_target: answer | ||
filter_list: | ||
- name: "strict-match" | ||
filter: | ||
- function: "regex" | ||
regex_pattern: "(?<=The answer is )(.*)(?=.)" | ||
- function: "take_first" | ||
- name: "flexible-extract" | ||
filter: | ||
- function: "multi_choice_regex" | ||
group_select: -1 | ||
ignore_case: true | ||
ignore_punctuation: true | ||
regex_pattern: "(\\([A-Z]\\))" | ||
- function: "take_first" | ||
generation_kwargs: | ||
until: | ||
- "</s>" | ||
do_sample: false | ||
temperature: 0.0 | ||
metric_list: | ||
- metric: exact_match | ||
aggregation: mean | ||
higher_is_better: true | ||
ignore_case: true | ||
ignore_punctuation: true | ||
metadata: | ||
version: 1.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Generated by _generate_configs.py | ||
dataset_name: gpqa_diamond | ||
include: _gpqa_cot_n_shot_yaml | ||
task: gpqa_diamond_cot_n_shot |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Generated by _generate_configs.py | ||
dataset_name: gpqa_extended | ||
include: _gpqa_cot_n_shot_yaml | ||
task: gpqa_extended_cot_n_shot |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Generated by _generate_configs.py | ||
dataset_name: gpqa_main | ||
include: _gpqa_cot_n_shot_yaml | ||
task: gpqa_main_cot_n_shot |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import random | ||
import re | ||
|
||
import datasets | ||
|
||
|
||
def preprocess(text): | ||
if text is None: | ||
return " " | ||
text = text.strip() | ||
text = text.replace(" [title]", ". ") | ||
text = re.sub("\\[.*?\\]", "", text) | ||
text = text.replace(" ", " ") | ||
return text | ||
|
||
|
||
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: | ||
def _process_doc(doc): | ||
choices = [ | ||
preprocess(doc["Incorrect Answer 1"]), | ||
preprocess(doc["Incorrect Answer 2"]), | ||
preprocess(doc["Incorrect Answer 3"]), | ||
preprocess(doc["Correct Answer"]), | ||
] | ||
|
||
random.shuffle(choices) | ||
correct_answer_index = choices.index(preprocess(doc["Correct Answer"])) | ||
|
||
out_doc = { | ||
"choice1": choices[0], | ||
"choice2": choices[1], | ||
"choice3": choices[2], | ||
"choice4": choices[3], | ||
"choices": [choices[0], choices[1], choices[2], choices[3]], | ||
"answer": f"({chr(65 + correct_answer_index)})", | ||
} | ||
return out_doc | ||
|
||
return dataset.map(_process_doc) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import yaml | ||
from tqdm import tqdm | ||
|
||
|
||
def main() -> None: | ||
subset = ["extended", "diamond", "main"] | ||
setting = "cot_zeroshot" | ||
for task in tqdm(subset): | ||
file_name = f"gpqa_{task}_{setting}.yaml" | ||
try: | ||
with open(f"{file_name}", "w") as f: | ||
f.write("# Generated by _generate_configs.py\n") | ||
yaml.dump( | ||
{ | ||
"include": f"_gpqa_{setting}_yaml", | ||
"task": f"gpqa_{task}_{setting}", | ||
"dataset_name": f"gpqa_{task}", | ||
}, | ||
f, | ||
) | ||
except FileExistsError: | ||
pass | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
dataset_path: Idavidrein/gpqa | ||
group: gpqa | ||
output_type: generate_until | ||
process_docs: !function utils.process_docs | ||
training_split: train | ||
# Because huggingface dataset only has train split | ||
validation_split: train | ||
test_split: null | ||
doc_to_text: "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nLet's think step by step: " | ||
doc_to_target: answer | ||
filter_list: | ||
- name: "strict-match" | ||
filter: | ||
- function: "regex" | ||
regex_pattern: "(?<=The answer is )(.*)(?=.)" | ||
- function: "take_first" | ||
- name: "flexible-extract" | ||
filter: | ||
- function: "multi_choice_regex" | ||
group_select: -1 | ||
ignore_case: true | ||
ignore_punctuation: true | ||
regex_pattern: "(\\([A-Z]\\))" | ||
- function: "take_first" | ||
generation_kwargs: | ||
until: | ||
- "</s>" | ||
do_sample: false | ||
temperature: 0.0 | ||
num_fewshot: 0 | ||
metric_list: | ||
- metric: exact_match | ||
aggregation: mean | ||
higher_is_better: true | ||
ignore_case: true | ||
ignore_punctuation: true | ||
metadata: | ||
version: 1.0 |
4 changes: 4 additions & 0 deletions
4
lm_eval/tasks/gpqa/cot_zeroshot/gpqa_diamond_cot_zeroshot.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Generated by _generate_configs.py | ||
dataset_name: gpqa_diamond | ||
include: _gpqa_cot_zeroshot_yaml | ||
task: gpqa_diamond_cot_zeroshot |
4 changes: 4 additions & 0 deletions
4
lm_eval/tasks/gpqa/cot_zeroshot/gpqa_extended_cot_zeroshot.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Generated by _generate_configs.py | ||
dataset_name: gpqa_extended | ||
include: _gpqa_cot_zeroshot_yaml | ||
task: gpqa_extended_cot_zeroshot |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Generated by _generate_configs.py | ||
dataset_name: gpqa_main | ||
include: _gpqa_cot_zeroshot_yaml | ||
task: gpqa_main_cot_zeroshot |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import random | ||
import re | ||
|
||
import datasets | ||
|
||
|
||
def preprocess(text): | ||
if text is None: | ||
return " " | ||
text = text.strip() | ||
text = text.replace(" [title]", ". ") | ||
text = re.sub("\\[.*?\\]", "", text) | ||
text = text.replace(" ", " ") | ||
return text | ||
|
||
|
||
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: | ||
def _process_doc(doc): | ||
choices = [ | ||
preprocess(doc["Incorrect Answer 1"]), | ||
preprocess(doc["Incorrect Answer 2"]), | ||
preprocess(doc["Incorrect Answer 3"]), | ||
preprocess(doc["Correct Answer"]), | ||
] | ||
|
||
random.shuffle(choices) | ||
correct_answer_index = choices.index(preprocess(doc["Correct Answer"])) | ||
|
||
out_doc = { | ||
"choice1": choices[0], | ||
"choice2": choices[1], | ||
"choice3": choices[2], | ||
"choice4": choices[3], | ||
"choices": [choices[0], choices[1], choices[2], choices[3]], | ||
"answer": f"({chr(65 + correct_answer_index)})", | ||
} | ||
return out_doc | ||
|
||
return dataset.map(_process_doc) |
Oops, something went wrong.