From 56a4e7943fca2959aff06bb96a3fe5ec63255aaa Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 15 Jul 2024 21:24:49 +0700 Subject: [PATCH] formatting (#2104) --- lm_eval/api/metrics.py | 2 +- lm_eval/api/task.py | 2 +- lm_eval/filters/extraction.py | 2 +- lm_eval/tasks/afrimgsm/README.md | 8 +- lm_eval/tasks/afrimgsm/run.sh | 2 +- lm_eval/tasks/afrimgsm/utils.py | 106 +++++++++++------- lm_eval/tasks/afrimmlu/README.md | 8 +- .../afrimmlu/direct/afrimmlu_common_yaml | 12 +- .../afrimmlu/direct/afrimmlu_direct_eng.yaml | 1 - .../afrimmlu/direct/afrimmlu_direct_ewe.yaml | 1 - .../afrimmlu/direct/afrimmlu_direct_fra.yaml | 2 +- .../afrimmlu/direct/afrimmlu_direct_hau.yaml | 2 +- .../afrimmlu/direct/afrimmlu_direct_ibo.yaml | 2 +- .../afrimmlu/direct/afrimmlu_direct_kin.yaml | 2 +- .../afrimmlu/direct/afrimmlu_direct_lin.yaml | 2 +- .../afrimmlu/direct/afrimmlu_direct_lug.yaml | 2 +- .../afrimmlu/direct/afrimmlu_direct_orm.yaml | 2 +- .../afrimmlu/direct/afrimmlu_direct_sna.yaml | 2 +- .../afrimmlu/direct/afrimmlu_direct_sot.yaml | 2 +- .../afrimmlu/direct/afrimmlu_direct_swa.yaml | 2 +- .../afrimmlu/direct/afrimmlu_direct_twi.yaml | 2 +- .../afrimmlu/direct/afrimmlu_direct_wol.yaml | 2 +- .../afrimmlu/direct/afrimmlu_direct_xho.yaml | 2 +- .../afrimmlu/direct/afrimmlu_direct_yor.yaml | 2 +- .../afrimmlu/direct/afrimmlu_direct_zul.yaml | 2 +- lm_eval/tasks/afrimmlu/direct/utils.py | 29 +++-- lm_eval/tasks/afrimmlu/fewshot.sh | 2 +- .../translate/afrimmlu_common_translate_yaml | 12 +- .../translate/afrimmlu_translate_eng.yaml | 1 - .../translate/afrimmlu_translate_ewe.yaml | 1 - .../translate/afrimmlu_translate_fra.yaml | 2 +- .../translate/afrimmlu_translate_hau.yaml | 2 +- .../translate/afrimmlu_translate_ibo.yaml | 2 +- .../translate/afrimmlu_translate_kin.yaml | 2 +- .../translate/afrimmlu_translate_lin.yaml | 2 +- .../translate/afrimmlu_translate_lug.yaml | 2 +- .../translate/afrimmlu_translate_orm.yaml | 2 +- .../translate/afrimmlu_translate_sna.yaml | 2 +- .../translate/afrimmlu_translate_sot.yaml | 2 +- .../translate/afrimmlu_translate_swa.yaml | 2 +- .../translate/afrimmlu_translate_twi.yaml | 2 +- .../translate/afrimmlu_translate_wol.yaml | 2 +- .../translate/afrimmlu_translate_xho.yaml | 2 +- .../translate/afrimmlu_translate_yor.yaml | 2 +- .../translate/afrimmlu_translate_zul.yaml | 2 +- lm_eval/tasks/afrimmlu/translate/utils.py | 24 ++-- lm_eval/tasks/afrimmlu/utils.py | 24 ++-- lm_eval/tasks/afrixnli/README.md | 10 +- .../afrixnli/anli prompt/en-direct/utils.py | 6 +- .../translate/afrixnli_translate_amh.yaml | 1 - .../afrixnli/anli prompt/translate/utils.py | 6 +- .../tasks/afrixnli/lai prompt/direct/utils.py | 15 +-- .../afrixnli/lai prompt/translate/utils.py | 15 +-- lm_eval/tasks/afrixnli/utils.py | 85 +++++++++----- .../med_concepts_qa/_med_concepts_qa_atc.yaml | 2 +- .../_med_concepts_qa_icd10proc.yaml | 2 +- .../_med_concepts_qa_icd9cm.yaml | 2 +- .../_med_concepts_qa_icd9proc.yaml | 2 +- 58 files changed, 234 insertions(+), 209 deletions(-) diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py index 5ea6b221fd..1daf3847de 100644 --- a/lm_eval/api/metrics.py +++ b/lm_eval/api/metrics.py @@ -565,4 +565,4 @@ def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True): assert len(metrics) == len(sizes) - return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes) \ No newline at end of file + return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes) diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py index ec9caccfba..030e6857f8 100644 --- a/lm_eval/api/task.py +++ b/lm_eval/api/task.py @@ -1665,4 +1665,4 @@ def count_bytes(cls, doc) -> int: @classmethod def count_words(cls, doc) -> int: """Downstream tasks with custom word boundaries should override this!""" - return len(re.split(r"\s+", doc)) \ No newline at end of file + return len(re.split(r"\s+", doc)) diff --git a/lm_eval/filters/extraction.py b/lm_eval/filters/extraction.py index 4f087451d9..41dc6208ce 100644 --- a/lm_eval/filters/extraction.py +++ b/lm_eval/filters/extraction.py @@ -181,4 +181,4 @@ def filter_ignores(st): filtered.append(match) filtered_resps.append(filtered) - return filtered_resps \ No newline at end of file + return filtered_resps diff --git a/lm_eval/tasks/afrimgsm/README.md b/lm_eval/tasks/afrimgsm/README.md index 8f9d4619fb..cca14d968d 100644 --- a/lm_eval/tasks/afrimgsm/README.md +++ b/lm_eval/tasks/afrimgsm/README.md @@ -5,8 +5,8 @@ IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models https://arxiv.org/pdf/2406.03368 -IrokoBench is a human-translated benchmark dataset for 16 typologically diverse -low-resource African languages covering three tasks: natural language inference (AfriXNLI), +IrokoBench is a human-translated benchmark dataset for 16 typologically diverse +low-resource African languages covering three tasks: natural language inference (AfriXNLI), mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU). @@ -14,13 +14,13 @@ mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU ``` @misc{adelani2024irokobenchnewbenchmarkafrican, - title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, + title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp}, year={2024}, eprint={2406.03368}, archivePrefix={arXiv}, primaryClass={cs.CL}, - url={https://arxiv.org/abs/2406.03368}, + url={https://arxiv.org/abs/2406.03368}, } ``` diff --git a/lm_eval/tasks/afrimgsm/run.sh b/lm_eval/tasks/afrimgsm/run.sh index 370ffd9a8a..075500be33 100644 --- a/lm_eval/tasks/afrimgsm/run.sh +++ b/lm_eval/tasks/afrimgsm/run.sh @@ -3,4 +3,4 @@ lm_eval --model hf \ --device cuda:0 \ --batch_size 1 \ --verbosity DEBUG \ - --limit 5 \ No newline at end of file + --limit 5 diff --git a/lm_eval/tasks/afrimgsm/utils.py b/lm_eval/tasks/afrimgsm/utils.py index 8899e8868f..0dd336f8b3 100644 --- a/lm_eval/tasks/afrimgsm/utils.py +++ b/lm_eval/tasks/afrimgsm/utils.py @@ -2,51 +2,74 @@ import yaml -languages = ['eng', 'amh', 'ibo', 'fra', 'sna', 'lin', 'wol', 'ewe', 'lug', 'xho', 'kin', 'twi', 'zul', 'orm', 'yor', - 'hau', 'sot', 'swa'] - -languages_REGEX = {"eng": "The answer is (\\-?[0-9\\.\\,]+)", - "amh": "መልሱ (\\-?[0-9\\.\\,]+)", - "ibo": "Azịza ya bụ (\\-?[0-9\\.\\,]+)", - 'fra': "La réponse est(\\-?[0-9\\.\\,]+)", - 'sna': "Mhinduro kumubvunzo ndi (\\-?[0-9\\.\\,]+)", - 'lin': "Eyano ezali (\\-?[0-9\\.\\,]+)", - 'wol': "Tontu li (\\-?[0-9\\.\\,]+)", - 'ewe': "ŋuɖoɖoae nye (\\-?[0-9\\.\\,]+)", - 'lug': "Ansa eri (\\-?[0-9\\.\\,]+)", - 'xho': "Impendulo ngu (\\-?[0-9\\.\\,]+)", - 'kin': "Igisubizo ni (\\-?[0-9\\.\\,]+)", - 'twi': "Ne nnyiano yɛ (\\-?[0-9\\.\\,]+)", - 'zul': "Impendulo ithi (\\-?[0-9\\.\\,]+)", - 'orm': "Deebiin isaa (\\-?[0-9\\.\\,]+)", - 'yor': "Ìdáhùn náà ni (\\-?[0-9\\.\\,]+)", - 'hau': "Amsar ita ce (\\-?[0-9\\.\\,]+)", - 'sot': "Karabo ke (\\-?[0-9\\.\\,]+)", - 'swa': "Jibu ni (\\-?[0-9\\.\\,]+)", - } + +languages = [ + "eng", + "amh", + "ibo", + "fra", + "sna", + "lin", + "wol", + "ewe", + "lug", + "xho", + "kin", + "twi", + "zul", + "orm", + "yor", + "hau", + "sot", + "swa", +] + +languages_REGEX = { + "eng": "The answer is (\\-?[0-9\\.\\,]+)", + "amh": "መልሱ (\\-?[0-9\\.\\,]+)", + "ibo": "Azịza ya bụ (\\-?[0-9\\.\\,]+)", + "fra": "La réponse est(\\-?[0-9\\.\\,]+)", + "sna": "Mhinduro kumubvunzo ndi (\\-?[0-9\\.\\,]+)", + "lin": "Eyano ezali (\\-?[0-9\\.\\,]+)", + "wol": "Tontu li (\\-?[0-9\\.\\,]+)", + "ewe": "ŋuɖoɖoae nye (\\-?[0-9\\.\\,]+)", + "lug": "Ansa eri (\\-?[0-9\\.\\,]+)", + "xho": "Impendulo ngu (\\-?[0-9\\.\\,]+)", + "kin": "Igisubizo ni (\\-?[0-9\\.\\,]+)", + "twi": "Ne nnyiano yɛ (\\-?[0-9\\.\\,]+)", + "zul": "Impendulo ithi (\\-?[0-9\\.\\,]+)", + "orm": "Deebiin isaa (\\-?[0-9\\.\\,]+)", + "yor": "Ìdáhùn náà ni (\\-?[0-9\\.\\,]+)", + "hau": "Amsar ita ce (\\-?[0-9\\.\\,]+)", + "sot": "Karabo ke (\\-?[0-9\\.\\,]+)", + "swa": "Jibu ni (\\-?[0-9\\.\\,]+)", +} LANGUAGES = {} for lang in languages: - if lang == 'amh': + if lang == "amh": LANGUAGES[lang] = { # English "QUESTION": "ጥያቄ:", "ANSWER": "በቅደም ተከተል መልስ:", "DIRECT": "Answer:", - "REGEX": languages_REGEX[lang]} - elif lang == 'yor': + "REGEX": languages_REGEX[lang], + } + elif lang == "yor": LANGUAGES[lang] = { # English "QUESTION": "Ìbéèrè:", "ANSWER": "Ìdáhùn lẹ́sẹsẹ:", "DIRECT": "Answer:", - "REGEX": languages_REGEX[lang]} + "REGEX": languages_REGEX[lang], + } else: LANGUAGES[lang] = { # English "QUESTION": "Question:", "ANSWER": "Step-by-Step Answer:", "DIRECT": "Answer:", - "REGEX": languages_REGEX[lang]} + "REGEX": languages_REGEX[lang], + } def add_regex_pattern(regex_pattern): @@ -93,13 +116,12 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None: err = [] for lang in LANGUAGES.keys(): try: - yaml_template = "cot_yaml" filter_list = {} DELIMITER = None if mode == "direct": - ANSWER = LANGUAGES['eng']["DIRECT"] - QUESTION = LANGUAGES['eng']["QUESTION"] + ANSWER = LANGUAGES["eng"]["DIRECT"] + QUESTION = LANGUAGES["eng"]["QUESTION"] REGEX = None task_name = f"afrimgsm_direct_{lang}" yaml_template = "direct_yaml" @@ -122,8 +144,8 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None: QUESTION = LANGUAGES["eng"]["QUESTION"] task_name = f"afrimgsm_en_cot_{lang}" elif mode == "translate-direct": - ANSWER = LANGUAGES['eng']["DIRECT"] - QUESTION = LANGUAGES['eng']["QUESTION"] + ANSWER = LANGUAGES["eng"]["DIRECT"] + QUESTION = LANGUAGES["eng"]["QUESTION"] REGEX = None task_name = f"afrimgsm_translate_direct_{lang}" yaml_template = "translate_direct_yaml" @@ -131,7 +153,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None: file_name = f"{task_name}.yaml" ANSWER_TO_SKIP = len(LANGUAGES[lang]["ANSWER"]) + 1 with open( - f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8" + f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8" ) as f: f.write("# Generated by utils.py\n") yaml.dump( @@ -140,15 +162,15 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None: "dataset_name": lang, "task": f"{task_name}", "doc_to_text": f"""{{% if answer is not none %}}""" - f"""{{{{question+"\\n{ANSWER}"}}}}""" - f"""{{% else %}}""" - f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}""" - f"""{{% endif %}}""", + f"""{{{{question+"\\n{ANSWER}"}}}}""" + f"""{{% else %}}""" + f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}""" + f"""{{% endif %}}""", "doc_to_target": f"""{{% if answer is not none %}}""" - f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}""" - f"""{{% else %}}""" - f"""{{{{answer_number|string}}}}""" - f"""{{% endif %}}""", + f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}""" + f"""{{% else %}}""" + f"""{{{{answer_number|string}}}}""" + f"""{{% endif %}}""", **filter_list, "generation_kwargs": { "until": [QUESTION, "", "<|im_end|>"], @@ -194,4 +216,4 @@ def main() -> None: if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/lm_eval/tasks/afrimmlu/README.md b/lm_eval/tasks/afrimmlu/README.md index b512937811..f7f7ed4d82 100644 --- a/lm_eval/tasks/afrimmlu/README.md +++ b/lm_eval/tasks/afrimmlu/README.md @@ -5,8 +5,8 @@ IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models https://arxiv.org/pdf/2406.03368 -IrokoBench is a human-translated benchmark dataset for 16 typologically diverse -low-resource African languages covering three tasks: natural language inference (AfriXNLI), +IrokoBench is a human-translated benchmark dataset for 16 typologically diverse +low-resource African languages covering three tasks: natural language inference (AfriXNLI), mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU). @@ -14,13 +14,13 @@ mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU ``` @misc{adelani2024irokobenchnewbenchmarkafrican, - title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, + title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp}, year={2024}, eprint={2406.03368}, archivePrefix={arXiv}, primaryClass={cs.CL}, - url={https://arxiv.org/abs/2406.03368}, + url={https://arxiv.org/abs/2406.03368}, } ``` diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml index 8c42b8f58a..47d16d95dd 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml @@ -9,18 +9,18 @@ output_type: multiple_choice validation_split: validation test_split: test fewshot_split: validation -doc_to_text: !function utils.doc_to_text +doc_to_text: !function utils.doc_to_text doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}" doc_to_choice: !function utils.doc_to_choice should_decontaminate: true doc_to_decontamination_query: "Question: {{question}}\nAnswer:" metric_list: - - metric: f1 - aggregation: !function utils.weighted_f1_score + - metric: f1 + aggregation: !function utils.weighted_f1_score # aggregation: mean - average: weighted - hf_evaluate: true - higher_is_better: True + average: weighted + hf_evaluate: true + higher_is_better: True ignore_case: true ignore_punctuation: true regexes_to_ignore: diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml index f2add8a5ff..a1e647cdf1 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml @@ -1,4 +1,3 @@ dataset_name: eng include: afrimmlu_common_yaml task: afrimmlu_direct_eng - diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml index e0b00c08ab..1cc45ddc0e 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml @@ -1,4 +1,3 @@ dataset_name: ewe include: afrimmlu_common_yaml task: afrimmlu_direct_ewe - diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml index 6e3383b387..e6adb6c8aa 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml @@ -1,3 +1,3 @@ dataset_name: fra include: afrimmlu_common_yaml -task: afrimmlu_direct_fra \ No newline at end of file +task: afrimmlu_direct_fra diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml index a28b083c4d..9cc9a1ae7a 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml @@ -1,3 +1,3 @@ dataset_name: hau include: afrimmlu_common_yaml -task: afrimmlu_direct_hau \ No newline at end of file +task: afrimmlu_direct_hau diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml index f9a863f237..6abb2c4a46 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml @@ -1,3 +1,3 @@ dataset_name: ibo include: afrimmlu_common_yaml -task: afrimmlu_direct_ibo \ No newline at end of file +task: afrimmlu_direct_ibo diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml index bfd008d4a3..2f81f709c4 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml @@ -1,3 +1,3 @@ dataset_name: kin include: afrimmlu_common_yaml -task: afrimmlu_direct_kin \ No newline at end of file +task: afrimmlu_direct_kin diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml index a541b655fb..55363ed937 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml @@ -1,3 +1,3 @@ dataset_name: lin include: afrimmlu_common_yaml -task: afrimmlu_direct_lin \ No newline at end of file +task: afrimmlu_direct_lin diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml index b6d0be0505..0d484427ed 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml @@ -1,3 +1,3 @@ dataset_name: lug include: afrimmlu_common_yaml -task: afrimmlu_direct_lug \ No newline at end of file +task: afrimmlu_direct_lug diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml index 124bbe59ba..763eb8a75f 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml @@ -1,3 +1,3 @@ dataset_name: orm include: afrimmlu_common_yaml -task: afrimmlu_direct_orm \ No newline at end of file +task: afrimmlu_direct_orm diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml index 9d75eb8570..ed9e69af39 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml @@ -1,3 +1,3 @@ dataset_name: sna include: afrimmlu_common_yaml -task: afrimmlu_direct_sna \ No newline at end of file +task: afrimmlu_direct_sna diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml index fba23339e5..acdba0fdcc 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml @@ -1,3 +1,3 @@ dataset_name: sot include: afrimmlu_common_yaml -task: afrimmlu_direct_sot \ No newline at end of file +task: afrimmlu_direct_sot diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml index 099ecb0d7d..c1aa82b0b1 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml @@ -1,3 +1,3 @@ dataset_name: swa include: afrimmlu_common_yaml -task: afrimmlu_direct_swa \ No newline at end of file +task: afrimmlu_direct_swa diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml index 2da2e627c7..2695d4a156 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml @@ -1,3 +1,3 @@ dataset_name: twi include: afrimmlu_common_yaml -task: afrimmlu_direct_twi \ No newline at end of file +task: afrimmlu_direct_twi diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml index 938247ab22..027f837637 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml @@ -1,3 +1,3 @@ dataset_name: wol include: afrimmlu_common_yaml -task: afrimmlu_direct_wol \ No newline at end of file +task: afrimmlu_direct_wol diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml index bcaa7a6229..8e0c12972d 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml @@ -1,3 +1,3 @@ dataset_name: xho include: afrimmlu_common_yaml -task: afrimmlu_direct_xho \ No newline at end of file +task: afrimmlu_direct_xho diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml index a83c8454f2..2a9f7645c2 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml @@ -1,3 +1,3 @@ dataset_name: yor include: afrimmlu_common_yaml -task: afrimmlu_direct_yor \ No newline at end of file +task: afrimmlu_direct_yor diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml index a597d3bbde..9d8d3b415b 100644 --- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml +++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml @@ -1,3 +1,3 @@ dataset_name: zul include: afrimmlu_common_yaml -task: afrimmlu_direct_zul \ No newline at end of file +task: afrimmlu_direct_zul diff --git a/lm_eval/tasks/afrimmlu/direct/utils.py b/lm_eval/tasks/afrimmlu/direct/utils.py index 2250b4b3ec..6d30579d79 100644 --- a/lm_eval/tasks/afrimmlu/direct/utils.py +++ b/lm_eval/tasks/afrimmlu/direct/utils.py @@ -1,9 +1,4 @@ -import re -import sys -import unicodedata - from sklearn.metrics import f1_score -from lm_eval.filters.extraction import RegexFilter def doc_to_choice(doc): @@ -12,9 +7,9 @@ def doc_to_choice(doc): def doc_to_text(doc): - output = """You are a highly knowledgeable and intelligent artificial intelligence + output = """You are a highly knowledgeable and intelligent artificial intelligence model answers multiple-choice questions about {subject} - + Question: {question} Choices: @@ -22,16 +17,18 @@ def doc_to_text(doc): B: {choice2} C: {choice3} D: {choice4} - + Answer: """ - + choices = eval(doc["choices"]) - text = output.format(subject=doc['subject'], - question=doc['question'], - choice1=choices[0], - choice2=choices[1], - choice3=choices[2], - choice4=choices[3]) + text = output.format( + subject=doc["subject"], + question=doc["question"], + choice1=choices[0], + choice2=choices[1], + choice3=choices[2], + choice4=choices[3], + ) return text @@ -40,4 +37,4 @@ def weighted_f1_score(items): golds = unzipped_list[0] preds = unzipped_list[1] fscore = f1_score(golds, preds, average="weighted") - return fscore \ No newline at end of file + return fscore diff --git a/lm_eval/tasks/afrimmlu/fewshot.sh b/lm_eval/tasks/afrimmlu/fewshot.sh index 42902dddc2..c69c48d7df 100644 --- a/lm_eval/tasks/afrimmlu/fewshot.sh +++ b/lm_eval/tasks/afrimmlu/fewshot.sh @@ -5,4 +5,4 @@ lm_eval --model hf \ --batch_size 1 \ --num_fewshot 0 \ --verbosity DEBUG \ - --wandb_args project=afrimmlu \ No newline at end of file + --wandb_args project=afrimmlu diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml index 247a677923..2bc87d5aa7 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml @@ -6,18 +6,18 @@ dataset_path: masakhane/afrimmlu-translate-test dataset_name: null output_type: multiple_choice test_split: test -doc_to_text: !function utils.doc_to_text +doc_to_text: !function utils.doc_to_text doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}" doc_to_choice: !function utils.doc_to_choice should_decontaminate: true doc_to_decontamination_query: "Question: {{question}}\nAnswer:" metric_list: - - metric: f1 - aggregation: !function utils.weighted_f1_score + - metric: f1 + aggregation: !function utils.weighted_f1_score # aggregation: mean - average: weighted - hf_evaluate: true - higher_is_better: True + average: weighted + hf_evaluate: true + higher_is_better: True ignore_case: true ignore_punctuation: true regexes_to_ignore: diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml index 30c5007881..0be98beedd 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml @@ -1,4 +1,3 @@ dataset_name: eng include: afrimmlu_common_translate_yaml task: afrimmlu_translate_eng - diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml index 3c764f9c98..624342b91f 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml @@ -1,4 +1,3 @@ dataset_name: ewe include: afrimmlu_common_translate_yaml task: afrimmlu_translate_ewe - diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml index 5401255c62..c4fd7e1fc7 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml @@ -1,3 +1,3 @@ dataset_name: fra include: afrimmlu_common_translate_yaml -task: afrimmlu_translate_fra \ No newline at end of file +task: afrimmlu_translate_fra diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml index 590baa6e27..aaeb415fa2 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml @@ -1,3 +1,3 @@ dataset_name: hau include: afrimmlu_common_translate_yaml -task: afrimmlu_translate_hau \ No newline at end of file +task: afrimmlu_translate_hau diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml index 25c306d185..93fb24e8c3 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml @@ -1,3 +1,3 @@ dataset_name: ibo include: afrimmlu_common_translate_yaml -task: afrimmlu_translate_ibo \ No newline at end of file +task: afrimmlu_translate_ibo diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml index ef917724a3..f39f666840 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml @@ -1,3 +1,3 @@ dataset_name: kin include: afrimmlu_common_translate_yaml -task: afrimmlu_translate_kin \ No newline at end of file +task: afrimmlu_translate_kin diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml index a07db09029..c935ee4738 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml @@ -1,3 +1,3 @@ dataset_name: lin include: afrimmlu_common_translate_yaml -task: afrimmlu_translate_lin \ No newline at end of file +task: afrimmlu_translate_lin diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml index f23a9472ec..72e4bce011 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml @@ -1,3 +1,3 @@ dataset_name: lug include: afrimmlu_common_translate_yaml -task: afrimmlu_translate_lug \ No newline at end of file +task: afrimmlu_translate_lug diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml index bee6ff8435..3ff9024994 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml @@ -1,3 +1,3 @@ dataset_name: orm include: afrimmlu_common_translate_yaml -task: afrimmlu_translate_orm \ No newline at end of file +task: afrimmlu_translate_orm diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml index ca99078ddd..9979740a9b 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml @@ -1,3 +1,3 @@ dataset_name: sna include: afrimmlu_common_translate_yaml -task: afrimmlu_translate_sna \ No newline at end of file +task: afrimmlu_translate_sna diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml index 3c617bc638..deb2b9b81d 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml @@ -1,3 +1,3 @@ dataset_name: sot include: afrimmlu_common_translate_yaml -task: afrimmlu_translate_sot \ No newline at end of file +task: afrimmlu_translate_sot diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml index a4baad77aa..e58d90bc69 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml @@ -1,3 +1,3 @@ dataset_name: swa include: afrimmlu_common_translate_yaml -task: afrimmlu_translate_swa \ No newline at end of file +task: afrimmlu_translate_swa diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml index 16436ff38a..51a2d26ae0 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml @@ -1,3 +1,3 @@ dataset_name: twi include: afrimmlu_common_translate_yaml -task: afrimmlu_translate_twi \ No newline at end of file +task: afrimmlu_translate_twi diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml index 025139a168..006b684782 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml @@ -1,3 +1,3 @@ dataset_name: wol include: afrimmlu_common_translate_yaml -task: afrimmlu_translate_wol \ No newline at end of file +task: afrimmlu_translate_wol diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml index 4404ab6152..c0bdf4471b 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml @@ -1,3 +1,3 @@ dataset_name: xho include: afrimmlu_common_translate_yaml -task: afrimmlu_translate_xho \ No newline at end of file +task: afrimmlu_translate_xho diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml index 48152ab2b6..0e7ba6005b 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml @@ -1,3 +1,3 @@ dataset_name: yor include: afrimmlu_common_translate_yaml -task: afrimmlu_translate_yor \ No newline at end of file +task: afrimmlu_translate_yor diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml index b4fc8dfbec..a18d251cc8 100644 --- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml +++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml @@ -1,3 +1,3 @@ dataset_name: zul include: afrimmlu_common_translate_yaml -task: afrimmlu_translate_zul \ No newline at end of file +task: afrimmlu_translate_zul diff --git a/lm_eval/tasks/afrimmlu/translate/utils.py b/lm_eval/tasks/afrimmlu/translate/utils.py index 6054cf31d4..f53fb68f11 100644 --- a/lm_eval/tasks/afrimmlu/translate/utils.py +++ b/lm_eval/tasks/afrimmlu/translate/utils.py @@ -7,9 +7,9 @@ def doc_to_choice(doc): def doc_to_text(doc): - output = """You are a highly knowledgeable and intelligent artificial intelligence + output = """You are a highly knowledgeable and intelligent artificial intelligence model answers multiple-choice questions about '{subject}' - + Question: '''{question}''' Choices: @@ -17,16 +17,18 @@ def doc_to_text(doc): B: ''{choice2}''' C: ''{choice3}''' D: ''{choice4}''' - + Answer: """ - + choices = eval(doc["choices"]) - text = output.format(subject=doc['subject'], - question=doc['question'], - choice1=choices[0], - choice2=choices[1], - choice3=choices[2], - choice4=choices[3]) + text = output.format( + subject=doc["subject"], + question=doc["question"], + choice1=choices[0], + choice2=choices[1], + choice3=choices[2], + choice4=choices[3], + ) return text @@ -35,4 +37,4 @@ def weighted_f1_score(items): golds = unzipped_list[0] preds = unzipped_list[1] fscore = f1_score(golds, preds, average="weighted") - return fscore \ No newline at end of file + return fscore diff --git a/lm_eval/tasks/afrimmlu/utils.py b/lm_eval/tasks/afrimmlu/utils.py index 6054cf31d4..f53fb68f11 100644 --- a/lm_eval/tasks/afrimmlu/utils.py +++ b/lm_eval/tasks/afrimmlu/utils.py @@ -7,9 +7,9 @@ def doc_to_choice(doc): def doc_to_text(doc): - output = """You are a highly knowledgeable and intelligent artificial intelligence + output = """You are a highly knowledgeable and intelligent artificial intelligence model answers multiple-choice questions about '{subject}' - + Question: '''{question}''' Choices: @@ -17,16 +17,18 @@ def doc_to_text(doc): B: ''{choice2}''' C: ''{choice3}''' D: ''{choice4}''' - + Answer: """ - + choices = eval(doc["choices"]) - text = output.format(subject=doc['subject'], - question=doc['question'], - choice1=choices[0], - choice2=choices[1], - choice3=choices[2], - choice4=choices[3]) + text = output.format( + subject=doc["subject"], + question=doc["question"], + choice1=choices[0], + choice2=choices[1], + choice3=choices[2], + choice4=choices[3], + ) return text @@ -35,4 +37,4 @@ def weighted_f1_score(items): golds = unzipped_list[0] preds = unzipped_list[1] fscore = f1_score(golds, preds, average="weighted") - return fscore \ No newline at end of file + return fscore diff --git a/lm_eval/tasks/afrixnli/README.md b/lm_eval/tasks/afrixnli/README.md index cf8e0ff22b..65b0272bc6 100644 --- a/lm_eval/tasks/afrixnli/README.md +++ b/lm_eval/tasks/afrixnli/README.md @@ -5,8 +5,8 @@ IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models https://arxiv.org/pdf/2406.03368 -IrokoBench is a human-translated benchmark dataset for 16 typologically diverse -low-resource African languages covering three tasks: natural language inference (AfriXNLI), +IrokoBench is a human-translated benchmark dataset for 16 typologically diverse +low-resource African languages covering three tasks: natural language inference (AfriXNLI), mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU). @@ -14,13 +14,13 @@ mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU ``` @misc{adelani2024irokobenchnewbenchmarkafrican, - title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, + title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp}, year={2024}, eprint={2406.03368}, archivePrefix={arXiv}, primaryClass={cs.CL}, - url={https://arxiv.org/abs/2406.03368}, + url={https://arxiv.org/abs/2406.03368}, } ``` @@ -30,7 +30,7 @@ mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU * `afrixnli`: All afrixnli tasks * `afrixnli_en_direct`: afrixnli_en_direct evaluates models performance using the anli prompt on the curated dataset -* `afrixnli_native_direct`: afrixnli_native_direct evaluates models performance using the anli prompt translated to the +* `afrixnli_native_direct`: afrixnli_native_direct evaluates models performance using the anli prompt translated to the respective languages on the curated dataset * `afrixnli_translate`: afrixnli_translate evaluates models using the anli prompt in translate-test setting * `afrixnli_manual_direct`: afrixnli_manual_direct evaluates models performance using Lai's prompt on the curated dataset diff --git a/lm_eval/tasks/afrixnli/anli prompt/en-direct/utils.py b/lm_eval/tasks/afrixnli/anli prompt/en-direct/utils.py index c4beaf8ec8..17df7ca963 100644 --- a/lm_eval/tasks/afrixnli/anli prompt/en-direct/utils.py +++ b/lm_eval/tasks/afrixnli/anli prompt/en-direct/utils.py @@ -2,11 +2,7 @@ def doc_to_target(doc): - replacements = { - 0: 'True', - 1: 'Neither', - 2: 'False' - } + replacements = {0: "True", 1: "Neither", 2: "False"} return replacements[doc["label"]] diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_amh.yaml b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_amh.yaml index 785ffe997f..94fb2bdcb6 100644 --- a/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_amh.yaml +++ b/lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_amh.yaml @@ -2,4 +2,3 @@ dataset_name: amh include: afrixnli_translate_yaml task: afrixnli_translate_amh - diff --git a/lm_eval/tasks/afrixnli/anli prompt/translate/utils.py b/lm_eval/tasks/afrixnli/anli prompt/translate/utils.py index c4beaf8ec8..17df7ca963 100644 --- a/lm_eval/tasks/afrixnli/anli prompt/translate/utils.py +++ b/lm_eval/tasks/afrixnli/anli prompt/translate/utils.py @@ -2,11 +2,7 @@ def doc_to_target(doc): - replacements = { - 0: 'True', - 1: 'Neither', - 2: 'False' - } + replacements = {0: "True", 1: "Neither", 2: "False"} return replacements[doc["label"]] diff --git a/lm_eval/tasks/afrixnli/lai prompt/direct/utils.py b/lm_eval/tasks/afrixnli/lai prompt/direct/utils.py index e8e3fb5882..8f472503c6 100644 --- a/lm_eval/tasks/afrixnli/lai prompt/direct/utils.py +++ b/lm_eval/tasks/afrixnli/lai prompt/direct/utils.py @@ -2,25 +2,20 @@ def doc_to_text(doc): - output = """Please identify whether the premise entails or contradicts the hypothesis in the following premise + output = """Please identify whether the premise entails or contradicts the hypothesis in the following premise and hypothesis. The answer should be exact entailment, contradiction, or neutral. - + Premise: {premise} Hypothesis: {hypothesis} - + Is it entailment, contradiction, or neutral?""" - text = output.format(premise=doc['premise'], - hypothesis=doc['hypothesis']) + text = output.format(premise=doc["premise"], hypothesis=doc["hypothesis"]) return text def doc_to_target(doc): - replacements = { - 0: 'entailment', - 1: 'neutral', - 2: 'contradiction' - } + replacements = {0: "entailment", 1: "neutral", 2: "contradiction"} return replacements[doc["label"]] diff --git a/lm_eval/tasks/afrixnli/lai prompt/translate/utils.py b/lm_eval/tasks/afrixnli/lai prompt/translate/utils.py index e8e3fb5882..8f472503c6 100644 --- a/lm_eval/tasks/afrixnli/lai prompt/translate/utils.py +++ b/lm_eval/tasks/afrixnli/lai prompt/translate/utils.py @@ -2,25 +2,20 @@ def doc_to_text(doc): - output = """Please identify whether the premise entails or contradicts the hypothesis in the following premise + output = """Please identify whether the premise entails or contradicts the hypothesis in the following premise and hypothesis. The answer should be exact entailment, contradiction, or neutral. - + Premise: {premise} Hypothesis: {hypothesis} - + Is it entailment, contradiction, or neutral?""" - text = output.format(premise=doc['premise'], - hypothesis=doc['hypothesis']) + text = output.format(premise=doc["premise"], hypothesis=doc["hypothesis"]) return text def doc_to_target(doc): - replacements = { - 0: 'entailment', - 1: 'neutral', - 2: 'contradiction' - } + replacements = {0: "entailment", 1: "neutral", 2: "contradiction"} return replacements[doc["label"]] diff --git a/lm_eval/tasks/afrixnli/utils.py b/lm_eval/tasks/afrixnli/utils.py index 088f8fe66a..905a72b001 100644 --- a/lm_eval/tasks/afrixnli/utils.py +++ b/lm_eval/tasks/afrixnli/utils.py @@ -1,6 +1,7 @@ -import yaml import argparse +import yaml + class FunctionTag: def __init__(self, value): @@ -12,110 +13,110 @@ def __init__(self, value): "QUESTION_WORD": "ትክክል", "ENTAILMENT_LABEL": "አዎ", "NEUTRAL_LABEL": "እንዲሁም", - "CONTRADICTION_LABEL": "አይ" + "CONTRADICTION_LABEL": "አይ", }, "eng": { "QUESTION_WORD": "Right", "ENTAILMENT_LABEL": "Yes", "NEUTRAL_LABEL": "Also", - "CONTRADICTION_LABEL": "No" + "CONTRADICTION_LABEL": "No", }, "ewe": { "QUESTION_WORD": "Esɔ gbe", "ENTAILMENT_LABEL": "Ɛ̃", "NEUTRAL_LABEL": "Hã", - "CONTRADICTION_LABEL": "Ao" + "CONTRADICTION_LABEL": "Ao", }, "fra": { "QUESTION_WORD": "correct", "ENTAILMENT_LABEL": "Oui", "NEUTRAL_LABEL": "Aussi", - "CONTRADICTION_LABEL": "Non" + "CONTRADICTION_LABEL": "Non", }, "hau": { "QUESTION_WORD": "Daidai", "ENTAILMENT_LABEL": "Ee", "NEUTRAL_LABEL": "Haka kuma", - "CONTRADICTION_LABEL": "A'a" + "CONTRADICTION_LABEL": "A'a", }, "ibo": { "QUESTION_WORD": "Ziri ezi", "ENTAILMENT_LABEL": "Éè", "NEUTRAL_LABEL": "Ọzọkwa", - "CONTRADICTION_LABEL": "Mba" + "CONTRADICTION_LABEL": "Mba", }, "kin": { "QUESTION_WORD": "Nibyo", "ENTAILMENT_LABEL": "Yego", "NEUTRAL_LABEL": "Na none", - "CONTRADICTION_LABEL": "Oya" + "CONTRADICTION_LABEL": "Oya", }, "lin": { "QUESTION_WORD": "Malamu", "ENTAILMENT_LABEL": "Iyo", "NEUTRAL_LABEL": "Lisusu", - "CONTRADICTION_LABEL": "Te" + "CONTRADICTION_LABEL": "Te", }, "lug": { "QUESTION_WORD": "Kituufu", "ENTAILMENT_LABEL": "Yee", "NEUTRAL_LABEL": "N’ekirala", - "CONTRADICTION_LABEL": "Nedda" + "CONTRADICTION_LABEL": "Nedda", }, "orm": { "QUESTION_WORD": "Sirrii", "ENTAILMENT_LABEL": "Eeyyee", "NEUTRAL_LABEL": "Akkasumas", - "CONTRADICTION_LABEL": "Lakki" + "CONTRADICTION_LABEL": "Lakki", }, "sna": { "QUESTION_WORD": "Chokwadi", "ENTAILMENT_LABEL": "Hongu", "NEUTRAL_LABEL": "Uye", - "CONTRADICTION_LABEL": "Kwete" + "CONTRADICTION_LABEL": "Kwete", }, "sot": { "QUESTION_WORD": "Nepile", "ENTAILMENT_LABEL": "E", "NEUTRAL_LABEL": "Hape", - "CONTRADICTION_LABEL": "Tjhe" + "CONTRADICTION_LABEL": "Tjhe", }, "swa": { "QUESTION_WORD": "Sahihi", "ENTAILMENT_LABEL": "Ndiyo", "NEUTRAL_LABEL": "Pia", - "CONTRADICTION_LABEL": "Hapana" + "CONTRADICTION_LABEL": "Hapana", }, "twi": { "QUESTION_WORD": "Nifa", "ENTAILMENT_LABEL": "Aane", "NEUTRAL_LABEL": "Anaasɛ", - "CONTRADICTION_LABEL": "Daabi" + "CONTRADICTION_LABEL": "Daabi", }, "wol": { "QUESTION_WORD": "Dëgg", "ENTAILMENT_LABEL": "Waaw", "NEUTRAL_LABEL": "Itam", - "CONTRADICTION_LABEL": "Déet" + "CONTRADICTION_LABEL": "Déet", }, "xho": { "QUESTION_WORD": "Ichanekile", "ENTAILMENT_LABEL": "Ewe", "NEUTRAL_LABEL": "Kananjalo", - "CONTRADICTION_LABEL": "Hayi" + "CONTRADICTION_LABEL": "Hayi", }, "yor": { "QUESTION_WORD": "Òótọ́", "ENTAILMENT_LABEL": "Bẹ́ẹ̀ni", "NEUTRAL_LABEL": "Àti pé", - "CONTRADICTION_LABEL": "Rárá" + "CONTRADICTION_LABEL": "Rárá", }, "zul": { "QUESTION_WORD": "Kulungile", "ENTAILMENT_LABEL": "Yebo", "NEUTRAL_LABEL": "Futhi", - "CONTRADICTION_LABEL": "Cha" - } + "CONTRADICTION_LABEL": "Cha", + }, } @@ -127,8 +128,26 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None: :param overwrite: Whether to overwrite files if they already exist. """ err = [] - languages = ['eng', 'amh', 'ibo', 'fra', 'sna', 'wol', 'ewe', 'lin', 'lug', 'xho', 'kin', 'twi', 'zul', 'orm', - 'yor', 'hau', 'sot', 'swa'] + languages = [ + "eng", + "amh", + "ibo", + "fra", + "sna", + "wol", + "ewe", + "lin", + "lug", + "xho", + "kin", + "twi", + "zul", + "orm", + "yor", + "hau", + "sot", + "swa", + ] for lang in languages: try: if mode == "native-direct": @@ -141,7 +160,9 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None: task_name = f"afrixnli_native_direct_{lang}" yaml_template = "afrixnli_native_direct_yaml" with open( - f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8" + f"{output_dir}/{file_name}", + "w" if overwrite else "x", + encoding="utf8", ) as f: f.write("# Generated by utils.py\n") yaml.dump( @@ -150,10 +171,10 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None: "task": task_name, "dataset_name": lang, "doc_to_choice": f"{{{{[" - f"""premise+\", {QUESTION_WORD}? {ENTAILMENT_LABEL}, \"+hypothesis,""" - f"""premise+\", {QUESTION_WORD}? {NEUTRAL_LABEL}, \"+hypothesis,""" - f"""premise+\", {QUESTION_WORD}? {CONTRADICTION_LABEL}, \"+hypothesis""" - f"]}}}}", + f"""premise+\", {QUESTION_WORD}? {ENTAILMENT_LABEL}, \"+hypothesis,""" + f"""premise+\", {QUESTION_WORD}? {NEUTRAL_LABEL}, \"+hypothesis,""" + f"""premise+\", {QUESTION_WORD}? {CONTRADICTION_LABEL}, \"+hypothesis""" + f"]}}}}", }, f, allow_unicode=True, @@ -163,14 +184,16 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None: task_name = f"afrixnli_{mode}_{lang}" yaml_template = f"afrixnli_{mode}_yaml" with open( - f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8" + f"{output_dir}/{file_name}", + "w" if overwrite else "x", + encoding="utf8", ) as f: f.write("# Generated by utils.py\n") yaml.dump( { "include": yaml_template, "task": task_name, - "dataset_name": lang + "dataset_name": lang, }, f, allow_unicode=True, @@ -195,7 +218,9 @@ def main() -> None: help="Overwrite files if they already exist", ) parser.add_argument( - "--output-dir", default="./manual/translate", help="Directory to write yaml files to" + "--output-dir", + default="./manual/translate", + help="Directory to write yaml files to", ) parser.add_argument( "--mode", diff --git a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml index b457ae582f..2e34ad36c2 100644 --- a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml +++ b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml @@ -3,4 +3,4 @@ task: - med_concepts_qa_atc_tasks aggregate_metric_list: - metric: acc - aggregation: mean \ No newline at end of file + aggregation: mean diff --git a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml index 777a5ce6ee..407ea4088d 100644 --- a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml +++ b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml @@ -3,4 +3,4 @@ task: - med_concepts_qa_icd10proc_tasks aggregate_metric_list: - metric: acc - aggregation: mean \ No newline at end of file + aggregation: mean diff --git a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml index d5671bf035..b12ea811ff 100644 --- a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml +++ b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml @@ -3,4 +3,4 @@ task: - med_concepts_qa_icd9cm_tasks aggregate_metric_list: - metric: acc - aggregation: mean \ No newline at end of file + aggregation: mean diff --git a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml index 4487772fa1..94fc034eb2 100644 --- a/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml +++ b/lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml @@ -3,4 +3,4 @@ task: - med_concepts_qa_icd9proc_tasks aggregate_metric_list: - metric: acc - aggregation: mean \ No newline at end of file + aggregation: mean