formatting (EleutherAI#2104)

OpenLLM-France · Jul 15, 2024 · 56a4e79 · 56a4e79
1 parent 9884ad6
commit 56a4e79
Show file tree

Hide file tree

Showing 58 changed files with 234 additions and 209 deletions.
diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
@@ -565,4 +565,4 @@ def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):
 
     assert len(metrics) == len(sizes)
 
-    return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)
+    return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
@@ -1665,4 +1665,4 @@ def count_bytes(cls, doc) -> int:
     @classmethod
     def count_words(cls, doc) -> int:
         """Downstream tasks with custom word boundaries should override this!"""
-        return len(re.split(r"\s+", doc))
+        return len(re.split(r"\s+", doc))
diff --git a/lm_eval/filters/extraction.py b/lm_eval/filters/extraction.py
@@ -181,4 +181,4 @@ def filter_ignores(st):
                 filtered.append(match)
             filtered_resps.append(filtered)
 
-        return filtered_resps
+        return filtered_resps
diff --git a/lm_eval/tasks/afrimgsm/README.md b/lm_eval/tasks/afrimgsm/README.md
@@ -5,22 +5,22 @@
 IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models
 https://arxiv.org/pdf/2406.03368
 
-IrokoBench is a human-translated benchmark dataset for 16 typologically diverse 
-low-resource African languages covering three tasks: natural language inference (AfriXNLI), 
+IrokoBench is a human-translated benchmark dataset for 16 typologically diverse
+low-resource African languages covering three tasks: natural language inference (AfriXNLI),
 mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU).
 
 
 ### Citation
 
 ```
 @misc{adelani2024irokobenchnewbenchmarkafrican,
-      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, 
+      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models},
       author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp},
       year={2024},
       eprint={2406.03368},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2406.03368}, 
+      url={https://arxiv.org/abs/2406.03368},
 }
 ```
 

diff --git a/lm_eval/tasks/afrimgsm/run.sh b/lm_eval/tasks/afrimgsm/run.sh
@@ -3,4 +3,4 @@ lm_eval --model hf   \
         --device cuda:0     \
         --batch_size 1  \
         --verbosity DEBUG \
-        --limit 5
+        --limit 5
diff --git a/lm_eval/tasks/afrimgsm/utils.py b/lm_eval/tasks/afrimgsm/utils.py
@@ -2,51 +2,74 @@
 
 import yaml
 
-languages = ['eng', 'amh', 'ibo', 'fra', 'sna', 'lin', 'wol', 'ewe', 'lug', 'xho', 'kin', 'twi', 'zul', 'orm', 'yor',
-             'hau', 'sot', 'swa']
-
-languages_REGEX = {"eng": "The answer is (\\-?[0-9\\.\\,]+)",
-                   "amh": "መልሱ (\\-?[0-9\\.\\,]+)",
-                   "ibo": "Azịza ya bụ (\\-?[0-9\\.\\,]+)",
-                   'fra': "La réponse est(\\-?[0-9\\.\\,]+)",
-                   'sna': "Mhinduro kumubvunzo ndi (\\-?[0-9\\.\\,]+)",
-                   'lin': "Eyano ezali (\\-?[0-9\\.\\,]+)",
-                   'wol': "Tontu li (\\-?[0-9\\.\\,]+)",
-                   'ewe': "ŋuɖoɖoae nye (\\-?[0-9\\.\\,]+)",
-                   'lug': "Ansa eri (\\-?[0-9\\.\\,]+)",
-                   'xho': "Impendulo ngu (\\-?[0-9\\.\\,]+)",
-                   'kin': "Igisubizo ni (\\-?[0-9\\.\\,]+)",
-                   'twi': "Ne nnyiano yɛ (\\-?[0-9\\.\\,]+)",
-                   'zul': "Impendulo ithi (\\-?[0-9\\.\\,]+)",
-                   'orm': "Deebiin isaa (\\-?[0-9\\.\\,]+)",
-                   'yor': "Ìdáhùn náà ni (\\-?[0-9\\.\\,]+)",
-                   'hau': "Amsar ita ce (\\-?[0-9\\.\\,]+)",
-                   'sot': "Karabo ke (\\-?[0-9\\.\\,]+)",
-                   'swa': "Jibu ni (\\-?[0-9\\.\\,]+)",
-                   }
+
+languages = [
+    "eng",
+    "amh",
+    "ibo",
+    "fra",
+    "sna",
+    "lin",
+    "wol",
+    "ewe",
+    "lug",
+    "xho",
+    "kin",
+    "twi",
+    "zul",
+    "orm",
+    "yor",
+    "hau",
+    "sot",
+    "swa",
+]
+
+languages_REGEX = {
+    "eng": "The answer is (\\-?[0-9\\.\\,]+)",
+    "amh": "መልሱ (\\-?[0-9\\.\\,]+)",
+    "ibo": "Azịza ya bụ (\\-?[0-9\\.\\,]+)",
+    "fra": "La réponse est(\\-?[0-9\\.\\,]+)",
+    "sna": "Mhinduro kumubvunzo ndi (\\-?[0-9\\.\\,]+)",
+    "lin": "Eyano ezali (\\-?[0-9\\.\\,]+)",
+    "wol": "Tontu li (\\-?[0-9\\.\\,]+)",
+    "ewe": "ŋuɖoɖoae nye (\\-?[0-9\\.\\,]+)",
+    "lug": "Ansa eri (\\-?[0-9\\.\\,]+)",
+    "xho": "Impendulo ngu (\\-?[0-9\\.\\,]+)",
+    "kin": "Igisubizo ni (\\-?[0-9\\.\\,]+)",
+    "twi": "Ne nnyiano yɛ (\\-?[0-9\\.\\,]+)",
+    "zul": "Impendulo ithi (\\-?[0-9\\.\\,]+)",
+    "orm": "Deebiin isaa (\\-?[0-9\\.\\,]+)",
+    "yor": "Ìdáhùn náà ni (\\-?[0-9\\.\\,]+)",
+    "hau": "Amsar ita ce (\\-?[0-9\\.\\,]+)",
+    "sot": "Karabo ke (\\-?[0-9\\.\\,]+)",
+    "swa": "Jibu ni (\\-?[0-9\\.\\,]+)",
+}
 
 LANGUAGES = {}
 
 for lang in languages:
-    if lang == 'amh':
+    if lang == "amh":
         LANGUAGES[lang] = {  # English
             "QUESTION": "ጥያቄ:",
             "ANSWER": "በቅደም ተከተል መልስ:",
             "DIRECT": "Answer:",
-            "REGEX": languages_REGEX[lang]}
-    elif lang == 'yor':
+            "REGEX": languages_REGEX[lang],
+        }
+    elif lang == "yor":
         LANGUAGES[lang] = {  # English
             "QUESTION": "Ìbéèrè:",
             "ANSWER": "Ìdáhùn lẹ́sẹsẹ:",
             "DIRECT": "Answer:",
-            "REGEX": languages_REGEX[lang]}
+            "REGEX": languages_REGEX[lang],
+        }
 
     else:
         LANGUAGES[lang] = {  # English
             "QUESTION": "Question:",
             "ANSWER": "Step-by-Step Answer:",
             "DIRECT": "Answer:",
-            "REGEX": languages_REGEX[lang]}
+            "REGEX": languages_REGEX[lang],
+        }
 
 
 def add_regex_pattern(regex_pattern):
@@ -93,13 +116,12 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
     err = []
     for lang in LANGUAGES.keys():
         try:
-
             yaml_template = "cot_yaml"
             filter_list = {}
             DELIMITER = None
             if mode == "direct":
-                ANSWER = LANGUAGES['eng']["DIRECT"]
-                QUESTION = LANGUAGES['eng']["QUESTION"]
+                ANSWER = LANGUAGES["eng"]["DIRECT"]
+                QUESTION = LANGUAGES["eng"]["QUESTION"]
                 REGEX = None
                 task_name = f"afrimgsm_direct_{lang}"
                 yaml_template = "direct_yaml"
@@ -122,16 +144,16 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
                 QUESTION = LANGUAGES["eng"]["QUESTION"]
                 task_name = f"afrimgsm_en_cot_{lang}"
             elif mode == "translate-direct":
-                ANSWER = LANGUAGES['eng']["DIRECT"]
-                QUESTION = LANGUAGES['eng']["QUESTION"]
+                ANSWER = LANGUAGES["eng"]["DIRECT"]
+                QUESTION = LANGUAGES["eng"]["QUESTION"]
                 REGEX = None
                 task_name = f"afrimgsm_translate_direct_{lang}"
                 yaml_template = "translate_direct_yaml"
 
             file_name = f"{task_name}.yaml"
             ANSWER_TO_SKIP = len(LANGUAGES[lang]["ANSWER"]) + 1
             with open(
-                    f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
+                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
             ) as f:
                 f.write("# Generated by utils.py\n")
                 yaml.dump(
@@ -140,15 +162,15 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
                         "dataset_name": lang,
                         "task": f"{task_name}",
                         "doc_to_text": f"""{{% if answer is not none %}}"""
-                                       f"""{{{{question+"\\n{ANSWER}"}}}}"""
-                                       f"""{{% else %}}"""
-                                       f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}"""
-                                       f"""{{% endif %}}""",
+                        f"""{{{{question+"\\n{ANSWER}"}}}}"""
+                        f"""{{% else %}}"""
+                        f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}"""
+                        f"""{{% endif %}}""",
                         "doc_to_target": f"""{{% if answer is not none %}}"""
-                                         f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}"""
-                                         f"""{{% else %}}"""
-                                         f"""{{{{answer_number|string}}}}"""
-                                         f"""{{% endif %}}""",
+                        f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}"""
+                        f"""{{% else %}}"""
+                        f"""{{{{answer_number|string}}}}"""
+                        f"""{{% endif %}}""",
                         **filter_list,
                         "generation_kwargs": {
                             "until": [QUESTION, "</s>", "<|im_end|>"],
@@ -194,4 +216,4 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/lm_eval/tasks/afrimmlu/README.md b/lm_eval/tasks/afrimmlu/README.md
@@ -5,22 +5,22 @@
 IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models
 https://arxiv.org/pdf/2406.03368
 
-IrokoBench is a human-translated benchmark dataset for 16 typologically diverse 
-low-resource African languages covering three tasks: natural language inference (AfriXNLI), 
+IrokoBench is a human-translated benchmark dataset for 16 typologically diverse
+low-resource African languages covering three tasks: natural language inference (AfriXNLI),
 mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU).
 
 
 ### Citation
 
 ```
 @misc{adelani2024irokobenchnewbenchmarkafrican,
-      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, 
+      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models},
       author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp},
       year={2024},
       eprint={2406.03368},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2406.03368}, 
+      url={https://arxiv.org/abs/2406.03368},
 }
 ```
 

diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml
@@ -9,18 +9,18 @@ output_type: multiple_choice
 validation_split: validation
 test_split: test
 fewshot_split: validation
-doc_to_text: !function utils.doc_to_text 
+doc_to_text: !function utils.doc_to_text
 doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
 doc_to_choice: !function utils.doc_to_choice
 should_decontaminate: true
 doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
 metric_list:
-  - metric: f1 
-    aggregation: !function utils.weighted_f1_score 
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
     # aggregation: mean
-    average: weighted 
-    hf_evaluate: true 
-    higher_is_better: True 
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
     ignore_case: true
     ignore_punctuation: true
     regexes_to_ignore:

diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml
@@ -1,4 +1,3 @@
 dataset_name: eng
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_eng
-
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml
@@ -1,4 +1,3 @@
 dataset_name: ewe
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_ewe
-
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml
@@ -1,3 +1,3 @@
 dataset_name: fra
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_fra
+task: afrimmlu_direct_fra
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml
@@ -1,3 +1,3 @@
 dataset_name: hau
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_hau
+task: afrimmlu_direct_hau
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml
@@ -1,3 +1,3 @@
 dataset_name: ibo
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_ibo
+task: afrimmlu_direct_ibo
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml
@@ -1,3 +1,3 @@
 dataset_name: kin
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_kin
+task: afrimmlu_direct_kin
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml
@@ -1,3 +1,3 @@
 dataset_name: lin
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_lin
+task: afrimmlu_direct_lin
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml
@@ -1,3 +1,3 @@
 dataset_name: lug
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_lug
+task: afrimmlu_direct_lug
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml
@@ -1,3 +1,3 @@
 dataset_name: orm
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_orm
+task: afrimmlu_direct_orm
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml
@@ -1,3 +1,3 @@
 dataset_name: sna
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_sna
+task: afrimmlu_direct_sna
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml
@@ -1,3 +1,3 @@
 dataset_name: sot
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_sot
+task: afrimmlu_direct_sot
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml
@@ -1,3 +1,3 @@
 dataset_name: swa
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_swa
+task: afrimmlu_direct_swa
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml
@@ -1,3 +1,3 @@
 dataset_name: twi
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_twi
+task: afrimmlu_direct_twi
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml
@@ -1,3 +1,3 @@
 dataset_name: wol
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_wol
+task: afrimmlu_direct_wol
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml
@@ -1,3 +1,3 @@
 dataset_name: xho
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_xho
+task: afrimmlu_direct_xho
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml
@@ -1,3 +1,3 @@
 dataset_name: yor
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_yor
+task: afrimmlu_direct_yor
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml
@@ -1,3 +1,3 @@
 dataset_name: zul
 include: afrimmlu_common_yaml
-task: afrimmlu_direct_zul
+task: afrimmlu_direct_zul
Original file line number	Diff line number	Diff line change
Expand Up		@@ -565,4 +565,4 @@ def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):

		assert len(metrics) == len(sizes)

		return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)
		return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)