From c2c8e2384166edcaf5f46084311334981234422c Mon Sep 17 00:00:00 2001 From: Julen Etxaniz Date: Mon, 1 Apr 2024 20:37:26 +0200 Subject: [PATCH] Add Latxa paper evaluation tasks for Basque (#1654) * add basqueglue * add eus_exams * add eus_proficiency * add eus_reading * add eus_trivia * run pre-commit --- lm_eval/tasks/basqueglue/README.md | 72 +++++++++++++++++ lm_eval/tasks/basqueglue/bec.yaml | 16 ++++ lm_eval/tasks/basqueglue/bhtc.yaml | 16 ++++ lm_eval/tasks/basqueglue/coref.yaml | 16 ++++ lm_eval/tasks/basqueglue/qnli.yaml | 16 ++++ lm_eval/tasks/basqueglue/utils.py | 78 +++++++++++++++++++ lm_eval/tasks/basqueglue/vaxx.yaml | 16 ++++ lm_eval/tasks/basqueglue/wic.yaml | 17 ++++ lm_eval/tasks/eus_exams/README.md | 49 ++++++++++++ lm_eval/tasks/eus_exams/configs.py | 67 ++++++++++++++++ lm_eval/tasks/eus_exams/eus_exams | 18 +++++ lm_eval/tasks/eus_exams/eus_exams_es | 4 + .../eus_exams_es_ejadministrativo.yaml | 4 + .../eus_exams/eus_exams_es_ejauxiliar.yaml | 4 + .../eus_exams/eus_exams_es_ejsubalterno.yaml | 4 + .../eus_exams/eus_exams_es_ejtecnico.yaml | 4 + .../eus_exams_es_opeayuntamientovitoria.yaml | 4 + .../eus_exams/eus_exams_es_opebilbao.yaml | 4 + .../eus_exams/eus_exams_es_opeehuadmin.yaml | 4 + .../eus_exams/eus_exams_es_opeehuaux.yaml | 4 + .../eus_exams/eus_exams_es_opeehubiblio.yaml | 4 + .../eus_exams/eus_exams_es_opeehuderecho.yaml | 4 + .../eus_exams_es_opeehueconomicas.yaml | 4 + .../eus_exams_es_opeehuempresariales.yaml | 4 + .../eus_exams_es_opeehusubalterno.yaml | 4 + .../eus_exams/eus_exams_es_opeehutecnico.yaml | 4 + .../eus_exams_es_opeehutecnicob.yaml | 4 + .../eus_exams/eus_exams_es_opeosakiadmin.yaml | 4 + .../eus_exams/eus_exams_es_opeosakiaux.yaml | 4 + .../eus_exams_es_opeosakiauxenf.yaml | 4 + .../eus_exams_es_opeosakicelador.yaml | 4 + .../eus_exams/eus_exams_es_opeosakienf.yaml | 4 + .../eus_exams_es_opeosakijuridico.yaml | 4 + .../eus_exams_es_opeosakioperario.yaml | 4 + .../eus_exams_es_opeosakitecnico.yaml | 4 + .../eus_exams_es_opeosakivarios.yaml | 4 + .../eus_exams/eus_exams_es_osakidetza1c.yaml | 4 + .../eus_exams/eus_exams_es_osakidetza2c.yaml | 4 + .../eus_exams/eus_exams_es_osakidetza3c.yaml | 4 + .../eus_exams/eus_exams_es_osakidetza4c.yaml | 4 + .../eus_exams/eus_exams_es_osakidetza5c.yaml | 4 + .../eus_exams/eus_exams_es_osakidetza6c.yaml | 4 + .../eus_exams/eus_exams_es_osakidetza7c.yaml | 4 + .../eus_exams/eus_exams_es_osakidetza8c.yaml | 4 + .../eus_exams/eus_exams_es_osakidetza9c.yaml | 4 + lm_eval/tasks/eus_exams/eus_exams_eu | 4 + .../eus_exams_eu_ejadministrari.yaml | 4 + .../eus_exams/eus_exams_eu_ejlaguntza.yaml | 4 + .../eus_exams/eus_exams_eu_ejlaguntzaile.yaml | 4 + .../eus_exams/eus_exams_eu_ejteknikari.yaml | 4 + .../eus_exams/eus_exams_eu_opebilbaoeu.yaml | 4 + .../eus_exams/eus_exams_eu_opeehuadmineu.yaml | 4 + .../eus_exams/eus_exams_eu_opeehuauxeu.yaml | 4 + .../eus_exams_eu_opeehubiblioeu.yaml | 4 + .../eus_exams_eu_opeehuderechoeu.yaml | 4 + .../eus_exams_eu_opeehueconomicaseu.yaml | 4 + .../eus_exams_eu_opeehuempresarialeseu.yaml | 4 + .../eus_exams_eu_opeehusubalternoeu.yaml | 4 + .../eus_exams_eu_opeehutecnicoeu.yaml | 4 + .../eus_exams_eu_opeehuteknikarib.yaml | 4 + .../eus_exams_eu_opegasteizkoudala.yaml | 4 + .../eus_exams_eu_opeosakiadmineu.yaml | 4 + .../eus_exams_eu_opeosakiauxenfeu.yaml | 4 + .../eus_exams/eus_exams_eu_opeosakiauxeu.yaml | 4 + .../eus_exams_eu_opeosakiceladoreu.yaml | 4 + .../eus_exams/eus_exams_eu_opeosakienfeu.yaml | 4 + .../eus_exams_eu_opeosakioperarioeu.yaml | 4 + .../eus_exams_eu_opeosakitecnicoeu.yaml | 4 + .../eus_exams_eu_opeosakivarioseu.yaml | 4 + .../eus_exams/eus_exams_eu_osakidetza1e.yaml | 4 + .../eus_exams/eus_exams_eu_osakidetza2e.yaml | 4 + .../eus_exams/eus_exams_eu_osakidetza3e.yaml | 4 + .../eus_exams/eus_exams_eu_osakidetza5e.yaml | 4 + .../eus_exams/eus_exams_eu_osakidetza6e.yaml | 4 + .../eus_exams/eus_exams_eu_osakidetza7e.yaml | 4 + lm_eval/tasks/eus_exams/utils.py | 15 ++++ lm_eval/tasks/eus_proficiency/README.md | 48 ++++++++++++ .../eus_proficiency/eus_proficiency.yaml | 16 ++++ lm_eval/tasks/eus_reading/README.md | 48 ++++++++++++ lm_eval/tasks/eus_reading/eus_reading.yaml | 16 ++++ lm_eval/tasks/eus_reading/utils.py | 41 ++++++++++ lm_eval/tasks/eus_trivia/README.md | 54 +++++++++++++ lm_eval/tasks/eus_trivia/eus_trivia.yaml | 16 ++++ lm_eval/tasks/eus_trivia/utils.py | 41 ++++++++++ lm_eval/tasks/super_glue/record/util.py | 1 + 85 files changed, 933 insertions(+) create mode 100644 lm_eval/tasks/basqueglue/README.md create mode 100644 lm_eval/tasks/basqueglue/bec.yaml create mode 100644 lm_eval/tasks/basqueglue/bhtc.yaml create mode 100644 lm_eval/tasks/basqueglue/coref.yaml create mode 100644 lm_eval/tasks/basqueglue/qnli.yaml create mode 100644 lm_eval/tasks/basqueglue/utils.py create mode 100644 lm_eval/tasks/basqueglue/vaxx.yaml create mode 100644 lm_eval/tasks/basqueglue/wic.yaml create mode 100644 lm_eval/tasks/eus_exams/README.md create mode 100644 lm_eval/tasks/eus_exams/configs.py create mode 100644 lm_eval/tasks/eus_exams/eus_exams create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_ejadministrativo.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_ejauxiliar.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_ejsubalterno.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_ejtecnico.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeayuntamientovitoria.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opebilbao.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeehuadmin.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeehuaux.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeehubiblio.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeehuderecho.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeehueconomicas.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeehuempresariales.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeehusubalterno.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeehutecnico.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeehutecnicob.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeosakiadmin.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeosakiaux.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeosakiauxenf.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeosakicelador.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeosakienf.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeosakijuridico.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeosakioperario.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeosakitecnico.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_opeosakivarios.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_osakidetza1c.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_osakidetza2c.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_osakidetza3c.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_osakidetza4c.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_osakidetza5c.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_osakidetza6c.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_osakidetza7c.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_osakidetza8c.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_es_osakidetza9c.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_ejadministrari.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_ejlaguntza.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_ejlaguntzaile.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_ejteknikari.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opebilbaoeu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeehuadmineu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeehuauxeu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeehubiblioeu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeehuderechoeu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeehueconomicaseu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeehuempresarialeseu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeehusubalternoeu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeehutecnicoeu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeehuteknikarib.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opegasteizkoudala.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiadmineu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiauxenfeu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiauxeu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiceladoreu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeosakienfeu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeosakioperarioeu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeosakitecnicoeu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_opeosakivarioseu.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza1e.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza2e.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza3e.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza5e.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza6e.yaml create mode 100644 lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza7e.yaml create mode 100644 lm_eval/tasks/eus_exams/utils.py create mode 100644 lm_eval/tasks/eus_proficiency/README.md create mode 100644 lm_eval/tasks/eus_proficiency/eus_proficiency.yaml create mode 100644 lm_eval/tasks/eus_reading/README.md create mode 100644 lm_eval/tasks/eus_reading/eus_reading.yaml create mode 100644 lm_eval/tasks/eus_reading/utils.py create mode 100644 lm_eval/tasks/eus_trivia/README.md create mode 100644 lm_eval/tasks/eus_trivia/eus_trivia.yaml create mode 100644 lm_eval/tasks/eus_trivia/utils.py diff --git a/lm_eval/tasks/basqueglue/README.md b/lm_eval/tasks/basqueglue/README.md new file mode 100644 index 0000000000..04583b1dad --- /dev/null +++ b/lm_eval/tasks/basqueglue/README.md @@ -0,0 +1,72 @@ +# BasqueGLUE + +### Paper + +Title: `BasqueGLUE: A Natural Language Understanding Benchmark for Basque` + +Abstract: `https://aclanthology.org/2022.lrec-1.172/` + +Natural Language Understanding (NLU) technology has improved significantly over the last few years and multitask benchmarks such as GLUE are key to evaluate this improvement in a robust and general way. These benchmarks take into account a wide and diverse set of NLU tasks that require some form of language understanding, beyond the detection of superficial, textual clues. However, they are costly to develop and language-dependent, and therefore they are only available for a small number of languages. In this paper, we present BasqueGLUE, the first NLU benchmark for Basque, a less-resourced language, which has been elaborated from previously existing datasets and following similar criteria to those used for the construction of GLUE and SuperGLUE. We also report the evaluation of two state-of-the-art language models for Basque on BasqueGLUE, thus providing a strong baseline to compare upon. BasqueGLUE is freely available under an open license. + +Homepage: `https://github.com/orai-nlp/BasqueGLUE` + +Title: `Latxa: An Open Language Model and Evaluation Suite for Basque` + +Abstract: `https://arxiv.org/abs/2403.20266` + +The use of BasqueGLUE for evaluating the performance of decoder models in Basque is presented in this paper. + +Homepage: `https://github.com/hitz-zentroa/latxa` + +### Citation + +``` +@InProceedings{urbizu2022basqueglue, + author = {Urbizu, Gorka and San Vicente, IƱaki and Saralegi, Xabier and Agerri, Rodrigo and Soroa, Aitor}, + title = {BasqueGLUE: A Natural Language Understanding Benchmark for Basque}, + booktitle = {Proceedings of the Language Resources and Evaluation Conference}, + month = {June}, + year = {2022}, + address = {Marseille, France}, + publisher = {European Language Resources Association}, + pages = {1603--1612}, + url = {https://aclanthology.org/2022.lrec-1.172} +} + +@misc{etxaniz2024latxa, + title={Latxa: An Open Language Model and Evaluation Suite for Basque}, + author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa}, + year={2024}, + eprint={2403.20266}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups and Tasks + +#### Groups + +* `basque-glue`: First version of the implementation + +#### Tasks + +* `bhtc_v2`: Topic classification of news extracts with 12 categories. +* `bec`: Sentiment analysis on tweets about the campaign for the 2016 Basque elections. +* `vaxx_stance`: Stance detection on tweets around the anti-vaccine movement. +* `qnlieu`: Q&A NLI as in [glue/qnli](../glue/qnli). +* `wiceu`: Word-in-Context as in [super_glue/wic](../super_glue/wic). +* `epec_korref_bin`: Correference detection as in [super_glue/wsc](../super_glue/wsc). + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/basqueglue/bec.yaml b/lm_eval/tasks/basqueglue/bec.yaml new file mode 100644 index 0000000000..a078300f0f --- /dev/null +++ b/lm_eval/tasks/basqueglue/bec.yaml @@ -0,0 +1,16 @@ +group: basque-glue +task: bec2016eu +dataset_path: orai-nlp/basqueGLUE +dataset_name: bec +output_type: multiple_choice +validation_split: validation +test_split: test +doc_to_text: "Testua: {{text}}\nGaldera: Nolako jarrera agertzen du aurreko testuak?\nErantzuna:" +doc_to_target: label +doc_to_choice: ['negatiboa', 'neutrala', 'positiboa'] +metric_list: + - metric: f1 + aggregation: !function utils.micro_f1_score + higher_is_better: true +metadata: + - version: 1.0 diff --git a/lm_eval/tasks/basqueglue/bhtc.yaml b/lm_eval/tasks/basqueglue/bhtc.yaml new file mode 100644 index 0000000000..b069d62f4d --- /dev/null +++ b/lm_eval/tasks/basqueglue/bhtc.yaml @@ -0,0 +1,16 @@ +group: basque-glue +task: bhtc_v2 +dataset_path: orai-nlp/basqueGLUE +dataset_name: bhtc +output_type: multiple_choice +validation_split: validation +test_split: test +doc_to_text: "Testua: {{text}}\nGaldera: Zein da aurreko testuaren gaia?\nErantzuna:" +doc_to_target: label +doc_to_choice: ['Ekonomia', 'Euskal Herria', 'Euskara', 'Gizartea', 'Historia', 'Ingurumena', 'Iritzia', 'Komunikazioa', 'Kultura', 'Nazioartea', 'Politika', 'Zientzia'] +metric_list: + - metric: f1 + aggregation: !function utils.micro_f1_score + higher_is_better: true +metadata: + - version: 1.0 diff --git a/lm_eval/tasks/basqueglue/coref.yaml b/lm_eval/tasks/basqueglue/coref.yaml new file mode 100644 index 0000000000..721691ab43 --- /dev/null +++ b/lm_eval/tasks/basqueglue/coref.yaml @@ -0,0 +1,16 @@ +group: basque-glue +task: epec_koref_bin +dataset_path: orai-nlp/basqueGLUE +dataset_name: coref +output_type: multiple_choice +validation_split: validation +test_split: test +doc_to_text: !function utils.coref_doc_to_text +doc_to_target: label +doc_to_choice: ['ez', 'bai'] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + - version: 1.0 diff --git a/lm_eval/tasks/basqueglue/qnli.yaml b/lm_eval/tasks/basqueglue/qnli.yaml new file mode 100644 index 0000000000..f3cfe84c16 --- /dev/null +++ b/lm_eval/tasks/basqueglue/qnli.yaml @@ -0,0 +1,16 @@ +group: basque-glue +task: qnlieu +dataset_path: orai-nlp/basqueGLUE +dataset_name: qnli +output_type: multiple_choice +validation_split: validation +test_split: test +doc_to_text: "{{question}}\n{{sentence}}\nGaldera: aurreko galderari erantzuten al dio emandako testuak?\nErantzuna:" +doc_to_target: label +doc_to_choice: ['bai', 'ez'] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + - version: 1.0 diff --git a/lm_eval/tasks/basqueglue/utils.py b/lm_eval/tasks/basqueglue/utils.py new file mode 100644 index 0000000000..401375f709 --- /dev/null +++ b/lm_eval/tasks/basqueglue/utils.py @@ -0,0 +1,78 @@ +import html +import re + +from datasets import load_metric + + +def general_detokenize(string): + string = re.sub(r"\s+([.,;:!?)])", r"\1", string) + string = re.sub(r"(\s+|^)\(\s+([^)]+)\s+\)", r"\1(\2)", string) + string = re.sub(r"(\s+|^)\[\s+([^)]+)\s+\]", r"\1[\2]", string) + string = re.sub(r'(\s+|^)"\s+([^"]+)\s+"', r'\1"\2"', string) + string = re.sub(r"(\s+|^)'\s+([^']+)\s+'", r"\1'\2'", string) + return string + + +def process_doc(string): + string = html.unescape(string) + string = general_detokenize(string) + return string + + +def process_wic_docs(dataset): + def _helper(doc): + # there's some issues with the encoding on this one + doc["sentence1"] = ( + process_doc(doc["sentence1"]).encode("latin-1").decode("utf-8") + ) + doc["sentence2"] = ( + process_doc(doc["sentence2"]).encode("latin-1").decode("utf-8") + ) + return doc + + return dataset.map(_helper) + + +def coref_doc_to_text(x): + def _span_in_context(span_index, span_text): + span_start = span_index + span_end = span_start + len(span_text.split(" ")) - 1 + tokens[span_start] = f"*{tokens[span_start]}" + tokens[span_end] = f"{tokens[span_end]}*" + + tokens = x["text"].split(" ") + _span_in_context(x["span1_index"], x["span1_text"]) + _span_in_context( + x["span2_index"] - 1, x["span2_text"] + ) # span1_index is 0-based but span2_index is 1-based ?? + context = process_doc(" ".join(tokens)) + span_1 = process_doc(x["span1_text"]) + span_2 = process_doc(x["span2_text"]) + text = ( + f"Testua: {context}\n" + + f'Galdera: Aurreko testuan, "*{span_1}*" eta "*{span_2}*" gauza bera dira?\n' + + "Erantzuna:" + ) + return text + + +# Measure F1 as in the benchmark repo: https://github.com/orai-nlp/BasqueGLUE/blob/main/eval_basqueglue.py + + +def micro_f1_score(items): + f1_metric = load_metric("f1") + golds, preds = list(zip(*items)) + f1_score = f1_metric.compute(references=golds, predictions=preds, average="micro")[ + "f1" + ] + return f1_score + + +def vaxx_f1_score(items): + f1_metric = load_metric("f1") + golds, preds = list(zip(*items)) + f1_class = f1_metric.compute( + references=golds, predictions=preds, labels=[0, 2], average=None + )["f1"] + f1_score = sum(f1_class) / len(f1_class) + return f1_score diff --git a/lm_eval/tasks/basqueglue/vaxx.yaml b/lm_eval/tasks/basqueglue/vaxx.yaml new file mode 100644 index 0000000000..f66f530dad --- /dev/null +++ b/lm_eval/tasks/basqueglue/vaxx.yaml @@ -0,0 +1,16 @@ +group: basque-glue +task: vaxx_stance +dataset_path: orai-nlp/basqueGLUE +dataset_name: vaxx +output_type: multiple_choice +validation_split: validation +test_split: test +doc_to_text: "Testua: {{text}}\nGaldera: Nolako jarrera agertzen du aurreko testuak txertoei buruz?\nErantzuna:" +doc_to_target: label +doc_to_choice: ['aurka', 'neutrala', 'alde'] +metric_list: + - metric: f1 + aggregation: !function utils.vaxx_f1_score + higher_is_better: true +metadata: + - version: 1.0 diff --git a/lm_eval/tasks/basqueglue/wic.yaml b/lm_eval/tasks/basqueglue/wic.yaml new file mode 100644 index 0000000000..7ec2681ac2 --- /dev/null +++ b/lm_eval/tasks/basqueglue/wic.yaml @@ -0,0 +1,17 @@ +group: basque-glue +task: wiceu +dataset_path: orai-nlp/basqueGLUE +dataset_name: wic +output_type: multiple_choice +validation_split: validation +test_split: test +process_docs: !function utils.process_wic_docs +doc_to_text: "1. esaldia: {{sentence1}}\n2. esaldia: {{sentence2}}\nGaldera: Aurreko bi esaldietan, \"{{word}}\" hitzak esanahi berdina du?\nErantzuna:" +doc_to_target: label +doc_to_choice: ['ez', 'bai'] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + - version: 1.0 diff --git a/lm_eval/tasks/eus_exams/README.md b/lm_eval/tasks/eus_exams/README.md new file mode 100644 index 0000000000..f44c5f04ed --- /dev/null +++ b/lm_eval/tasks/eus_exams/README.md @@ -0,0 +1,49 @@ +# EusExams + +### Paper + +Title: Latxa: An Open Language Model and Evaluation Suite for Basque + +Abstract: https://arxiv.org/abs/2403.20266 + +EusExams is a collection of tests designed to prepare individuals for Public Service examinations conducted by several Basque institutions, including the public health system Osakidetza, the Basque Government, the City Councils of Bilbao and Gasteiz, and the University of the Basque Country (UPV/EHU). Within each of these groups, there are different exams for public positions, such as administrative and assistant roles. Each multiple-choice question contains 2 to 4 choices (3.90 on average) and one correct answer. The dataset is mostly parallel with 16k questions in Basque and 18k in Spanish. + +Homepage: https://github.com/hitz-zentroa/latxa + + +### Citation + +``` +@misc{etxaniz2024latxa, + title={Latxa: An Open Language Model and Evaluation Suite for Basque}, + author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa}, + year={2024}, + eprint={2403.20266}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups and Tasks + +#### Groups + +* `eus_exams_eu`: The Basque version of the exams. +* `eus_exams_es`: The Spanish version of the exams. + +#### Tasks + +Basque and Spanish versions of the exams are available as separate tasks starting with `eus_exams_eu` and `eus_exams_es` respectively. + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/eus_exams/configs.py b/lm_eval/tasks/eus_exams/configs.py new file mode 100644 index 0000000000..993faa9f5d --- /dev/null +++ b/lm_eval/tasks/eus_exams/configs.py @@ -0,0 +1,67 @@ +import argparse +import json + +import requests +import yaml + + +# get configs from huggingface datasets server by doing a request +response = requests.get( + "https://datasets-server.huggingface.co/splits?dataset=HiTZ%2FEusExams", timeout=5 +) +response_json = json.loads(response.text) +CONFIGS = [split["config"] for split in response_json["splits"]] + + +def gen_config_yamls(output_dir: str, overwrite: bool) -> None: + """ + Generate a yaml file for each configuage. + + :param output_dir: The directory to output the files to. + :param overwrite: Whether to overwrite files if they already exist. + """ + err = [] + for config in CONFIGS: + file_name = f"eus_exams_{config}.yaml" + try: + with open(f"{output_dir}/{file_name}", "w" if overwrite else "x") as f: + f.write("# Generated by utils.py\n") + yaml.dump( + { + "include": "eus_exams_es" + if "eus_exams_es" in config + else "eus_exams_eu", + "dataset_name": config, + "task": f"eus_exams_{config}", + }, + f, + ) + except FileExistsError: + err.append(file_name) + + if len(err) > 0: + raise FileExistsError( + "Files were not created because they already exist (use --overwrite flag):" + f" {', '.join(err)}" + ) + + +def main() -> None: + """Parse CLI args and generate configuage-specific yaml files.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--overwrite", + default=False, + action="store_true", + help="Overwrite files if they already exist", + ) + parser.add_argument( + "--output-dir", default=".", help="Directory to write yaml files to" + ) + args = parser.parse_args() + + gen_config_yamls(output_dir=args.output_dir, overwrite=args.overwrite) + + +if __name__ == "__main__": + main() diff --git a/lm_eval/tasks/eus_exams/eus_exams b/lm_eval/tasks/eus_exams/eus_exams new file mode 100644 index 0000000000..d1d2af7314 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams @@ -0,0 +1,18 @@ +dataset_path: HiTZ/EusExams +dataset_name: null +validation_split: null +test_split: test +fewshot_split: test +process_docs: !function utils.process_docs +output_type: multiple_choice +doc_to_choice: ["A", "B", "C", "D"] +doc_to_target: answer +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 0.0 diff --git a/lm_eval/tasks/eus_exams/eus_exams_es b/lm_eval/tasks/eus_exams/eus_exams_es new file mode 100644 index 0000000000..2588660694 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es @@ -0,0 +1,4 @@ +include: eus_exams +group: + - eus_exams_es +doc_to_text: "Pregunta: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nD: {{candidates[3]}}\nRespuesta:" diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_ejadministrativo.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_ejadministrativo.yaml new file mode 100644 index 0000000000..22b93ed6b7 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_ejadministrativo.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_ejadministrativo +include: eus_exams_es +task: eus_exams_es_ejadministrativo diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_ejauxiliar.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_ejauxiliar.yaml new file mode 100644 index 0000000000..b6cbe975fd --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_ejauxiliar.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_ejauxiliar +include: eus_exams_es +task: eus_exams_es_ejauxiliar diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_ejsubalterno.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_ejsubalterno.yaml new file mode 100644 index 0000000000..0adfba26dd --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_ejsubalterno.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_ejsubalterno +include: eus_exams_es +task: eus_exams_es_ejsubalterno diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_ejtecnico.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_ejtecnico.yaml new file mode 100644 index 0000000000..0d0b011c9a --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_ejtecnico.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_ejtecnico +include: eus_exams_es +task: eus_exams_es_ejtecnico diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeayuntamientovitoria.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeayuntamientovitoria.yaml new file mode 100644 index 0000000000..0d43c0b416 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeayuntamientovitoria.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeayuntamientovitoria +include: eus_exams_es +task: eus_exams_es_opeayuntamientovitoria diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opebilbao.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opebilbao.yaml new file mode 100644 index 0000000000..5cb33cbbdd --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opebilbao.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opebilbao +include: eus_exams_es +task: eus_exams_es_opebilbao diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeehuadmin.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeehuadmin.yaml new file mode 100644 index 0000000000..e9cacfbd3f --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeehuadmin.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeehuadmin +include: eus_exams_es +task: eus_exams_es_opeehuadmin diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeehuaux.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeehuaux.yaml new file mode 100644 index 0000000000..88316e512a --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeehuaux.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeehuaux +include: eus_exams_es +task: eus_exams_es_opeehuaux diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeehubiblio.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeehubiblio.yaml new file mode 100644 index 0000000000..728d7cfb0e --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeehubiblio.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeehubiblio +include: eus_exams_es +task: eus_exams_es_opeehubiblio diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeehuderecho.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeehuderecho.yaml new file mode 100644 index 0000000000..13e1d9de4b --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeehuderecho.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeehuderecho +include: eus_exams_es +task: eus_exams_es_opeehuderecho diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeehueconomicas.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeehueconomicas.yaml new file mode 100644 index 0000000000..6625c9ce5a --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeehueconomicas.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeehueconomicas +include: eus_exams_es +task: eus_exams_es_opeehueconomicas diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeehuempresariales.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeehuempresariales.yaml new file mode 100644 index 0000000000..4f61d3a143 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeehuempresariales.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeehuempresariales +include: eus_exams_es +task: eus_exams_es_opeehuempresariales diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeehusubalterno.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeehusubalterno.yaml new file mode 100644 index 0000000000..96cc86b402 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeehusubalterno.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeehusubalterno +include: eus_exams_es +task: eus_exams_es_opeehusubalterno diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeehutecnico.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeehutecnico.yaml new file mode 100644 index 0000000000..0641fc2e77 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeehutecnico.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeehutecnico +include: eus_exams_es +task: eus_exams_es_opeehutecnico diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeehutecnicob.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeehutecnicob.yaml new file mode 100644 index 0000000000..a338a1ab0d --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeehutecnicob.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeehutecnicob +include: eus_exams_es +task: eus_exams_es_opeehutecnicob diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeosakiadmin.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakiadmin.yaml new file mode 100644 index 0000000000..85c771cdb3 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakiadmin.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeosakiadmin +include: eus_exams_es +task: eus_exams_es_opeosakiadmin diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeosakiaux.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakiaux.yaml new file mode 100644 index 0000000000..2d61825b0b --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakiaux.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeosakiaux +include: eus_exams_es +task: eus_exams_es_opeosakiaux diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeosakiauxenf.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakiauxenf.yaml new file mode 100644 index 0000000000..08fe0ed6c0 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakiauxenf.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeosakiauxenf +include: eus_exams_es +task: eus_exams_es_opeosakiauxenf diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeosakicelador.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakicelador.yaml new file mode 100644 index 0000000000..2a61b6878e --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakicelador.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeosakicelador +include: eus_exams_es +task: eus_exams_es_opeosakicelador diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeosakienf.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakienf.yaml new file mode 100644 index 0000000000..e4749cac11 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakienf.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeosakienf +include: eus_exams_es +task: eus_exams_es_opeosakienf diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeosakijuridico.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakijuridico.yaml new file mode 100644 index 0000000000..a62dc8001f --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakijuridico.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeosakijuridico +include: eus_exams_es +task: eus_exams_es_opeosakijuridico diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeosakioperario.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakioperario.yaml new file mode 100644 index 0000000000..df72481742 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakioperario.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeosakioperario +include: eus_exams_es +task: eus_exams_es_opeosakioperario diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeosakitecnico.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakitecnico.yaml new file mode 100644 index 0000000000..4b5b397b88 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakitecnico.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeosakitecnico +include: eus_exams_es +task: eus_exams_es_opeosakitecnico diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_opeosakivarios.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakivarios.yaml new file mode 100644 index 0000000000..fe98dc76aa --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_opeosakivarios.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_opeosakivarios +include: eus_exams_es +task: eus_exams_es_opeosakivarios diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza1c.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza1c.yaml new file mode 100644 index 0000000000..080f99fcf2 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza1c.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_osakidetza1c +include: eus_exams_es +task: eus_exams_es_osakidetza1c diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza2c.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza2c.yaml new file mode 100644 index 0000000000..4ee8ab46c6 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza2c.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_osakidetza2c +include: eus_exams_es +task: eus_exams_es_osakidetza2c diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza3c.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza3c.yaml new file mode 100644 index 0000000000..2974a11d79 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza3c.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_osakidetza3c +include: eus_exams_es +task: eus_exams_es_osakidetza3c diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza4c.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza4c.yaml new file mode 100644 index 0000000000..faa6c4b46c --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza4c.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_osakidetza4c +include: eus_exams_es +task: eus_exams_es_osakidetza4c diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza5c.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza5c.yaml new file mode 100644 index 0000000000..153ce3add3 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza5c.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_osakidetza5c +include: eus_exams_es +task: eus_exams_es_osakidetza5c diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza6c.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza6c.yaml new file mode 100644 index 0000000000..d94ef2b9f4 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza6c.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_osakidetza6c +include: eus_exams_es +task: eus_exams_es_osakidetza6c diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza7c.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza7c.yaml new file mode 100644 index 0000000000..1fc30ce353 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza7c.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_osakidetza7c +include: eus_exams_es +task: eus_exams_es_osakidetza7c diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza8c.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza8c.yaml new file mode 100644 index 0000000000..38f7ee3c39 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza8c.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_osakidetza8c +include: eus_exams_es +task: eus_exams_es_osakidetza8c diff --git a/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza9c.yaml b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza9c.yaml new file mode 100644 index 0000000000..7b23ff6707 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_es_osakidetza9c.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: es_osakidetza9c +include: eus_exams_es +task: eus_exams_es_osakidetza9c diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu b/lm_eval/tasks/eus_exams/eus_exams_eu new file mode 100644 index 0000000000..95b82388df --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu @@ -0,0 +1,4 @@ +include: eus_exams +group: + - eus_exams_eu +doc_to_text: "Galdera: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nD: {{candidates[3]}}\nErantzuna:" diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_ejadministrari.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_ejadministrari.yaml new file mode 100644 index 0000000000..f5630ddb05 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_ejadministrari.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_ejadministrari +include: eus_exams_eu +task: eus_exams_eu_ejadministrari diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_ejlaguntza.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_ejlaguntza.yaml new file mode 100644 index 0000000000..cf2806c1e6 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_ejlaguntza.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_ejlaguntza +include: eus_exams_eu +task: eus_exams_eu_ejlaguntza diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_ejlaguntzaile.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_ejlaguntzaile.yaml new file mode 100644 index 0000000000..1d713a3244 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_ejlaguntzaile.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_ejlaguntzaile +include: eus_exams_eu +task: eus_exams_eu_ejlaguntzaile diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_ejteknikari.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_ejteknikari.yaml new file mode 100644 index 0000000000..7b528b9d4c --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_ejteknikari.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_ejteknikari +include: eus_exams_eu +task: eus_exams_eu_ejteknikari diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opebilbaoeu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opebilbaoeu.yaml new file mode 100644 index 0000000000..d15dbc6101 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opebilbaoeu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opebilbaoeu +include: eus_exams_eu +task: eus_exams_eu_opebilbaoeu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeehuadmineu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehuadmineu.yaml new file mode 100644 index 0000000000..85b9c90477 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehuadmineu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeehuadmineu +include: eus_exams_eu +task: eus_exams_eu_opeehuadmineu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeehuauxeu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehuauxeu.yaml new file mode 100644 index 0000000000..e720824863 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehuauxeu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeehuauxeu +include: eus_exams_eu +task: eus_exams_eu_opeehuauxeu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeehubiblioeu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehubiblioeu.yaml new file mode 100644 index 0000000000..0ff2ab853f --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehubiblioeu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeehubiblioeu +include: eus_exams_eu +task: eus_exams_eu_opeehubiblioeu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeehuderechoeu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehuderechoeu.yaml new file mode 100644 index 0000000000..bef6b65245 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehuderechoeu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeehuderechoeu +include: eus_exams_eu +task: eus_exams_eu_opeehuderechoeu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeehueconomicaseu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehueconomicaseu.yaml new file mode 100644 index 0000000000..713f332341 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehueconomicaseu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeehueconomicaseu +include: eus_exams_eu +task: eus_exams_eu_opeehueconomicaseu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeehuempresarialeseu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehuempresarialeseu.yaml new file mode 100644 index 0000000000..8dddd9bc76 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehuempresarialeseu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeehuempresarialeseu +include: eus_exams_eu +task: eus_exams_eu_opeehuempresarialeseu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeehusubalternoeu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehusubalternoeu.yaml new file mode 100644 index 0000000000..b02a451dd9 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehusubalternoeu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeehusubalternoeu +include: eus_exams_eu +task: eus_exams_eu_opeehusubalternoeu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeehutecnicoeu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehutecnicoeu.yaml new file mode 100644 index 0000000000..3792e12aa0 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehutecnicoeu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeehutecnicoeu +include: eus_exams_eu +task: eus_exams_eu_opeehutecnicoeu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeehuteknikarib.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehuteknikarib.yaml new file mode 100644 index 0000000000..b9f5cc612a --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeehuteknikarib.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeehuteknikarib +include: eus_exams_eu +task: eus_exams_eu_opeehuteknikarib diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opegasteizkoudala.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opegasteizkoudala.yaml new file mode 100644 index 0000000000..e9211f39a1 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opegasteizkoudala.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opegasteizkoudala +include: eus_exams_eu +task: eus_exams_eu_opegasteizkoudala diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiadmineu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiadmineu.yaml new file mode 100644 index 0000000000..cf19e09941 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiadmineu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeosakiadmineu +include: eus_exams_eu +task: eus_exams_eu_opeosakiadmineu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiauxenfeu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiauxenfeu.yaml new file mode 100644 index 0000000000..719039915a --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiauxenfeu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeosakiauxenfeu +include: eus_exams_eu +task: eus_exams_eu_opeosakiauxenfeu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiauxeu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiauxeu.yaml new file mode 100644 index 0000000000..9d0891886c --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiauxeu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeosakiauxeu +include: eus_exams_eu +task: eus_exams_eu_opeosakiauxeu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiceladoreu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiceladoreu.yaml new file mode 100644 index 0000000000..af82c87bdf --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiceladoreu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeosakiceladoreu +include: eus_exams_eu +task: eus_exams_eu_opeosakiceladoreu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakienfeu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakienfeu.yaml new file mode 100644 index 0000000000..10b853e399 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakienfeu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeosakienfeu +include: eus_exams_eu +task: eus_exams_eu_opeosakienfeu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakioperarioeu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakioperarioeu.yaml new file mode 100644 index 0000000000..8f3cf7c490 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakioperarioeu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeosakioperarioeu +include: eus_exams_eu +task: eus_exams_eu_opeosakioperarioeu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakitecnicoeu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakitecnicoeu.yaml new file mode 100644 index 0000000000..f44e4994e3 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakitecnicoeu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeosakitecnicoeu +include: eus_exams_eu +task: eus_exams_eu_opeosakitecnicoeu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakivarioseu.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakivarioseu.yaml new file mode 100644 index 0000000000..11801ddec6 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_opeosakivarioseu.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_opeosakivarioseu +include: eus_exams_eu +task: eus_exams_eu_opeosakivarioseu diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza1e.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza1e.yaml new file mode 100644 index 0000000000..cc71350719 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza1e.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_osakidetza1e +include: eus_exams_eu +task: eus_exams_eu_osakidetza1e diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza2e.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza2e.yaml new file mode 100644 index 0000000000..218dc87cb8 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza2e.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_osakidetza2e +include: eus_exams_eu +task: eus_exams_eu_osakidetza2e diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza3e.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza3e.yaml new file mode 100644 index 0000000000..d5d17c32a8 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza3e.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_osakidetza3e +include: eus_exams_eu +task: eus_exams_eu_osakidetza3e diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza5e.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza5e.yaml new file mode 100644 index 0000000000..be4d2ca741 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza5e.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_osakidetza5e +include: eus_exams_eu +task: eus_exams_eu_osakidetza5e diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza6e.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza6e.yaml new file mode 100644 index 0000000000..7b2af263fb --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza6e.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_osakidetza6e +include: eus_exams_eu +task: eus_exams_eu_osakidetza6e diff --git a/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza7e.yaml b/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza7e.yaml new file mode 100644 index 0000000000..666e96a0e1 --- /dev/null +++ b/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza7e.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: eu_osakidetza7e +include: eus_exams_eu +task: eus_exams_eu_osakidetza7e diff --git a/lm_eval/tasks/eus_exams/utils.py b/lm_eval/tasks/eus_exams/utils.py new file mode 100644 index 0000000000..51e9f4c632 --- /dev/null +++ b/lm_eval/tasks/eus_exams/utils.py @@ -0,0 +1,15 @@ +import datasets + + +def process_docs(dataset: datasets.Dataset): + """Filter out examples with no answer.""" + + def valid_example(example: dict) -> bool: + """Check if an example is valid.""" + if example["answer"] not in [0, 1, 2, 3]: + return False + if example["candidates"] == ["", "", "", ""]: + return False + return True + + return dataset.filter(valid_example) diff --git a/lm_eval/tasks/eus_proficiency/README.md b/lm_eval/tasks/eus_proficiency/README.md new file mode 100644 index 0000000000..6671bda477 --- /dev/null +++ b/lm_eval/tasks/eus_proficiency/README.md @@ -0,0 +1,48 @@ +# EusProficiency + +### Paper + +Title: Latxa: An Open Language Model and Evaluation Suite for Basque + +Abstract: https://arxiv.org/abs/2403.20266 + +EusProficiency comprises 5,169 exercises on different topics from past EGA exams, the official C1-level certificate of proficiency in Basque. We collected the atarikoa exercises from EGA exams through the years 1998 to 2008. Atarikoa is the first qualifying test of EGA, which measures different aspects of language competency, such as reading comprehension, grammar, vocabulary, spelling, and writing. Each test generally has 85 multiple-choice questions, with 4 choices and a single correct answer. + +Homepage: https://github.com/hitz-zentroa/latxa + + +### Citation + +``` +@misc{etxaniz2024latxa, + title={Latxa: An Open Language Model and Evaluation Suite for Basque}, + author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa}, + year={2024}, + eprint={2403.20266}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups and Tasks + +#### Groups + +There are no groups. + +#### Tasks + +* `eus_proficiency`: EusProficiency comprises 5,169 exercises on different topics from past EGA exams, the official C1-level certificate of proficiency in Basque. + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/eus_proficiency/eus_proficiency.yaml b/lm_eval/tasks/eus_proficiency/eus_proficiency.yaml new file mode 100644 index 0000000000..18cf5d2ab3 --- /dev/null +++ b/lm_eval/tasks/eus_proficiency/eus_proficiency.yaml @@ -0,0 +1,16 @@ +dataset_path: HiTZ/EusProficiency +dataset_name: default +task: eus_proficiency +doc_to_text: "Galdera: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nD: {{candidates[3]}}\nErantzuna:" +doc_to_choice: ["A", "B", "C", "D"] +validation_split: null +test_split: test +fewshot_split: test +output_type: multiple_choice +doc_to_target: answer +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 0.0 diff --git a/lm_eval/tasks/eus_reading/README.md b/lm_eval/tasks/eus_reading/README.md new file mode 100644 index 0000000000..2542e9b509 --- /dev/null +++ b/lm_eval/tasks/eus_reading/README.md @@ -0,0 +1,48 @@ +# EusReading + +### Paper + +Title: Latxa: An Open Language Model and Evaluation Suite for Basque + +Abstract: https://arxiv.org/abs/2403.20266 + +EusReading consists of 352 reading comprehension exercises (irakurmena) sourced from the set of past EGA exams from 1998 to 2008. Each test generally has 10 multiple-choice questions, with 4 choices and a single correct answer. These exercises are more challenging than Belebele due to the complexity and length of the input texts. As a result, EusReading is useful to measure long context understanding of models. + +Homepage: https://github.com/hitz-zentroa/latxa + + +### Citation + +``` +@misc{etxaniz2024latxa, + title={Latxa: An Open Language Model and Evaluation Suite for Basque}, + author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa}, + year={2024}, + eprint={2403.20266}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups and Tasks + +#### Groups + +There are no groups. + +#### Tasks + +* `eus_reading`: EusReading consists of 352 reading comprehension exercises (irakurmena) sourced from the set of past EGA exams from 1998 to 2008. + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/eus_reading/eus_reading.yaml b/lm_eval/tasks/eus_reading/eus_reading.yaml new file mode 100644 index 0000000000..1af52d499d --- /dev/null +++ b/lm_eval/tasks/eus_reading/eus_reading.yaml @@ -0,0 +1,16 @@ +dataset_path: HiTZ/EusReading +dataset_name: default +task: eus_reading +doc_to_text: !function utils.doc_to_text_context +doc_to_choice: !function utils.doc_to_choice +validation_split: null +test_split: test +fewshot_split: test +output_type: multiple_choice +doc_to_target: answer +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 0.0 diff --git a/lm_eval/tasks/eus_reading/utils.py b/lm_eval/tasks/eus_reading/utils.py new file mode 100644 index 0000000000..dc507e02ff --- /dev/null +++ b/lm_eval/tasks/eus_reading/utils.py @@ -0,0 +1,41 @@ +from typing import List + + +letters = ["A", "B", "C", "D"] + + +def doc_to_text_context(doc) -> str: + """ + Converts a document to a formatted string. + + Args: + doc (dict): A dictionary containing the document information. + + Returns: + str: A formatted string containing the question and answer choices. + """ + candidates = doc["candidates"] + num_choices = len(candidates) + if num_choices < 2: + raise ValueError("Invalid number of candidates") + choices = letters[:num_choices] + formatted_choices = "\n".join( + [f"{choice}: {candidates[i]}" for i, choice in enumerate(choices)] + ) + return f"Pasartea: {doc['context']}\n\nGaldera: {doc['question']}\n{formatted_choices}\nErantzuna:" + + +def doc_to_choice(doc) -> List[str]: + """ + Returns the answer choices for a document. + + Args: + doc (dict): A dictionary containing the document information. + + Returns: + list: A list of strings containing the answer choices. + """ + num_choices = len(doc["candidates"]) + if num_choices < 2: + raise ValueError("Invalid number of candidates") + return letters[:num_choices] diff --git a/lm_eval/tasks/eus_trivia/README.md b/lm_eval/tasks/eus_trivia/README.md new file mode 100644 index 0000000000..88e760e435 --- /dev/null +++ b/lm_eval/tasks/eus_trivia/README.md @@ -0,0 +1,54 @@ +# EusTrivia + +### Paper + +Title: Latxa: An Open Language Model and Evaluation Suite for Basque + +Abstract: https://arxiv.org/abs/2403.20266 + +EusTrivia consists of 1,715 trivia questions from multiple online sources. 56.3\% of the questions are elementary level (grades 3-6), while the rest are considered challenging. A significant portion of the questions focus specifically on the Basque Country, its language and culture. Each multiple-choice question contains two, three or four choices (3.84 on average) and a single correct answer. Five areas of knowledge are covered: + +- **Humanities and Natural Sciences** (27.8%): This category encompasses questions about history, geography, biology, ecology and other social and natural sciences. +- **Leisure and Art** (24.5%): This category includes questions on sports and athletes, performative and plastic arts and artists, architecture, cultural events, and related topics. +- **Music** (16.0%): Here are grouped all the questions about music and musicians, both classical and contemporary. +- **Language and Literature** (17.1%): This category is concerned with all kinds of literature productions and writers, as well as metalinguistic questions (e.g., definitions, synonyms, and word usage). +- **Mathematics and ICT** (14.5%): This category covers mathematical problems and questions about ICT, as well as questions about people known for their contributions to these fields of knowledge. + +Homepage: https://github.com/hitz-zentroa/latxa + + +### Citation + +``` +@misc{etxaniz2024latxa, + title={Latxa: An Open Language Model and Evaluation Suite for Basque}, + author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa}, + year={2024}, + eprint={2403.20266}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups and Tasks + +#### Groups + +There are no groups. + +#### Tasks + +* `eus_trivia`: EusTrivia consists of 1,715 trivia questions from multiple online sources. + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/eus_trivia/eus_trivia.yaml b/lm_eval/tasks/eus_trivia/eus_trivia.yaml new file mode 100644 index 0000000000..fe93ab6172 --- /dev/null +++ b/lm_eval/tasks/eus_trivia/eus_trivia.yaml @@ -0,0 +1,16 @@ +dataset_path: HiTZ/EusTrivia +dataset_name: default +task: eus_trivia +doc_to_text: !function utils.doc_to_text +doc_to_choice: !function utils.doc_to_choice +validation_split: null +test_split: test +fewshot_split: test +output_type: multiple_choice +doc_to_target: answer +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 0.0 diff --git a/lm_eval/tasks/eus_trivia/utils.py b/lm_eval/tasks/eus_trivia/utils.py new file mode 100644 index 0000000000..e5802c795b --- /dev/null +++ b/lm_eval/tasks/eus_trivia/utils.py @@ -0,0 +1,41 @@ +from typing import List + + +letters = ["A", "B", "C", "D"] + + +def doc_to_text(doc) -> str: + """ + Converts a document to a formatted string. + + Args: + doc (dict): A dictionary containing the document information. + + Returns: + str: A formatted string containing the question and answer choices. + """ + candidates = doc["candidates"] + num_choices = len(candidates) + if num_choices < 2: + raise ValueError("Invalid number of candidates") + choices = letters[:num_choices] + formatted_choices = "\n".join( + [f"{choice}: {candidates[i]}" for i, choice in enumerate(choices)] + ) + return f"Galdera: {doc['question']}\n{formatted_choices}\nErantzuna:" + + +def doc_to_choice(doc) -> List[str]: + """ + Returns the answer choices for a document. + + Args: + doc (dict): A dictionary containing the document information. + + Returns: + list: A list of strings containing the answer choices. + """ + num_choices = len(doc["candidates"]) + if num_choices < 2: + raise ValueError("Invalid number of candidates") + return letters[:num_choices] diff --git a/lm_eval/tasks/super_glue/record/util.py b/lm_eval/tasks/super_glue/record/util.py index ea815aad67..252dba44eb 100644 --- a/lm_eval/tasks/super_glue/record/util.py +++ b/lm_eval/tasks/super_glue/record/util.py @@ -34,6 +34,7 @@ def _process_doc(doc): "entities": sorted(list(set(doc["entities"]))), "answers": sorted(list(set(doc["answers"]))), } + return dataset.map(_process_doc)