diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md index df17137f9b..f68066af9f 100644 --- a/lm_eval/tasks/README.md +++ b/lm_eval/tasks/README.md @@ -52,7 +52,8 @@ | [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean | | [lambada](lambada/README.md) | Tasks designed to predict the endings of text passages, testing language prediction skills. | English | | [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English | -| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. | German, English, Spanish, French, Italian | +| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian | +| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese | | [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese | | [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese | | [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English | diff --git a/lm_eval/tasks/lambada_multilingual_stablelm/README.md b/lm_eval/tasks/lambada_multilingual_stablelm/README.md new file mode 100644 index 0000000000..546ac11686 --- /dev/null +++ b/lm_eval/tasks/lambada_multilingual_stablelm/README.md @@ -0,0 +1,56 @@ +# LAMBADA + +### Paper +The LAMBADA dataset: Word prediction requiring a broad discourse context +https://arxiv.org/pdf/1606.06031.pdf + +LAMBADA is a dataset to evaluate the capabilities of computational models for text +understanding by means of a word prediction task. LAMBADA is a collection of narrative +passages sharing the characteristic that human subjects are able to guess their last +word if they are exposed to the whole passage, but not if they only see the last +sentence preceding the target word. To succeed on LAMBADA, computational models +cannot simply rely on local context, but must be able to keep track of information +in the broader discourse. + +Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI + +### Citation + +@misc{ + author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, + title={The LAMBADA dataset}, + DOI={10.5281/zenodo.2630551}, + publisher={Zenodo}, + year={2016}, + month={Aug} +} + +@article{bellagente2024stable, + title={Stable LM 2 1.6 B Technical Report}, + author={Bellagente, Marco and Tow, Jonathan and Mahan, Dakota and Phung, Duy and Zhuravinskyi, Maksym and Adithyan, Reshinth and Baicoianu, James and Brooks, Ben and Cooper, Nathan and Datta, Ashish and others}, + journal={arXiv preprint arXiv:2402.17834}, + year={2024} +} + +### Groups and Tasks + +#### Groups + +* `lambada_multilingual_stablelm`: Evaluates all `lambada_mt_stablelm_X` tasks + +#### Tasks + +* `lambada_mt_stablelm_{en, fr, de, it, es}`: Machine-translated versions of OpenAI's Lambada variant as reported in "Stable LM 2 1.6 B Technical Report" (Bellagente et. al.). + +### Checklist + +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? +(This task is novel to the Evaluation Harness, and has been checked against v0.3.0 of the harness.) + + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [x] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_de.yaml b/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_de.yaml new file mode 100644 index 0000000000..e1aa61edb4 --- /dev/null +++ b/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_de.yaml @@ -0,0 +1,3 @@ +include: lambada_mt_stablelm_en.yaml +task: lambada_openai_mt_stablelm_de +dataset_name: de diff --git a/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_en.yaml b/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_en.yaml new file mode 100644 index 0000000000..a6e6041db5 --- /dev/null +++ b/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_en.yaml @@ -0,0 +1,20 @@ +group: + - lambada_multilingual_stablelm +task: lambada_openai_mt_stablelm_en +dataset_path: marcob/lambada_multilingual +dataset_name: en +output_type: loglikelihood +test_split: test +doc_to_text: "{{text.split(' ')[:-1]|join(' ')}}" +doc_to_target: "{{' '+text.split(' ')[-1]}}" +should_decontaminate: true +doc_to_decontamination_query: "{{text}}" +metric_list: + - metric: perplexity + aggregation: perplexity + higher_is_better: false + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_es.yaml b/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_es.yaml new file mode 100644 index 0000000000..e7e75944a8 --- /dev/null +++ b/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_es.yaml @@ -0,0 +1,3 @@ +include: lambada_mt_stablelm_en.yaml +task: lambada_openai_mt_stablelm_es +dataset_name: es diff --git a/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_fr.yaml b/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_fr.yaml new file mode 100644 index 0000000000..37768ac60b --- /dev/null +++ b/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_fr.yaml @@ -0,0 +1,3 @@ +include: lambada_mt_stablelm_en.yaml +task: lambada_openai_mt_stablelm_fr +dataset_name: fr diff --git a/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_it.yaml b/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_it.yaml new file mode 100644 index 0000000000..76d920f3ee --- /dev/null +++ b/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_it.yaml @@ -0,0 +1,3 @@ +include: lambada_mt_stablelm_en.yaml +task: lambada_openai_mt_stablelm_it +dataset_name: it diff --git a/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_nl.yaml b/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_nl.yaml new file mode 100644 index 0000000000..fa96c8c63e --- /dev/null +++ b/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_nl.yaml @@ -0,0 +1,3 @@ +include: lambada_mt_stablelm_en.yaml +task: lambada_openai_mt_stablelm_nl +dataset_name: nl diff --git a/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_pt.yaml b/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_pt.yaml new file mode 100644 index 0000000000..9399d5e7a8 --- /dev/null +++ b/lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_pt.yaml @@ -0,0 +1,3 @@ +include: lambada_mt_stablelm_en.yaml +task: lambada_openai_mt_stablelm_pt +dataset_name: pt diff --git a/lm_eval/tasks/nq_open/README.md b/lm_eval/tasks/nq_open/README.md index 4a58f90c81..01792089a6 100644 --- a/lm_eval/tasks/nq_open/README.md +++ b/lm_eval/tasks/nq_open/README.md @@ -14,24 +14,6 @@ Derived from the Natural Questions dataset, introduced in https://storage.google ### Citation ``` -@inproceedings{lee-etal-2019-latent, - title = "Latent Retrieval for Weakly Supervised Open Domain Question Answering", - author = "Lee, Kenton and - Chang, Ming-Wei and - Toutanova, Kristina", - editor = "Korhonen, Anna and - Traum, David and - M{\`a}rquez, Llu{\'\i}s", - booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", - month = jul, - year = "2019", - address = "Florence, Italy", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/P19-1612", - doi = "10.18653/v1/P19-1612", - pages = "6086--6096", - abstract = "Recent work on open domain question answering (QA) assumes strong supervision of the supporting evidence and/or assumes a blackbox information retrieval (IR) system to retrieve evidence candidates. We argue that both are suboptimal, since gold evidence is not always available, and QA is fundamentally different from IR. We show for the first time that it is possible to jointly learn the retriever and reader from question-answer string pairs and without any IR system. In this setting, evidence retrieval from all of Wikipedia is treated as a latent variable. Since this is impractical to learn from scratch, we pre-train the retriever with an Inverse Cloze Task. We evaluate on open versions of five QA datasets. On datasets where the questioner already knows the answer, a traditional IR system such as BM25 is sufficient. On datasets where a user is genuinely seeking an answer, we show that learned retrieval is crucial, outperforming BM25 by up to 19 points in exact match.", -} @article{47761, title = {Natural Questions: a Benchmark for Question Answering Research}, author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},