From 47add1afa15121e0a80afb83d51ef9e869d3b559 Mon Sep 17 00:00:00 2001 From: Oligou Date: Tue, 17 Dec 2024 15:43:51 +0100 Subject: [PATCH] Add french leaderboard --- lm_eval/tasks/leaderboard-french/README.md | 312 +++ .../bbh_mc/_fewshot_template_yaml | 16 + .../bbh_mc/_leaderboard_bbh.yaml | 28 + .../compr\303\251hension_de_la_date.yaml" | 58 + .../compr\303\251hension_des_sports.yaml" | 19 + .../bbh_mc/comptage_d'objets.yaml | 21 + ...d\303\251duction_logique_cinq_objets.yaml" | 52 + ...d\303\251duction_logique_sept_objets.yaml" | 52 + ...\303\251duction_logique_trois_objets.yaml" | 52 + .../d\303\251sambigu\303\257sation_qa.yaml" | 51 + ...on_d'erreurs_de_traduction_sailantes.yaml" | 72 + .../expressions_bool\303\251ennes.yaml" | 15 + .../formes_g\303\251om\303\251triques.yaml" | 86 + .../leaderboard-french/bbh_mc/hyperbate.yaml | 36 + .../bbh_mc/jugement_causal.yaml | 65 + .../leaderboard-french/bbh_mc/naviguer.yaml | 41 + .../bbh_mc/pingouins_sur_une_table.yaml | 85 + ...nement_sur_les_objets_color\303\251s.yaml" | 118 ++ .../bbh_mc/recommandation_de_film.yaml | 56 + .../leaderboard-french/bbh_mc/scarcasmes.yaml | 40 + .../bbh_mc/sophismes_formels.yaml | 60 + ...s_m\303\251lang\303\251s_cinq_objets.yaml" | 48 + ...s_m\303\251lang\303\251s_sept_objets.yaml" | 48 + ..._m\303\251lang\303\251s_trois_objets.yaml" | 48 + .../bbh_mc/s\303\251quences_temporelles.yaml" | 100 + .../bbh_mc/toile_de_mensonges.yaml | 23 + .../gpqa/_leaderboard_gpqa.yaml | 9 + .../leaderboard-french/gpqa/_template_yaml | 19 + .../gpqa/gpqa_diamond_zeroshot.yaml | 4 + .../gpqa/gpqa_extended_zeroshot.yaml | 4 + .../gpqa/gpqa_main_zeroshot.yaml | 4 + .../tasks/leaderboard-french/gpqa/utils.py | 38 + .../_leaderboard_instruction_following.yaml | 3 + .../leaderboard-french/ifeval/ifeval.yaml | 31 + .../leaderboard-french/ifeval/instructions.py | 1633 ++++++++++++++++ .../ifeval/instructions_registry.py | 168 ++ .../ifeval/instructions_util.py | 1679 +++++++++++++++++ .../tasks/leaderboard-french/ifeval/utils.py | 217 +++ .../tasks/leaderboard-french/leaderboard.yaml | 32 + .../math/_leaderboard_math.yaml | 12 + .../leaderboard-french/math/_template_yaml | 26 + .../leaderboard-french/math/math_algebra.yaml | 3 + .../math/math_counting_and_prob.yaml | 3 + .../math/math_geometry.yaml | 3 + .../math/math_intermediate_algebra.yaml | 3 + .../math/math_num_theory.yaml | 3 + .../math/math_prealgebra.yaml | 3 + .../math/math_precalculus.yaml | 3 + .../tasks/leaderboard-french/math/utils.py | 331 ++++ .../leaderboard-french/mmlu_pro/mmlu_pro.yaml | 17 + .../leaderboard-french/mmlu_pro/utils.py | 16 + .../tasks/leaderboard-french/musr/_musr.yaml | 9 + .../leaderboard-french/musr/_template_yaml | 11 + .../musr/musr_murder_mysteries.yaml | 3 + .../musr/musr_object_placements.yaml | 3 + .../musr/musr_team_allocation.yaml | 3 + .../tasks/leaderboard-french/musr/utils.py | 26 + 57 files changed, 5921 insertions(+) create mode 100644 lm_eval/tasks/leaderboard-french/README.md create mode 100644 lm_eval/tasks/leaderboard-french/bbh_mc/_fewshot_template_yaml create mode 100644 lm_eval/tasks/leaderboard-french/bbh_mc/_leaderboard_bbh.yaml create mode 100644 "lm_eval/tasks/leaderboard-french/bbh_mc/compr\303\251hension_de_la_date.yaml" create mode 100644 "lm_eval/tasks/leaderboard-french/bbh_mc/compr\303\251hension_des_sports.yaml" create mode 100644 lm_eval/tasks/leaderboard-french/bbh_mc/comptage_d'objets.yaml create mode 100644 "lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251duction_logique_cinq_objets.yaml" create mode 100644 "lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251duction_logique_sept_objets.yaml" create mode 100644 "lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251duction_logique_trois_objets.yaml" create mode 100644 "lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251sambigu\303\257sation_qa.yaml" create mode 100644 "lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251tection_d'erreurs_de_traduction_sailantes.yaml" create mode 100644 "lm_eval/tasks/leaderboard-french/bbh_mc/expressions_bool\303\251ennes.yaml" create mode 100644 "lm_eval/tasks/leaderboard-french/bbh_mc/formes_g\303\251om\303\251triques.yaml" create mode 100644 lm_eval/tasks/leaderboard-french/bbh_mc/hyperbate.yaml create mode 100644 lm_eval/tasks/leaderboard-french/bbh_mc/jugement_causal.yaml create mode 100644 lm_eval/tasks/leaderboard-french/bbh_mc/naviguer.yaml create mode 100644 lm_eval/tasks/leaderboard-french/bbh_mc/pingouins_sur_une_table.yaml create mode 100644 "lm_eval/tasks/leaderboard-french/bbh_mc/raisonnement_sur_les_objets_color\303\251s.yaml" create mode 100644 lm_eval/tasks/leaderboard-french/bbh_mc/recommandation_de_film.yaml create mode 100644 lm_eval/tasks/leaderboard-french/bbh_mc/scarcasmes.yaml create mode 100644 lm_eval/tasks/leaderboard-french/bbh_mc/sophismes_formels.yaml create mode 100644 "lm_eval/tasks/leaderboard-french/bbh_mc/suivi_objets_m\303\251lang\303\251s_cinq_objets.yaml" create mode 100644 "lm_eval/tasks/leaderboard-french/bbh_mc/suivi_objets_m\303\251lang\303\251s_sept_objets.yaml" create mode 100644 "lm_eval/tasks/leaderboard-french/bbh_mc/suivi_objets_m\303\251lang\303\251s_trois_objets.yaml" create mode 100644 "lm_eval/tasks/leaderboard-french/bbh_mc/s\303\251quences_temporelles.yaml" create mode 100644 lm_eval/tasks/leaderboard-french/bbh_mc/toile_de_mensonges.yaml create mode 100644 lm_eval/tasks/leaderboard-french/gpqa/_leaderboard_gpqa.yaml create mode 100644 lm_eval/tasks/leaderboard-french/gpqa/_template_yaml create mode 100644 lm_eval/tasks/leaderboard-french/gpqa/gpqa_diamond_zeroshot.yaml create mode 100644 lm_eval/tasks/leaderboard-french/gpqa/gpqa_extended_zeroshot.yaml create mode 100644 lm_eval/tasks/leaderboard-french/gpqa/gpqa_main_zeroshot.yaml create mode 100644 lm_eval/tasks/leaderboard-french/gpqa/utils.py create mode 100644 lm_eval/tasks/leaderboard-french/ifeval/_leaderboard_instruction_following.yaml create mode 100644 lm_eval/tasks/leaderboard-french/ifeval/ifeval.yaml create mode 100644 lm_eval/tasks/leaderboard-french/ifeval/instructions.py create mode 100644 lm_eval/tasks/leaderboard-french/ifeval/instructions_registry.py create mode 100644 lm_eval/tasks/leaderboard-french/ifeval/instructions_util.py create mode 100644 lm_eval/tasks/leaderboard-french/ifeval/utils.py create mode 100644 lm_eval/tasks/leaderboard-french/leaderboard.yaml create mode 100644 lm_eval/tasks/leaderboard-french/math/_leaderboard_math.yaml create mode 100644 lm_eval/tasks/leaderboard-french/math/_template_yaml create mode 100644 lm_eval/tasks/leaderboard-french/math/math_algebra.yaml create mode 100644 lm_eval/tasks/leaderboard-french/math/math_counting_and_prob.yaml create mode 100644 lm_eval/tasks/leaderboard-french/math/math_geometry.yaml create mode 100644 lm_eval/tasks/leaderboard-french/math/math_intermediate_algebra.yaml create mode 100644 lm_eval/tasks/leaderboard-french/math/math_num_theory.yaml create mode 100644 lm_eval/tasks/leaderboard-french/math/math_prealgebra.yaml create mode 100644 lm_eval/tasks/leaderboard-french/math/math_precalculus.yaml create mode 100644 lm_eval/tasks/leaderboard-french/math/utils.py create mode 100644 lm_eval/tasks/leaderboard-french/mmlu_pro/mmlu_pro.yaml create mode 100644 lm_eval/tasks/leaderboard-french/mmlu_pro/utils.py create mode 100644 lm_eval/tasks/leaderboard-french/musr/_musr.yaml create mode 100644 lm_eval/tasks/leaderboard-french/musr/_template_yaml create mode 100644 lm_eval/tasks/leaderboard-french/musr/musr_murder_mysteries.yaml create mode 100644 lm_eval/tasks/leaderboard-french/musr/musr_object_placements.yaml create mode 100644 lm_eval/tasks/leaderboard-french/musr/musr_team_allocation.yaml create mode 100644 lm_eval/tasks/leaderboard-french/musr/utils.py diff --git a/lm_eval/tasks/leaderboard-french/README.md b/lm_eval/tasks/leaderboard-french/README.md new file mode 100644 index 0000000000..0344bdd355 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/README.md @@ -0,0 +1,312 @@ +# Leaderboard evaluations +Our goal with this group is to diversify the stable sets by enabling them to a new category of models Namely, Multilingual Models and in process create an unchanging through time version of evaluations that will power the Open LLM French Leaderboard on HuggingFace. + +As we want to evaluate models across capabilities, the list currently contains: +- BBH-fr (3-shots, multichoice) +- GPQA-fr (0-shot, multichoice) +- mmmlu-fr (5-shots, multichoice) +- Musr-fr (0-shot, multichoice) +- ifeval-fr (0-shot, generative) +- Math-lvl-5-fr (4-shots, generative, minerva version) + + +Details on the choice of those evals can be found [here]([https://huggingface.co/spaces/open-llm-leaderboard/blog](https://huggingface.co/le-leadboard)) ! + +## BigBenchHard (BBH) + +A suite of 23 challenging BIG-Bench tasks which we call BIG-Bench Hard (BBH). +These are the task for which prior language model evaluations did not +outperform the average human-rater. + +### Paper + +Title: Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them + +BIG-Bench (Srivastava et al., 2022) is a diverse evaluation suite that focuses on tasks believed to be beyond the capabilities of current language models. Language models have already made good progress on this benchmark, with the best model in the BIG-Bench paper outperforming average reported human-rater results on 65% of the BIG-Bench tasks via few-shot prompting. But on what tasks do language models fall short of average human-rater performance, and are those tasks actually unsolvable by current language models? +In this work, we focus on a suite of 23 challenging BIG-Bench tasks which we call BIG-Bench Hard (BBH). These are the task for which prior language model evaluations did not outperform the average human-rater. We find that applying chain-of-thought (CoT) prompting to BBH tasks enables PaLM to surpass the average human-rater performance on 10 of the 23 tasks, and Codex (code-davinci-002) to surpass the average human-rater performance on 17 of the 23 tasks. Since many tasks in BBH require multi-step reasoning, few-shot prompting without CoT, as done in the BIG-Bench evaluations (Srivastava et al., 2022), substantially underestimates the best performance and capabilities of language models, which is better captured via CoT prompting. As further analysis, we explore the interaction between CoT and model scale on BBH, finding that CoT enables emergent task performance on several BBH tasks with otherwise flat scaling curves. + + +- paper: https://huggingface.co/papers/2210.09261 +- Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard + +### Citation + +``` +@article{suzgun2022challenging, + title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them}, + author={Suzgun, Mirac and Scales, Nathan and Sch{\"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and and Wei, Jason}, + journal={arXiv preprint arXiv:2210.09261}, + year={2022} +} +``` + +### Groups + +- `leaderboard_bbh_fr` + +### Tasks + +- `leaderboard_bbh_boolean_expressions_fr` +- `leaderboard_bbh_causal_judgement_fr` +- `leaderboard_bbh_date_understanding_fr` +- `leaderboard_bbh_disambiguation_qa_fr` +- `leaderboard_bbh_dyck_languages_fr` +- `leaderboard_bbh_formal_fallacies_fr` +- `leaderboard_bbh_geometric_shapes_fr` +- `leaderboard_bbh_hyperbaton_fr` +- `leaderboard_bbh_logical_deduction_five_objects_fr` +- `leaderboard_bbh_logical_deduction_seven_objects_fr` +- `leaderboard_bbh_logical_deduction_three_objects_fr` +- `leaderboard_bbh_movie_recommendation_fr` +- `leaderboard_bbh_multistep_arithmetic_two_fr` +- `leaderboard_bbh_navigate_fr` +- `leaderboard_bbh_object_counting_fr` +- `leaderboard_bbh_penguins_in_a_table_fr` +- `leaderboard_bbh_reasoning_about_colored_objects_fr` +- `leaderboard_bbh_ruin_names_fr` +- `leaderboard_bbh_salient_translation_error_detection_fr` +- `leaderboard_bbh_snarks_fr` +- `leaderboard_bbh_sports_understanding_fr` +- `leaderboard_bbh_temporal_sequences_fr` +- `leaderboard_bbh_tracking_shuffled_objects_five_objects_fr` +- `leaderboard_bbh_tracking_shuffled_objects_seven_objects_fr` +- `leaderboard_bbh_tracking_shuffled_objects_three_objects_fr` +- `leaderboard_bbh_web_of_lies_fr` +- `leaderboard_bbh_word_sorting_fr` + +## GPQA + +### Paper + +Title: GPQA: A Graduate-Level Google-Proof Q&A Benchmark + +We present GPQA, a challenging dataset of 448 multiple-choice questions written +by domain experts in biology, physics, and chemistry. We ensure that the +questions are high-quality and extremely difficult: experts who have or are +pursuing PhDs in the corresponding domains reach 65% accuracy (74% when +discounting clear mistakes the experts identified in retrospect), while highly +skilled non-expert validators only reach 34% accuracy, despite spending on +average over 30 minutes with unrestricted access to the web (i.e., the +questions are “Google-proof”). The questions are also difficult for +state-of-the-art AI systems, with our strongest GPT-4–based baseline achieving +39% accuracy. If we are to use future AI systems to help us answer very hard +questions—for example, when developing new scientific knowledge—we need to +develop scalable oversight methods that enable humans to supervise their +outputs, which may be difficult even if the supervisors are themselves skilled +and knowledgeable. The difficulty of GPQA both for skilled non-experts and +frontier AI systems should enable realistic scalable oversight experiments, +which we hope can help devise ways for human experts to reliably get truthful +information from AI systems that surpass human capabilities. + +- Paper: https://huggingface.co/papers/2311.12022 +- Homepage: https://github.com/idavidrein/gpqa/tree/main + +### Citation + +``` +@misc{rein2023gpqa, + title={GPQA: A Graduate-Level Google-Proof Q&A Benchmark}, + author={David Rein and Betty Li Hou and Asa Cooper Stickland and Jackson Petty and Richard Yuanzhe Pang and Julien Dirani and Julian Michael and Samuel R. Bowman}, + year={2023}, + eprint={2311.12022}, + archivePrefix={arXiv}, + primaryClass={cs.AI} +} +``` + +### Groups + +- `leaderboard_gpqa_fr` + +### Tasks + +- `leaderboard_gpqa_extended_fr` +- `leaderboard_gpqa_diamond_fr` +- `leaderboard_gpqa_main_fr` + +## IFEval + +### Paper + +Title: Instruction-Following Evaluation for Large Language Models + +One core capability of Large Language Models (LLMs) is to follow natural +language instructions. However, the evaluation of such abilities is not +standardized: Human evaluations are expensive, slow, and not objectively +reproducible, while LLM-based auto-evaluation is potentially biased or limited +by the ability of the evaluator LLM. To overcome these issues, we introduce +Instruction-Following Eval (IFEval) for large language models. IFEval is a +straightforward and easy-to-reproduce evaluation benchmark. It focuses on a set +of "verifiable instructions" such as "write in more than 400 words" and +"mention the keyword of AI at least 3 times". We identified 25 types of those +verifiable instructions and constructed around 500 prompts, with each prompt +containing one or more verifiable instructions. We show evaluation results of +two widely available LLMs on the market. + +- Paper: https://huggingface.co/papers/2210.09261 +- Homepage: https://github.com/google-research/google-research/tree/master/instruction_following_eval + +### Citation + +``` +@article{zhou2023instructionfollowing, + title={Instruction-Following Evaluation for Large Language Models}, + author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou}, + journal={arXiv preprint arXiv:2311.07911}, + year={2023}, +} +``` + +### Tasks + +- `leaderboard_ifeval_fr` + +## MATH-hard + +This is the 4 shots variant of minerva math but only keeping the level 5 questions. + +### Paper + +Title: Measuring Mathematical Problem Solving With the MATH Dataset + +Many intellectual endeavors require mathematical problem solving, but this +skill remains beyond the capabilities of computers. To measure this ability in +machine learning models, we introduce MATH, a new dataset of 12,500 challenging +competition mathematics problems. Each problem in MATH has a full step-by-step +solution which can be used to teach models to generate answer derivations and +explanations. + +NOTE: The few-shot and the generated answer extraction is based on the +[Minerva](https://arxiv.org/abs/2206.14858) and exact match equivalence is +calculated using the `sympy` library. This requires additional dependencies, +which can be installed via the `lm-eval[math]` extra. + +- Paper: https://huggingface.co/papers/2103.03874 +- Homepage: https://github.com/hendrycks/math + + +### Citation + +``` +@article{hendrycksmath2021, + title={Measuring Mathematical Problem Solving With the MATH Dataset}, + author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt}, + journal={NeurIPS}, + year={2021} +} +@misc{2206.14858, +Author = {Aitor Lewkowycz and Anders Andreassen and David Dohan and Ethan Dye and Henryk Michalewski and Vinay Ramasesh and Ambrose Slone and Cem Anil and Imanol Schlag and Theo Gutman-Solo and Yuhuai Wu and Behnam Neyshabur and Guy Gur-Ari and Vedant Misra}, +Title = {Solving Quantitative Reasoning Problems with Language Models}, +Year = {2022}, +Eprint = {arXiv:2206.14858}, +} +``` + +### Groups + +- `leaderboard_math_hard_fr` + +### Tasks + +- `leaderboard_math_algebra_hard_fr` +- `leaderboard_math_counting_and_prob_hard_fr` +- `leaderboard_math_geometry_hard_fr` +- `leaderboard_math_intermediate_algebra_hard_fr` +- `leaderboard_math_num_theory_hard_fr` +- `leaderboard_math_prealgebra_hard_fr` +- `leaderboard_math_precalc_hard_fr` + + +## MMMLU-fr + +### Model Card + +MMMLU-fr is the french split of the dataset openai/MMMLU + +Multilingual Massive Multitask Language Understanding (MMMLU) +The MMLU is a widely recognized benchmark of general knowledge attained by AI models. It covers a broad range of topics from 57 different categories, covering elementary-level knowledge up to advanced professional subjects like law, physics, history, and computer science. + +We translated the MMLU’s test set into 14 languages using professional human translators. Relying on human translators for this evaluation increases confidence in the accuracy of the translations, especially for low-resource languages like Yoruba. We are publishing the professional human translations and the code we use to run the evaluations. + +This effort reflects our commitment to improving the multilingual capabilities of AI models, ensuring they perform accurately across languages, particularly for underrepresented communities. By prioritizing high-quality translations, we aim to make AI technology more inclusive and effective for users worldwide. + +- Paper: https://arxiv.org/abs/2009.03300 +- Homepage: https://github.com/openai/simple-evals + +### Citation + +``` +@misc{wang2024mmluprorobustchallengingmultitask, + title={MMLU-Pro: A More Robust and Challenging Multi-Task Language + Understanding Benchmark}, + author={Yubo Wang and Xueguang Ma and Ge Zhang and Yuansheng Ni and Abhranil Chandra and Shiguang Guo and Weiming Ren and Aaran Arulraj and Xuan He and Ziyan Jiang and Tianle Li and Max Ku and Kai Wang and Alex Zhuang and Rongqi Fan and Xiang Yue and Wenhu Chen}, + year={2024}, + eprint={2406.01574}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2406.01574}, +} +``` + +### Groups + +- `leaderboard_mmlu_fr` + +### Tasks + +- `leaderboard_mmlu_fr` + + +## Musr + +### Paper + +Title: MuSR: Testing the Limits of Chain-of-thought with Multistep Soft +Reasoning + +While large language models (LLMs) equipped with techniques like +chain-of-thought prompting have demonstrated impressive capabilities, they +still fall short in their ability to reason robustly in complex settings. +However, evaluating LLM reasoning is challenging because system capabilities +continue to grow while benchmark datasets for tasks like logical deduction have +remained static. We introduce MuSR, a dataset for evaluating language models on +multistep soft reasoning tasks specified in a natural language narrative. This +dataset has two crucial features. First, it is created through a novel +neurosymbolic synthetic-to-natural generation algorithm, enabling the +construction of complex reasoning instances that challenge GPT-4 (e.g., murder +mysteries roughly 1000 words in length) and which can be scaled further as more +capable LLMs are released. Second, our dataset instances are free text +narratives corresponding to real-world domains of reasoning; this makes it +simultaneously much more challenging than other synthetically-crafted +benchmarks while remaining realistic and tractable for human annotators to +solve with high accuracy. We evaluate a range of LLMs and prompting techniques +on this dataset and characterize the gaps that remain for techniques like +chain-of-thought to perform robust reasoning. + +- Paper: https://huggingface.co/papers/2310.16049 +- Homepage: https://zayne-sprague.github.io/MuSR/ + +### Citation + +``` +@misc{sprague2024musrtestinglimitschainofthought, + title={MuSR: Testing the Limits of Chain-of-thought with Multistep Soft + Reasoning}, + author={Zayne Sprague and Xi Ye and Kaj Bostrom and Swarat Chaudhuri and Greg Durrett}, + year={2024}, + eprint={2310.16049}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2310.16049}, +} +``` + +### Groups + +- `leaderboard_musr_fr` + +### Tasks + +- `leaderboard_musr_murder_mysteries` +- `leaderboard_musr_object_placements` +- `leaderboard_musr_team_allocation` diff --git a/lm_eval/tasks/leaderboard-french/bbh_mc/_fewshot_template_yaml b/lm_eval/tasks/leaderboard-french/bbh_mc/_fewshot_template_yaml new file mode 100644 index 0000000000..44bb854fa7 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/bbh_mc/_fewshot_template_yaml @@ -0,0 +1,16 @@ +dataset_path: le-leadboard/bbh-fr +output_type: multiple_choice +test_split: test +doc_to_text: 'Q: {{input}} + + A:' +doc_to_target: "{{target}}" +metric_list: + - metric: acc_norm + aggregation: mean + higher_is_better: true +num_fewshot: 3 +fewshot_config: + sampler: first_n +metadata: + version: 0.0 diff --git a/lm_eval/tasks/leaderboard-french/bbh_mc/_leaderboard_bbh.yaml b/lm_eval/tasks/leaderboard-french/bbh_mc/_leaderboard_bbh.yaml new file mode 100644 index 0000000000..3bb2a987e5 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/bbh_mc/_leaderboard_bbh.yaml @@ -0,0 +1,28 @@ +group: leaderboard_bbh_fr +task: + - leaderboard_bbh_expressions_booléennes + - leaderboard_bbh_jugement_causal + - leaderboard_bbh_compréhension_de_la_date + - leaderboard_bbh_désambiguïsation_qa + - leaderboard_bbh_sophismes_formels + - leaderboard_bbh_formes_géométriques + - leaderboard_bbh_hyperbate + - leaderboard_bbh_déduction_logique_cinq_objets + - leaderboard_bbh_déduction_logique_sept_objets + - leaderboard_bbh_déduction_logique_trois_objets + - leaderboard_bbh_recommandation_de_film + - leaderboard_bbh_naviguer + - leaderboard_bbh_comptage_d_objets + - leaderboard_bbh_pingouins_sur_une_table + - leaderboard_bbh_raisonnement_sur_les_objets_colorés + - leaderboard_bbh_sarcasmes + - leaderboard_bbh_compréhension_des_sports + - leaderboard_bbh_séquences_temporelles + - leaderboard_bbh_suivi_objets_mélangés_cinq_objets + - leaderboard_bbh_suivi_objets_mélangés_sept_objets + - leaderboard_bbh_suivi_objets_mélangés_trois_objets + - leaderboard_bbh_toile_de_mensonges +aggregate_metric_list: + - metric: acc_norm + aggregation: mean + weight_by_size: true \ No newline at end of file diff --git "a/lm_eval/tasks/leaderboard-french/bbh_mc/compr\303\251hension_de_la_date.yaml" "b/lm_eval/tasks/leaderboard-french/bbh_mc/compr\303\251hension_de_la_date.yaml" new file mode 100644 index 0000000000..e7c7c9eccb --- /dev/null +++ "b/lm_eval/tasks/leaderboard-french/bbh_mc/compr\303\251hension_de_la_date.yaml" @@ -0,0 +1,58 @@ +dataset_name: compréhension_de_la_date +description: > + Déduire la date à partir du contexte. +doc_to_choice: ["(A)", "(B)", "(C)", "(D)", "(E)", "(F)"] +fewshot_config: + sampler: first_n + samples: + - input: | + Aujourd'hui, c'est la veille de Noël de 1937. Quelle est la date il y a 10 jours en MM/JJ/AAAA ? + + Options : + + (A) 14/12/2026 + + (B) 14/12/1950 + + (C) 14/12/2007 + + (D) 14/12/1937 + + (E) 14/07/1938 + + (F) 14/12/1988 + target: (D) + - input: | + Demain, nous serons le 12/11/2019. Quelle est la date d'il y a un an à partir d'aujourd'hui en MM/JJ/AAAA ? + + Options : + + (A) 04/09/2018 + + (B) 11/11/2018 + + (C) 25/08/2018 + + (D) 02/11/2018 + + (E) 04/11/2018 + target: (B) + - input: | + Jane et John se sont mariés le 2 janvier 1958. Aujourd'hui, c'est leur 5e anniversaire. Quelle est la date de demain en MM/JJ/AAAA ? + + Options : + + (A) 11/01/1961 + + (B) 03/01/1963 + + (C) 18/01/1961 + + (D) 14/10/1960 + + (E) 03/01/1982 + + (F) 03/12/1960 + target: (B) +include: _fewshot_template_yaml +task: leaderboard_bbh_compréhension_de_la_date diff --git "a/lm_eval/tasks/leaderboard-french/bbh_mc/compr\303\251hension_des_sports.yaml" "b/lm_eval/tasks/leaderboard-french/bbh_mc/compr\303\251hension_des_sports.yaml" new file mode 100644 index 0000000000..c44b95d129 --- /dev/null +++ "b/lm_eval/tasks/leaderboard-french/bbh_mc/compr\303\251hension_des_sports.yaml" @@ -0,0 +1,19 @@ +dataset_name: compréhension_des_sports +description: 'Déterminez si une phrase artificiellement construite en rapport avec + le sport est plausible ou Non. + + ' +doc_to_choice: ["Oui", "Non"] +fewshot_config: + sampler: first_n + samples: + - input: La phrase suivante est-elle plausible ? "Bam Adebayo a marqué un layup inversé + lors de la finale de la Conférence Ouest." + target: 'Oui' + - input: La phrase suivante est-elle plausible ? "Santi Cazorla a marqué un touchdown." + target: 'Non' + - input: La phrase suivante est-elle plausible ? "DeMar DeRozan a été sanctionné + pour goaltending." + target: 'Oui' +include: _fewshot_template_yaml +task: leaderboard_bbh_compréhension_des_sports diff --git a/lm_eval/tasks/leaderboard-french/bbh_mc/comptage_d'objets.yaml b/lm_eval/tasks/leaderboard-french/bbh_mc/comptage_d'objets.yaml new file mode 100644 index 0000000000..8dd632b265 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/bbh_mc/comptage_d'objets.yaml @@ -0,0 +1,21 @@ +dataset_name: comptage_d_objets +description: > + Questions qui impliquent d'énumérer des objets et de demander au modèle de les compter. +doc_to_choice: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18"] +fewshot_config: + sampler: first_n + samples: + - input: | + J'ai une mûre, une clarinette, une nectarine, une prune, une fraise, une banane, + une flûte, une orange et un violon. Combien de fruits ai-je ? + target: '6' + - input: | + J'ai une orange, une framboise, deux pêches, une mûre, une pomme, un raisin, + une nectarine et trois prunes. Combien de fruits ai-je ? + target: '11' + - input: | + J'ai une tête de laitue, une tête de brocoli, un oignon, une branche de céleri, + deux carottes, une gousse d'ail et une patate douce. Combien de légumes ai-je ? + target: '8' +include: _fewshot_template_yaml +task: leaderboard_bbh_comptage_d_objets diff --git "a/lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251duction_logique_cinq_objets.yaml" "b/lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251duction_logique_cinq_objets.yaml" new file mode 100644 index 0000000000..a003d848ce --- /dev/null +++ "b/lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251duction_logique_cinq_objets.yaml" @@ -0,0 +1,52 @@ +dataset_name: déduction_logique_cinq_objets +description: > + Une tâche de déduction logique qui nécessite de déduire l'ordre d'une séquence d'objets. +doc_to_choice: ["(A)", "(B)", "(C)", "(D)", "(E)"] +fewshot_config: + sampler: first_n + samples: + - input: | + Les paragraphes suivants décrivent chacun un ensemble de trois objets disposés + dans un ordre fixe. Les déclarations sont logiquement cohérentes dans chaque paragraphe. + Dans un tournoi de golf, il y avait trois golfeurs : Amy, Eli et Eve. Eve a terminé + devant Amy. Eli a terminé derrière Amy. + + Options : + + (A) Amy a terminé dernière + + (B) Eli a terminé dernier + + (C) Eve a terminé dernière + target: (B) + - input: | + Les paragraphes suivants décrivent chacun un ensemble de trois objets disposés + dans un ordre fixe. Les déclarations sont logiquement cohérentes dans chaque paragraphe. + Sur une étagère, il y a trois livres : un livre blanc, un livre vert et un livre + orange. Le livre vert est à droite du livre blanc. Le livre orange est tout à droite. + + Options : + + (A) Le livre blanc est tout à gauche + + (B) Le livre vert est tout à gauche + + (C) Le livre orange est tout à gauche + target: (A) + - input: | + Les paragraphes suivants décrivent chacun un ensemble de trois objets disposés + dans un ordre fixe. Les déclarations sont logiquement cohérentes dans chaque paragraphe. + Sur une étagère, il y a trois livres : un livre rouge, un livre gris et un livre blanc. + Le livre blanc est à gauche du livre gris. Le livre rouge est le deuxième en partant + de la gauche. + + Options : + + (A) Le livre rouge est tout à gauche + + (B) Le livre gris est tout à gauche + + (C) Le livre blanc est tout à gauche + target: (C) +include: _fewshot_template_yaml +task: leaderboard_bbh_déduction_logique_cinq_objets diff --git "a/lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251duction_logique_sept_objets.yaml" "b/lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251duction_logique_sept_objets.yaml" new file mode 100644 index 0000000000..941b1ccf4d --- /dev/null +++ "b/lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251duction_logique_sept_objets.yaml" @@ -0,0 +1,52 @@ +dataset_name: déduction_logique_sept_objets +description: > + Une tâche de déduction logique qui nécessite de déduire l'ordre d'une séquence d'objets. +doc_to_choice: ["(A)", "(B)", "(C)", "(D)", "(E)", "(F)", "(G)"] +fewshot_config: + sampler: first_n + samples: + - input: | + Les paragraphes suivants décrivent chacun un ensemble de trois objets disposés + dans un ordre fixe. Les déclarations sont logiquement cohérentes dans chaque paragraphe. + Dans un tournoi de golf, il y avait trois golfeurs : Amy, Eli et Eve. Eve a terminé + devant Amy. Eli a terminé derrière Amy. + + Options : + + (A) Amy a terminé dernière + + (B) Eli a terminé dernier + + (C) Eve a terminé dernière + target: (B) + - input: | + Les paragraphes suivants décrivent chacun un ensemble de trois objets disposés + dans un ordre fixe. Les déclarations sont logiquement cohérentes dans chaque paragraphe. + Sur une étagère, il y a trois livres : un livre blanc, un livre vert et un livre + orange. Le livre vert est à droite du livre blanc. Le livre orange est tout à droite. + + Options : + + (A) Le livre blanc est tout à gauche + + (B) Le livre vert est tout à gauche + + (C) Le livre orange est tout à gauche + target: (A) + - input: | + Les paragraphes suivants décrivent chacun un ensemble de trois objets disposés + dans un ordre fixe. Les déclarations sont logiquement cohérentes dans chaque paragraphe. + Sur une étagère, il y a trois livres : un livre rouge, un livre gris et un livre blanc. + Le livre blanc est à gauche du livre gris. Le livre rouge est le deuxième en partant + de la gauche. + + Options : + + (A) Le livre rouge est tout à gauche + + (B) Le livre gris est tout à gauche + + (C) Le livre blanc est tout à gauche + target: (C) +include: _fewshot_template_yaml +task: leaderboard_bbh_déduction_logique_sept_objets diff --git "a/lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251duction_logique_trois_objets.yaml" "b/lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251duction_logique_trois_objets.yaml" new file mode 100644 index 0000000000..c3e3372b12 --- /dev/null +++ "b/lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251duction_logique_trois_objets.yaml" @@ -0,0 +1,52 @@ +dataset_name: déduction_logique_trois_objets +description: > + Une tâche de déduction logique qui nécessite de déduire l'ordre d'une séquence d'objets. +doc_to_choice: ["(A)", "(B)", "(C)"] +fewshot_config: + sampler: first_n + samples: + - input: | + Les paragraphes suivants décrivent chacun un ensemble de trois objets disposés + dans un ordre fixe. Les déclarations sont logiquement cohérentes dans chaque paragraphe. + Dans un tournoi de golf, il y avait trois golfeurs : Amy, Eli et Eve. Eve a terminé + devant Amy. Eli a terminé derrière Amy. + + Options : + + (A) Amy a terminé dernière + + (B) Eli a terminé dernier + + (C) Eve a terminé dernière + target: (B) + - input: | + Les paragraphes suivants décrivent chacun un ensemble de trois objets disposés + dans un ordre fixe. Les déclarations sont logiquement cohérentes dans chaque paragraphe. + Sur une étagère, il y a trois livres : un livre blanc, un livre vert et un livre + orange. Le livre vert est à droite du livre blanc. Le livre orange est tout à droite. + + Options : + + (A) Le livre blanc est tout à gauche + + (B) Le livre vert est tout à gauche + + (C) Le livre orange est tout à gauche + target: (A) + - input: | + Les paragraphes suivants décrivent chacun un ensemble de trois objets disposés + dans un ordre fixe. Les déclarations sont logiquement cohérentes dans chaque paragraphe. + Sur une étagère, il y a trois livres : un livre rouge, un livre gris et un livre blanc. + Le livre blanc est à gauche du livre gris. Le livre rouge est le deuxième en partant + de la gauche. + + Options : + + (A) Le livre rouge est tout à gauche + + (B) Le livre gris est tout à gauche + + (C) Le livre blanc est tout à gauche + target: (C) +include: _fewshot_template_yaml +task: leaderboard_bbh_déduction_logique_trois_objets diff --git "a/lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251sambigu\303\257sation_qa.yaml" "b/lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251sambigu\303\257sation_qa.yaml" new file mode 100644 index 0000000000..e6de27cbbb --- /dev/null +++ "b/lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251sambigu\303\257sation_qa.yaml" @@ -0,0 +1,51 @@ +dataset_name: désambiguïsation_qa +description: > + Clarifiez le sens des phrases avec des pronoms ambigus. +doc_to_choice: ["(A)", "(B)", "(C)"] +fewshot_config: + sampler: first_n + samples: + - input: | + Dans les phrases suivantes, expliquez l'antécédent du pronom (à quoi le pronom + fait référence), ou indiquez que c'est ambigu. + + Phrase : Le chef a dit au conseiller qu'ils ont pris un jour de congé. + + Options : + + (A) Le chef a pris un jour de congé + + (B) Le conseiller a pris un jour de congé + + (C) Ambigu + target: (A) + - input: | + Dans les phrases suivantes, expliquez l'antécédent du pronom (à quoi le pronom + fait référence), ou indiquez que c'est ambigu. + + Phrase : Le manager a envoyé un message au secrétaire, mais il n'a pas encore répondu. + + Options : + + (A) Le secrétaire n'a pas encore répondu + + (B) Le manager n'a pas encore répondu + + (C) Ambigu + target: (A) + - input: | + Dans les phrases suivantes, expliquez l'antécédent du pronom (à quoi le pronom + fait référence), ou indiquez que c'est ambigu. + + Phrase : Bailey prévoit de rencontrer le directeur dans son bureau. + + Options : + + (A) Ce sera le bureau de Bailey + + (B) Ce sera le bureau du directeur + + (C) Ambigu + target: (C) +include: _fewshot_template_yaml +task: leaderboard_bbh_désambiguïsation_qa diff --git "a/lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251tection_d'erreurs_de_traduction_sailantes.yaml" "b/lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251tection_d'erreurs_de_traduction_sailantes.yaml" new file mode 100644 index 0000000000..39c5271f4a --- /dev/null +++ "b/lm_eval/tasks/leaderboard-french/bbh_mc/d\303\251tection_d'erreurs_de_traduction_sailantes.yaml" @@ -0,0 +1,72 @@ +dataset_name: détection_d'erreurs_de_traduction_sailantes +description: > + Détectez le type d'erreur dans une traduction anglaise d'une phrase source allemande. +doc_to_choice: ["(A)", "(B)", "(C)", "(D)", "(E)", "(F)"] +fewshot_config: + sampler: first_n + samples: + - input: | + Les traductions suivantes de l'allemand à l'anglais contiennent une erreur particulière. + Cette erreur sera l'un des types suivants : + Entités Nommées : Une entité (noms, lieux, localisations, etc.) est changée pour une autre entité. + Valeurs Numériques : Les valeurs numériques (ordinaux ou cardinaux), les dates et/ou les unités sont modifiées. + Modificateurs ou Adjectifs : Les modificateurs et adjectifs associés à un nom sont changés. + Négation ou Antonymes : Une négation est introduite ou retirée, ou des comparatifs sont changés en leurs antonymes. + Faits : Des erreurs factuelles triviales, non liées aux catégories ci-dessus, sont introduites dans les traductions. + Contenu Supprimé : Une clause significative est supprimée dans la traduction. + Veuillez identifier cette erreur. + Source: In der Liste der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet. + + Traduction: Dans la liste des monuments architecturaux de Lenzen, tous les monuments architecturaux de la ville de Brandebourg de Lenzen et de ses districts sont répertoriés. + + La traduction contient une erreur concernant + + Options : + + (A) Modificateurs ou Adjectifs + + (B) Valeurs Numériques + + (C) Négation ou Antonymes + + (D) Entités Nommées + + (E) Contenu Supprimé + + (F) Faits + target: (D) + - input: | + Les traductions suivantes de l'allemand à l'anglais contiennent une erreur particulière. + Cette erreur sera l'un des types suivants : + Entités Nommées : Une entité (noms, lieux, localisations, etc.) est changée pour une autre entité. + Valeurs Numériques : Les valeurs numériques (ordinaux ou cardinaux), les dates et/ou les unités sont modifiées. + Modificateurs ou Adjectifs : Les modificateurs et adjectifs associés à un nom sont changés. + Négation ou Antonymes : Une négation est introduite ou retirée, ou des comparatifs sont changés en leurs antonymes. + Faits : Des erreurs factuelles triviales, non liées aux catégories ci-dessus, sont introduites dans les traductions. + Contenu Supprimé : Une clause significative est supprimée dans la traduction. + Veuillez identifier cette erreur. + Source: Auf dieser Seite sind die Baudenkmäler der oberbayerischen Großen Kreisstadt Landsberg am Lech zusammengestellt. + + Traduction: Sur cette page, les monuments architecturaux de la ville de Landsberg am Lech sont compilés. + + La traduction contient une erreur concernant + + Options : + + (A) Modificateurs ou Adjectifs + + (B) Valeurs Numériques + + (C) Négation ou Antonymes + + (D) Entités Nommées + + (E) Contenu Supprimé + + (F) Faits + target: (E) + - input: | + Les traductions suivantes de l'allemand à l'anglais contiennent une erreur particulière. + Cette erreur sera l'un des types suivants : + Entités Nommées : Une entité (noms, lieux, localisations, etc.) est changée pour une autre entité. + Valeurs Numériques : Les valeurs numériques (ordinaux ou cardinaux diff --git "a/lm_eval/tasks/leaderboard-french/bbh_mc/expressions_bool\303\251ennes.yaml" "b/lm_eval/tasks/leaderboard-french/bbh_mc/expressions_bool\303\251ennes.yaml" new file mode 100644 index 0000000000..cc08db087b --- /dev/null +++ "b/lm_eval/tasks/leaderboard-french/bbh_mc/expressions_bool\303\251ennes.yaml" @@ -0,0 +1,15 @@ +dataset_name: expressions_booléennes +description: > + Évaluez le résultat d'une expression booléenne aléatoire. +doc_to_choice: ["Incorrect", "Vrai"] +fewshot_config: + sampler: first_n + samples: + - input: pas ( ( pas pas Vrai ) ) est + target: 'Incorrect' + - input: Vrai et Faux et pas Vrai et Vrai est + target: 'Incorrect' + - input: pas pas ( pas ( Faux ) ) est + target: 'Vrai' +include: _fewshot_template_yaml +task: leaderboard_bbh_expressions_booléennes diff --git "a/lm_eval/tasks/leaderboard-french/bbh_mc/formes_g\303\251om\303\251triques.yaml" "b/lm_eval/tasks/leaderboard-french/bbh_mc/formes_g\303\251om\303\251triques.yaml" new file mode 100644 index 0000000000..b85cf4f1d0 --- /dev/null +++ "b/lm_eval/tasks/leaderboard-french/bbh_mc/formes_g\303\251om\303\251triques.yaml" @@ -0,0 +1,86 @@ +dataset_name: formes_géométriques +description: 'Nommez des formes géométriques à partir de leurs chemins SVG. + + ' +doc_to_choice: ["(A)", "(B)", "(C)", "(D)", "(E)", "(F)", "(G)", "(H)", "(I)", "(J)", "(K)"] +fewshot_config: + sampler: first_n + samples: + - input: 'Cet élément de chemin SVG + dessine un(e) + + Options : + + (A) cercle + + (B) heptagone + + (C) hexagone + + (D) cerf-volant + + (E) ligne + + (F) octogone + + (G) pentagone + + (H) rectangle + + (I) secteur + + (J) triangle' + target: (F) + - input: 'Cet élément de chemin SVG dessine un(e) + + Options : + + (A) cercle + + (B) heptagone + + (C) hexagone + + (D) cerf-volant + + (E) ligne + + (F) octogone + + (G) pentagone + + (H) rectangle + + (I) secteur + + (J) triangle' + target: (G) + - input: 'Cet élément de chemin SVG dessine un(e) + + Options : + + (A) cercle + + (B) heptagone + + (C) hexagone + + (D) cerf-volant + + (E) ligne + + (F) octogone + + (G) pentagone + + (H) rectangle + + (I) secteur + + (J) triangle' + target: (D) +include: _fewshot_template_yaml +task: leaderboard_bbh_formes_géométriques \ No newline at end of file diff --git a/lm_eval/tasks/leaderboard-french/bbh_mc/hyperbate.yaml b/lm_eval/tasks/leaderboard-french/bbh_mc/hyperbate.yaml new file mode 100644 index 0000000000..7a870a77e5 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/bbh_mc/hyperbate.yaml @@ -0,0 +1,36 @@ +dataset_name: hyperbate +description: > + Ordonnez correctement les adjectifs dans des phrases en anglais. +doc_to_choice: ["(A)", "(B)"] +fewshot_config: + sampler: first_n + samples: + - input: | + Quelle phrase a le bon ordre des adjectifs : + + Options : + + (A) bateau en caoutchouc terrible + + (B) terrible bateau en caoutchouc + target: (B) + - input: | + Quelle phrase a le bon ordre des adjectifs : + + Options : + + (A) petit bateau d'exercice brésilien répugnant + + (B) brésilien répugnant bateau d'exercice petit + target: (A) + - input: | + Quelle phrase a le bon ordre des adjectifs : + + Options : + + (A) chaussure carrée merveilleuse bleu or + + (B) merveilleuse chaussure carrée bleu or + target: (B) +include: _fewshot_template_yaml +task: leaderboard_bbh_hyperbate diff --git a/lm_eval/tasks/leaderboard-french/bbh_mc/jugement_causal.yaml b/lm_eval/tasks/leaderboard-french/bbh_mc/jugement_causal.yaml new file mode 100644 index 0000000000..a89f8c2446 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/bbh_mc/jugement_causal.yaml @@ -0,0 +1,65 @@ +dataset_name: jugement_causal +description: "Répondez aux questions sur l'attribution causale." +doc_to_choice: ["Oui", "Non"] +fewshot_config: + sampler: first_n + samples: + - input: | + Comment une personne typique répondrait-elle à chacune des questions suivantes sur la causalité ? + + Frank T. avait un conflit de longue date avec son voisin à propos d'un morceau + de terrain et un jour, il a décidé de tirer sur son voisin dans le corps. + Frank T. n'avait aucune expérience avec les armes à feu, sa main a glissé sur + le canon de l'arme, et le tir est parti dans une direction imprévue. + Néanmoins, la balle a ricoché sur un gros rocher à plusieurs mètres de là et + a touché le corps du voisin, causant des blessures importantes. Frank T. a-t-il + intentionnellement tiré sur son voisin dans le corps ? + + Options : + + - Oui + + - Non + target: 'Non' + - input: | + Comment une personne typique répondrait-elle à chacune des questions suivantes sur la causalité ? + + Suzy et Billy travaillent sur un projet très important pour la sécurité de notre + nation. Leur patron leur dit à tous les deux : "Assurez-vous d'être ici + exactement à 9h00. Il est absolument essentiel que vous arriviez à cette heure." + Billy et Suzy arrivent tous les deux à 9h00. Il se trouve qu'un détecteur de + mouvement avait été installé dans la pièce où ils sont arrivés. Le détecteur de + mouvement était configuré pour se déclencher si au moins une personne apparaissait + dans la pièce en même temps. Le détecteur de mouvement s'est donc déclenché. + Billy a-t-il causé le déclenchement du détecteur de mouvement ? + + Options : + + - Oui + + - Non + target: 'Oui' + - input: | + Comment une personne typique répondrait-elle à chacune des questions suivantes sur la causalité ? + + George et sa sœur Lena se retrouvent chez leurs parents pour Thanksgiving. + Tandis que George vient d'entrer en faculté de médecine, Lena est mécontente + de son mariage et a récemment perdu son emploi. Au cours de la journée, George + et Lena se disputent à plusieurs reprises. Plus tard dans l'après-midi, ils + jouent à un jeu de fléchettes. Ils se partagent les deux premiers matchs, et + le troisième est serré jusqu'à la fin. Le gagnant sera déterminé par le dernier + tir de George. S'il vise une zone de points élevés, il gagne ; s'il vise une + zone de points faibles, Lena gagne. George pense au moment difficile que Lena + traverse et il veut vraiment la laisser gagner. Il vise la zone de points + faibles. Il prépare son tir, et la fléchette atterrit dans la zone de points + faibles. Après son tir, Lena gagne le jeu et est très heureuse. George a-t-il + intentionnellement visé la zone de points faibles ? + + Options : + + - Oui + + - Non + target: 'Oui' +include: _fewshot_template_yaml +task: leaderboard_bbh_jugement_causal diff --git a/lm_eval/tasks/leaderboard-french/bbh_mc/naviguer.yaml b/lm_eval/tasks/leaderboard-french/bbh_mc/naviguer.yaml new file mode 100644 index 0000000000..36b704d0c3 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/bbh_mc/naviguer.yaml @@ -0,0 +1,41 @@ +dataset_name: naviguer +description: > + Étant donné une série d'instructions de navigation, déterminez si l'on revient au point de départ. +doc_to_choice: ["Oui", "Non"] +fewshot_config: + sampler: first_n + samples: + - input: | + Si vous suivez ces instructions, revenez-vous au point de départ ? + Tournez à gauche. Faites demi-tour. Tournez à gauche. Faites 7 pas. Faites 2 pas. Faites 4 pas. + Faites 8 pas. + + Options : + + - Oui + + - Non + target: 'Non' + - input: | + Si vous suivez ces instructions, revenez-vous au point de départ ? + Faites demi-tour. Faites 1 pas. Faites 6 pas. Faites demi-tour. Faites 6 pas. Faites 9 pas. + Faites 1 pas. + + Options : + + - Oui + + - Non + target: 'Non' + - input: | + Si vous suivez ces instructions, revenez-vous au point de départ ? + Faites toujours face vers l'avant. Faites 2 pas à droite. Faites 9 pas à gauche. Faites 7 pas à droite. + + Options : + + - Oui + + - Non + target: 'Oui' +include: _fewshot_template_yaml +task: leaderboard_bbh_naviguer diff --git a/lm_eval/tasks/leaderboard-french/bbh_mc/pingouins_sur_une_table.yaml b/lm_eval/tasks/leaderboard-french/bbh_mc/pingouins_sur_une_table.yaml new file mode 100644 index 0000000000..c4ebacceb1 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/bbh_mc/pingouins_sur_une_table.yaml @@ -0,0 +1,85 @@ +dataset_name: pingouins_sur_une_table +description: > + Répondez aux questions sur une table de pingouins et leurs attributs. +doc_to_choice: ["(A)", "(B)", "(C)", "(D)", "(E)"] +fewshot_config: + sampler: first_n + samples: + - input: | + Voici une table où la première ligne est un en-tête et chaque ligne suivante + représente un pingouin : nom, âge, taille (cm), poids (kg) + + Louis, 7, 50, 11 + Bernard, 5, 80, 13 + Vincent, 9, 60, 11 + Gwen, 8, 70, 15 + + Par exemple : l'âge de Louis est 7 ans, le poids de Gwen est de 15 kg, la taille de Bernard est de 80 cm. + Nous ajoutons maintenant un pingouin à la table : + + James, 12, 90, 12 + + Combien de pingouins ont moins de 8 ans ? + + Options : + + (A) 1 + + (B) 2 + + (C) 3 + + (D) 4 + + (E) 5 + target: (B) + - input: | + Voici une table où la première ligne est un en-tête et chaque ligne suivante + représente un pingouin : nom, âge, taille (cm), poids (kg) + + Louis, 7, 50, 11 + Bernard, 5, 80, 13 + Vincent, 9, 60, 11 + Gwen, 8, 70, 15 + + Par exemple : l'âge de Louis est de 7 ans, le poids de Gwen est de 15 kg, la taille de Bernard est de 80 cm. + Quel est le pingouin le plus jeune ? + + Options : + + (A) Louis + + (B) Bernard + + (C) Vincent + + (D) Gwen + + (E) James + target: (B) + - input: | + Voici une table où la première ligne est un en-tête et chaque ligne suivante + représente un pingouin : nom, âge, taille (cm), poids (kg) + + Louis, 7, 50, 11 + Bernard, 5, 80, 13 + Vincent, 9, 60, 11 + Gwen, 8, 70, 15 + + Par exemple : l'âge de Louis est de 7 ans, le poids de Gwen est de 15 kg, la taille de Bernard est de 80 cm. + Quel est le nom du deuxième pingouin trié par ordre alphabétique ? + + Options : + + (A) Louis + + (B) Bernard + + (C) Vincent + + (D) Gwen + + (E) James + target: (D) +include: _fewshot_template_yaml +task: leaderboard_bbh_pingouins_sur_une_table diff --git "a/lm_eval/tasks/leaderboard-french/bbh_mc/raisonnement_sur_les_objets_color\303\251s.yaml" "b/lm_eval/tasks/leaderboard-french/bbh_mc/raisonnement_sur_les_objets_color\303\251s.yaml" new file mode 100644 index 0000000000..95c8576823 --- /dev/null +++ "b/lm_eval/tasks/leaderboard-french/bbh_mc/raisonnement_sur_les_objets_color\303\251s.yaml" @@ -0,0 +1,118 @@ +dataset_name: raisonnement_sur_les_objets_colorés +description: > + Répondez à des questions extrêmement simples sur les couleurs des objets sur une surface. +doc_to_choice: ["(A)", "(B)", "(C)", "(D)", "(E)", "(F)", "(G)", "(H)", "(I)", "(J)", "(K)", "(L)", "(M)", "(N)", "(O)", "(P)", "(Q)", "(R)"] +fewshot_config: + sampler: first_n + samples: + - input: | + Sur la table de chevet, il y a un crayon rouge, une tasse violette, un porte-clés + bordeaux, un ours en peluche fuchsia, une assiette noire et une balle anti-stress bleue. + De quelle couleur est la balle anti-stress ? + + Options : + + (A) rouge + + (B) orange + + (C) jaune + + (D) vert + + (E) bleu + + (F) marron + + (G) magenta + + (H) fuchsia + + (I) mauve + + (J) sarcelle + + (K) turquoise + + (L) bordeaux + + (M) argent + + (N) or + + (O) noir + + (P) gris + + (Q) violet + + (R) rose + target: (E) + - input: | + Sur la table, vous voyez un ensemble d'objets disposés en ligne : un trombone + violet, une balle anti-stress rose, un porte-clés marron, un chargeur scrunchiephone + vert, un hand spinner mauve et un stylo bordeaux. Quelle est la couleur de l'objet + directement à droite de la balle anti-stress ? + + Options : + + (A) rouge + + (B) orange + + (C) jaune + + (D) vert + + (E) bleu + + (F) marron + + (G) magenta + + (H) fuchsia + + (I) mauve + + (J) sarcelle + + (K) turquoise + + (L) bordeaux + + (M) argent + + (N) or + + (O) noir + + (P) gris + + (Q) violet + + (R) rose + target: (F) + - input: | + Sur la table de chevet, vous voyez les objets suivants disposés en ligne : une assiette + sarcelle, un porte-clés bordeaux, un chargeur scrunchiephone jaune, une tasse orange, + un carnet rose et une tasse grise. Combien d'objets non-orange voyez-vous à gauche + de l'objet sarcelle ? + + Options : + + (A) zéro + + (B) un + + (C) deux + + (D) trois + + (E) quatre + + (F) cinq + + (G) six + target: (A) +include: _fewshot_template_yaml +task: leaderboard_bbh_raisonnement_sur_les_objets_colorés diff --git a/lm_eval/tasks/leaderboard-french/bbh_mc/recommandation_de_film.yaml b/lm_eval/tasks/leaderboard-french/bbh_mc/recommandation_de_film.yaml new file mode 100644 index 0000000000..fed1f9aeba --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/bbh_mc/recommandation_de_film.yaml @@ -0,0 +1,56 @@ +dataset_name: recommandation_de_film +description: > + Recommandez des films similaires à la liste de films donnée. +doc_to_choice: ["(A)", "(B)", "(C)", "(D)", "(E)", "(F)"] +fewshot_config: + sampler: first_n + samples: + - input: | + Trouvez un film similaire à Star Wars Episode IV - Un nouvel espoir, Indiana + Jones et la dernière croisade, Star Wars Episode V - L'Empire contre-attaque, + The Big Lebowski : + + Options : + + (A) Tetsuo + + (B) the Ironman + + (C) The Princess Bride + + (D) The Barkley Marathons The Race That Eats Its Young + + (E) Bug + target: (C) + - input: | + Trouvez un film similaire à Twister, Le Silence des agneaux, Independence + Day, Braveheart : + + Options : + + (A) Ils tirent sur les chevaux + + (B) N'est-ce pas + + (C) Forrest Gump + + (D) The Salton Sea + + (E) Extreme Days + target: (C) + - input: | + Trouvez un film similaire à Minority Report, Total Recall, Vice-versa, + Forrest Gump : + + Options : + + (A) Phénomènes + + (B) Lilting + + (C) Catwoman + + (D) Edge of Tomorrow + target: (D) +include: _fewshot_template_yaml +task: leaderboard_bbh_recommandation_de_film diff --git a/lm_eval/tasks/leaderboard-french/bbh_mc/scarcasmes.yaml b/lm_eval/tasks/leaderboard-french/bbh_mc/scarcasmes.yaml new file mode 100644 index 0000000000..2bb0178b60 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/bbh_mc/scarcasmes.yaml @@ -0,0 +1,40 @@ +dataset_name: sarcasmes +description: > + Déterminez laquelle des deux phrases est sarcastique. Selon le dictionnaire de l'Université de Cambridge, + le sarcasme est "l'utilisation de remarques qui signifient clairement le contraire de ce qu'elles disent, + faites pour blesser les sentiments de quelqu'un ou pour critiquer quelque chose de manière humoristique." + Les phrases sarcastiques contiennent souvent des énonciations satiriques ou ironiques, + des hyperboles, des remarques ambivalentes ou spirituelles. +doc_to_choice: ["(A)", "(B)"] +fewshot_config: + sampler: first_n + samples: + - input: | + Quelle déclaration est sarcastique ? + + Options : + + (A) Oui, parce que s'intéresser à des sujets et faire des recherches actives est une énorme perte de temps + + (B) Oui, parce que s'intéresser à des sujets et faire des recherches actives est une grande affaire + target: (A) + - input: | + Quelle déclaration est sarcastique ? + + Options : + + (A) Personne ne va être en désaccord avec vous là-dessus. Éviter les attaques ad hominem aide vraiment votre cause + + (B) Personne ne va être en désaccord avec vous là-dessus. Les attaques ad hominem aident vraiment votre cause + target: (B) + - input: | + Quelle déclaration est sarcastique ? + + Options : + + (A) Cohérence dans les sanctions de la ligue ? Qu'est-ce que vous pensez que cela doit être, de la politique ? + + (B) Cohérence dans les sanctions de la ligue ? Qu'est-ce que vous pensez que cela doit être, de la morale ? + target: (A) +include: _fewshot_template_yaml +task: leaderboard_bbh_sarcasmes diff --git a/lm_eval/tasks/leaderboard-french/bbh_mc/sophismes_formels.yaml b/lm_eval/tasks/leaderboard-french/bbh_mc/sophismes_formels.yaml new file mode 100644 index 0000000000..f16e1cf547 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/bbh_mc/sophismes_formels.yaml @@ -0,0 +1,60 @@ +dataset_name: sophismes_formels +description: > + Distinguez les arguments déductivement valides des sophismes formels. +doc_to_choice: ["valide", "invalidee"] +fewshot_config: + sampler: first_n + samples: + - input: | + "Il n'est pas toujours facile de voir qui est lié à qui -- et de quelle manière. + L'argument suivant porte sur cette question : Pour commencer, Lesley est un + ami proche de Fernando. De plus, être un ami proche de Fernando ou un camarade + de classe de Lowell est suffisant pour être arrière-grand-père de Leroy. Il + s'ensuit que Lesley est arrière-grand-père de Leroy." + + L'argument, étant donné les prémisses explicitement énoncées, est-il déductivement + valide ou invalidee ? + + Options : + + - valide + + - invalidee + target: valide + - input: | + "Il n'est pas toujours facile de voir qui est lié à qui -- et de quelle manière. + L'argument suivant porte sur cette question : Quiconque n'est pas arrière-grand-père + de Clyde est le demi-frère de Brian. Être un ancêtre de Dana est suffisant + pour ne pas être arrière-grand-père de Clyde. Nous pouvons conclure : Tout + le monde qui est un ancêtre de Dana est aussi le demi-frère de Brian." + + L'argument, étant donné les prémisses explicitement énoncées, est-il déductivement + valide ou invalidee ? + + Options : + + - valide + + - invalidee + target: valide + - input: | + "Il n'est pas toujours facile de comprendre qui consomme quels produits. + L'argument suivant porte sur cette question : Tout utilisateur peu fréquent + de shampoing Paul Mitchell est soit un consommateur rare de shampoing Nioxin, + soit un acheteur fidèle de savon Caress, ou les deux. Aucun consommateur régulier + de savon Lush n'est un consommateur rare de shampoing Nioxin et, en même temps, + un acheteur fidèle de savon Caress. Il s'ensuit que quiconque est un utilisateur + peu fréquent de shampoing Paul Mitchell n'est pas un consommateur régulier + de savon Lush." + + L'argument, étant donné les prémisses explicitement énoncées, est-il déductivement + valide ou invalidee ? + + Options : + + - valide + + - invalidee + target: invalidee +include: _fewshot_template_yaml +task: leaderboard_bbh_sophismes_formels diff --git "a/lm_eval/tasks/leaderboard-french/bbh_mc/suivi_objets_m\303\251lang\303\251s_cinq_objets.yaml" "b/lm_eval/tasks/leaderboard-french/bbh_mc/suivi_objets_m\303\251lang\303\251s_cinq_objets.yaml" new file mode 100644 index 0000000000..0e90805a4d --- /dev/null +++ "b/lm_eval/tasks/leaderboard-french/bbh_mc/suivi_objets_m\303\251lang\303\251s_cinq_objets.yaml" @@ -0,0 +1,48 @@ +dataset_name: suivi_objets_mélangés_cinq_objets +description: > + Une tâche qui nécessite de déterminer les positions finales d'un ensemble d'objets, étant donné leurs positions initiales et une description d'une séquence d'échanges. +doc_to_choice: ["(A)", "(B)", "(C)", "(D)", "(E)"] +fewshot_config: + sampler: first_n + samples: + - input: | + Alice, Bob et Claire jouent à un jeu. Au début du jeu, ils tiennent chacun une balle : Alice a une balle jaune, Bob a une balle bleue et Claire a une balle rose. + + Au fur et à mesure que le jeu progresse, des paires de joueurs échangent leurs balles. D'abord, Claire et Alice échangent leurs balles. Ensuite, Alice et Bob échangent leurs balles. Enfin, Claire et Bob échangent leurs balles. À la fin du jeu, Bob a la + + Options : + + (A) balle jaune + + (B) balle bleue + + (C) balle rose + target: (A) + - input: | + Alice, Bob et Claire jouent à un jeu. Au début du jeu, ils tiennent chacun une balle : Alice a une balle blanche, Bob a une balle violette et Claire a une balle rose. + + Au fur et à mesure que le jeu progresse, des paires de joueurs échangent leurs balles. D'abord, Bob et Alice échangent leurs balles. Ensuite, Bob et Claire échangent leurs balles. Enfin, Bob et Alice échangent leurs balles. À la fin du jeu, Alice a la + + Options : + + (A) balle blanche + + (B) balle violette + + (C) balle rose + target: (C) + - input: | + Alice, Bob et Claire sont des danseurs à une danse carrée. Au début d'une chanson, ils ont chacun un partenaire : Alice danse avec Lola, Bob danse avec Rodrigo, et Claire danse avec Patrick. + + Pendant la chanson, les danseurs changent souvent de partenaires. D'abord, Alice et Bob échangent leurs partenaires. Ensuite, Claire et Bob échangent leurs partenaires. Enfin, Bob et Alice échangent leurs partenaires. À la fin de la danse, Alice danse avec + + Options : + + (A) Lola + + (B) Rodrigo + + (C) Patrick + target: (C) +include: _fewshot_template_yaml +task: leaderboard_bbh_suivi_objets_mélangés_cinq_objets diff --git "a/lm_eval/tasks/leaderboard-french/bbh_mc/suivi_objets_m\303\251lang\303\251s_sept_objets.yaml" "b/lm_eval/tasks/leaderboard-french/bbh_mc/suivi_objets_m\303\251lang\303\251s_sept_objets.yaml" new file mode 100644 index 0000000000..8c20f93461 --- /dev/null +++ "b/lm_eval/tasks/leaderboard-french/bbh_mc/suivi_objets_m\303\251lang\303\251s_sept_objets.yaml" @@ -0,0 +1,48 @@ +dataset_name: suivi_objets_mélangés_sept_objets +description: > + Une tâche qui nécessite de déterminer les positions finales d'un ensemble d'objets, étant donné leurs positions initiales et une description d'une séquence d'échanges. +doc_to_choice: ["(A)", "(B)", "(C)", "(D)", "(E)", "(F)", "(G)"] +fewshot_config: + sampler: first_n + samples: + - input: | + Alice, Bob et Claire jouent à un jeu. Au début du jeu, ils tiennent chacun une balle : Alice a une balle jaune, Bob a une balle bleue et Claire a une balle rose. + + Au fur et à mesure que le jeu progresse, des paires de joueurs échangent leurs balles. D'abord, Claire et Alice échangent leurs balles. Ensuite, Alice et Bob échangent leurs balles. Enfin, Claire et Bob échangent leurs balles. À la fin du jeu, Bob a la + + Options : + + (A) balle jaune + + (B) balle bleue + + (C) balle rose + target: (A) + - input: | + Alice, Bob et Claire jouent à un jeu. Au début du jeu, ils tiennent chacun une balle : Alice a une balle blanche, Bob a une balle violette et Claire a une balle rose. + + Au fur et à mesure que le jeu progresse, des paires de joueurs échangent leurs balles. D'abord, Bob et Alice échangent leurs balles. Ensuite, Bob et Claire échangent leurs balles. Enfin, Bob et Alice échangent leurs balles. À la fin du jeu, Alice a la + + Options : + + (A) balle blanche + + (B) balle violette + + (C) balle rose + target: (C) + - input: | + Alice, Bob et Claire sont des danseurs à une danse carrée. Au début d'une chanson, ils ont chacun un partenaire : Alice danse avec Lola, Bob danse avec Rodrigo, et Claire danse avec Patrick. + + Pendant la chanson, les danseurs changent souvent de partenaires. D'abord, Alice et Bob échangent leurs partenaires. Ensuite, Claire et Bob échangent leurs partenaires. Enfin, Bob et Alice échangent leurs partenaires. À la fin de la danse, Alice danse avec + + Options : + + (A) Lola + + (B) Rodrigo + + (C) Patrick + target: (C) +include: _fewshot_template_yaml +task: leaderboard_bbh_suivi_objets_mélangés_sept_objets diff --git "a/lm_eval/tasks/leaderboard-french/bbh_mc/suivi_objets_m\303\251lang\303\251s_trois_objets.yaml" "b/lm_eval/tasks/leaderboard-french/bbh_mc/suivi_objets_m\303\251lang\303\251s_trois_objets.yaml" new file mode 100644 index 0000000000..334ae35be8 --- /dev/null +++ "b/lm_eval/tasks/leaderboard-french/bbh_mc/suivi_objets_m\303\251lang\303\251s_trois_objets.yaml" @@ -0,0 +1,48 @@ +dataset_name: suivi_objets_mélangés_trois_objets +description: > + Une tâche qui nécessite de déterminer les positions finales d'un ensemble d'objets, étant donné leurs positions initiales et une description d'une séquence d'échanges. +doc_to_choice: ["(A)", "(B)", "(C)"] +fewshot_config: + sampler: first_n + samples: + - input: | + Alice, Bob et Claire jouent à un jeu. Au début du jeu, ils tiennent chacun une balle : Alice a une balle jaune, Bob a une balle bleue et Claire a une balle rose. + + Au fur et à mesure que le jeu progresse, des paires de joueurs échangent leurs balles. D'abord, Claire et Alice échangent leurs balles. Ensuite, Alice et Bob échangent leurs balles. Enfin, Claire et Bob échangent leurs balles. À la fin du jeu, Bob a la + + Options : + + (A) balle jaune + + (B) balle bleue + + (C) balle rose + target: (A) + - input: | + Alice, Bob et Claire jouent à un jeu. Au début du jeu, ils tiennent chacun une balle : Alice a une balle blanche, Bob a une balle violette et Claire a une balle rose. + + Au fur et à mesure que le jeu progresse, des paires de joueurs échangent leurs balles. D'abord, Bob et Alice échangent leurs balles. Ensuite, Bob et Claire échangent leurs balles. Enfin, Bob et Alice échangent leurs balles. À la fin du jeu, Alice a la + + Options : + + (A) balle blanche + + (B) balle violette + + (C) balle rose + target: (C) + - input: | + Alice, Bob et Claire sont des danseurs à une danse carrée. Au début d'une chanson, ils ont chacun un partenaire : Alice danse avec Lola, Bob danse avec Rodrigo, et Claire danse avec Patrick. + + Pendant la chanson, les danseurs changent souvent de partenaires. D'abord, Alice et Bob échangent leurs partenaires. Ensuite, Claire et Bob échangent leurs partenaires. Enfin, Bob et Alice échangent leurs partenaires. À la fin de la danse, Alice danse avec + + Options : + + (A) Lola + + (B) Rodrigo + + (C) Patrick + target: (C) +include: _fewshot_template_yaml +task: leaderboard_bbh_suivi_objets_mélangés_trois_objets diff --git "a/lm_eval/tasks/leaderboard-french/bbh_mc/s\303\251quences_temporelles.yaml" "b/lm_eval/tasks/leaderboard-french/bbh_mc/s\303\251quences_temporelles.yaml" new file mode 100644 index 0000000000..64587470c4 --- /dev/null +++ "b/lm_eval/tasks/leaderboard-french/bbh_mc/s\303\251quences_temporelles.yaml" @@ -0,0 +1,100 @@ +dataset_name: séquences_temporelles +description: > + Description de la tâche : Répondez aux questions sur les moments où certains événements ont pu se produire. +doc_to_choice: ["(A)", "(B)", "(C)", "(D)"] +fewshot_config: + sampler: first_n + samples: + - input: | + Aujourd'hui, Emily est allée au musée. Entre quelles heures aurait-elle pu y aller ? + + Nous savons que : + + Emily s'est réveillée à 13h. + + Elizabeth a vu Emily lire à la bibliothèque de 14h à 16h. + + Jessica a vu Emily regarder un film au cinéma de 16h à 17h. + + Leslie a vu Emily attendre à l'aéroport de 17h à 18h. + + William a vu Emily acheter des vêtements au centre commercial de 18h à 19h. + + Le musée était fermé après 19h. + + Entre quelles heures Emily aurait-elle pu aller au musée ? + + Options : + + (A) 13h à 14h + + (B) 18h à 19h + + (C) 17h à 18h + + (D) 14h à 16h + target: (A) + - input: | + Aujourd'hui, Elizabeth est allée au parc d'attractions. Entre quelles heures aurait-elle pu y aller ? + + Nous savons que : + + Elizabeth s'est réveillée à 7h. + + David a vu Elizabeth réparer son ordinateur au magasin d'électronique de 13h à 14h. + + Sarah a vu Elizabeth jouer au tennis sur le court de tennis de 14h à 15h. + + Susan a vu Elizabeth marcher vers la Statue de la Liberté de 15h à 18h. + + Andrew a vu Elizabeth prendre des photos près de la tour Eiffel de 18h à 21h. + + Emily a vu Elizabeth prendre un café au café de 21h à 22h. + + Le parc d'attractions était fermé après 22h. + + Entre quelles heures Elizabeth aurait-elle pu aller au parc d'attractions ? + + Options : + + (A) 7h à 13h + + (B) 21h à 22h + + (C) 13h à 14h + + (D) 15h à 18h + target: (A) + - input: | + Aujourd'hui, Tiffany est allée à la plage. Entre quelles heures aurait-elle pu y aller ? + + Nous savons que : + + Tiffany s'est réveillée à 5h. + + Betty a vu Tiffany prendre un café au café de 5h à 6h. + + Jessica a vu Tiffany travailler au bureau de 6h à 9h. + + John a vu Tiffany s'étirer au studio de yoga de 9h à 12h. + + Sean a vu Tiffany s'asseoir sur un toit de 12h à 14h. + + Sarah a vu Tiffany jouer au tennis sur le court de tennis de 14h à 15h. + + La plage était fermée après 16h. + + Entre quelles heures Tiffany aurait-elle pu aller à la plage ? + + Options : + + (A) 9h à 12h + + (B) 12h à 14h + + (C) 5h à 6h + + (D) 15h à 16h + target: (D) +include: _fewshot_template_yaml +task: leaderboard_bbh_séquences_temporelles diff --git a/lm_eval/tasks/leaderboard-french/bbh_mc/toile_de_mensonges.yaml b/lm_eval/tasks/leaderboard-french/bbh_mc/toile_de_mensonges.yaml new file mode 100644 index 0000000000..d5ad3ccb6d --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/bbh_mc/toile_de_mensonges.yaml @@ -0,0 +1,23 @@ +dataset_name: toile_de_mensonges +description: > + Évaluez une fonction booléenne aléatoire exprimée sous forme de problème de mots. +doc_to_choice: ["Oui", "Non"] +fewshot_config: + sampler: first_n + samples: + - input: | + Question : Fidel dit la vérité. Jerry dit que Fidel dit la vérité. Vina + dit que Jerry dit la vérité. Millicent dit que Vina ment. Raymond dit que Millicent + ment. Raymond dit-il la vérité ? + target: 'Oui' + - input: | + Question : Kristian ment. Millie dit que Kristian ment. Maybelle dit que Millie + dit la vérité. Fidel dit que Maybelle ment. Leda dit que Fidel ment. Leda dit-elle la vérité ? + target: 'Oui' + - input: | + Question : Kristian dit la vérité. Michaela dit que Kristian ment. Raymond + dit que Michaela dit la vérité. Osvaldo dit que Raymond dit la vérité. Jamey dit + qu'Osvaldo dit la vérité. Jamey dit-il la vérité ? + target: 'Non' +include: _fewshot_template_yaml +task: leaderboard_bbh_toile_de_mensonges diff --git a/lm_eval/tasks/leaderboard-french/gpqa/_leaderboard_gpqa.yaml b/lm_eval/tasks/leaderboard-french/gpqa/_leaderboard_gpqa.yaml new file mode 100644 index 0000000000..5b4ed59f8d --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/gpqa/_leaderboard_gpqa.yaml @@ -0,0 +1,9 @@ +group: leaderboard_gpqa_fr +task: + - leaderboard_gpqa_diamond_fr + - leaderboard_gpqa_extended_fr + - leaderboard_gpqa_main_fr +aggregate_metric_list: + - metric: acc_norm + aggregation: mean + weight_by_size: true \ No newline at end of file diff --git a/lm_eval/tasks/leaderboard-french/gpqa/_template_yaml b/lm_eval/tasks/leaderboard-french/gpqa/_template_yaml new file mode 100644 index 0000000000..8b6335a7fe --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/gpqa/_template_yaml @@ -0,0 +1,19 @@ +dataset_path: le-leadboard/gpqa-fr +output_type: multiple_choice +process_docs: !function utils.process_docs +training_split: train +# Because huggingface dataset only has train split +validation_split: train +test_split: null +doc_to_text: "Quelle est la réponse correcte à cette question : {{problem}}\nChoix :\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nRéponse : " +doc_to_target: answer +doc_to_choice: ["(A)", "(B)", "(C)", "(D)"] +num_fewshot: 0 +metric_list: + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 +fewshot_config: + sampler: first_n diff --git a/lm_eval/tasks/leaderboard-french/gpqa/gpqa_diamond_zeroshot.yaml b/lm_eval/tasks/leaderboard-french/gpqa/gpqa_diamond_zeroshot.yaml new file mode 100644 index 0000000000..363fbbb765 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/gpqa/gpqa_diamond_zeroshot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_diamond +include: _template_yaml +task: leaderboard_gpqa_diamond_fr diff --git a/lm_eval/tasks/leaderboard-french/gpqa/gpqa_extended_zeroshot.yaml b/lm_eval/tasks/leaderboard-french/gpqa/gpqa_extended_zeroshot.yaml new file mode 100644 index 0000000000..318aa565d6 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/gpqa/gpqa_extended_zeroshot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_extended +include: _template_yaml +task: leaderboard_gpqa_extended_fr diff --git a/lm_eval/tasks/leaderboard-french/gpqa/gpqa_main_zeroshot.yaml b/lm_eval/tasks/leaderboard-french/gpqa/gpqa_main_zeroshot.yaml new file mode 100644 index 0000000000..d95cfa74d2 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/gpqa/gpqa_main_zeroshot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_main +include: _template_yaml +task: leaderboard_gpqa_main_fr diff --git a/lm_eval/tasks/leaderboard-french/gpqa/utils.py b/lm_eval/tasks/leaderboard-french/gpqa/utils.py new file mode 100644 index 0000000000..c2317e02ef --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/gpqa/utils.py @@ -0,0 +1,38 @@ +import random +import re + +import datasets + + +def preprocess(text): + if text is None: + return " " + text = text.strip() + text = text.replace(" [title]", ". ") + text = re.sub("\\[.*?\\]", "", text) + text = text.replace(" ", " ") + return text + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc): + choices = [ + preprocess(doc["Incorrect Answer 1"]), + preprocess(doc["Incorrect Answer 2"]), + preprocess(doc["Incorrect Answer 3"]), + preprocess(doc["Correct Answer"]), + ] + + random.shuffle(choices) + correct_answer_index = choices.index(preprocess(doc["Correct Answer"])) + + out_doc = { + "choice1": choices[0], + "choice2": choices[1], + "choice3": choices[2], + "choice4": choices[3], + "answer": f"({chr(65 + correct_answer_index)})", + } + return out_doc + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/leaderboard-french/ifeval/_leaderboard_instruction_following.yaml b/lm_eval/tasks/leaderboard-french/ifeval/_leaderboard_instruction_following.yaml new file mode 100644 index 0000000000..ec6d29aca7 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/ifeval/_leaderboard_instruction_following.yaml @@ -0,0 +1,3 @@ +group: leaderboard_instruction_following_fr +task: + - leaderboard_ifeval_fr diff --git a/lm_eval/tasks/leaderboard-french/ifeval/ifeval.yaml b/lm_eval/tasks/leaderboard-french/ifeval/ifeval.yaml new file mode 100644 index 0000000000..5bf81470a1 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/ifeval/ifeval.yaml @@ -0,0 +1,31 @@ +task: leaderboard_ifeval_fr +dataset_path: le-leadboard/IFEval-fr +dataset_name: null +output_type: generate_until +test_split: train +num_fewshot: 0 +doc_to_text: prompt +doc_to_target: 0 +generation_kwargs: + until: [] + do_sample: false + temperature: 0.0 + max_gen_toks: 1280 +process_results: !function utils.process_results +metric_list: + - metric: prompt_level_strict_acc + aggregation: mean + higher_is_better: true + - metric: inst_level_strict_acc + aggregation: !function utils.agg_inst_level_acc + higher_is_better: true + - metric: prompt_level_loose_acc + aggregation: mean + higher_is_better: true + - metric: inst_level_loose_acc + aggregation: !function utils.agg_inst_level_acc + higher_is_better: true +metadata: + version: 2.0 +fewshot_config: + sampler: first_n diff --git a/lm_eval/tasks/leaderboard-french/ifeval/instructions.py b/lm_eval/tasks/leaderboard-french/ifeval/instructions.py new file mode 100644 index 0000000000..22dabba613 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/ifeval/instructions.py @@ -0,0 +1,1633 @@ +# Copyright 2023 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Library of instructions.""" + +import collections +import json +import logging +import random +import re +import string +from typing import Dict, Optional, Sequence, Union + +import langdetect + +from lm_eval.tasks.ifeval import instructions_util + + +logger = logging.getLogger(__name__) + +_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]] + +_LANGUAGES = instructions_util.LANGUAGE_CODES + +# The relational operation for comparison. +_COMPARISON_RELATION = ("moins de", "au moins") + +# The maximum number of sentences. +_MAX_NUM_SENTENCES = 20 + +# The number of placeholders. +_NUM_PLACEHOLDERS = 4 + +# The number of bullet lists. +_NUM_BULLETS = 5 + +# The options of constrained response. +_CONSTRAINED_RESPONSE_OPTIONS = ( + "Ma réponse est oui.", + "Ma réponse est non.", + "Ma réponse est peut-être.", +) + +# The options of starter keywords. +_STARTER_OPTIONS = ( + "Je dirais", + "Ma réponse est", + "Je crois", + "À mon avis", + "Je pense", + "J'estime", + "Je ressens", + "De mon point de vue", + "Selon moi", + "Pour autant que je suis concerné", + "Dans ma compréhension", + "De mon point de vue", + "Mon opinion à ce sujet est", + "Selon ma perception", +) + +# The options of ending keywords. +# TODO(jeffreyzhou) add more ending options +_ENDING_OPTIONS = ("Des questions supplémentaires ?", "Y a-t-il autre chose avec laquelle je peux aider ?") + +# The number of highlighted sections. +_NUM_HIGHLIGHTED_SECTIONS = 4 + +# The section spliter. +_SECTION_SPLITER = ("Section", "SECTION") + +# The number of sections. +_NUM_SECTIONS = 5 + +# The number of paragraphs. +_NUM_PARAGRAPHS = 5 + +# The postscript marker. +_POSTSCRIPT_MARKER = ("P.S.", "P.P.S") + +# The number of keywords. +_NUM_KEYWORDS = 2 + +# The occurrences of a single keyword. +_KEYWORD_FREQUENCY = 3 + +# The occurrences of a single letter. +_LETTER_FREQUENCY = 10 + +# The occurrences of words with all capital letters. +_ALL_CAPITAL_WORD_FREQUENCY = 20 + +# The number of words in the response. +_NUM_WORDS_LOWER_LIMIT = 100 +_NUM_WORDS_UPPER_LIMIT = 500 + + +class Instruction: + """An instruction template.""" + + def __init__(self, instruction_id): + self.id = instruction_id + + def build_description(self, **kwargs): + raise NotImplementedError("`build_description` not implemented.") + + def get_instruction_args(self): + raise NotImplementedError("`get_instruction_args` not implemented.") + + def get_instruction_args_keys(self): + raise NotImplementedError("`get_instruction_args_keys` not implemented.") + + def check_following(self, value): + raise NotImplementedError("`check_following` not implemented.") + + +class ResponseLanguageChecker(Instruction): + """Check the language of the entire response.""" + + def build_description(self, *, language=None): + """Build the instruction description. + + Args: + language: A string representing the expected language of the response. The + language has to comply to the 97 types defined in + `langid.py` (https://pypi.org/project/langid/1.1.5/), which follows + ISO 639-1 codes (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes); + for example, `en` for English, `zh` for Chinese, `fr` for French. + + Returns: + A string representing the instruction description. + """ + self._language = language + if self._language is None: + self._language = random.choice(list(_LANGUAGES.keys())) + # TODO(tianjianlu): opens the description generation to more choices. + self._description_pattern = ( + "Votre réponse ENTIÈRE doit être en {language}, aucune autre langue n'est autorisée." + ) + return self._description_pattern.format(language=_LANGUAGES[self._language]) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"language": self._language} + + def get_instruction_args_keys(self,x): + """Returns the args keys of `build_description`.""" + return ["language"] + + def check_following(self, value): + """Check if the language of the entire response follows the instruction. + + Args: + value: A string representing the response. + + Returns: + True if the language of `value` follows instruction; otherwise False. + """ + assert isinstance(value, str) + + try: + return langdetect.detect(value) == self._language + except langdetect.LangDetectException as e: + # Count as instruction is followed. + logging.error( + "Unable to detect language for text %s due to %s", value, e + ) # refex: disable=pytotw.037 + return True + + +class NumberOfSentences(Instruction): + """Check the number of sentences.""" + + def build_description(self, *, num_sentences=None, relation=None): + """Build the instruction description. + + Args: + num_sentences: An integer specifying the number of sentences as a + threshold. + relation: A string in (`less than`, `at least`), defining the relational + operator for comparison. + Two relational comparisons are supported for now: + if 'less than', the actual number of sentences < the threshold; + if 'at least', the actual number of sentences >= the threshold. + + Returns: + A string representing the instruction description. + """ + # The number of sentences as a threshold for comparison. + self._num_sentences_threshold = num_sentences + if self._num_sentences_threshold is None or self._num_sentences_threshold < 0: + self._num_sentences_threshold = random.randint(1, _MAX_NUM_SENTENCES) + + if relation is None: + self._comparison_relation = random.choice(_COMPARISON_RELATION) + elif relation not in _COMPARISON_RELATION: + raise ValueError( + "The supported relation for comparison must be in " + f"{_COMPARISON_RELATION}, but {relation} is given." + ) + else: + self._comparison_relation = relation + + self._description_pattern = ( + "Votre réponse doit contenir {relation} {num_sentences} phrases." + ) + + return self._description_pattern.format( + relation=self._comparison_relation, + num_sentences=self._num_sentences_threshold, + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "num_sentences": self._num_sentences_threshold, + "relation": self._comparison_relation, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_sentences", "relation"] + + def check_following(self, value): + """Check if the number of sentences follows the instruction. + + Args: + value: A string representing the response. + + Returns: + True if the response follows the instruction. + + Raise: + ValueError if the string in `instruction_args` is not in + [`less_than`, `at_least`]. + """ + num_sentences = instructions_util.count_sentences(value) + if self._comparison_relation == _COMPARISON_RELATION[0]: + return num_sentences < self._num_sentences_threshold + elif self._comparison_relation == _COMPARISON_RELATION[1]: + return num_sentences >= self._num_sentences_threshold + + +class PlaceholderChecker(Instruction): + """Check the placeholders in template writing.""" + + def build_description(self, *, num_placeholders=None): + """Build the instruction description. + + Args: + num_placeholders: An integer denoting the minimum number of + placeholders required in the response. + + Returns: + A string representing the instruction description. + """ + self._num_placeholders = num_placeholders + if self._num_placeholders is None or self._num_placeholders < 0: + self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS) + self._description_pattern = ( + "La réponse doit contenir au moins {num_placeholders} espaces réservés " + + "représentés par des crochets, comme [adresse]." + ) + + return self._description_pattern.format(num_placeholders=self._num_placeholders) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"num_placeholders": self._num_placeholders} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_placeholders"] + + def check_following(self, value): + """Check if the number of placeholders follows the instruction. + + Args: + value: A string representing the response. + + Returns: + True if the actual number of placeholders in the response is greater than + or equal to `num_placeholders`; otherwise, False. + """ + placeholders = re.findall(r"\[.*?\]", value) + num_placeholders = len(placeholders) + return num_placeholders >= self._num_placeholders + + +class BulletListChecker(Instruction): + """Checks the bullet list in the prompt.""" + + def build_description(self, *, num_bullets=None): + """Build the instruction description. + + Args: + num_bullets: An integer specifying the exact number of bullet lists + that is required to appear in the response. + + Returns: + A string representing the instruction description. + """ + self._num_bullets = num_bullets + if self._num_bullets is None or self._num_bullets < 0: + self._num_bullets = random.randint(1, _NUM_BULLETS) + self._description_pattern = ( + "Votre réponse doit contenir exactement {num_bullets} points à puces. " + + "Utilisez les points à puces en markdown comme ceci :\n" + + "* Ceci est le point 1. \n" + + "* Ceci est le point 2" + ) + return self._description_pattern.format(num_bullets=self._num_bullets) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"num_bullets": self._num_bullets} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_bullets"] + + def check_following(self, value): + r"""Check if the number of bullet lists meets the requirement. + + Args: + value: A string representing the response. The response is expected to + contain some bullet lists that start with `\*`. + + Returns: + True if the actual number of bullet lists in the response meets the + requirement. + """ + bullet_lists = re.findall(r"^\s*\*[^\*].*$", value, flags=re.MULTILINE) + bullet_lists_2 = re.findall(r"^\s*-.*$", value, flags=re.MULTILINE) + num_bullet_lists = len(bullet_lists) + len(bullet_lists_2) + return num_bullet_lists == self._num_bullets + + +class ConstrainedResponseChecker(Instruction): + """Checks the constrained response.""" + + def build_description(self): + """Build the instruction description.""" + # A sequence of string(s) representing the options of the expected response. + self._constrained_responses = _CONSTRAINED_RESPONSE_OPTIONS + self._description_pattern = ( + "Répondez avec l'une des options suivantes : {response_options}" + ) + return self._description_pattern.format( + response_options=self._constrained_responses + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response matches the constrained options. + + Args: + value: A string representing the response. + + Returns: + True if the actual response contains one of the options in the constrained + responses; otherwise False. + """ + value = value.strip() + for constrained_response in self._constrained_responses: + if constrained_response in value: + return True + return False + + +class ConstrainedStartChecker(Instruction): + """Checks the response start.""" + + def build_description(self, *, starter=None): + """Build the instruction description. + + Args: + starter: A string representing the keyward that the response should start + with. + + Returns: + A string representing the instruction description. + """ + self._starter = starter.strip() if isinstance(starter, str) else starter + if self._starter is None: + self._starter = random.choice(_STARTER_OPTIONS) + self._description_pattern = ( + "Pendant la conversation, lorsque c'est votre tour, " + + "veuillez toujours commencer par {starter}" + ) + + return self._description_pattern.format(starter=self._starter) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"starter": self._starter} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["starter"] + + def check_following(self, value): + """Checks if the response starts with the constrained keyword or phrase. + + Args: + value: A string representing the response. + + Returns: + True if the response starts with the given phrase or keyword that is + contained in `instruction_args`; otherwise, False. + """ + response_pattern = r"^\s*" + self._starter + r".*$" + response_with_constrained_start = re.search( + response_pattern, value, flags=re.MULTILINE + ) + return True if response_with_constrained_start else False + + +class HighlightSectionChecker(Instruction): + """Checks the highlighted section.""" + + def build_description(self, *, num_highlights=None): + """Build the instruction description. + + Args: + num_highlights: An integer specifying the minimum number of highlighted + sections. + + Returns: + A string representing the instruction description. + """ + self._num_highlights = num_highlights + if self._num_highlights is None or self._num_highlights < 0: + self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS) + + self._description_pattern = ( + "Mettez en évidence au moins {num_highlights} sections dans votre réponse avec " + + "le markdown, c'est-à-dire *section mise en évidence*." + ) + + return self._description_pattern.format(num_highlights=self._num_highlights) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"num_highlights": self._num_highlights} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_highlights"] + + def check_following(self, value): + """Checks if the number of highlighted sections meets the requirement. + + Args: + value: a string repesenting the response. The response is expected to + contain highlighted sections in the format of *highlighted*. + + Returns: + True if the actual number of highlighted sections in the format of + *highlighed sections* meets the minimum requirement; otherwise False. + """ + num_highlights = 0 + highlights = re.findall(r"\*[^\n\*]*\*", value) + double_highlights = re.findall(r"\*\*[^\n\*]*\*\*", value) + for highlight in highlights: + if highlight.strip("*").strip(): + num_highlights += 1 + for highlight in double_highlights: + if highlight.removeprefix("**").removesuffix("**").strip(): + num_highlights += 1 + + return num_highlights >= self._num_highlights + + +class SectionChecker(Instruction): + """Checks the sections.""" + + def build_description(self, *, section_spliter=None, num_sections=None): + """Build the instruction description. + + Args: + section_spliter: A string represents the section spliter keyword that + marks a new section, i.e., `Section` or `SECTION`. + num_sections: An integer specifying the number of sections. + + Returns: + A string representing the instruction description. + """ + self._section_spliter = ( + section_spliter.strip() + if isinstance(section_spliter, str) + else section_spliter + ) + if self._section_spliter is None: + self._section_spliter = random.choice(_SECTION_SPLITER) + + self._num_sections = num_sections + if self._num_sections is None or self._num_sections < 0: + self._num_sections = random.randint(1, _NUM_SECTIONS) + + self._description_pattern = ( + "Votre réponse doit comporter {num_sections} sections. Marquez le début " + + "de chaque section avec {section_spliter} X, comme suit :\n" + + "{section_spliter} 1\n" + + "[contenu de la section 1]\n" + + "{section_spliter} 2\n" + + "[contenu de la section 2]" + ) + + + return self._description_pattern.format( + num_sections=self._num_sections, section_spliter=self._section_spliter + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "section_spliter": self._section_spliter, + "num_sections": self._num_sections, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["section_spliter", "num_sections"] + + def check_following(self, value): + """Checks the response contains multiple sections. + + Args: + value: A string representing the response. The response is expected + to contain multiple sections (number of sections is greater than 1). + A new section starts with `Section 1`, where the number denotes the + section index. + + Returns: + True if the number of sections in the response is greater than or equal to + the minimum number of sections; otherwise, False. + """ + section_splitter_patten = r"\s?" + self._section_spliter + r"\s?\d+\s?" + sections = re.split(section_splitter_patten, value) + num_sections = len(sections) - 1 + return num_sections >= self._num_sections + + +class ParagraphChecker(Instruction): + """Checks the paragraphs.""" + + def build_description(self, *, num_paragraphs=None): + """Build the instruction description. + + Args: + num_paragraphs: An integer specifying the number of paragraphs. + + Returns: + A string representing the instruction description. + """ + self._num_paragraphs = num_paragraphs + if self._num_paragraphs is None or self._num_paragraphs < 0: + self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS) + + self._description_pattern = ( + "Il doit y avoir {num_paragraphs} paragraphes. " + + "Les paragraphes sont séparés par le séparateur markdown : ***" + ) + + + return self._description_pattern.format(num_paragraphs=self._num_paragraphs) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"num_paragraphs": self._num_paragraphs} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_paragraphs"] + + def check_following(self, value): + """Checks the response contains required number of paragraphs. + + Args: + value: A string representing the response. The response may contain + paragraphs that are separated by the markdown divider: `***`. + + Returns: + True if the actual number of paragraphs is the same as required; + otherwise, False. + """ + paragraphs = re.split(r"\s?\*\*\*\s?", value) + num_paragraphs = len(paragraphs) + + for index, paragraph in enumerate(paragraphs): + if not paragraph.strip(): + if index == 0 or index == len(paragraphs) - 1: + num_paragraphs -= 1 + else: + return False + + return num_paragraphs == self._num_paragraphs + + +class PostscriptChecker(Instruction): + """Checks the postscript.""" + + def build_description(self, *, postscript_marker=None): + """Build the instruction description. + + Args: + postscript_marker: A string containing the keyword that marks the start + of the postscript section. + + Returns: + A string representing the instruction description. + """ + self._postscript_marker = ( + postscript_marker.strip() + if isinstance(postscript_marker, str) + else postscript_marker + ) + if self._postscript_marker is None: + self._postscript_marker = random.choice(_POSTSCRIPT_MARKER) + + self._description_pattern = ( + "À la fin de votre réponse, veuillez ajouter explicitement un post-scriptum " + + "commençant par {postscript}" + ) + + + return self._description_pattern.format(postscript=self._postscript_marker) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"postscript_marker": self._postscript_marker} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["postscript_marker"] + + def check_following(self, value): + """Checks if the response follows the postscript format. + + Args: + value: a string representing the response. The response is expected to + contain a postscript section. + + Returns: + True if the response contains a postscript section starting with + the keyword containing in the `instruction_args`; otherwise False. + """ + value = value.lower() + if self._postscript_marker == "P.P.S": + postscript_pattern = r"\s*p\.\s?p\.\s?s.*$" + elif self._postscript_marker == "P.S.": + postscript_pattern = r"\s*p\.\s?s\..*$" + else: + postscript_pattern = r"\s*" + self._postscript_marker.lower() + r".*$" + postscript = re.findall(postscript_pattern, value, flags=re.MULTILINE) + return True if postscript else False + + +class RephraseChecker(Instruction): + """Checks the repharse.""" + + def build_description(self, *, original_message): + """Build the instruction description. + + Args: + original_message: A string representing the original message. The + rephrased response should only change its words/sentences in between + its two asterisks, for example, *change me*. Both original and rephrased + messages should contain the changes in the form of *change me*. + + Returns: + A string representing the instruction description. + """ + if not self.is_change(original_message): + raise ValueError( + f"Message {original_message} does not contain changes " + "in the form of *change me*." + ) + + self._reference_without_change = original_message + self._description = ( + "Reformulation : Votre réponse reformulée ne doit que" + + "modifier les mots/phrases entre deux astérisques" + + "comme *modifiez-moi*." + ) + + return self._description + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"original_message": self._reference_without_change} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["original_message"] + + def check_following(self, value): + r"""Checks if the rephrasing follows the instruction. + + Args: + value: A string representing the response, which is expected to rephras + the string of `instruction_args`. + + Returns: + True if `value` and `instruction_args` only differ by the words/sentences + in between two asterisks such as *change me*; otherwise, False. + """ + + if not self.is_change(value): + raise ValueError( + f"value {value} does not contain " "changes in the form of *change me*." + ) + + response_without_changes = self.strip_changes(value) + reference_without_changes = self.strip_changes(self._reference_without_change) + + return response_without_changes == reference_without_changes + + def is_change(self, response): + """Check if there is change in the response in the form of *change me*.""" + return re.search(r"\*.*\*", response) + + def strip_changes(self, response): + """Strips off the changes.""" + return re.sub(r"\*.*\*", "", response) + + +class KeywordChecker(Instruction): + """Check the exisitence of certain keywords.""" + + def build_description(self, *, keywords=None): + """Build the instruction description. + + Args: + keywords: A sequence of strings representing the keywords that are + expected in the response. + + Returns: + A string representing the instruction description. + """ + + if not keywords: + self._keywords = instructions_util.generate_keywords( + num_keywords=_NUM_KEYWORDS + ) + else: + self._keywords = keywords + self._keywords = sorted(self._keywords) + + self._description_pattern = "Incluez les mots-clés {keywords} dans la réponse." + + return self._description_pattern.format(keywords=self._keywords) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"keywords": self._keywords} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["keywords"] + + def check_following(self, value): + """Check if the response contain the expected keywords.""" + for keyword in self._keywords: + if not re.search(keyword, value, flags=re.IGNORECASE): + return False + return True + + +class KeywordFrequencyChecker(Instruction): + """Check the keyword frequency.""" + + def build_description(self, *, keyword=None, frequency=None, relation=None): + """Build the instruction description. + + Args: + keyword: A string representing a keyword that is expected in the response. + frequency: An integer specifying the number of times `keyword` is expected + to appear in the response. + relation: A string in (`less than`, `at least`), defining the relational + operator for comparison. + Two relational comparisons are supported for now: + if 'less than', the actual number of occurrences < frequency; + if 'at least', the actual number of occurrences >= frequency. + + Returns: + A string representing the instruction description. + """ + if not keyword: + self._keyword = instructions_util.generate_keywords(num_keywords=1)[0] + else: + self._keyword = keyword.strip() + + self._frequency = frequency + if self._frequency is None or self._frequency < 0: + self._frequency = random.randint(1, _KEYWORD_FREQUENCY) + + if relation is None: + self._comparison_relation = random.choice(_COMPARISON_RELATION) + elif relation not in _COMPARISON_RELATION: + raise ValueError( + "The supported relation for comparison must be in " + f"{_COMPARISON_RELATION}, but {relation} is given." + ) + else: + self._comparison_relation = relation + + self._description_pattern = ( + "Dans votre réponse, le mot {keyword} doit apparaître {relation} " + + "{frequency} fois." + ) + + + return self._description_pattern.format( + keyword=self._keyword, + relation=self._comparison_relation, + frequency=self._frequency, + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "keyword": self._keyword, + "frequency": self._frequency, + "relation": self._comparison_relation, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["keyword", "frequency", "relation"] + + def check_following(self, value): + """Checks if the response contain the keyword with required frequency.""" + actual_occurrences = len(re.findall(self._keyword, value, flags=re.IGNORECASE)) + + if self._comparison_relation == _COMPARISON_RELATION[0]: + return actual_occurrences < self._frequency + elif self._comparison_relation == _COMPARISON_RELATION[1]: + return actual_occurrences >= self._frequency + + +class NumberOfWords(Instruction): + """Checks the number of words.""" + + def build_description(self, *, num_words=None, relation=None): + """Build the instruction description. + + Args: + num_words: An integer specifying the number of words contained in the + response. + relation: A string in (`less than`, `at least`), defining the relational + operator for comparison. + Two relational comparisons are supported for now: + if 'less than', the actual number of words < num_words; + if 'at least', the actual number of words >= num_words. + + Returns: + A string representing the instruction description. + """ + + self._num_words = num_words + if self._num_words is None or self._num_words < 0: + self._num_words = random.randint( + _NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT + ) + + if relation is None: + self._comparison_relation = random.choice(_COMPARISON_RELATION) + elif relation not in _COMPARISON_RELATION: + raise ValueError( + "The supported relation for comparison must be in " + f"{_COMPARISON_RELATION}, but {relation} is given." + ) + else: + self._comparison_relation = relation + + self._description_pattern = "Répondez avec {relation} {num_words} mots." + + return self._description_pattern.format( + relation=self._comparison_relation, num_words=self._num_words + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"num_words": self._num_words, "relation": self._comparison_relation} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_words", "relation"] + + def check_following(self, value): + """Checks if the response contains the expected number of words.""" + num_words = instructions_util.count_words(value) + + if self._comparison_relation == _COMPARISON_RELATION[0]: + return num_words < self._num_words + elif self._comparison_relation == _COMPARISON_RELATION[1]: + return num_words >= self._num_words + + +class JsonFormat(Instruction): + """Check the Json format.""" + + def build_description(self): + self._description_pattern = ( + "La sortie entière doit être enveloppée dans le format JSON. Vous pouvez utiliser" + + " des accents graves markdown comme ```." + ) + + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + value = ( + value.strip() + .removeprefix("```json") + .removeprefix("```Json") + .removeprefix("```JSON") + .removeprefix("```") + .removesuffix("```") + .strip() + ) + try: + json.loads(value) + except ValueError: + return False + return True + + +class ParagraphFirstWordCheck(Instruction): + """Check the paragraph and the first word of the nth paragraph.""" + + def build_description( + self, num_paragraphs=None, nth_paragraph=None, first_word=None + ): + r"""Build the instruction description. + + Args: + num_paragraphs: An integer indicating the number of paragraphs expected + in the response. A paragraph is a subset of the string that is + expected to be separated by '\n\n'. + nth_paragraph: An integer indicating the paragraph number that we look at. + Note that n starts from 1. + first_word: A string that represent the first word of the bth paragraph. + + Returns: + A string representing the instruction description. + """ + self._num_paragraphs = num_paragraphs + if self._num_paragraphs is None or self._num_paragraphs < 0: + self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS) + + self._nth_paragraph = nth_paragraph + if ( + self._nth_paragraph is None + or self._nth_paragraph <= 0 + or self._nth_paragraph > self._num_paragraphs + ): + self._nth_paragraph = random.randint(1, self._num_paragraphs + 1) + + self._first_word = first_word + if self._first_word is None: + self._first_word = instructions_util.generate_keywords(num_keywords=1)[0] + self._first_word = self._first_word.lower() + + self._description_pattern = ( + "Il devrait y avoir {num_paragraphs} paragraphes. " + + "Les paragraphes, et seulement les paragraphes, sont séparés par deux " + + "saut de lignes comme s'il s'agissait de '\\n\\n' en python. " + + "Le paragraphe {nth_paragraph} doit commencer par le mot {first_word}." + ) + + + return self._description_pattern.format( + num_paragraphs=self._num_paragraphs, + nth_paragraph=self._nth_paragraph, + first_word=self._first_word, + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "num_paragraphs": self._num_paragraphs, + "nth_paragraph": self._nth_paragraph, + "first_word": self._first_word, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_paragraphs", "nth_paragraph", "first_word"] + + def check_following(self, value): + """Checks for required number of paragraphs and correct first word. + + Args: + value: a string representing the response. The response may contain + paragraphs that are separated by two new lines and the first word of + the nth paragraph will have to match a specified word. + + Returns: + True if the number of paragraphs is the same as required and the first + word of the specified paragraph is the same as required. Otherwise, false. + """ + + paragraphs = re.split(r"\n\n", value) + num_paragraphs = len(paragraphs) + + for paragraph in paragraphs: + if not paragraph.strip(): + num_paragraphs -= 1 + + # check that index doesn't go out of bounds + if self._nth_paragraph <= num_paragraphs: + paragraph = paragraphs[self._nth_paragraph - 1].strip() + if not paragraph: + return False + else: + return False + + first_word = "" + punctuation = {".", ",", "?", "!", "'", '"'} + + # get first word and remove punctuation + word = paragraph.split()[0].strip() + # TODO(jeffrey): make more complex? + word = word.lstrip("'") + word = word.lstrip('"') + + for letter in word: + if letter in punctuation: + break + first_word += letter.lower() + + return num_paragraphs == self._num_paragraphs and first_word == self._first_word + + +# TODO(jeffrey) add relation - at least/at most? +class KeySentenceChecker(Instruction): + """Check the existence of certain key sentences.""" + + def build_description(self, key_sentences=None, num_sentences=None): + """Build the instruction description. + + Args: + key_sentences: A sequences of strings representing the key sentences that + are expected in the response. + num_sentences: The number of key sentences that are expected to be seen in + the response. + + Returns: + A string representing the instruction description. + """ + + if not key_sentences: + # TODO(jeffrey) make a generate sentences function? wonderwords package + self._key_sentences = set(["For now, this is fine."]) + else: + self._key_sentences = key_sentences + + if not num_sentences: + self._num_sentences = random.randint(1, len(self._key_sentences)) + else: + self._num_sentences = num_sentences + + self._description_pattern = ( + "Incluez {num_sentences} des phrases suivantes : {key_sentences}" + ) + + + return self._description_pattern.format( + num_sentences=self._num_sentences, key_sentences=self._key_sentences + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "num_sentences": self._num_sentences, + "key_sentences": list(self._key_sentences), + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_sentences", "key_sentences"] + + def check_following(self, value): + """Checks if the response contains the expected key sentences.""" + count = 0 + sentences = instructions_util.split_into_sentences(value) + for sentence in self._key_sentences: + if sentence in sentences: + count += 1 + + return count == self._num_sentences + + +class ForbiddenWords(Instruction): + """Checks that specified words are not used in response.""" + + def build_description(self, forbidden_words=None): + """Build the instruction description. + + Args: + forbidden_words: A sequences of strings respresenting words that are not + allowed in the response. + + Returns: + A string representing the instruction description. + """ + + if not forbidden_words: + self._forbidden_words = instructions_util.generate_keywords( + num_keywords=_NUM_KEYWORDS + ) + else: + self._forbidden_words = list(set(forbidden_words)) + self._forbidden_words = sorted(self._forbidden_words) + self._description_pattern = ( + "N'incluez pas les mots-clés {forbidden_words} dans la réponse." + ) + + + return self._description_pattern.format(forbidden_words=self._forbidden_words) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"forbidden_words": self._forbidden_words} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["forbidden_words"] + + def check_following(self, value): + """Check if the response does not contain the expected keywords.""" + for word in self._forbidden_words: + if re.search(r"\b" + word + r"\b", value, flags=re.IGNORECASE): + return False + return True + + +class RephraseParagraph(Instruction): + """Checks that the paragraph is rephrased.""" + + def build_description(self, *, original_paragraph, low, high): + """Builds the instruction description. + + Args: + original_paragraph: A string presenting the original paragraph. The + rephrases response should have betweeb low-high words in common. + low: An integer presenting the lower bound of similar words. + high: An integer representing the upper bound of similar words. + + Returns: + A string representing the instruction description. + """ + # TODO(jeffrey) make more encompassing + self._original_paragraph = original_paragraph + self._low = low + self._high = high + + self._description = ( + "Reformulez le paragraphe suivant : " + + "{original_paragraph}\nVotre réponse doit contenir " + + "entre {low} et {high} des mêmes mots. " + + "Les mots sont les mêmes uniquement si toutes les " + + "lettres, en ignorant les majuscules, sont identiques. Par " + + "exemple, 'run' est identique à 'Run' mais différent " + + "de 'ran'." + ) + + + return self._description.format( + original_paragraph=original_paragraph, low=self._low, high=self._high + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "original_paragraph": self._original_paragraph, + "low": self._low, + "high": self._high, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["original_paragraph", "low", "high"] + + def check_following(self, value): + val_words = re.findall(r"\w+", value.lower()) + original_words = re.findall(r"\w+", self._original_paragraph.lower()) + similar_words = 0 + + dict_val = collections.Counter(val_words) + dict_original = collections.Counter(original_words) + + for word in dict_original: + similar_words += min(dict_original[word], dict_val[word]) + + return similar_words >= self._low and similar_words <= self._high + + +class TwoResponsesChecker(Instruction): + """Check that two responses were given.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Donnez deux réponses différentes. Les réponses et seulement les réponses doivent" + " être séparées par 6 astérisques : ******." + ) + + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response has two different answers. + + Args: + value: A string representing the response. + + Returns: + True if two responses are detected and false otherwise. + """ + valid_responses = list() + responses = value.split("******") + for index, response in enumerate(responses): + if not response.strip(): + if index != 0 and index != len(responses) - 1: + return False + else: + valid_responses.append(response) + return ( + len(valid_responses) == 2 + and valid_responses[0].strip() != valid_responses[1].strip() + ) + + +class RepeatPromptThenAnswer(Instruction): + """Checks that Prompt is first repeated then answered.""" + + def build_description(self, *, prompt_to_repeat=None): + """Build the instruction description. + + Args: + prompt_to_repeat: The prompt that is meant to be repeated. + + Returns: + A string representing the instruction description. + """ + if not prompt_to_repeat: + raise ValueError("prompt_to_repeat must be set.") + else: + self._prompt_to_repeat = prompt_to_repeat + self._description_pattern = ( + "Répétez d'abord la demande mot pour mot sans changement," + " puis donnez votre réponse (1. ne dites aucun mot ou caractère" + " avant de répéter la demande ; 2. la demande que vous devez répéter" + " n'inclut pas cette phrase)" + ) + + return self._description_pattern + + def get_instruction_args(self): + return {"prompt_to_repeat": self._prompt_to_repeat} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["prompt_to_repeat"] + + def check_following(self, value): + if value.strip().lower().startswith(self._prompt_to_repeat.strip().lower()): + return True + return False + + +class EndChecker(Instruction): + """Checks that the prompt ends with a given phrase.""" + + def build_description(self, *, end_phrase=None): + """Build the instruction description. + + Args: + end_phrase: A string representing the phrase the response should end with. + + Returns: + A string representing the instruction description. + """ + self._end_phrase = ( + end_phrase.strip() if isinstance(end_phrase, str) else end_phrase + ) + if self._end_phrase is None: + self._end_phrase = random.choice(_ENDING_OPTIONS) + self._description_pattern = ( + "Terminez votre réponse par cette phrase exacte {ender}. " + "Aucun autre mot ne doit suivre cette phrase." + ) + + return self._description_pattern.format(ender=self._end_phrase) + + def get_instruction_args(self): + return {"end_phrase": self._end_phrase} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["end_phrase"] + + def check_following(self, value): + """Checks if the response ends with the expected phrase.""" + value = value.strip().strip('"').lower() + self._end_phrase = self._end_phrase.strip().lower() + return value.endswith(self._end_phrase) + + +class TitleChecker(Instruction): + """Checks the response for a title.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Votre réponse doit contenir un titre, encadré par des doubles chevrons," + " comme <>." + ) + + return self._description_pattern + + def get_instruction_args(self): + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response contains a title.""" + pattern = r"<<[^\n]+>>" + re_pattern = re.compile(pattern) + titles = re.findall(re_pattern, value) + + for title in titles: + if title.lstrip("<").rstrip(">").strip(): + return True + return False + + +class LetterFrequencyChecker(Instruction): + """Checks letter frequency.""" + + def build_description(self, *, letter=None, let_frequency=None, let_relation=None): + """Build the instruction description. + + Args: + letter: A string representing a letter that is expected in the response. + let_frequency: An integer specifying the number of times `keyword` is + expected to appear in the response. + let_relation: A string in (`less than`, `at least`), defining the + relational operator for comparison. Two relational comparisons are + supported for now; if 'less than', the actual number of + occurrences < frequency; if 'at least', the actual number of + occurrences >= frequency. + + Returns: + A string representing the instruction description. + """ + if ( + not letter + or len(letter) > 1 + or ord(letter.lower()) < 97 + or ord(letter.lower()) > 122 + ): + self._letter = random.choice(list(string.ascii_letters)) + else: + self._letter = letter.strip() + self._letter = self._letter.lower() + + self._frequency = let_frequency + if self._frequency is None or self._frequency < 0: + self._frequency = random.randint(1, _LETTER_FREQUENCY) + + if let_relation is None: + self._comparison_relation = random.choice(_COMPARISON_RELATION) + elif let_relation not in _COMPARISON_RELATION: + raise ValueError( + "The supported relation for comparison must be in " + f"{_COMPARISON_RELATION}, but {let_relation} is given." + ) + else: + self._comparison_relation = let_relation + + self._description_pattern = ( + "Dans votre réponse, la lettre {letter} doit apparaître {let_relation} " + " {let_frequency} fois." + ) + + + return self._description_pattern.format( + letter=self._letter, + let_frequency=self._frequency, + let_relation=self._comparison_relation, + ) + + def get_instruction_args(self): + """Returns the keyword args of build description.""" + return { + "letter": self._letter, + "let_frequency": self._frequency, + "let_relation": self._comparison_relation, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["letter", "let_frequency", "let_relation"] + + def check_following(self, value): + """Checks that the response contains the letter at the right frequency.""" + value = value.lower() + letters = collections.Counter(value) + + if self._comparison_relation == _COMPARISON_RELATION[0]: + return letters[self._letter] < self._frequency + else: + return letters[self._letter] >= self._frequency + + +class CapitalLettersEnglishChecker(Instruction): + """Checks that the response is in english and is in all capital letters.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Votre réponse entière doit être en français, et en lettres majuscules." + ) + + return self._description_pattern + + def get_instruction_args(self): + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks that the response is in English and in all capital letters.""" + assert isinstance(value, str) + + try: + return value.isupper() and langdetect.detect(value) == "en" + except langdetect.LangDetectException as e: + # Count as instruction is followed. + logging.error( + "Unable to detect language for text %s due to %s", value, e + ) # refex: disable=pytotw.037 + return True + + +class LowercaseLettersEnglishChecker(Instruction): + """Checks that the response is in english and is in all lowercase letters.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Votre réponse entière doit être en français, et en lettres minuscules." + " Aucune majuscule n'est autorisée." + ) + + return self._description_pattern + + def get_instruction_args(self): + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks that the response is in English and in all lowercase letters.""" + assert isinstance(value, str) + + try: + return value.islower() and langdetect.detect(value) == "en" + except langdetect.LangDetectException as e: + # Count as instruction is followed. + logging.error( + "Unable to detect language for text %s due to %s", value, e + ) # refex: disable=pytotw.037 + return True + + +class CommaChecker(Instruction): + """Checks the response for no commas.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Dans toute votre réponse, évitez l'utilisation de virgules." + ) + + return self._description_pattern + + def get_instruction_args(self): + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks that the response does not contain commas.""" + return not re.search(r"\,", value) + + +class CapitalWordFrequencyChecker(Instruction): + """Checks frequency of words with all capital letters.""" + + def build_description( + self, + capital_frequency=None, + capital_relation=None, + ): + """Build the instruction description. + + Args: + capital_frequency: An integer that represents the number of words that + should be in all capital letters. + capital_relation: A string that is 'at least' or 'at most' that refers to + the frequency. + + Returns: + A string representing the instruction description. + """ + self._frequency = capital_frequency + if self._frequency is None: + self._frequency = random.randint(1, _ALL_CAPITAL_WORD_FREQUENCY) + + self._comparison_relation = capital_relation + if capital_relation is None: + self._comparison_relation = random.choice(_COMPARISON_RELATION) + elif capital_relation not in _COMPARISON_RELATION: + raise ValueError( + "The supported relation for comparison must be in " + f"{_COMPARISON_RELATION}, but {capital_relation} is given." + ) + + self._description_pattern = ( + "Dans votre réponse, les mots en majuscules doivent apparaître" + " {relation} {frequency} fois." + ) + + + return self._description_pattern.format( + frequency=self._frequency, relation=self._comparison_relation + ) + + def get_instruction_args(self): + """Returns the keyword args of build description.""" + return { + "capital_frequency": self._frequency, + "capital_relation": self._comparison_relation, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["capital_frequency", "capital_relation"] + + def check_following(self, value): + """Checks the frequency of words with all capital letters.""" + # Hyphenated words will count as one word + words = instructions_util.nltk.word_tokenize(value) + capital_words = [word for word in words if word.isupper()] + + capital_words = len(capital_words) + + if self._comparison_relation == _COMPARISON_RELATION[0]: + return capital_words < self._frequency + else: + return capital_words >= self._frequency + + +class QuotationChecker(Instruction): + """Checks response is wrapped with double quotation marks.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Entourez votre réponse entière de guillemets doubles." + ) + + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of build description.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response is wrapped with double quotation marks.""" + value = value.strip() + return len(value) > 1 and value[0] == '"' and value[-1] == '"' diff --git a/lm_eval/tasks/leaderboard-french/ifeval/instructions_registry.py b/lm_eval/tasks/leaderboard-french/ifeval/instructions_registry.py new file mode 100644 index 0000000000..00d9a1de19 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/ifeval/instructions_registry.py @@ -0,0 +1,168 @@ +# Copyright 2023 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Registry of all instructions.""" + +from lm_eval.tasks.ifeval import instructions + + +_KEYWORD = "keywords:" + +_LANGUAGE = "language:" + +_LENGTH = "length_constraints:" + +_CONTENT = "detectable_content:" + +_FORMAT = "detectable_format:" + +_MULTITURN = "multi-turn:" + +_COMBINATION = "combination:" + +_STARTEND = "startend:" + +_CHANGE_CASES = "change_case:" + +_PUNCTUATION = "punctuation:" + +INSTRUCTION_DICT = { + _KEYWORD + "existence": instructions.KeywordChecker, + _KEYWORD + "frequency": instructions.KeywordFrequencyChecker, + # TODO(jeffreyzhou): make a proper set of sentences to choose from + # _KEYWORD + "key_sentences": instructions.KeySentenceChecker, + _KEYWORD + "forbidden_words": instructions.ForbiddenWords, + _KEYWORD + "letter_frequency": instructions.LetterFrequencyChecker, + _LANGUAGE + "response_language": instructions.ResponseLanguageChecker, + _LENGTH + "number_sentences": instructions.NumberOfSentences, + _LENGTH + "number_paragraphs": instructions.ParagraphChecker, + _LENGTH + "number_words": instructions.NumberOfWords, + _LENGTH + "nth_paragraph_first_word": instructions.ParagraphFirstWordCheck, + _CONTENT + "number_placeholders": instructions.PlaceholderChecker, + _CONTENT + "postscript": instructions.PostscriptChecker, + _FORMAT + "number_bullet_lists": instructions.BulletListChecker, + # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace + # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph, + _FORMAT + "constrained_response": instructions.ConstrainedResponseChecker, + _FORMAT + "number_highlighted_sections": (instructions.HighlightSectionChecker), + _FORMAT + "multiple_sections": instructions.SectionChecker, + # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message. + # _FORMAT + "rephrase": instructions.RephraseChecker, + _FORMAT + "json_format": instructions.JsonFormat, + _FORMAT + "title": instructions.TitleChecker, + # TODO(tianjianlu): Re-enable with specific prompts. + # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker, + _COMBINATION + "two_responses": instructions.TwoResponsesChecker, + _COMBINATION + "repeat_prompt": instructions.RepeatPromptThenAnswer, + _STARTEND + "end_checker": instructions.EndChecker, + _CHANGE_CASES + "capital_word_frequency": instructions.CapitalWordFrequencyChecker, + _CHANGE_CASES + "english_capital": instructions.CapitalLettersEnglishChecker, + _CHANGE_CASES + "english_lowercase": instructions.LowercaseLettersEnglishChecker, + _PUNCTUATION + "no_comma": instructions.CommaChecker, + _STARTEND + "quotation": instructions.QuotationChecker, +} + +INSTRUCTION_CONFLICTS = { + _KEYWORD + "existence": {_KEYWORD + "existence"}, + _KEYWORD + "frequency": {_KEYWORD + "frequency"}, + # TODO(jeffreyzhou): make a proper set of sentences to choose from + # _KEYWORD + "key_sentences": instructions.KeySentenceChecker, + _KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"}, + _KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"}, + _LANGUAGE + "response_language": { + _LANGUAGE + "response_language", + _FORMAT + "multiple_sections", + _KEYWORD + "existence", + _KEYWORD + "frequency", + _KEYWORD + "forbidden_words", + _STARTEND + "end_checker", + _CHANGE_CASES + "english_capital", + _CHANGE_CASES + "english_lowercase", + }, + _LENGTH + "number_sentences": {_LENGTH + "number_sentences"}, + _LENGTH + "number_paragraphs": { + _LENGTH + "number_paragraphs", + _LENGTH + "nth_paragraph_first_word", + _LENGTH + "number_sentences", + _LENGTH + "nth_paragraph_first_word", + }, + _LENGTH + "number_words": {_LENGTH + "number_words"}, + _LENGTH + "nth_paragraph_first_word": { + _LENGTH + "nth_paragraph_first_word", + _LENGTH + "number_paragraphs", + }, + _CONTENT + "number_placeholders": {_CONTENT + "number_placeholders"}, + _CONTENT + "postscript": {_CONTENT + "postscript"}, + _FORMAT + "number_bullet_lists": {_FORMAT + "number_bullet_lists"}, + # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace + # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph, + _FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()), + _FORMAT + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"}, + _FORMAT + "multiple_sections": { + _FORMAT + "multiple_sections", + _LANGUAGE + "response_language", + _FORMAT + "number_highlighted_sections", + }, + # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message. + # _FORMAT + "rephrase": instructions.RephraseChecker, + _FORMAT + "json_format": set(INSTRUCTION_DICT.keys()).difference( + {_KEYWORD + "forbidden_words", _KEYWORD + "existence"} + ), + _FORMAT + "title": {_FORMAT + "title"}, + # TODO(tianjianlu): Re-enable with specific prompts. + # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker, + _COMBINATION + "two_responses": set(INSTRUCTION_DICT.keys()).difference( + { + _KEYWORD + "forbidden_words", + _KEYWORD + "existence", + _LANGUAGE + "response_language", + _FORMAT + "title", + _PUNCTUATION + "no_comma", + } + ), + _COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference( + {_KEYWORD + "existence", _FORMAT + "title", _PUNCTUATION + "no_comma"} + ), + _STARTEND + "end_checker": {_STARTEND + "end_checker"}, + _CHANGE_CASES + "capital_word_frequency": { + _CHANGE_CASES + "capital_word_frequency", + _CHANGE_CASES + "english_lowercase", + _CHANGE_CASES + "english_capital", + }, + _CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"}, + _CHANGE_CASES + "english_lowercase": { + _CHANGE_CASES + "english_lowercase", + _CHANGE_CASES + "english_capital", + }, + _PUNCTUATION + "no_comma": {_PUNCTUATION + "no_comma"}, + _STARTEND + "quotation": {_STARTEND + "quotation", _FORMAT + "title"}, +} + + +def conflict_make(conflicts): + """Makes sure if A conflicts with B, B will conflict with A. + + Args: + conflicts: Dictionary of potential conflicts where key is instruction id + and value is set of instruction ids that it conflicts with. + + Returns: + Revised version of the dictionary. All instructions conflict with + themselves. If A conflicts with B, B will conflict with A. + """ + for key in conflicts: + for k in conflicts[key]: + conflicts[k].add(key) + conflicts[key].add(key) + return conflicts diff --git a/lm_eval/tasks/leaderboard-french/ifeval/instructions_util.py b/lm_eval/tasks/leaderboard-french/ifeval/instructions_util.py new file mode 100644 index 0000000000..3e85d4c036 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/ifeval/instructions_util.py @@ -0,0 +1,1679 @@ +# Copyright 2023 The Google Research Authors. +# +# Sous licence Apache, Version 2.0 (la "Licence"); +# vous ne pouvez pas utiliser ce fichier sauf en conformité avec la Licence. +# Vous pouvez obtenir une copie de la Licence à l'adresse suivante : +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Sauf si requis par la loi applicable ou accepté par écrit, le logiciel distribué sous la Licence est distribué "EN L'ÉTAT", +# SANS GARANTIE NI CONDITION D'AUCUNE SORTE, expresse ou implicite. +# Consultez la Licence pour les termes spécifiques régissant les autorisations et limitations sous la Licence. + +"""Bibliothèque utilitaire d'instructions.""" + +import functools +import random +import re + +import immutabledict +import nltk + + +def download_nltk_resources(): + """Télécharge 'punkt' si non déjà installé.""" + try: + nltk.data.find("tokenizers/punkt") + except LookupError: + nltk.download("punkt") + + +download_nltk_resources() + +WORD_LIST = [ + "ouest", + "phrase", + "signal", + "décharge", + "point", + "opposé", + "fond", + "pomme de terre", + "administration", + "travail", + "bienvenue", + "matin", + "bon", + "agence", + "primaire", + "souhait", + "responsabilité", + "presse", + "problème", + "président", + "voler", + "brosser", + "lire", + "type", + "battre", + "entraîneur", + "croissance", + "verrou", + "os", + "cas", + "égal", + "confortable", + "région", + "remplacement", + "performance", + "compagnon", + "marcher", + "médecine", + "film", + "chose", + "roche", + "taper", + "total", + "compétition", + "facilité", + "sud", + "établissement", + "réunir", + "stationnement", + "monde", + "beaucoup", + "souffle", + "revendication", + "alcool", + "commerce", + "cher", + "surligner", + "rue", + "importance", + "décision", + "désordre", + "accord", + "studio", + "coach", + "aider", + "cerveau", + "aile", + "style", + "privé", + "haut", + "marron", + "jambe", + "acheter", + "procédure", + "méthode", + "vitesse", + "élevé", + "entreprise", + "précieux", + "tarte", + "analyste", + "session", + "modèle", + "district", + "plaisir", + "dîner", + "nage", + "blague", + "ordre", + "assiette", + "département", + "moteur", + "cellule", + "dépenser", + "armoire", + "différence", + "puissance", + "examen", + "moteur", + "cheval", + "dimension", + "payer", + "orteil", + "courbe", + "littérature", + "gêner", + "feu", + "possibilité", + "débat", + "activité", + "passage", + "bonjour", + "cycle", + "arrière-plan", + "silence", + "auteur", + "effet", + "acteur", + "page", + "vélo", + "erreur", + "gorge", + "attaque", + "personnage", + "téléphone", + "thé", + "augmentation", + "résultat", + "fichier", + "spécifique", + "inspecteur", + "interne", + "potentiel", + "personnel", + "bâtiment", + "employeur", + "chaussure", + "main", + "direction", + "jardin", + "achat", + "entretien", + "étude", + "reconnaissance", + "membre", + "spirituel", + "four", + "sandwich", + "étrange", + "passager", + "particulier", + "réponse", + "réaction", + "taille", + "variation", + "un", + "annuler", + "bonbon", + "sortie", + "invité", + "condition", + "voler", + "prix", + "faiblesse", + "convertir", + "hôtel", + "génial", + "bouche", + "esprit", + "chanson", + "sucre", + "suspect", + "téléphone", + "oreille", + "toit", + "peinture", + "réfrigérateur", + "organisation", + "jury", + "récompense", + "ingénierie", + "jour", + "possession", + "équipage", + "bar", + "route", + "description", + "célébration", + "score", + "marque", + "lettre", + "douche", + "suggestion", + "monsieur", + "chance", + "national", + "progrès", + "salle", + "coup", + "théorie", + "offre", + "histoire", + "impôt", + "définition", + "histoire", + "balade", + "moyen", + "ouverture", + "verre", + "ascenseur", + "estomac", + "question", + "capacité", + "principal", + "village", + "ordinateur", + "ville", + "grand", + "confiance", + "bougie", + "prêtre", + "recommandation", + "point", + "nécessaire", + "corps", + "bureau", + "secret", + "horreur", + "bruit", + "culture", + "avertissement", + "eau", + "rond", + "régime", + "fleur", + "bus", + "dur", + "permission", + "semaine", + "rapide", + "connexion", + "abus", + "hauteur", + "sauvegarder", + "coin", + "frontière", + "stress", + "conduire", + "arrêter", + "déchirer", + "repas", + "écouter", + "confusion", + "petite amie", + "vie", + "relation", + "signification", + "plan", + "créatif", + "atmosphère", + "blâmer", + "inviter", + "logement", + "papier", + "boisson", + "rouleau", + "argent", + "ivre", + "âge", + "dommage", + "fumée", + "environnement", + "paquet", + "économies", + "influence", + "touriste", + "pluie", + "poste", + "signe", + "grand-mère", + "courir", + "profit", + "pousser", + "commis", + "final", + "vin", + "nager", + "pause", + "affaires", + "chanteur", + "funérailles", + "moyenne", + "source", + "scène", + "tradition", + "personnel", + "neige", + "personne", + "distance", + "type", + "sensible", + "animal", + "majeur", + "négociation", + "clic", + "humeur", + "période", + "arrivée", + "expression", + "vacances", + "répéter", + "poussière", + "placard", + "or", + "mauvais", + "naviguer", + "combinaison", + "vêtements", + "emphase", + "devoir", + "noir", + "étape", + "école", + "sauter", + "document", + "professionnel", + "lèvre", + "chimique", + "devant", + "réveil", + "pendant", + "intérieur", + "regarder", + "rangée", + "sujet", + "pénalité", + "équilibre", + "possible", + "adulte", + "côté", + "échantillon", + "appel", + "mariage", + "profondeur", + "roi", + "prix", + "femme", + "coup", + "site", + "camp", + "musique", + "sûr", + "cadeau", + "faute", + "deviner", + "acte", + "honte", + "drame", + "capital", + "examen", + "stupide", + "enregistrement", + "son", + "balancer", + "roman", + "minimum", + "ratio", + "machine", + "forme", + "diriger", + "opération", + "salaire", + "nuage", + "affaire", + "frapper", + "chapitre", + "scène", + "quantité", + "accès", + "armée", + "chaîne", + "trafic", + "coup de pied", + "analyse", + "aéroport", + "temps", + "vacances", + "philosophie", + "balle", + "poitrine", + "merci", + "lieu", + "montagne", + "publicité", + "rouge", + "passé", + "loyer", + "retour", + "tour", + "maison", + "construction", + "filet", + "natif", + "guerre", + "figure", + "frais", + "pulvérisation", + "utilisateur", + "poussière", + "coup", + "tâche", + "bâton", + "ami", + "logiciel", + "promotion", + "interaction", + "environner", + "bloc", + "but", + "pratique", + "conflit", + "routine", + "exigence", + "bonus", + "trou", + "état", + "junior", + "doux", + "attraper", + "larme", + "pli", + "mur", + "éditeur", + "vie", + "position", + "livre", + "respect", + "salle de bain", + "manteau", + "scénario", + "travail", + "enseigner", + "naissance", + "vue", + "résoudre", + "thème", + "employé", + "doute", + "marché", + "éducation", + "servir", + "récupérer", + "ton", + "mal", + "manquer", + "union", + "compréhension", + "vache", + "rivière", + "association", + "concept", + "formation", + "recette", + "relation", + "réserve", + "dépression", + "preuve", + "cheveux", + "revenu", + "indépendant", + "ascenseur", + "affectation", + "temporaire", + "montant", + "perte", + "bord", + "piste", + "vérifier", + "corde", + "estimation", + "pollution", + "stable", + "message", + "livraison", + "perspective", + "miroir", + "assistant", + "représentant", + "témoin", + "nature", + "juge", + "fruit", + "conseil", + "diable", + "ville", + "urgence", + "supérieur", + "chute", + "séjour", + "humain", + "cou", + "haut-parleur", + "réseau", + "chanter", + "résister", + "ligue", + "voyage", + "signature", + "avocat", + "importance", + "gaz", + "choix", + "ingénieur", + "succès", + "partie", + "externe", + "travailleur", + "simple", + "quart", + "étudiant", + "cœur", + "passer", + "mépris", + "changement", + "dur", + "dame", + "herbe", + "communauté", + "garage", + "jeunesse", + "standard", + "jupe", + "promesse", + "aveugle", + "télévision", + "maladie", + "commission", + "positif", + "énergie", + "calme", + "présence", + "accord", + "base", + "préférence", + "tête", + "commun", + "coupure", + "quelque part", + "présentation", + "actuel", + "pensée", + "révolution", + "effort", + "maître", + "mettre en œuvre", + "république", + "sol", + "principe", + "étranger", + "épaule", + "classe", + "bouton", + "tennis", + "police", + "collection", + "compte", + "inscription", + "gants", + "diviser", + "professeur", + "chaise", + "priorité", + "combiner", + "paix", + "extension", + "peut-être", + "soirée", + "cadre", + "sœur", + "vague", + "code", + "application", + "souris", + "allumette", + "comptoir", + "bouteille", + "moitié", + "joue", + "résolution", + "arrière", + "connaissance", + "faire", + "discussion", + "vis", + "longueur", + "accident", + "bataille", + "robe", + "genou", + "journal", + "paquet", + "il", + "tourner", + "audition", + "journal", + "couche", + "richesse", + "profil", + "imagination", + "réponse", + "week-end", + "enseignant", + "apparence", + "rencontre", + "vélo", + "croître", + "ceinture", + "crash", + "bol", + "équivalent", + "soutien", + "image", + "poème", + "risque", + "excitation", + "télécommande", + "secrétaire", + "public", + "produire", + "avion", + "affichage", + "argent", + "sable", + "situation", + "poing", + "client", + "titre", + "secouer", + "hypothèque", + "option", + "numéro", + "pop", + "fenêtre", + "étendue", + "rien", + "expérience", + "opinion", + "départ", + "danse", + "indication", + "garçon", + "matériel", + "bande", + "leader", + "soleil", + "beau", + "muscle", + "fermier", + "variété", + "graisse", + "poignée", + "directeur", + "opportunité", + "calendrier", + "extérieur", + "rythme", + "bain", + "poisson", + "conséquence", + "mettre", + "propriétaire", + "aller", + "docteur", + "information", + "partager", + "blesser", + "protection", + "carrière", + "finance", + "force", + "golf", + "ordures", + "aspect", + "enfant", + "nourriture", + "bottes", + "lait", + "répondre", + "objectif", + "réalité", + "brut", + "anneau", + "centre commercial", + "un", + "impact", + "zone", + "nouvelles", + "international", + "série", + "impressionner", + "mère", + "abri", + "frappe", + "prêt", + "mois", + "siège", + "tout", + "divertissement", + "familier", + "indice", + "année", + "heureux", + "supermarché", + "naturel", + "dieu", + "coût", + "conversation", + "cravate", + "ruiner", + "confort", + "terre", + "tempête", + "pourcentage", + "assistance", + "budget", + "force", + "début", + "sommeil", + "autre", + "jeune", + "unité", + "remplir", + "magasin", + "désir", + "cacher", + "valeur", + "tasse", + "entretien", + "infirmière", + "fonction", + "tour", + "rôle", + "classe", + "caméra", + "base de données", + "panique", + "nation", + "panier", + "glace", + "art", + "esprit", + "graphique", + "échange", + "retour", + "réputation", + "recherche", + "chasse", + "exercice", + "méchant", + "remarque", + "mâle", + "cour", + "annuel", + "col", + "date", + "plateforme", + "plante", + "fortune", + "passion", + "amitié", + "propagation", + "cancer", + "billet", + "attitude", + "île", + "actif", + "objet", + "service", + "acheteur", + "morsure", + "carte", + "visage", + "steak", + "proposition", + "patient", + "chaleur", + "règle", + "résident", + "large", + "politique", + "ouest", + "couteau", + "expert", + "fille", + "design", + "sel", + "base-ball", + "attraper", + "inspection", + "cousin", + "couple", + "magazine", + "cuisinier", + "dépendant", + "sécurité", + "poulet", + "version", + "devise", + "échelle", + "schéma", + "cuisine", + "emploi", + "local", + "attention", + "directeur", + "fait", + "couvrir", + "triste", + "garde", + "parent", + "comté", + "taux", + "déjeuner", + "programme", + "initiative", + "équipement", + "pont", + "poitrine", + "parler", + "plat", + "garantie", + "bière", + "véhicule", + "réception", + "femme", + "substance", + "copie", + "conférence", + "avantage", + "parc", + "froid", + "mort", + "mélange", + "tenir", + "échelle", + "demain", + "sang", + "demande", + "vert", + "biscuit", + "église", + "bande", + "toujours", + "au-delà", + "dette", + "s'occuper", + "laver", + "suivant", + "ressentir", + "maximum", + "secteur", + "mer", + "propriété", + "économie", + "menu", + "banc", + "essayer", + "langue", + "départ", + "appel", + "solide", + "adresse", + "revenu", + "pied", + "supérieur", + "chéri", + "peu", + "mélange", + "argent", + "épicerie", + "lien", + "carte", + "formulaire", + "facteur", + "pot", + "modèle", + "écrivain", + "ferme", + "hiver", + "compétence", + "partout", + "anniversaire", + "politique", + "sortie", + "mari", + "laboratoire", + "se dépêcher", + "courrier", + "équipement", + "évier", + "paire", + "conducteur", + "considération", + "cuir", + "peau", + "bleu", + "bateau", + "vente", + "brique", + "deux", + "nourrir", + "carré", + "point", + "se précipiter", + "rêve", + "emplacement", + "après-midi", + "fabricant", + "contrôle", + "occasion", + "problème", + "introduction", + "conseil", + "parier", + "manger", + "tuer", + "catégorie", + "manière", + "bureau", + "propriété", + "fierté", + "conscience", + "glisser", + "fissure", + "client", + "ongle", + "tirer", + "adhésion", + "doux", + "n'importe qui", + "web", + "officiel", + "individuel", + "pizza", + "intérêt", + "sac", + "épeler", + "profession", + "reine", + "accord", + "ressource", + "bateau", + "gars", + "chocolat", + "joint", + "formel", + "en haut", + "voiture", + "station", + "étranger", + "dépositaire", + "partenaire", + "doigt", + "chirurgie", + "commentaire", + "équipe", + "détail", + "fou", + "chemin", + "histoire", + "initiale", + "bras", + "radio", + "demande", + "simple", + "tirer", + "jaune", + "concours", + "morceau", + "citation", + "tirer", + "commercial", + "chemise", + "contribution", + "crème", + "chaîne", + "costume", + "discipline", + "instruction", + "concert", + "discours", + "bas", + "efficace", + "accrocher", + "gratter", + "industrie", + "petit déjeuner", + "poser", + "joindre", + "métal", + "chambre", + "minute", + "produit", + "repos", + "température", + "beaucoup", + "donner", + "discussion", + "imprimer", + "violet", + "rire", + "santé", + "crédit", + "investissement", + "vendre", + "réglage", + "leçon", + "œuf", + "milieu", + "mariage", + "niveau", + "preuve", + "phrase", + "amour", + "soi", + "avantage", + "orientation", + "affecter", + "toi", + "papa", + "anxiété", + "spécial", + "petit ami", + "test", + "vide", + "paiement", + "soupe", + "obligation", + "répondre", + "sourire", + "profond", + "plainte", + "ajout", + "révision", + "boîte", + "serviette", + "mineur", + "plaisir", + "sol", + "problème", + "cigarette", + "internet", + "gain", + "dire", + "entrée", + "supplémentaire", + "incident", + "famille", + "refuser", + "branche", + "peut", + "stylo", + "grand-père", + "constant", + "réservoir", + "oncle", + "climat", + "sol", + "volume", + "communication", + "genre", + "poète", + "enfant", + "écran", + "le mien", + "arrêter", + "gène", + "manque", + "charité", + "mémoire", + "dent", + "peur", + "mentionner", + "marketing", + "révéler", + "raison", + "cour", + "saison", + "liberté", + "terre", + "sport", + "public", + "salle de classe", + "loi", + "crochet", + "gagner", + "porter", + "œil", + "odeur", + "distribution", + "recherche", + "pays", + "oser", + "espoir", + "alors que", + "étirement", + "bibliothèque", + "si", + "retard", + "collège", + "plastique", + "livre", + "présent", + "utilisation", + "inquiétude", + "champion", + "but", + "économie", + "mars", + "élection", + "réflexion", + "minuit", + "diapositive", + "inflation", + "action", + "défi", + "guitare", + "côte", + "pomme", + "campagne", + "champ", + "veste", + "sens", + "chemin", + "visuel", + "retirer", + "temps", + "ordures", + "câble", + "regret", + "copain", + "plage", + "historien", + "courage", + "sympathie", + "camion", + "tension", + "permis", + "nez", + "lit", + "fils", + "personne", + "base", + "viande", + "habituel", + "air", + "réunion", + "valeur", + "jeu", + "indépendance", + "physique", + "court", + "jouer", + "soulever", + "tableau", + "elle", + "clé", + "écriture", + "choisir", + "commande", + "fête", + "hier", + "printemps", + "candidat", + "physique", + "université", + "préoccupation", + "développement", + "changement", + "chaîne", + "objectif", + "exemple", + "salle", + "amer", + "oiseau", + "football", + "normal", + "diviser", + "impression", + "bois", + "long", + "sens", + "stock", + "casquette", + "leadership", + "médias", + "ambition", + "pêche", + "essai", + "salade", + "réparation", + "aujourd'hui", + "designer", + "nuit", + "banque", + "dessin", + "inévitable", + "phase", + "vaste", + "puce", + "colère", + "interrupteur", + "crier", + "tordre", + "personnalité", + "tentative", + "stockage", + "être", + "préparation", + "chauve-souris", + "sélection", + "blanc", + "technologie", + "contrat", + "côté", + "section", + "station", + "jusqu'à", + "structure", + "langue", + "goût", + "vérité", + "difficulté", + "groupe", + "limite", + "principal", + "mouvement", + "sentiment", + "lumière", + "exemple", + "mission", + "pouvoir", + "attendre", + "roue", + "magasin", + "hôte", + "classique", + "alternative", + "cause", + "agent", + "consister", + "table", + "compagnie aérienne", + "texte", + "piscine", + "métier", + "gamme", + "carburant", + "outil", + "partenaire", + "charge", + "entrée", + "dépôt", + "haine", + "article", + "vidéo", + "été", + "caractéristique", + "extrême", + "mobile", + "hôpital", + "vol", + "chute", + "pension", + "piano", + "échouer", + "résultat", + "frotter", + "écart", + "système", + "rapport", + "aspirer", + "ordinaire", + "vent", + "nerf", + "demander", + "briller", + "note", + "ligne", + "maman", + "perception", + "frère", + "référence", + "plier", + "charge", + "traiter", + "tromperie", + "terme", + "devoirs", + "cuire", + "enchère", + "statut", + "projet", + "stratégie", + "orange", + "laisser", + "enthousiasme", + "parent", + "concentrer", + "appareil", + "voyage", + "poésie", + "entreprise", + "société", + "baiser", + "fin", + "légume", + "employé", + "calendrier", + "heure", + "courageux", + "focus", + "processus", + "film", + "illégal", + "général", + "café", + "publicité", + "autoroute", + "chimie", + "psychologie", + "embaucher", + "cloche", + "conférence", + "soulagement", + "montrer", + "propre", + "drôle", + "poids", + "qualité", + "club", + "fille", + "zone", + "toucher", + "ce soir", + "choc", + "brûler", + "excuse", + "nom", + "enquête", + "paysage", + "avancer", + "satisfaction", + "pain", + "catastrophe", + "article", + "chapeau", + "précédent", + "courses", + "visite", + "est", + "photo", + "maison", + "idée", + "père", + "comparaison", + "chat", + "tuyau", + "gagnant", + "compter", + "lac", + "combattre", + "prix", + "fondation", + "chien", + "garder", + "idéal", + "ventilateur", + "lutte", + "pic", + "sécurité", + "solution", + "enfer", + "conclusion", + "population", + "contrainte", + "alarme", + "mesure", + "seconde", + "train", + "course", + "attente", + "assurance", + "patron", + "arbre", + "écran", + "malade", + "cours", + "tirer", + "rendez-vous", + "tranche", + "encore", + "soin", + "patience", + "riche", + "s'échapper", + "émotion", + "royal", + "femelle", + "enfance", + "gouvernement", + "photo", + "volonté", + "chaussette", + "grand", + "porte", + "huile", + "croix", + "épingle", + "amélioration", + "championnat", + "stupide", + "aide", + "ciel", + "pitch", + "homme", + "diamant", + "plus", + "transition", + "travail", + "science", + "comité", + "moment", + "réparation", + "enseignement", + "creuser", + "spécialiste", + "complexe", + "guide", + "personnes", + "mort", + "voix", + "original", + "pause", + "sujet", + "données", + "degré", + "lecture", + "enregistrement", + "groupe", + "atteindre", + "jugement", + "mensonge", + "régulier", + "ensemble", + "peinture", + "mode", + "liste", + "joueur", + "ours", + "nord", + "merveille", + "tapis", + "lourd", + "officier", + "négatif", + "horloge", + "unique", + "bébé", + "douleur", + "supposition", + "disque", + "fer", + "facture", + "tiroir", + "regard", + "double", + "erreur", + "finir", + "avenir", + "brillant", + "contact", + "maths", + "riz", + "laisser", + "restaurant", + "réduction", + "sexe", + "virus", + "morceau", + "confiance", + "événement", + "porter", + "jus", + "échec", + "bogue", + "contexte", + "boue", + "tout", + "envelopper", + "intention", + "brouillon", + "pression", + "gâteau", + "sombre", + "explication", + "espace", + "angle", + "mot", + "efficacité", + "gestion", + "habitude", + "étoile", + "chance", + "découverte", + "transport", + "support", + "critique", + "flux", + "porte", + "blessure", + "insecte", + "surprise", + "appartement", +] # pylint: disable=line-too-long + +# Codes ISO 639-1 pour les noms de langue. +LANGUAGE_CODES = immutabledict.immutabledict( + { + "en": "Anglais", + "es": "Espagnol", + "pt": "Portugais", + "ar": "Arabe", + "hi": "Hindi", + "fr": "Français", + "ru": "Russe", + "de": "Allemand", + "ja": "Japonais", + "it": "Italien", + "bn": "Bengali", + "uk": "Ukrainien", + "th": "Thaï", + "ur": "Ourdou", + "ta": "Tamoul", + "te": "Télougou", + "bg": "Bulgare", + "ko": "Coréen", + "pl": "Polonais", + "he": "Hébreu", + "fa": "Persan", + "vi": "Vietnamien", + "ne": "Népalais", + "sw": "Swahili", + "kn": "Kannada", + "mr": "Marathi", + "gu": "Gujarati", + "pa": "Pendjabi", + "ml": "Malayalam", + "fi": "Finnois", + } +) + +_ALPHABETS = "([A-Za-z])" +_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]" +_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)" +_STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" +_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" +_WEBSITES = "[.](com|net|org|io|gov|edu|me)" +_DIGITS = "([0-9])" +_MULTIPLE_DOTS = r"\.{2,}" + + +def split_into_sentences(text): + """Split the text into sentences. + + Args: + text: A string that consists of more than or equal to one sentences. + + Returns: + A list of strings where each string is a sentence. + """ + text = " " + text + " " + text = text.replace("\n", " ") + text = re.sub(_PREFIXES, "\\1", text) + text = re.sub(_WEBSITES, "\\1", text) + text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1\\2", text) + text = re.sub( + _MULTIPLE_DOTS, + lambda match: "" * len(match.group(0)) + "", + text, + ) + if "Ph.D" in text: + text = text.replace("Ph.D.", "PhD") + text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1 ", text) + text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1 \\2", text) + text = re.sub( + _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]", + "\\1\\2\\3", + text, + ) + text = re.sub(_ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1\\2", text) + text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1 \\2", text) + text = re.sub(" " + _SUFFIXES + "[.]", " \\1", text) + text = re.sub(" " + _ALPHABETS + "[.]", " \\1", text) + if "”" in text: + text = text.replace(".”", "”.") + if '"' in text: + text = text.replace('."', '".') + if "!" in text: + text = text.replace('!"', '"!') + if "?" in text: + text = text.replace('?"', '"?') + text = text.replace(".", ".") + text = text.replace("?", "?") + text = text.replace("!", "!") + text = text.replace("", ".") + sentences = text.split("") + sentences = [s.strip() for s in sentences] + if sentences and not sentences[-1]: + sentences = sentences[:-1] + return sentences + +def count_words(text): + """Counts the number of words.""" + tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+") + tokens = tokenizer.tokenize(text) + num_words = len(tokens) + return num_words + + + +@functools.lru_cache(maxsize=None) +def _obtenir_tokenizer_phrases(): + return nltk.data.load("nltk:tokenizers/punkt/french.pickle") + + +def count_sentences(text): + """Count the number of sentences.""" + tokenizer = _get_sentence_tokenizer() + tokenized_sentences = tokenizer.tokenize(text) + return len(tokenized_sentences) + + +def generate_keywords(num_keywords): + """Randomly generates a few keywords.""" + return random.sample(WORD_LIST, k=num_keywords) \ No newline at end of file diff --git a/lm_eval/tasks/leaderboard-french/ifeval/utils.py b/lm_eval/tasks/leaderboard-french/ifeval/utils.py new file mode 100644 index 0000000000..b2c9f9f067 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/ifeval/utils.py @@ -0,0 +1,217 @@ +import dataclasses +from typing import Dict, Optional, Union + +from lm_eval.tasks.ifeval import instructions_registry + +import nltk +nltk.download('punkt') # Download the punkt tokenizer, which includes French +nltk.download('punkt_tab') + +@dataclasses.dataclass +class InputExample: + key: int + instruction_id_list: list[str] + prompt: str + kwargs: list[Dict[str, Optional[Union[str, int]]]] + + +@dataclasses.dataclass +class OutputExample: + instruction_id_list: list[str] + prompt: str + response: str + follow_all_instructions: bool + follow_instruction_list: list[bool] + +def filter_not_null_entries_all(input_dict): + # Filter the dictionary to include all non-null entries + non_null_entries = {k: v for k, v in input_dict.items() if v is not None} + non_null_entries_arr = [{k: v} for k, v in input_dict.items() if v is not None] + + # If both 'section_spliter' and 'num_sections' are in the dictionary, combine them + if 'section_spliter' in non_null_entries and 'num_sections' in non_null_entries: + non_null_entries + combined_section = { + 'section_spliter': non_null_entries.pop('section_spliter'), + 'num_sections': non_null_entries.pop('num_sections') + } + # List of keys we want to remove + keys_to_remove = ['section_spliter', 'num_sections'] + + # Clean the dictionaries using a for loop + cleaned_data = [] + for entry in non_null_entries_arr: + # Create a copy of the dictionary without the unwanted keys + cleaned_entry = {k: v for k, v in entry.items() if k not in keys_to_remove} + if cleaned_entry: # Only add non-empty dictionaries + cleaned_data.append(cleaned_entry) + non_null_entries_arr.append(combined_section) + return cleaned_data + + return non_null_entries_arr + +def test_instruction_following_strict( + inp, + response, +): + """Tests response to see if instructions are followed.""" + instruction_list = inp.instruction_id_list + is_following_list = [] + + for index, instruction_id in enumerate(instruction_list): + instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id] + instruction = instruction_cls(instruction_id) + #results=[parse_kwargs_list(inp.kwargs)] + # Remove the last three elements + keys = list(inp.kwargs.keys())[:-3] + trimmed_kwargs = {k: inp.kwargs[k] for k in keys} + + result = filter_not_null_entries_all(trimmed_kwargs) + try: + #kwargs = {k: v for k, v in result[0].items() if v} + instruction.build_description(**result[0]) + except: + try: + #kwargs = {k: v for k, v in result[1].items() if v} + instruction.build_description(**result[1]) + except: + try: + #kwargs = {k: v for k, v in result[2].items() if v} + instruction.build_description(**result[2]) + except: + instruction.build_description() + + + + args = instruction.get_instruction_args() + if args and "prompt" in args: + instruction.build_description(prompt=inp.prompt) + + if response.strip() and instruction.check_following(response): + is_following_list.append(True) + else: + is_following_list.append(False) + + return OutputExample( + instruction_id_list=inp.instruction_id_list, + prompt=inp.prompt, + response=response, + follow_all_instructions=all(is_following_list), + follow_instruction_list=is_following_list, + ) +def parse_kwargs_list(kwargs_list): + parsed_kwargs = {} + non_null_instances = [] + + # Parse the list of 'key:value' strings into a dictionary + for pair in kwargs_list: + if ":" in pair: + k, v = pair.split(":", 1) + parsed_kwargs[k] = None if v == "None" else v + + # Split the dictionary into separate dictionaries based on non-null values + current_dict = {} + for k, v in parsed_kwargs.items(): + if v is not None: + current_dict[k] = v + # If current_dict is not empty, store it in the list and reset + if len(current_dict) > 0: + non_null_instances.append(current_dict) + current_dict = {} + + # Reverse the list of non-null instances before returning + return non_null_instances + + + +def test_instruction_following_loose( + inp, + response, +): + """Tests response for an upper bound for following instructions.""" + r = response.split("\n") + response_remove_first = "\n".join(r[1:]).strip() + response_remove_last = "\n".join(r[:-1]).strip() + response_remove_both = "\n".join(r[1:-1]).strip() + revised_response = response.replace("*", "") + revised_response_remove_first = response_remove_first.replace("*", "") + revised_response_remove_last = response_remove_last.replace("*", "") + revised_response_remove_both = response_remove_both.replace("*", "") + all_responses = [ + response, + revised_response, + response_remove_first, + response_remove_last, + response_remove_both, + revised_response_remove_first, + revised_response_remove_last, + revised_response_remove_both, + ] + instruction_list = inp.instruction_id_list + is_following_list = [] + + for index, instruction_id in enumerate(instruction_list): + instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id] + instruction = instruction_cls(instruction_id) + keys = list(inp.kwargs.keys())[:-3] + trimmed_kwargs = {k: inp.kwargs[k] for k in keys} + result = filter_not_null_entries_all(trimmed_kwargs) + # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method. + try: + kwargs = {k: v for k, v in result[0].items() if v} + instruction.build_description(**kwargs) + except: + try: + kwargs = {k: v for k, v in result[1].items() if v} + instruction.build_description(**kwargs) + except: + try: + kwargs = {k: v for k, v in result[2].items() if v} + instruction.build_description(**kwargs) + except: + instruction.build_description() + + args = instruction.get_instruction_args() + if args and "prompt" in args: + instruction.build_description(prompt=inp.prompt) + + is_following = False + for r in all_responses: + if r.strip() and instruction.check_following(r): + is_following = True + break + + is_following_list.append(is_following) + + return OutputExample( + instruction_id_list=inp.instruction_id_list, + prompt=inp.prompt, + response=response, + follow_all_instructions=all(is_following_list), + follow_instruction_list=is_following_list, + ) + + +def process_results(doc, results): + inp = InputExample( + key=0, + instruction_id_list=doc["categories"], + prompt=doc["prompt"], + kwargs=doc["kwargs"], + ) + response = results[0] + out_strict = test_instruction_following_strict(inp, response) + out_loose = test_instruction_following_loose(inp, response) + + return { + "prompt_level_strict_acc": out_strict.follow_all_instructions, + "inst_level_strict_acc": out_strict.follow_instruction_list, + "prompt_level_loose_acc": out_loose.follow_all_instructions, + "inst_level_loose_acc": out_loose.follow_instruction_list, + } + + +def agg_inst_level_acc(items): + flat_items = [item for sublist in items for item in sublist] + inst_level_acc = sum(flat_items) / len(flat_items) + return inst_level_acc diff --git a/lm_eval/tasks/leaderboard-french/leaderboard.yaml b/lm_eval/tasks/leaderboard-french/leaderboard.yaml new file mode 100644 index 0000000000..c8b1fb6c53 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/leaderboard.yaml @@ -0,0 +1,32 @@ +group: leaderboard-fr +task: + - leaderboard_mmlu_fr + - leaderboard_bbh_fr + - leaderboard_gpqa_fr + - leaderboard_math_hard_fr + - leaderboard_ifeval_fr + - leaderboard_musr_fr +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). + - metric: acc_norm + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). + - metric: exact_match + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). + - metric: inst_level_loose_acc + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). + - metric: inst_level_strict_acc + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). + - metric: prompt_level_loose_acc + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). + - metric: prompt_level_strict_acc + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/leaderboard-french/math/_leaderboard_math.yaml b/lm_eval/tasks/leaderboard-french/math/_leaderboard_math.yaml new file mode 100644 index 0000000000..07b84ed06a --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/math/_leaderboard_math.yaml @@ -0,0 +1,12 @@ +group: leaderboard_math_hard_fr +task: + - leaderboard_math_algebra_hard_fr + - leaderboard_math_counting_and_prob_hard_fr + - leaderboard_math_geometry_hard_fr + - leaderboard_math_num_theory_hard_fr + - leaderboard_math_prealgebra_hard_fr + - leaderboard_math_precalculus_hard_fr +aggregate_metric_list: + - metric: exact_match + aggregation: mean + weight_by_size: true diff --git a/lm_eval/tasks/leaderboard-french/math/_template_yaml b/lm_eval/tasks/leaderboard-french/math/_template_yaml new file mode 100644 index 0000000000..7d2819c291 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/math/_template_yaml @@ -0,0 +1,26 @@ +dataset_path: le-leadboard/MATH_LVL5_fr +process_docs: !function utils.process_docs +output_type: generate_until +training_split: train +test_split: train +doc_to_text: !function utils.doc_to_text +process_results: !function utils.process_results +doc_to_target: "{{answer if few_shot is undefined else solution}}" +generation_kwargs: + until: + - "Problème:" + do_sample: false + temperature: 0 + max_gen_toks: 1024 +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true +num_fewshot: 4 +metadata: + version: 1.0 +dataset_kwargs: + trust_remote_code: true +fewshot_config: + sampler: first_n + samples: !function utils.list_fewshot_samples diff --git a/lm_eval/tasks/leaderboard-french/math/math_algebra.yaml b/lm_eval/tasks/leaderboard-french/math/math_algebra.yaml new file mode 100644 index 0000000000..1bb3161ad0 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/math/math_algebra.yaml @@ -0,0 +1,3 @@ +include: _template_yaml +dataset_name: algebra +task: leaderboard_math_algebra_hard_fr diff --git a/lm_eval/tasks/leaderboard-french/math/math_counting_and_prob.yaml b/lm_eval/tasks/leaderboard-french/math/math_counting_and_prob.yaml new file mode 100644 index 0000000000..c8993505cb --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/math/math_counting_and_prob.yaml @@ -0,0 +1,3 @@ +include: _template_yaml +dataset_name: counting_and_probability +task: leaderboard_math_counting_and_prob_hard_fr diff --git a/lm_eval/tasks/leaderboard-french/math/math_geometry.yaml b/lm_eval/tasks/leaderboard-french/math/math_geometry.yaml new file mode 100644 index 0000000000..16d74f417a --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/math/math_geometry.yaml @@ -0,0 +1,3 @@ +include: _template_yaml +dataset_name: geometry +task: leaderboard_math_geometry_hard_fr diff --git a/lm_eval/tasks/leaderboard-french/math/math_intermediate_algebra.yaml b/lm_eval/tasks/leaderboard-french/math/math_intermediate_algebra.yaml new file mode 100644 index 0000000000..73f002dbdf --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/math/math_intermediate_algebra.yaml @@ -0,0 +1,3 @@ +include: _template_yaml +dataset_name: intermediate_algebra +task: leaderboard_math_intermediate_algebra_hard_fr diff --git a/lm_eval/tasks/leaderboard-french/math/math_num_theory.yaml b/lm_eval/tasks/leaderboard-french/math/math_num_theory.yaml new file mode 100644 index 0000000000..4c4919e82e --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/math/math_num_theory.yaml @@ -0,0 +1,3 @@ +include: _template_yaml +dataset_name: number_theory +task: leaderboard_math_num_theory_hard_fr diff --git a/lm_eval/tasks/leaderboard-french/math/math_prealgebra.yaml b/lm_eval/tasks/leaderboard-french/math/math_prealgebra.yaml new file mode 100644 index 0000000000..8065c02018 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/math/math_prealgebra.yaml @@ -0,0 +1,3 @@ +include: _template_yaml +dataset_name: prealgebra +task: leaderboard_math_prealgebra_hard_fr diff --git a/lm_eval/tasks/leaderboard-french/math/math_precalculus.yaml b/lm_eval/tasks/leaderboard-french/math/math_precalculus.yaml new file mode 100644 index 0000000000..287b96e99a --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/math/math_precalculus.yaml @@ -0,0 +1,3 @@ +include: _template_yaml +dataset_name: precalculus +task: leaderboard_math_precalculus_hard_fr diff --git a/lm_eval/tasks/leaderboard-french/math/utils.py b/lm_eval/tasks/leaderboard-french/math/utils.py new file mode 100644 index 0000000000..ff8cf3948a --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/math/utils.py @@ -0,0 +1,331 @@ +import re +import signal +from typing import Dict, List, Optional + +import datasets + +from lm_eval.utils import eval_logger + + +try: + import sympy + from sympy.parsing.latex import parse_latex +except ModuleNotFoundError: + raise ModuleNotFoundError( + "`sympy` is required for generating translation task prompt templates. \ +please install sympy via pip install lm-eval[math] or pip install -e .[math]", + ) + + +# taken from +# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py +def doc_to_text(doc: dict) -> str: + return ( + "Problème:\n" + + doc["problem"] + + "\n\n" + + "Veuillez terminer votre solution en écrivant 'Réponse finale : ### [votre réponse]'.\n" + + "Solution:\n" + ) + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc: dict) -> dict: + out_doc = { + "problem": doc["problem"], + "solution": doc["solution"], + "answer": normalize_final_answer( + remove_boxed(last_boxed_only_string(doc["solution"])) + ), + } + if getattr(doc, "few_shot", None) is not None: + out_doc["few_shot"] = True + return out_doc + + return dataset.map(_process_doc) + + +def list_fewshot_samples() -> list[dict]: + return [ + { + "problem": "Trouvez le domaine de l'expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.", + "solution": "Les expressions sous chaque racine carrée doivent être non négatives. Donc, $x - 2 \\ge 0$, donc $x \\ge 2$, et $5 - x \\ge 0$, donc $x \\le 5$. De plus, le dénominateur ne peut pas être égal à zéro, donc $5 - x > 0$, ce qui donne $x < 5$. Par conséquent, le domaine de l'expression est $\\boxed{[2,5)}$.\nRéponse finale : ### $[2,5)$", + "few_shot": "1", + }, + { + "problem": "Si $\\det \\mathbf{A} = 2$ et $\\det \\mathbf{B} = 12$, alors trouvez $\\det (\\mathbf{A} \\mathbf{B})$.", + "solution": "Nous avons que $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}$.\nRéponse finale : ### $24$", + "few_shot": "1", + }, + { + "problem": "Terrell soulève habituellement deux haltères de 20 livres 12 fois. S'il utilise deux haltères de 15 livres à la place, combien de fois Terrell doit-il les soulever pour obtenir le même poids total ?", + "solution": "Si Terrell soulève deux haltères de 20 livres 12 fois, il soulève un total de $2\\cdot 12\\cdot20=480$ livres. S'il soulève à la place deux haltères de 15 livres $n$ fois, il soulèvera un total de $2\\cdot15\\cdot n=30n$ livres. En égalant cela à 480 livres, nous pouvons résoudre pour $n$ :\n\\begin{align*}\n30n&=480\\\n\\Rightarrow\\qquad n&=480/30=\\boxed{16}\n\\end{align*}\nRéponse finale : ### $16$", + "few_shot": "1", + }, + { + "problem": "Si le système d'équations\n\n\\begin{align*}\n6x - 4y &= a, \\\\\n6y - 9x &= b.\n\\end{align*}a une solution $(x, y)$ où $x$ et $y$ sont tous deux non nuls,\ntrouvez $\\frac{a}{b}$, en supposant que $b$ est non nul.", + "solution": "Si nous multiplions la première équation par $-\\frac{3}{2}$, nous obtenons\n\n$$6y - 9x = -\\frac{3}{2}a.$$Comme nous savons aussi que $6y - 9x = b$, nous avons\n\n$$-\\frac{3}{2}a = b \\Rightarrow \\frac{a}{b} = \\boxed{-\\frac{2}{3}}.$$\nRéponse finale : ### $-\\frac{2}{3}$", + "few_shot": "1", + }, + ] + + +def process_results(doc: dict, results: List[str]) -> Dict[str, int]: + candidates = results[0] + + unnormalized_answer = get_unnormalized_answer(candidates) + answer = normalize_final_answer(unnormalized_answer) + if is_equiv(answer, doc["answer"]): + retval = 1 + else: + retval = 0 + + results = { + "exact_match": retval, + } + return results + + +def last_boxed_only_string(string: str) -> Optional[str]: + idx = string.rfind("\\boxed") + if "\\boxed " in string: + return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0] + if idx < 0: + idx = string.rfind("\\fbox") + if idx < 0: + return None + + i = idx + right_brace_idx = None + num_left_braces_open = 0 + while i < len(string): + if string[i] == "{": + num_left_braces_open += 1 + if string[i] == "}": + num_left_braces_open -= 1 + if num_left_braces_open == 0: + right_brace_idx = i + break + i += 1 + + if right_brace_idx is None: + retval = None + else: + retval = string[idx : right_brace_idx + 1] + + return retval + + +def remove_boxed(s: str) -> str: + if "\\boxed " in s: + left = "\\boxed " + assert s[: len(left)] == left + return s[len(left) :] + + left = "\\boxed{" + + assert s[: len(left)] == left + assert s[-1] == "}" + + return s[len(left) : -1] + + +class timeout: + def __init__(self, seconds=1, error_message="Timeout"): + self.seconds = seconds + self.error_message = error_message + + def handle_timeout(self, signum, frame): + raise TimeoutError(self.error_message) + + def __enter__(self): + signal.signal(signal.SIGALRM, self.handle_timeout) + signal.alarm(self.seconds) + + def __exit__(self, type, value, traceback): + signal.alarm(0) + + +def is_equiv(x1: str, x2: str) -> bool: + """ + x1 and x2 are normalized latex string + """ + try: + with timeout(seconds=5): + try: + parsed_x1 = parse_latex(x1) + parsed_x2 = parse_latex(x2) + except ( + sympy.parsing.latex.errors.LaTeXParsingError, + sympy.SympifyError, + TypeError, + ): + eval_logger.debug(f"couldn't parse one of {x1} or {x2}") + return False + + try: + diff = parsed_x1 - parsed_x2 + except TypeError: + eval_logger.debug(f"couldn't subtract {x1} and {x2}") + return False + + try: + if sympy.simplify(diff) == 0: + return True + else: + return False + except ValueError: + eval_logger.debug( + f"Had some trouble simplifying when comparing {x1} and {x2}" + ) + except TimeoutError: + eval_logger.debug(f"Timed out comparing {x1} and {x2}") + return False + except ImportError as e: + eval_logger.error(e) + raise + except Exception as e: + eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}") + return False + + +def get_unnormalized_answer(text: str) -> str: + INVALID_ANSWER = "[invalidanswer]" + # Ensure that the text ends with a newline to capture answers at the end of the string + text += "\n" + match = re.search( + r"Réponse finale : ###(.*?)(?:\n|$)", + text + ) + match_v1 = re.search( + r"Réponse finale :###(.*?)(?:\n|$)", + text + ) + match_v2 = re.search( + r"Réponse finale:###(.*?)(?:\n|$)", + text + ) + match_v3 = re.search( + r"###(.*?)(?:\n|$)", + text + ) + match_v4 = re.search( + r"Réponse finale: (.*?)(?:\n|$)", + text + ) + if match or match_v1 or match_v2 or match_v3 or match_v4: + try: + return match.group(1).strip() + except: + try: + return match_v1.group(1).strip() + except: + try: + return match_v2.group(1).strip() + except: + try: + return match_v3.group(1).strip() + except: + try: + return match_v4.group(1).strip() + except: + return INVALID_ANSWER + else: + return INVALID_ANSWER + +SUBSTITUTIONS = [ + ("an ", ""), + ("a ", ""), + (".$", "$"), + ("\\$", ""), + (r"\ ", ""), + (" ", ""), + ("mbox", "text"), + (",\\text{and}", ","), + ("\\text{and}", ","), + ("\\text{m}", "\\text{}"), +] + +REMOVED_EXPRESSIONS = [ + "carré", + "manières", + "entiers", + "dollars", + "mph", + "pouces", + "pieds", + "heures", + "km", + "unités", + "\\ldots", + "sue", + "points", + "pieds", + "minutes", + "chiffres", + "cents", + "degrés", + "cm", + "gm", + "livres", + "mètres", + "repas", + "arêtes", + "étudiants", + "billetsdenfants", + "multiples", + "\\text{s}", + "\\text{.}", + "\\text{\ns}", + "\\text{}^2", + "\\text{}^3", + "\\text{\n}", + "\\text{}", + r"\mathrm{th}", + r"^\circ", + r"^{\circ}", + r"\;", + r",\!", + "{,}", + '"', + "\\dots", +] + + + +def normalize_final_answer(final_answer: str) -> str: + """ + Normalize a final answer to a quantitative reasoning question. + + Copied character for character from appendix D of Lewkowycz et al. (2022) + """ + final_answer = final_answer.split("=")[-1] + + for before, after in SUBSTITUTIONS: + final_answer = final_answer.replace(before, after) + for expr in REMOVED_EXPRESSIONS: + final_answer = final_answer.replace(expr, "") + + # Extract answer that is in LaTeX math, is bold, + # is surrounded by a box, etc. + final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer) + final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer) + final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer) + final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer) + final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer) + + # Normalize shorthand TeX: + # \fracab -> \frac{a}{b} + # \frac{abc}{bef} -> \frac{abc}{bef} + # \fracabc -> \frac{a}{b}c + # \sqrta -> \sqrt{a} + # \sqrtab -> sqrt{a}b + final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer) + final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer) + final_answer = final_answer.replace("$", "") + + # Normalize 100,000 -> 100000 + if final_answer.replace(",", "").isdigit(): + final_answer = final_answer.replace(",", "") + + return final_answer diff --git a/lm_eval/tasks/leaderboard-french/mmlu_pro/mmlu_pro.yaml b/lm_eval/tasks/leaderboard-french/mmlu_pro/mmlu_pro.yaml new file mode 100644 index 0000000000..84302acea7 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/mmlu_pro/mmlu_pro.yaml @@ -0,0 +1,17 @@ +dataset_path: le-leadboard/MMMLU-fr # a copy of `cais/leaderboard_mmlu` with no auxiliary_train split +task: leaderboard_mmlu_fr +test_split: test +fewshot_split: test +fewshot_config: + sampler: first_n +output_type: multiple_choice +doc_to_text: !function utils.doc_to_text +doc_to_choice: !function utils.doc_to_choice +doc_to_target: Answer +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +num_fewshot: 5 +metadata: + version: 0.1 diff --git a/lm_eval/tasks/leaderboard-french/mmlu_pro/utils.py b/lm_eval/tasks/leaderboard-french/mmlu_pro/utils.py new file mode 100644 index 0000000000..5872306d75 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/mmlu_pro/utils.py @@ -0,0 +1,16 @@ +import string + + +def doc_to_text(doc): + doc_to_text = f"{doc['Question']}\n" + options = [doc['A'],doc['B'],doc['C'],doc['D']] + for i in range(len(options)): + doc_to_text += f"{string.ascii_uppercase[i]}. {options[i]}\n" + + doc_to_text += "Réponse:" + return doc_to_text + + +def doc_to_choice(doc): + options = [doc['A'],doc['B'],doc['C'],doc['D']] + return [string.ascii_uppercase[i] for i in range(len(options))] diff --git a/lm_eval/tasks/leaderboard-french/musr/_musr.yaml b/lm_eval/tasks/leaderboard-french/musr/_musr.yaml new file mode 100644 index 0000000000..7aa4ecfe43 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/musr/_musr.yaml @@ -0,0 +1,9 @@ +group: leaderboard_musr_fr +task: + - leaderboard_musr_murder_mysteries_fr + - leaderboard_musr_object_placements_fr + - leaderboard_musr_team_allocation_fr +aggregate_metric_list: + - metric: acc_norm + aggregation: mean + weight_by_size: true diff --git a/lm_eval/tasks/leaderboard-french/musr/_template_yaml b/lm_eval/tasks/leaderboard-french/musr/_template_yaml new file mode 100644 index 0000000000..f6a76d0bb0 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/musr/_template_yaml @@ -0,0 +1,11 @@ +dataset_path: le-leadboard/musr-fr +output_type: multiple_choice +doc_to_text: !function utils.doc_to_text +doc_to_target: "{{answer_choice}}" +doc_to_choice: "{{choices}}" +metric_list: + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/leaderboard-french/musr/musr_murder_mysteries.yaml b/lm_eval/tasks/leaderboard-french/musr/musr_murder_mysteries.yaml new file mode 100644 index 0000000000..d2da61265f --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/musr/musr_murder_mysteries.yaml @@ -0,0 +1,3 @@ +include: "_template_yaml" +task: leaderboard_musr_murder_mysteries_fr +test_split: murder_mysteries diff --git a/lm_eval/tasks/leaderboard-french/musr/musr_object_placements.yaml b/lm_eval/tasks/leaderboard-french/musr/musr_object_placements.yaml new file mode 100644 index 0000000000..47a1189631 --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/musr/musr_object_placements.yaml @@ -0,0 +1,3 @@ +include: "_template_yaml" +task: leaderboard_musr_object_placements_fr +test_split: object_placements diff --git a/lm_eval/tasks/leaderboard-french/musr/musr_team_allocation.yaml b/lm_eval/tasks/leaderboard-french/musr/musr_team_allocation.yaml new file mode 100644 index 0000000000..0ed8fc4caa --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/musr/musr_team_allocation.yaml @@ -0,0 +1,3 @@ +include: "_template_yaml" +task: leaderboard_musr_team_allocation_fr +test_split: team_allocation diff --git a/lm_eval/tasks/leaderboard-french/musr/utils.py b/lm_eval/tasks/leaderboard-french/musr/utils.py new file mode 100644 index 0000000000..2df421f2bc --- /dev/null +++ b/lm_eval/tasks/leaderboard-french/musr/utils.py @@ -0,0 +1,26 @@ +import ast + + +def doc_to_choice(doc): + """ + Convert a doc to a choice. + """ + return ast.literal_eval(doc["choices"]) + + +DOC_TO_TEXT = "{narrative}\n\n" "{question}\n\n" "{choices}\n" "Answer:" + + +def doc_to_text(doc): + """ + Convert a doc to text. + """ + choices = "" + for i, choice in enumerate(doc["choices"]): + choices += f"{i+1} - {choice}\n" + + text = DOC_TO_TEXT.format( + narrative=doc["narrative"], question=doc["question"], choices=choices + ) + + return text