From 7257aa2e4e0e6934a32ce0e005c7d9a534d710a7 Mon Sep 17 00:00:00 2001 From: Maxime <672982+maximegmd@users.noreply.github.com> Date: Wed, 5 Jun 2024 13:20:27 +0200 Subject: [PATCH] Multiple Choice Questions and Large Languages Models: A Case Study with Fictional Medical Data (#1867) * glianorex tasks * Create README.md * Update README.md * Update README.md * fix formatting * fix internal formatting --- lm_eval/tasks/glianorex/README.md | 20 ++++++++++++++++ lm_eval/tasks/glianorex/glianorex.yaml | 14 +++++++++++ lm_eval/tasks/glianorex/glianorex_en.yaml | 15 ++++++++++++ lm_eval/tasks/glianorex/glianorex_fr.yaml | 15 ++++++++++++ .../tasks/glianorex/preprocess_glianorex.py | 23 +++++++++++++++++++ 5 files changed, 87 insertions(+) create mode 100644 lm_eval/tasks/glianorex/README.md create mode 100755 lm_eval/tasks/glianorex/glianorex.yaml create mode 100755 lm_eval/tasks/glianorex/glianorex_en.yaml create mode 100755 lm_eval/tasks/glianorex/glianorex_fr.yaml create mode 100755 lm_eval/tasks/glianorex/preprocess_glianorex.py diff --git a/lm_eval/tasks/glianorex/README.md b/lm_eval/tasks/glianorex/README.md new file mode 100644 index 0000000000..3efc925665 --- /dev/null +++ b/lm_eval/tasks/glianorex/README.md @@ -0,0 +1,20 @@ +# Glianorex + +The goal of this benchmark is to isolate the test answering capabilities from the content knowledge. + +### Paper + +Title: Multiple Choice Questions and Large Languages Models: A Case Study with Fictional Medical Data + +Abstract: https://arxiv.org/abs/2406.02394 + +To test the relevance of MCQs to assess LLM performance without prior data exposure, we created a fictional medical benchmark and knowledge base on a non-existent gland, the Glianorex. Using GPT-4 we generated a comprehensive textbook on the Glianorex in both English and French, and created multiple-choice questions in both English and French. + +### Tasks + +All tasks are multiple choice questions with 4 options, only one correct option. + +- `glianorex`: Evaluates all tasks listed below. + +- `glianorex_en`: Evaluates the accuracy on 264 questions in English. +- `glianorex_fr`: Evaluates the accuracy on 264 questions in French. diff --git a/lm_eval/tasks/glianorex/glianorex.yaml b/lm_eval/tasks/glianorex/glianorex.yaml new file mode 100755 index 0000000000..a7ba436656 --- /dev/null +++ b/lm_eval/tasks/glianorex/glianorex.yaml @@ -0,0 +1,14 @@ +task: glianorex +dataset_path: maximegmd/glianorex +output_type: multiple_choice +test_split: train +doc_to_text: !function preprocess_glianorex.doc_to_text +doc_to_target: !function preprocess_glianorex.doc_to_target +doc_to_choice: [ 'A', 'B', 'C', 'D' ] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true diff --git a/lm_eval/tasks/glianorex/glianorex_en.yaml b/lm_eval/tasks/glianorex/glianorex_en.yaml new file mode 100755 index 0000000000..b08c6f8114 --- /dev/null +++ b/lm_eval/tasks/glianorex/glianorex_en.yaml @@ -0,0 +1,15 @@ +task: glianorex_en +dataset_path: maximegmd/glianorex +output_type: multiple_choice +test_split: train +doc_to_text: !function preprocess_glianorex.doc_to_text +doc_to_target: !function preprocess_glianorex.doc_to_target +process_docs: !function preprocess_glianorex.filter_english +doc_to_choice: [ 'A', 'B', 'C', 'D' ] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true diff --git a/lm_eval/tasks/glianorex/glianorex_fr.yaml b/lm_eval/tasks/glianorex/glianorex_fr.yaml new file mode 100755 index 0000000000..6d09bc5a7b --- /dev/null +++ b/lm_eval/tasks/glianorex/glianorex_fr.yaml @@ -0,0 +1,15 @@ +task: glianorex_fr +dataset_path: maximegmd/glianorex +output_type: multiple_choice +test_split: train +doc_to_text: !function preprocess_glianorex.doc_to_text +doc_to_target: !function preprocess_glianorex.doc_to_target +process_docs: !function preprocess_glianorex.filter_french +doc_to_choice: [ 'A', 'B', 'C', 'D' ] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true diff --git a/lm_eval/tasks/glianorex/preprocess_glianorex.py b/lm_eval/tasks/glianorex/preprocess_glianorex.py new file mode 100755 index 0000000000..f257df14d8 --- /dev/null +++ b/lm_eval/tasks/glianorex/preprocess_glianorex.py @@ -0,0 +1,23 @@ +import datasets + + +def doc_to_text(doc) -> str: + option_choices = doc["options"] + answers = "".join((f"{k}. {v}\n") for k, v in option_choices.items()) + return f"Question: {doc['question']}\n{answers}Answer:" + + +def doc_to_target(doc) -> int: + return doc["answer_idx"] + + +def filter_dataset(dataset: datasets.Dataset, lang: str) -> datasets.Dataset: + return dataset.filter(lambda example: example["language"].startswith(lang)) + + +def filter_french(dataset: datasets.Dataset) -> datasets.Dataset: + return filter_dataset(dataset, "fr") + + +def filter_english(dataset: datasets.Dataset) -> datasets.Dataset: + return filter_dataset(dataset, "en")