From 7257aa2e4e0e6934a32ce0e005c7d9a534d710a7 Mon Sep 17 00:00:00 2001
From: Maxime <672982+maximegmd@users.noreply.github.com>
Date: Wed, 5 Jun 2024 13:20:27 +0200
Subject: [PATCH] Multiple Choice Questions and Large Languages Models: A Case
 Study with Fictional Medical Data (#1867)

* glianorex tasks

* Create README.md

* Update README.md

* Update README.md

* fix formatting

* fix internal formatting
---
 lm_eval/tasks/glianorex/README.md             | 20 ++++++++++++++++
 lm_eval/tasks/glianorex/glianorex.yaml        | 14 +++++++++++
 lm_eval/tasks/glianorex/glianorex_en.yaml     | 15 ++++++++++++
 lm_eval/tasks/glianorex/glianorex_fr.yaml     | 15 ++++++++++++
 .../tasks/glianorex/preprocess_glianorex.py   | 23 +++++++++++++++++++
 5 files changed, 87 insertions(+)
 create mode 100644 lm_eval/tasks/glianorex/README.md
 create mode 100755 lm_eval/tasks/glianorex/glianorex.yaml
 create mode 100755 lm_eval/tasks/glianorex/glianorex_en.yaml
 create mode 100755 lm_eval/tasks/glianorex/glianorex_fr.yaml
 create mode 100755 lm_eval/tasks/glianorex/preprocess_glianorex.py

diff --git a/lm_eval/tasks/glianorex/README.md b/lm_eval/tasks/glianorex/README.md
new file mode 100644
index 0000000000..3efc925665
--- /dev/null
+++ b/lm_eval/tasks/glianorex/README.md
@@ -0,0 +1,20 @@
+# Glianorex
+
+The goal of this benchmark is to isolate the test answering capabilities from the content knowledge.
+
+### Paper
+
+Title: Multiple Choice Questions and Large Languages Models: A Case Study with Fictional Medical Data
+
+Abstract: https://arxiv.org/abs/2406.02394
+
+To test the relevance of MCQs to assess LLM performance without prior data exposure, we created a fictional medical benchmark and knowledge base on a non-existent gland, the Glianorex. Using GPT-4 we generated a comprehensive textbook on the Glianorex in both English and French, and created multiple-choice questions in both English and French.
+
+### Tasks
+
+All tasks are multiple choice questions with 4 options, only one correct option.
+
+- `glianorex`: Evaluates all tasks listed below.
+
+- `glianorex_en`: Evaluates the accuracy on 264 questions in English.
+- `glianorex_fr`: Evaluates the accuracy on 264 questions in French.
diff --git a/lm_eval/tasks/glianorex/glianorex.yaml b/lm_eval/tasks/glianorex/glianorex.yaml
new file mode 100755
index 0000000000..a7ba436656
--- /dev/null
+++ b/lm_eval/tasks/glianorex/glianorex.yaml
@@ -0,0 +1,14 @@
+task: glianorex
+dataset_path: maximegmd/glianorex
+output_type: multiple_choice
+test_split: train
+doc_to_text: !function preprocess_glianorex.doc_to_text
+doc_to_target: !function preprocess_glianorex.doc_to_target
+doc_to_choice: [ 'A', 'B', 'C', 'D' ]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
diff --git a/lm_eval/tasks/glianorex/glianorex_en.yaml b/lm_eval/tasks/glianorex/glianorex_en.yaml
new file mode 100755
index 0000000000..b08c6f8114
--- /dev/null
+++ b/lm_eval/tasks/glianorex/glianorex_en.yaml
@@ -0,0 +1,15 @@
+task: glianorex_en
+dataset_path: maximegmd/glianorex
+output_type: multiple_choice
+test_split: train
+doc_to_text: !function preprocess_glianorex.doc_to_text
+doc_to_target: !function preprocess_glianorex.doc_to_target
+process_docs: !function preprocess_glianorex.filter_english
+doc_to_choice: [ 'A', 'B', 'C', 'D' ]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
diff --git a/lm_eval/tasks/glianorex/glianorex_fr.yaml b/lm_eval/tasks/glianorex/glianorex_fr.yaml
new file mode 100755
index 0000000000..6d09bc5a7b
--- /dev/null
+++ b/lm_eval/tasks/glianorex/glianorex_fr.yaml
@@ -0,0 +1,15 @@
+task: glianorex_fr
+dataset_path: maximegmd/glianorex
+output_type: multiple_choice
+test_split: train
+doc_to_text: !function preprocess_glianorex.doc_to_text
+doc_to_target: !function preprocess_glianorex.doc_to_target
+process_docs: !function preprocess_glianorex.filter_french
+doc_to_choice: [ 'A', 'B', 'C', 'D' ]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
diff --git a/lm_eval/tasks/glianorex/preprocess_glianorex.py b/lm_eval/tasks/glianorex/preprocess_glianorex.py
new file mode 100755
index 0000000000..f257df14d8
--- /dev/null
+++ b/lm_eval/tasks/glianorex/preprocess_glianorex.py
@@ -0,0 +1,23 @@
+import datasets
+
+
+def doc_to_text(doc) -> str:
+    option_choices = doc["options"]
+    answers = "".join((f"{k}. {v}\n") for k, v in option_choices.items())
+    return f"Question: {doc['question']}\n{answers}Answer:"
+
+
+def doc_to_target(doc) -> int:
+    return doc["answer_idx"]
+
+
+def filter_dataset(dataset: datasets.Dataset, lang: str) -> datasets.Dataset:
+    return dataset.filter(lambda example: example["language"].startswith(lang))
+
+
+def filter_french(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "fr")
+
+
+def filter_english(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "en")