From 934548ac9ab4f5f816aa18f66fd074b3fa01dee6 Mon Sep 17 00:00:00 2001 From: Karthik Rangasai Date: Fri, 22 Apr 2022 23:38:57 +0530 Subject: [PATCH 1/4] Add Epic QA dataset loader. --- biodatasets/epic_qa/epic_qa.py | 287 +++++++++++++++++++++++++++++++++ 1 file changed, 287 insertions(+) create mode 100644 biodatasets/epic_qa/epic_qa.py diff --git a/biodatasets/epic_qa/epic_qa.py b/biodatasets/epic_qa/epic_qa.py new file mode 100644 index 00000000..f70346fc --- /dev/null +++ b/biodatasets/epic_qa/epic_qa.py @@ -0,0 +1,287 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from pathlib import Path +from typing import Any, Dict, List, Union + +import datasets + +from utils import schemas +from utils.configs import BigBioConfig +from utils.constants import Tasks + +logger = datasets.utils.logging.get_logger(__name__) + +_CITATION = """\ +Unknown +""" + +_DESCRIPTION = """\ +In response to the COVID-19 pandemic, the Epidemic Question +Answering (EPIC-QA) track challenges teams to develop systems +capable of automatically answering ad-hoc questions about the +disease COVID-19, its causal virus SARS-CoV-2, related corona +viruses, and the recommended response to the pandemic. +The 2020 EPIC-QA track involves two tasks: +Task A + Expert QA: In Task A, teams are provided with a set of + questions asked by experts and are asked to provide a + ranked list of expert-level answers to each question. + In Task A, answers should provide information that is + useful to researchers, scientists, or clinicians. +Task B + Consumer QA: In Task B, teams are provided with a set + of questions asked by consumers and are asked to provide + a ranked list of consumer-friendly answers to each question. + In Task B, answers should be understandable by the general + public. +While each task will have its own set of questions, many of the +questions will overlap. This is by design, so that the collection +can be used to explore whether the same approaches or systems can +account for different types of users. +""" + +_DATASETNAME = "epic_qa" + +_HOMEPAGE = "https://bionlp.nlm.nih.gov/epic_qa/#collection" + +_LICENSE = "DUA-NC" + +_BASE = "https://bionlp.nlm.nih.gov/epic_qa/data/epic_qa_" + +_URLs = { + "epic_qa_research_source": _BASE + "cord_2020-10-22-split-corrected.tar.gz", + "epic_qa_research_bigbio_qa": _BASE + "cord_2020-10-22-split-corrected.tar.gz", + "epic_qa_consumer_source": _BASE + "consumer_2020-11-02.tar.gz", + "epic_qa_consumer_bigbio_qa": _BASE + "consumer_2020-11-02.tar.gz", +} + +_DIRS = { + "research": "2020-10-22-new", + "consumer": "epic_qa_consumer_2020_11-02", +} + +_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] +_SOURCE_VERSION = "1.0.0" +_BIGBIO_VERSION = "1.0.0" + + +class EpicQaDataset(datasets.GeneratorBasedBuilder): + """EpicQA""" + + DEFAULT_CONFIG_NAME = "epic_qa_research_source" + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [] + + for subset in ["research", "consumer"]: + BUILDER_CONFIGS.append( + BigBioConfig( + name=f"epic_qa_{subset}_source", + version=SOURCE_VERSION, + description=f"EpicQA Source schema for the {subset.capitalize()} Articles subset", + schema="source", + subset_id=f"epic_qa_{subset}", + ) + ) + BUILDER_CONFIGS.append( + BigBioConfig( + name=f"epic_qa_{subset}_bigbio_qa", + version=BIGBIO_VERSION, + description=f"EpicQA BigBio schema for the {subset.capitalize()} Articles subset", + schema="bigbio_qa", + subset_id=f"epic_qa_{subset}", + ) + ) + + def _info(self): + if self.config.name == "epic_qa_consumer_source": + features = datasets.Features( + { + "document_id": datasets.Value("string"), + "metadata": { + "url": datasets.Value("string"), + }, + "question": datasets.Value("string"), + "context_id": datasets.Value("string"), + "context": datasets.Value("string"), + "section": datasets.Value("string"), + "answer": [ + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + "start": datasets.Value("int32"), + "end": datasets.Value("int32"), + } + ], + } + ) + elif self.config.name == "epic_qa_research_source": + features = datasets.Features( + { + "document_id": datasets.Value("string"), + "metadata": { + "urls": datasets.Sequence(datasets.Value("string")), + "authors": datasets.Value("string"), + "full_text_path": datasets.Value("string"), + }, + "question": datasets.Value("string"), + "context_id": datasets.Value("string"), + "context": datasets.Value("string"), + "section": datasets.Value("string"), + "answer": [ + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + "start": datasets.Value("int32"), + "end": datasets.Value("int32"), + } + ], + } + ) + else: + features = schemas.qa_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + supervised_keys=None, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + + my_urls = _URLs[self.config.name] + data_dir = dl_manager.download_and_extract(my_urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": Path(data_dir), "split": "train"}, + ), + ] + + def _generate_examples(self, filepath: Path, split: str): + """Yields examples as (key, example) tuples.""" + + if self.config.subset_id.endswith("research"): + subset_name = "research" + elif self.config.subset_id.endswith("consumer"): + subset_name = "consumer" + else: + raise ValueError(f"subset_id {self.config.subset_id} does not end with 'research' or 'consumer'") + + sub_directory = _DIRS[subset_name] + + sub_directory_path = filepath / sub_directory + files_path_glob = sub_directory_path.rglob("*.json") + + uid = 0 + for file_full_path in files_path_glob: + with open(file_full_path, encoding="utf-8") as file: + try: + doc: Dict = json.load(file) + except json.JSONDecodeError: + logger.warning("File %s is malformed and cannot be decoded.", file_full_path.name) + continue + + document_id: str = doc.pop("document_id") + metadata: Dict[str, Any] = doc.pop("metadata") + question: str = metadata.pop("title") + contexts: List[Dict[str, Any]] = doc.pop("contexts") + + for context in contexts: + if self.config.name.endswith("source"): + source_example = self._to_source_example( + document_id=document_id, + metadata=metadata, + question=question, + context=context, + ) + yield uid, source_example + + # Generate the BigBio schema + else: + bigbio_example = self._to_bigbio_example( + document_id=document_id, + question=question, + context=context, + ) + yield uid, bigbio_example + + uid += 1 + + def _to_source_example( + self, + document_id: str, + metadata: Dict[str, Any], + question: str, + context: Dict[str, Any], + ) -> Dict: + + answers: List[Dict[str, Union[str, int]]] = [] + for sentence in context["sentences"]: + start = sentence["start"] + end = sentence["end"] + answer = { + "id": sentence["sentence_id"], + "text": context["text"][start:end], + "start": start, + "end": end, + } + answers.append(answer) + + source_example = { + "document_id": document_id, + "metadata": metadata, + "question": question, + "context_id": context["context_id"], + "context": context["text"], + "section": context["section"], + "answer": answers, + } + + return source_example + + def _to_bigbio_example( + self, + document_id: str, + question: str, + context: Dict[str, Any], + ) -> Dict: + + answers: List[str] = [] + for sentence in context["sentences"]: + start = sentence["start"] + end = sentence["end"] + answer = context["text"][start:end] + answers.append(answer) + + bigbio_example = { + "id": context["context_id"], + "question_id": context["context_id"], + "document_id": document_id, + "question": question, + "type": "list", + "choices": [], + "context": context["text"], + "answer": answers, + } + return bigbio_example From b1e34089fdda2fd3eeb47dcebd083e901056b190 Mon Sep 17 00:00:00 2001 From: Karthik Rangasai Date: Sun, 24 Apr 2022 12:57:12 +0530 Subject: [PATCH 2/4] Add description. --- biodatasets/epic_qa/epic_qa.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/biodatasets/epic_qa/epic_qa.py b/biodatasets/epic_qa/epic_qa.py index f70346fc..75710d9c 100644 --- a/biodatasets/epic_qa/epic_qa.py +++ b/biodatasets/epic_qa/epic_qa.py @@ -80,7 +80,15 @@ class EpicQaDataset(datasets.GeneratorBasedBuilder): - """EpicQA""" + """ + EpicQA + + In response to the COVID-19 pandemic, the Epidemic Question + Answering (EPIC-QA) track challenges teams to develop systems + capable of automatically answering ad-hoc questions about the + disease COVID-19, its causal virus SARS-CoV-2, related corona + viruses, and the recommended response to the pandemic. + """ DEFAULT_CONFIG_NAME = "epic_qa_research_source" SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) From 3dd87589fb1fbd99a7144c809fb81561bc0c0eca Mon Sep 17 00:00:00 2001 From: Karthik Rangasai Date: Fri, 29 Apr 2022 20:59:54 +0530 Subject: [PATCH 3/4] Change imports to use new bigbio module. --- biodatasets/epic_qa/epic_qa.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/biodatasets/epic_qa/epic_qa.py b/biodatasets/epic_qa/epic_qa.py index 75710d9c..c1e7c714 100644 --- a/biodatasets/epic_qa/epic_qa.py +++ b/biodatasets/epic_qa/epic_qa.py @@ -19,9 +19,9 @@ import datasets -from utils import schemas -from utils.configs import BigBioConfig -from utils.constants import Tasks +from bigbio.utils import schemas +from bigbio.utils.configs import BigBioConfig +from bigbio.utils.constants import Tasks logger = datasets.utils.logging.get_logger(__name__) From f496762478acbf2144ee036b15773e63895bb487 Mon Sep 17 00:00:00 2001 From: Karthik Rangasai Date: Wed, 25 May 2022 14:03:33 +0530 Subject: [PATCH 4/4] Add init file. --- bigbio/biodatasets/epic_qa/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 bigbio/biodatasets/epic_qa/__init__.py diff --git a/bigbio/biodatasets/epic_qa/__init__.py b/bigbio/biodatasets/epic_qa/__init__.py new file mode 100644 index 00000000..e69de29b