Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closes #230 : Epic-QA #507

Empty file.
295 changes: 295 additions & 0 deletions bigbio/biodatasets/epic_qa/epic_qa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,295 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
from pathlib import Path
from typing import Any, Dict, List, Union

import datasets

from bigbio.utils import schemas
from bigbio.utils.configs import BigBioConfig
from bigbio.utils.constants import Tasks

logger = datasets.utils.logging.get_logger(__name__)

_CITATION = """\
Unknown
"""

_DESCRIPTION = """\
In response to the COVID-19 pandemic, the Epidemic Question
Answering (EPIC-QA) track challenges teams to develop systems
capable of automatically answering ad-hoc questions about the
disease COVID-19, its causal virus SARS-CoV-2, related corona
viruses, and the recommended response to the pandemic.
The 2020 EPIC-QA track involves two tasks:
Task A
Expert QA: In Task A, teams are provided with a set of
questions asked by experts and are asked to provide a
ranked list of expert-level answers to each question.
In Task A, answers should provide information that is
useful to researchers, scientists, or clinicians.
Task B
Consumer QA: In Task B, teams are provided with a set
of questions asked by consumers and are asked to provide
a ranked list of consumer-friendly answers to each question.
In Task B, answers should be understandable by the general
public.
While each task will have its own set of questions, many of the
questions will overlap. This is by design, so that the collection
can be used to explore whether the same approaches or systems can
account for different types of users.
"""

_DATASETNAME = "epic_qa"

_HOMEPAGE = "https://bionlp.nlm.nih.gov/epic_qa/#collection"

_LICENSE = "DUA-NC"

_BASE = "https://bionlp.nlm.nih.gov/epic_qa/data/epic_qa_"

_URLs = {
"epic_qa_research_source": _BASE + "cord_2020-10-22-split-corrected.tar.gz",
"epic_qa_research_bigbio_qa": _BASE + "cord_2020-10-22-split-corrected.tar.gz",
"epic_qa_consumer_source": _BASE + "consumer_2020-11-02.tar.gz",
"epic_qa_consumer_bigbio_qa": _BASE + "consumer_2020-11-02.tar.gz",
}

_DIRS = {
"research": "2020-10-22-new",
"consumer": "epic_qa_consumer_2020_11-02",
}

_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING]
_SOURCE_VERSION = "1.0.0"
_BIGBIO_VERSION = "1.0.0"


class EpicQaDataset(datasets.GeneratorBasedBuilder):
"""
EpicQA

In response to the COVID-19 pandemic, the Epidemic Question
Answering (EPIC-QA) track challenges teams to develop systems
capable of automatically answering ad-hoc questions about the
disease COVID-19, its causal virus SARS-CoV-2, related corona
viruses, and the recommended response to the pandemic.
"""

DEFAULT_CONFIG_NAME = "epic_qa_research_source"
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)

BUILDER_CONFIGS = []

for subset in ["research", "consumer"]:
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"epic_qa_{subset}_source",
version=SOURCE_VERSION,
description=f"EpicQA Source schema for the {subset.capitalize()} Articles subset",
schema="source",
subset_id=f"epic_qa_{subset}",
)
)
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"epic_qa_{subset}_bigbio_qa",
version=BIGBIO_VERSION,
description=f"EpicQA BigBio schema for the {subset.capitalize()} Articles subset",
schema="bigbio_qa",
subset_id=f"epic_qa_{subset}",
)
)

def _info(self):
if self.config.name == "epic_qa_consumer_source":
features = datasets.Features(
{
"document_id": datasets.Value("string"),
"metadata": {
"url": datasets.Value("string"),
},
"question": datasets.Value("string"),
"context_id": datasets.Value("string"),
"context": datasets.Value("string"),
"section": datasets.Value("string"),
"answer": [
{
"id": datasets.Value("string"),
"text": datasets.Value("string"),
"start": datasets.Value("int32"),
"end": datasets.Value("int32"),
}
],
}
)
elif self.config.name == "epic_qa_research_source":
features = datasets.Features(
{
"document_id": datasets.Value("string"),
"metadata": {
"urls": datasets.Sequence(datasets.Value("string")),
"authors": datasets.Value("string"),
"full_text_path": datasets.Value("string"),
},
"question": datasets.Value("string"),
"context_id": datasets.Value("string"),
"context": datasets.Value("string"),
"section": datasets.Value("string"),
"answer": [
{
"id": datasets.Value("string"),
"text": datasets.Value("string"),
"start": datasets.Value("int32"),
"end": datasets.Value("int32"),
}
],
}
)
else:
features = schemas.qa_features

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
supervised_keys=None,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""

my_urls = _URLs[self.config.name]
data_dir = dl_manager.download_and_extract(my_urls)

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"filepath": Path(data_dir), "split": "train"},
),
]

def _generate_examples(self, filepath: Path, split: str):
"""Yields examples as (key, example) tuples."""

if self.config.subset_id.endswith("research"):
subset_name = "research"
elif self.config.subset_id.endswith("consumer"):
subset_name = "consumer"
else:
raise ValueError(f"subset_id {self.config.subset_id} does not end with 'research' or 'consumer'")

sub_directory = _DIRS[subset_name]

sub_directory_path = filepath / sub_directory
files_path_glob = sub_directory_path.rglob("*.json")

uid = 0
for file_full_path in files_path_glob:
with open(file_full_path, encoding="utf-8") as file:
try:
doc: Dict = json.load(file)
except json.JSONDecodeError:
logger.warning("File %s is malformed and cannot be decoded.", file_full_path.name)
continue

document_id: str = doc.pop("document_id")
metadata: Dict[str, Any] = doc.pop("metadata")
question: str = metadata.pop("title")
contexts: List[Dict[str, Any]] = doc.pop("contexts")

for context in contexts:
if self.config.name.endswith("source"):
source_example = self._to_source_example(
document_id=document_id,
metadata=metadata,
question=question,
context=context,
)
yield uid, source_example

# Generate the BigBio schema
else:
bigbio_example = self._to_bigbio_example(
document_id=document_id,
question=question,
context=context,
)
yield uid, bigbio_example

uid += 1

def _to_source_example(
self,
document_id: str,
metadata: Dict[str, Any],
question: str,
context: Dict[str, Any],
) -> Dict:

answers: List[Dict[str, Union[str, int]]] = []
for sentence in context["sentences"]:
start = sentence["start"]
end = sentence["end"]
answer = {
"id": sentence["sentence_id"],
"text": context["text"][start:end],
"start": start,
"end": end,
}
answers.append(answer)

source_example = {
"document_id": document_id,
"metadata": metadata,
"question": question,
"context_id": context["context_id"],
"context": context["text"],
"section": context["section"],
"answer": answers,
}

return source_example

def _to_bigbio_example(
self,
document_id: str,
question: str,
context: Dict[str, Any],
) -> Dict:

answers: List[str] = []
for sentence in context["sentences"]:
start = sentence["start"]
end = sentence["end"]
answer = context["text"][start:end]
answers.append(answer)

bigbio_example = {
"id": context["context_id"],
"question_id": context["context_id"],
"document_id": document_id,
"question": question,
"type": "list",
"choices": [],
"context": context["text"],
"answer": answers,
}
return bigbio_example