forked from bigscience-workshop/biomedical
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
* _split_generators includes filepath * add pubmed_id, fix authors data_dir as dict pass ner_filepath, corpus_filepath use _corpus_to_dict function * modified schema need to fix entity offsets fix entities to match schema * add "full" config delete _corpus_to_dict change how _split_generators checks configs * functional if ignore long sequences * make example_id actually unique * delete check for long lines * fix: cord-ner full source name Co-authored-by: Nicholas Broad <[email protected]> Co-authored-by: Natasha Seelam <[email protected]>
- Loading branch information
1 parent
0758b12
commit 71f7a11
Showing
1 changed file
with
334 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,334 @@ | ||
# coding=utf-8 | ||
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
""" | ||
CORD-NER dataset covers 75 fine-grained entity types: In addition to the common biomedical entity types (e.g., genes, chemicals and diseases), it covers many new entity types related explicitly to the COVID-19 studies (e.g., coronaviruses, viral proteins, evolution, materials, substrates and immune responses), which may benefit research on COVID-19 related virus, spreading mechanisms, and potential vaccines. CORD-NER annotation is a combination of four sources with different NER methods. | ||
""" | ||
|
||
import os | ||
import json | ||
from typing import List, Tuple, Dict | ||
|
||
import datasets | ||
from utils import schemas | ||
from utils.configs import BigBioConfig | ||
from utils.constants import Tasks | ||
|
||
_CITATION = """\ | ||
@article{DBLP:journals/corr/abs-2003-12218, | ||
author = {Xuan Wang and | ||
Xiangchen Song and | ||
Yingjun Guan and | ||
Bangzheng Li and | ||
Jiawei Han}, | ||
title = {Comprehensive Named Entity Recognition on {CORD-19} with Distant or | ||
Weak Supervision}, | ||
journal = {CoRR}, | ||
volume = {abs/2003.12218}, | ||
year = {2020}, | ||
url = {https://arxiv.org/abs/2003.12218}, | ||
eprinttype = {arXiv}, | ||
eprint = {2003.12218}, | ||
timestamp = {Fri, 08 May 2020 13:20:46 +0200}, | ||
biburl = {https://dblp.org/rec/journals/corr/abs-2003-12218.bib}, | ||
bibsource = {dblp computer science bibliography, https://dblp.org} | ||
} | ||
""" | ||
|
||
|
||
_DATASETNAME = "cord_ner" | ||
|
||
|
||
_DESCRIPTION = """\ | ||
CORD-NER dataset covers 75 fine-grained entity types: In addition to the common biomedical entity types (e.g., genes, chemicals and diseases), it covers many new entity types related explicitly to the COVID-19 studies (e.g., coronaviruses, viral proteins, evolution, materials, substrates and immune responses), which may benefit research on COVID-19 related virus, spreading mechanisms, and potential vaccines. CORD-NER annotation is a combination of four sources with different NER methods. | ||
""" | ||
|
||
_HOMEPAGE = "https://xuanwang91.github.io/2020-03-20-cord19-ner/" | ||
|
||
|
||
_LICENSE = """ | ||
This dataset is made from multiple datasets by Allen Institute | ||
for AI in partnership with the Chan Zuckerberg Initiative, Georgetown | ||
University’s Center for Security and Emerging Technology, Microsoft Research, | ||
IBM, and the National Library of Medicine - National Institutes | ||
of Health, in coordination with The White House Office of Science | ||
and Technology Policy . The licenses are different depending on the source. | ||
The full license details can be found here: https://www.kaggle.com/datasets/allen-institute-for-ai/CORD-19-research-challenge?select=metadata.csv | ||
""" | ||
|
||
_URLS = { | ||
_DATASETNAME: { | ||
"full": "https://uofi.app.box.com/index.php?rm=box_download_shared_file&shared_name=k8pw7d5kozzpoum2jwfaqdaey1oij93x&file_id=f_651148518303", | ||
"ner": "https://uofi.app.box.com/index.php?rm=box_download_shared_file&shared_name=k8pw7d5kozzpoum2jwfaqdaey1oij93x&file_id=f_642495001609", | ||
"corpus": "https://uofi.app.box.com/index.php?rm=box_download_shared_file&shared_name=k8pw7d5kozzpoum2jwfaqdaey1oij93x&file_id=f_642522056185", | ||
} | ||
} | ||
|
||
_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] | ||
|
||
_SOURCE_VERSION = "1.0.0" | ||
|
||
_BIGBIO_VERSION = "1.0.0" | ||
|
||
|
||
class CordNERDataset(datasets.GeneratorBasedBuilder): | ||
"""TODO: Short description of my dataset.""" | ||
|
||
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) | ||
BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) | ||
|
||
BUILDER_CONFIGS = [ | ||
BigBioConfig( | ||
name="cord_ner_ner_source", | ||
version=SOURCE_VERSION, | ||
description="cord_ner source schema for ner file", | ||
schema="source", | ||
subset_id="cord_ner", | ||
), | ||
BigBioConfig( | ||
name="cord_ner_corpus_source", | ||
version=SOURCE_VERSION, | ||
description="cord_ner source schema for corpus file", | ||
schema="source", | ||
subset_id="cord_ner", | ||
), | ||
BigBioConfig( | ||
name="cord_ner_source", | ||
version=SOURCE_VERSION, | ||
description="cord_ner source schema for full file", | ||
schema="source", | ||
subset_id="cord_ner", | ||
), | ||
BigBioConfig( | ||
name="cord_ner_bigbio_kb", | ||
version=BIGBIO_VERSION, | ||
description="cord_ner BigBio schema", | ||
schema="bigbio_kb", | ||
subset_id="cord_ner", | ||
), | ||
] | ||
|
||
DEFAULT_CONFIG_NAME = "cord_ner_source" | ||
|
||
def _info(self) -> datasets.DatasetInfo: | ||
|
||
# Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible. | ||
|
||
# You can arbitrarily nest lists and dictionaries. | ||
# For iterables, use lists over tuples or `datasets.Sequence` | ||
|
||
if self.config.schema == "source": | ||
if self.config.name == "cord_ner_source": | ||
features = datasets.Features( | ||
{ | ||
"id": datasets.Value("int32"), | ||
"source": datasets.Value("string"), | ||
"doi": datasets.Value("string"), | ||
"pmcid": datasets.Value("string"), | ||
"pubmed_id": datasets.Value("string"), | ||
"publish_time": datasets.Value("string"), | ||
"authors": datasets.Value("string"), | ||
"journal": datasets.Value("string"), | ||
"title": datasets.Value("string"), | ||
"abstract": datasets.Value("string"), | ||
"body": datasets.Value("string"), | ||
"entities": [ | ||
{ | ||
"end": datasets.Value("int32"), | ||
"start": datasets.Value("int32"), | ||
"text": datasets.Value("string"), | ||
"type": datasets.Value("string"), | ||
} | ||
], | ||
} | ||
) | ||
elif self.config.name == "cord_ner_ner_source": | ||
features = datasets.Features( | ||
{ | ||
"doc_id": datasets.Value("int32"), | ||
"sents": [ | ||
{ | ||
"entities": [ | ||
{ | ||
"end": datasets.Value("int32"), | ||
"start": datasets.Value("int32"), | ||
"text": datasets.Value("string"), | ||
"type": datasets.Value("string"), | ||
} | ||
], | ||
"sent_id": datasets.Value("int32"), | ||
} | ||
], | ||
} | ||
) | ||
elif self.config.name == "cord_ner_corpus_source": | ||
features = datasets.Features( | ||
{ | ||
"doc_id": datasets.Value("int32"), | ||
"sents": [ | ||
{ | ||
"sent_id": datasets.Value("int32"), | ||
"sent_tokens": datasets.Sequence( | ||
datasets.Value("string") | ||
), | ||
} | ||
], | ||
"source": datasets.Value("string"), | ||
"doi": datasets.Value("string"), | ||
"pmcid": datasets.Value("string"), | ||
"pubmed_id": datasets.Value("string"), | ||
"publish_time": datasets.Value("string"), | ||
"authors": datasets.Value("string"), | ||
"journal": datasets.Value("string"), | ||
} | ||
) | ||
elif self.config.name == "cord_ner_bigbio_kb": | ||
features = schemas.kb_features | ||
else: | ||
raise NotImplementedError(f"{self.config.name} not a valid config name") | ||
|
||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=features, | ||
homepage=_HOMEPAGE, | ||
license=_LICENSE, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: | ||
"""Returns SplitGenerators.""" | ||
if self.config.data_dir is None: | ||
# The download method may not be reliable, so if it fails this function will still work for local files | ||
try: | ||
|
||
if ( | ||
self.config.name == "cord_ner_source" | ||
or self.config.name == "cord_ner_bigbio_kb" | ||
): | ||
url = {"train": _URLS[_DATASETNAME]["full"]} | ||
elif self.config.name == "cord_ner_ner_source": | ||
url = {"train": _URLS[_DATASETNAME]["ner"]} | ||
elif self.config.name == "cord_ner_corpus_source": | ||
url = {"train": _URLS[_DATASETNAME]["corpus"]} | ||
data_dir = dl_manager.download_and_extract(url) | ||
except: | ||
raise ConnectionError( | ||
"The dataset could not be downloaded. Please download to local storage and pass the data_dir kwarg to load_dataset." | ||
) | ||
else: | ||
filenames = { | ||
"full": "CORD-NER-full.json", | ||
"ner": "CORD-NER-ner.json", | ||
"corpus": "CORD-NER-corpus.json", | ||
} | ||
|
||
if ( | ||
self.config.name == "cord_ner_source" | ||
or self.config.name == "cord_ner_bigbio_kb" | ||
): | ||
data_dir = { | ||
"train": os.path.join(self.config.data_dir, filenames["full"]) | ||
} | ||
elif self.config.name == "cord_ner_ner_source": | ||
data_dir = { | ||
"train": os.path.join(self.config.data_dir, filenames["ner"]) | ||
} | ||
elif self.config.name == "cord_ner_corpus_source": | ||
data_dir = { | ||
"train": os.path.join(self.config.data_dir, filenames["corpus"]) | ||
} | ||
|
||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
gen_kwargs={ | ||
"filepath": data_dir["train"], | ||
"split": "train", | ||
}, | ||
), | ||
] | ||
|
||
def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: | ||
"""Yields examples as (key, example) tuples.""" | ||
|
||
if self.config.schema == "source": | ||
with open(filepath) as fp: | ||
for key, example in enumerate(fp.readlines()): | ||
yield key, json.loads(example) | ||
|
||
elif self.config.schema == "bigbio_kb": | ||
|
||
with open(filepath) as fp: | ||
unq_id = 0 | ||
for key, line in enumerate(fp.readlines()): | ||
|
||
example_id = unq_id | ||
unq_id += 1 | ||
|
||
line = json.loads(line) | ||
passages = [] | ||
offset_start = 0 | ||
for text_type in ["title", "abstract", "body"]: | ||
|
||
passages.append( | ||
{ | ||
"id": str(unq_id), | ||
"type": text_type, | ||
"text": [line[text_type]], | ||
"offsets": [ | ||
[ | ||
offset_start, | ||
offset_start + len(line[text_type]), | ||
] | ||
], | ||
} | ||
) | ||
# add +1 for the space | ||
offset_start += len(line[text_type]) + 1 | ||
unq_id += 1 | ||
|
||
entities = [] | ||
for ent in line["entities"]: | ||
entities.append( | ||
{ | ||
"id": str(unq_id), | ||
"type": ent["type"], | ||
"text": [ent["text"]], | ||
"offsets": [[ent["start"], ent["end"]]], | ||
"normalized": [ | ||
{ | ||
"db_name": "", | ||
"db_id": "", | ||
} | ||
], | ||
} | ||
) | ||
unq_id += 1 | ||
|
||
yield key, { | ||
"id": str(example_id), | ||
"document_id": str(line["id"]), | ||
"passages": passages, | ||
"entities": entities, | ||
"events": [], | ||
"coreferences": [], | ||
"relations": [], | ||
} | ||
|
||
|
||
# This allows you to run your dataloader with `python [dataset_name].py` during development | ||
# TODO: Remove this before making your PR | ||
if __name__ == "__main__": | ||
datasets.load_dataset(__file__) |