diff --git a/bigbio/biodatasets/jnlpba/jnlpba.py b/bigbio/biodatasets/jnlpba/jnlpba.py index c3c42451c..3e65c520e 100644 --- a/bigbio/biodatasets/jnlpba/jnlpba.py +++ b/bigbio/biodatasets/jnlpba/jnlpba.py @@ -23,7 +23,8 @@ from typing import Dict, List, Tuple import datasets - +import os +import itertools from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig from bigbio.utils.constants import Lang, Tasks @@ -33,7 +34,6 @@ _PUBMED = True _LOCAL = False -# TODO: Add BibTeX citation _CITATION = """\ @inproceedings{collier-kim-2004-introduction, title = "Introduction to the Bio-entity Recognition Task at {JNLPBA}", @@ -61,21 +61,20 @@ _LICENSE = Licenses.CC_BY_3p0 _URLS = { - _DATASETNAME: "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Train/Genia4ERtraining.tar.gz", + "train": "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Train/Genia4ERtraining.tar.gz", + "test": "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Evaluation/Genia4ERtest.tar.gz", } -# TODO: add supported task by dataset. One dataset may support multiple tasks _SUPPORTED_TASKS = [ Tasks.NAMED_ENTITY_RECOGNITION ] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] -# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0" -# This version doesn't have to be consistent with semantic versioning. Anything that is -# provided by the original dataset as a version goes. _SOURCE_VERSION = "3.2.0" _BIGBIO_VERSION = "1.0.0" +logger = datasets.utils.logging.get_logger(__name__) + class JNLPBADataset(datasets.GeneratorBasedBuilder): """ @@ -114,7 +113,29 @@ class JNLPBADataset(datasets.GeneratorBasedBuilder): def _info(self) -> datasets.DatasetInfo: if self.config.schema == "source": - features = datasets.load_dataset("jnlpba", split="train").features + features = datasets.Features( + { + "id": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "ner_tags": datasets.Sequence( + datasets.features.ClassLabel( + names=[ + "O", + "B-DNA", + "I-DNA", + "B-RNA", + "I-RNA", + "B-cell_line", + "I-cell_line", + "B-cell_type", + "I-cell_type", + "B-protein", + "I-protein", + ] + ) + ), + } + ) elif self.config.schema == "bigbio_kb": features = schemas.kb_features @@ -129,54 +150,139 @@ def _info(self) -> datasets.DatasetInfo: def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - data = datasets.load_dataset("jnlpba") - + train_filepath = dl_manager.download_and_extract(_URLS["train"]) + test_filepath = dl_manager.download_and_extract(_URLS["test"]) + train_file = os.path.join(train_filepath, "Genia4ERtask1.iob2") + test_file = os.path.join(test_filepath, "Genia4EReval1.iob2") return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # Whatever you put in gen_kwargs will be passed to _generate_examples - gen_kwargs={"data": data["train"]}, + gen_kwargs={"filepath": train_file}, ), datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={"data": data["validation"]}, + name=datasets.Split.TEST, + gen_kwargs={"filepath": test_file}, ), ] - def _generate_examples(self, data: datasets.Dataset) -> Tuple[int, Dict]: - """Yields examples as (key, example) tuples.""" - uid = 0 + def _parse_sentence(self, tokens, ner_tags, uid): + """ + This function takes in two stacks, one with tokens and the other with tags + It returns the passage and the entities as required by the bigbio_kb schema + """ + entities = [] + sentence_words = [] + distance_from_back = -1 + while tokens: + curr_token = tokens.pop() + ner_tag = ner_tags.pop() + distance_from_back += len(curr_token) + 1 + sentence_words.append(curr_token) + if ner_tag.startswith("I-"): + # Keep popping elements until the next B-* tag is hit + tag_tokens = [curr_token] + curr_tag = ner_tag[2:] + while not ner_tag.startswith("B-"): + curr_token = tokens.pop() + ner_tag = ner_tags.pop() + distance_from_back += len(curr_token) + 1 + sentence_words.append(curr_token) + tag_tokens.append(curr_token) + tag_text = " ".join(list(reversed(tag_tokens))) + tag_start = distance_from_back + tag_end = tag_start - len(tag_text) + entity = { + "id": next(uid), + "type": curr_tag, + "text": [tag_text], + "normalized": [], + "offsets": [[tag_start, tag_end]], + } + entities.append(entity) + elif ner_tag.startswith("B-"): + curr_tag = ner_tag[2:] + tag_start = distance_from_back + tag_end = tag_start - len(curr_token) + entity = { + "id": next(uid), + "type": curr_tag, + "text": [curr_token], + "normalized": [], + "offsets": [[tag_start, tag_end]], + } + entities.append(entity) + elif ner_tag == "O": + continue + passage = " ".join(list(reversed(sentence_words))) + for entity in entities: + entity_start = len(passage) - entity["offsets"][0][0] + entity_end = len(passage) - entity["offsets"][0][1] + entity["offsets"][0][1] = entity_end + entity["offsets"][0][0] = entity_start + + document = {} + document["id"] = next(uid) + document["document_id"] = document["id"] + document["entities"] = entities + document["passages"] = [ + { + "id": next(uid), + "type": "", + "text": [passage], + "offsets": [[0, len(passage)]], + } + ] + document["relations"] = [] + document["events"] = [] + document["coreferences"] = [] + return document["id"], document + def _generate_examples(self, filepath) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + logger.info(f"Generating examples from {filepath}") + uid = itertools.count(0) if self.config.schema == "source": - for key, sample in enumerate(data): - yield key, sample + with open(filepath, encoding="utf-8") as f: + tokens = [] + ner_tags = [] + for line in f: + if line == "" or line == "\n": + if tokens: + id = next(uid) + yield id, { + "id": id, + "tokens": tokens, + "ner_tags": ner_tags, + } + next(uid) + tokens = [] + ner_tags = [] - elif self.config.schema == "bigbio_kb": - for i, sample in enumerate(data): - feature_dict = { - "id": uid, - "document_id": "NULL", - "passages": [], - "entities": [], - "relations": [], - "events": [], - "coreferences": [], + else: + # tokens are tab separated + splits = line.split("\t") + tokens.append(splits[0]) + ner_tags.append(splits[1].rstrip()) + # last example + id = next(uid) + yield id, { + "id": id, + "tokens": tokens, + "ner_tags": ner_tags, } - uid += 1 - offset_start = 0 - for token, tag in zip(sample["tokens"], sample["ner_tags"]): - offset_start += len(token) + 1 - feature_dict["entities"].append( - { - "id": uid, - "offsets": [[offset_start, offset_start + len(token)]], - "text": [token], - "type": tag, - "normalized": [], - } - ) - uid += 1 - - # entities - yield i, feature_dict + elif self.config.schema == "bigbio_kb": + with open(filepath, encoding="utf-8") as f: + tokens = [] + ner_tags = [] + for line in f: + if line == "" or line == "\n": + document_id, document = self._parse_sentence( + tokens, ner_tags, uid + ) + yield document_id, document + else: + token, tag = line.split("\t") + tokens.append(token.strip()) + ner_tags.append(tag.strip()) diff --git a/bigbio/hub/hub_repos/jnlpba/README.md b/bigbio/hub/hub_repos/jnlpba/README.md index 4fb53c317..9778051c7 100644 --- a/bigbio/hub/hub_repos/jnlpba/README.md +++ b/bigbio/hub/hub_repos/jnlpba/README.md @@ -26,24 +26,22 @@ bigbio_tasks: - **Tasks:** NER -NER For Bio-Entities +The data came from the GENIA version 3.02 corpus (Kim et al., 2003). +This was formed from a controlled search on MEDLINE using the MeSH terms human, blood cells and transcription factors. +From this search 2,000 abstracts were selected and hand annotated according to a small taxonomy of 48 classes based on +a chemical classification. Among the classes, 36 terminal classes were used to annotate the GENIA corpus. ## Citation Information ``` -@inproceedings{collier-kim-2004-introduction, -title = "Introduction to the Bio-entity Recognition Task at {JNLPBA}", -author = "Collier, Nigel and Kim, Jin-Dong", -booktitle = "Proceedings of the International Joint Workshop -on Natural Language Processing in Biomedicine and its Applications -({NLPBA}/{B}io{NLP})", -month = aug # " 28th and 29th", year = "2004", -address = "Geneva, Switzerland", -publisher = "COLING", -url = "https://aclanthology.org/W04-1213", -pages = "73--78", +@inproceedings{collier2004introduction, + title={Introduction to the bio-entity recognition task at JNLPBA}, + author={Collier, Nigel and Ohta, Tomoko and Tsuruoka, Yoshimasa and Tateisi, Yuka and Kim, Jin-Dong}, + booktitle={Proceedings of the International Joint Workshop on Natural Language Processing in Biomedicine \ + and its Applications (NLPBA/BioNLP)}, + pages={73--78}, + year={2004} } - ``` diff --git a/bigbio/hub/hub_repos/jnlpba/jnlpba.py b/bigbio/hub/hub_repos/jnlpba/jnlpba.py index b723aaa75..e674d265d 100644 --- a/bigbio/hub/hub_repos/jnlpba/jnlpba.py +++ b/bigbio/hub/hub_repos/jnlpba/jnlpba.py @@ -13,38 +13,26 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -The data came from the GENIA version 3.02 corpus (Kim et al., 2003). -This was formed from a controlled search on MEDLINE using the MeSH terms human, blood cells and transcription factors. -From this search 2,000 abstracts were selected and hand annotated according to a small taxonomy of 48 classes based on -a chemical classification. Among the classes, 36 terminal classes were used to annotate the GENIA corpus. -""" - +import itertools +import os from typing import Dict, List, Tuple import datasets -from .bigbiohub import kb_features -from .bigbiohub import BigBioConfig -from .bigbiohub import Tasks +from .bigbiohub import BigBioConfig, Tasks, kb_features -_LANGUAGES = ['English'] +_LANGUAGES = ["English"] _PUBMED = True _LOCAL = False -# TODO: Add BibTeX citation _CITATION = """\ -@inproceedings{collier-kim-2004-introduction, -title = "Introduction to the Bio-entity Recognition Task at {JNLPBA}", -author = "Collier, Nigel and Kim, Jin-Dong", -booktitle = "Proceedings of the International Joint Workshop -on Natural Language Processing in Biomedicine and its Applications -({NLPBA}/{B}io{NLP})", -month = aug # " 28th and 29th", year = "2004", -address = "Geneva, Switzerland", -publisher = "COLING", -url = "https://aclanthology.org/W04-1213", -pages = "73--78", +@inproceedings{collier2004introduction, +title={Introduction to the bio-entity recognition task at JNLPBA}, +author={Collier, Nigel and Ohta, Tomoko and Tsuruoka, Yoshimasa and Tateisi, Yuka and Kim, Jin-Dong}, +booktitle={Proceedings of the International Joint Workshop on Natural Language Processing in Biomedicine \ + and its Applications (NLPBA/BioNLP)}, +pages={73--78}, +year={2004} } """ @@ -52,29 +40,28 @@ _DISPLAYNAME = "JNLPBA" _DESCRIPTION = """\ -NER For Bio-Entities +The data came from the GENIA version 3.02 corpus (Kim et al., 2003). +This was formed from a controlled search on MEDLINE using the MeSH terms human, blood cells and transcription factors. +From this search 2,000 abstracts were selected and hand annotated according to a small taxonomy of 48 classes based on +a chemical classification. Among the classes, 36 terminal classes were used to annotate the GENIA corpus. """ _HOMEPAGE = "http://www.geniaproject.org/shared-tasks/bionlp-jnlpba-shared-task-2004" -_LICENSE = 'Creative Commons Attribution 3.0 Unported' +_LICENSE = "CC_BY_3p0" _URLS = { - _DATASETNAME: "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Train/Genia4ERtraining.tar.gz", + "train": "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Train/Genia4ERtraining.tar.gz", + "test": "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Evaluation/Genia4ERtest.tar.gz", } -# TODO: add supported task by dataset. One dataset may support multiple tasks -_SUPPORTED_TASKS = [ - Tasks.NAMED_ENTITY_RECOGNITION -] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] -# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0" -# This version doesn't have to be consistent with semantic versioning. Anything that is -# provided by the original dataset as a version goes. _SOURCE_VERSION = "3.2.0" - _BIGBIO_VERSION = "1.0.0" +logger = datasets.utils.logging.get_logger(__name__) + class JNLPBADataset(datasets.GeneratorBasedBuilder): """ @@ -113,10 +100,34 @@ class JNLPBADataset(datasets.GeneratorBasedBuilder): def _info(self) -> datasets.DatasetInfo: if self.config.schema == "source": - features = datasets.load_dataset("jnlpba", split="train").features + features = datasets.Features( + { + "id": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "ner_tags": datasets.Sequence( + datasets.features.ClassLabel( + names=[ + "O", + "B-DNA", + "I-DNA", + "B-RNA", + "I-RNA", + "B-cell_line", + "I-cell_line", + "B-cell_type", + "I-cell_type", + "B-protein", + "I-protein", + ] + ) + ), + } + ) elif self.config.schema == "bigbio_kb": features = kb_features + else: + raise NotImplementedError(f"Schema {self.config.schema} not supported") return datasets.DatasetInfo( description=_DESCRIPTION, @@ -127,55 +138,141 @@ def _info(self) -> datasets.DatasetInfo: ) def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: - """Returns SplitGenerators.""" - data = datasets.load_dataset("jnlpba") + """Return SplitGenerators""" + + train_filepath = dl_manager.download_and_extract(_URLS["train"]) + test_filepath = dl_manager.download_and_extract(_URLS["test"]) + + train_file = os.path.join(train_filepath, "Genia4ERtask1.iob2") + test_file = os.path.join(test_filepath, "Genia4EReval1.iob2") return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, - # Whatever you put in gen_kwargs will be passed to _generate_examples - gen_kwargs={"data": data["train"]}, + gen_kwargs={"filepath": train_file}, ), datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={"data": data["validation"]}, + name=datasets.Split.TEST, + gen_kwargs={"filepath": test_file}, ), ] - def _generate_examples(self, data: datasets.Dataset) -> Tuple[int, Dict]: - """Yields examples as (key, example) tuples.""" - uid = 0 + def _parse_sentence(self, tokens, ner_tags, uid): + """ + This function takes in two stacks, one with tokens and the other with tags + It returns the passage and the entities as required by the bigbio_kb schema + """ + entities = [] + sentence_words = [] + distance_from_back = -1 + while tokens: + curr_token = tokens.pop() + ner_tag = ner_tags.pop() + distance_from_back += len(curr_token) + 1 + sentence_words.append(curr_token) + if ner_tag.startswith("I-"): + # Keep popping elements until the next B-* tag is hit + tag_tokens = [curr_token] + curr_tag = ner_tag[2:] + while not ner_tag.startswith("B-"): + curr_token = tokens.pop() + ner_tag = ner_tags.pop() + distance_from_back += len(curr_token) + 1 + sentence_words.append(curr_token) + tag_tokens.append(curr_token) + + tag_text = " ".join(list(reversed(tag_tokens))) + tag_start = distance_from_back + tag_end = tag_start - len(tag_text) + entity = { + "id": next(uid), + "type": curr_tag, + "text": [tag_text], + "normalized": [], + "offsets": [[tag_start, tag_end]], + } + entities.append(entity) + elif ner_tag.startswith("B-"): + curr_tag = ner_tag[2:] + tag_start = distance_from_back + tag_end = tag_start - len(curr_token) + entity = { + "id": next(uid), + "type": curr_tag, + "text": [curr_token], + "normalized": [], + "offsets": [[tag_start, tag_end]], + } + entities.append(entity) + elif ner_tag == "O": + continue + passage = " ".join(list(reversed(sentence_words))) + for entity in entities: + entity_start = len(passage) - entity["offsets"][0][0] + entity_end = len(passage) - entity["offsets"][0][1] + entity["offsets"][0][1] = entity_end + entity["offsets"][0][0] = entity_start + + document = {} + document["id"] = next(uid) + document["document_id"] = document["id"] + document["entities"] = entities + document["passages"] = [ + { + "id": next(uid), + "type": "", + "text": [passage], + "offsets": [[0, len(passage)]], + } + ] + document["relations"] = [] + document["events"] = [] + document["coreferences"] = [] + return document["id"], document + def _generate_examples(self, filepath) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + logger.info(f"Generating examples from {filepath}") + uid = itertools.count(0) if self.config.schema == "source": - for key, sample in enumerate(data): - yield key, sample + with open(filepath, encoding="utf-8") as f: + tokens = [] + ner_tags = [] + for line in f: + if line == "" or line == "\n": + if tokens: + id = next(uid) + yield id, { + "id": id, + "tokens": tokens, + "ner_tags": ner_tags, + } + next(uid) + tokens = [] + ner_tags = [] - elif self.config.schema == "bigbio_kb": - for i, sample in enumerate(data): - feature_dict = { - "id": uid, - "document_id": "NULL", - "passages": [], - "entities": [], - "relations": [], - "events": [], - "coreferences": [], + else: + # tokens are tab separated + splits = line.split("\t") + tokens.append(splits[0]) + ner_tags.append(splits[1].rstrip()) + # last example + id = next(uid) + yield id, { + "id": id, + "tokens": tokens, + "ner_tags": ner_tags, } - uid += 1 - offset_start = 0 - for token, tag in zip(sample["tokens"], sample["ner_tags"]): - offset_start += len(token) + 1 - feature_dict["entities"].append( - { - "id": uid, - "offsets": [[offset_start, offset_start + len(token)]], - "text": [token], - "type": tag, - "normalized": [], - } - ) - uid += 1 - - # entities - yield i, feature_dict + elif self.config.schema == "bigbio_kb": + with open(filepath, encoding="utf-8") as f: + tokens = [] + ner_tags = [] + for line in f: + if line == "" or line == "\n": + document_id, document = self._parse_sentence(tokens, ner_tags, uid) + yield document_id, document + else: + token, tag = line.split("\t") + tokens.append(token.strip()) + ner_tags.append(tag.strip())