Skip to content

Commit

Permalink
Closes #873 and revises #874 (#876)
Browse files Browse the repository at this point in the history
* Fixed wrong entity offsets in tmvar_v3 for PMID 21904390

* reverted offsets in 'source' dataset back to original (but wrong) offsets and added new 'source_fixed' dataset with corrected offsets

* Update reference + fix license

---------

Co-authored-by: Xing Wang <wangxida@guppi4>
Co-authored-by: Leon Weber <[email protected]>
Co-authored-by: Mario Sänger <[email protected]>
  • Loading branch information
4 people authored Oct 12, 2024
1 parent a5e3a92 commit 9da7283
Showing 1 changed file with 26 additions and 21 deletions.
47 changes: 26 additions & 21 deletions bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,16 @@
from .bigbiohub import BigBioConfig, Tasks, kb_features

_CITATION = """\
@misc{https://doi.org/10.48550/arxiv.2204.03637,
title = {tmVar 3.0: an improved variant concept recognition and normalization tool},
author = {
Wei, Chih-Hsuan and Allot, Alexis and Riehle, Kevin and Milosavljevic,
Aleksandar and Lu, Zhiyong
},
year = 2022,
publisher = {arXiv},
doi = {10.48550/ARXIV.2204.03637},
url = {https://arxiv.org/abs/2204.03637},
copyright = {Creative Commons Attribution 4.0 International},
keywords = {
Computation and Language (cs.CL), FOS: Computer and information sciences,
FOS: Computer and information sciences
}
@article{wei2022tmvar,
title={tmVar 3.0: an improved variant concept recognition and normalization tool},
author={Wei, Chih-Hsuan and Allot, Alexis and Riehle, Kevin and Milosavljevic, Aleksandar and Lu, Zhiyong},
journal={Bioinformatics},
volume={38},
number={18},
pages={4449--4451},
year={2022},
publisher={Oxford University Press}
}
"""
_LANGUAGES = ["English"]
_PUBMED = True
Expand All @@ -60,9 +53,9 @@
NED tasks, This dataset does NOT have splits.
"""

_HOMEPAGE = "https://www.ncbi.nlm.nih.gov/research/bionlp/Tools/tmvar/"
_HOMEPAGE = "https://github.com/ncbi/tmVar3"

_LICENSE = "License information unavailable"
_LICENSE = "UNKNOWN"

_URLS = {_DATASETNAME: "ftp://ftp.ncbi.nlm.nih.gov/pub/lu/tmVar3/tmVar3Corpus.txt"}
_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION]
Expand Down Expand Up @@ -90,6 +83,15 @@ class TmvarV3Dataset(datasets.GeneratorBasedBuilder):
subset_id=f"{_DATASETNAME}",
)
)
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"{_DATASETNAME}_source_fixed",
version=SOURCE_VERSION,
description=f"{_DATASETNAME} source schema with fixed offsets",
schema="source",
subset_id=f"{_DATASETNAME}",
)
)
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"{_DATASETNAME}_bigbio_kb",
Expand Down Expand Up @@ -258,9 +260,12 @@ def pubtator_to_source(self, filepath):
}
for mention in doc.annotations
]
document["entities"] = self._correct_wrong_offsets(
document["entities"], doc.pmid
)

if "_fixed" in self.config.name:
document["entities"] = self._correct_wrong_offsets(
document["entities"], doc.pmid
)

yield document

def pubtator_to_bigbio_kb(self, filepath):
Expand Down

0 comments on commit 9da7283

Please sign in to comment.