diff --git a/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py b/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py index 8689584a..13b4c859 100644 --- a/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py +++ b/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py @@ -27,23 +27,16 @@ from .bigbiohub import BigBioConfig, Tasks, kb_features _CITATION = """\ -@misc{https://doi.org/10.48550/arxiv.2204.03637, - title = {tmVar 3.0: an improved variant concept recognition and normalization tool}, - author = { - Wei, Chih-Hsuan and Allot, Alexis and Riehle, Kevin and Milosavljevic, - Aleksandar and Lu, Zhiyong - }, - year = 2022, - publisher = {arXiv}, - doi = {10.48550/ARXIV.2204.03637}, - url = {https://arxiv.org/abs/2204.03637}, - copyright = {Creative Commons Attribution 4.0 International}, - keywords = { - Computation and Language (cs.CL), FOS: Computer and information sciences, - FOS: Computer and information sciences - } +@article{wei2022tmvar, + title={tmVar 3.0: an improved variant concept recognition and normalization tool}, + author={Wei, Chih-Hsuan and Allot, Alexis and Riehle, Kevin and Milosavljevic, Aleksandar and Lu, Zhiyong}, + journal={Bioinformatics}, + volume={38}, + number={18}, + pages={4449--4451}, + year={2022}, + publisher={Oxford University Press} } - """ _LANGUAGES = ["English"] _PUBMED = True @@ -60,9 +53,9 @@ NED tasks, This dataset does NOT have splits. """ -_HOMEPAGE = "https://www.ncbi.nlm.nih.gov/research/bionlp/Tools/tmvar/" +_HOMEPAGE = "https://github.com/ncbi/tmVar3" -_LICENSE = "License information unavailable" +_LICENSE = "UNKNOWN" _URLS = {_DATASETNAME: "ftp://ftp.ncbi.nlm.nih.gov/pub/lu/tmVar3/tmVar3Corpus.txt"} _SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION] @@ -90,6 +83,15 @@ class TmvarV3Dataset(datasets.GeneratorBasedBuilder): subset_id=f"{_DATASETNAME}", ) ) + BUILDER_CONFIGS.append( + BigBioConfig( + name=f"{_DATASETNAME}_source_fixed", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema with fixed offsets", + schema="source", + subset_id=f"{_DATASETNAME}", + ) + ) BUILDER_CONFIGS.append( BigBioConfig( name=f"{_DATASETNAME}_bigbio_kb", @@ -258,9 +260,12 @@ def pubtator_to_source(self, filepath): } for mention in doc.annotations ] - document["entities"] = self._correct_wrong_offsets( - document["entities"], doc.pmid - ) + + if "_fixed" in self.config.name: + document["entities"] = self._correct_wrong_offsets( + document["entities"], doc.pmid + ) + yield document def pubtator_to_bigbio_kb(self, filepath):