From fbcd6650a0fc6cb061b84d5cc5f1e8c49f19fb0b Mon Sep 17 00:00:00 2001 From: madanucd Date: Mon, 25 Nov 2024 20:19:56 -0500 Subject: [PATCH 1/3] remove spacy dependancy in pyproject.toml --- backend/pyproject.toml | 3 - .../spacy/spacy_implementation.py | 59 ++++++++++++++++++- 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 94f842b47..86293687f 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -33,9 +33,6 @@ typer = "^0.12.0" beautifulsoup4 = "^4.12.3" spacy = "^3.7.6" -[tool.poetry.dependencies.en_core_sci_sm] -url = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz" - [tool.poetry.group.dev.dependencies] pytest = "^8.2.0" mkdocs = ">=1.6.0" diff --git a/backend/src/monarch_py/implementations/spacy/spacy_implementation.py b/backend/src/monarch_py/implementations/spacy/spacy_implementation.py index 20c74283e..b36ae9d20 100644 --- a/backend/src/monarch_py/implementations/spacy/spacy_implementation.py +++ b/backend/src/monarch_py/implementations/spacy/spacy_implementation.py @@ -7,6 +7,8 @@ from monarch_py.interfaces.text_annotation_interface import TextAnnotatorInterface from monarch_py.datamodels.model import TextAnnotationResult, SearchResult +import pystow +import tarfile @dataclass class SpacyImplementation(TextAnnotatorInterface): @@ -16,9 +18,60 @@ class SpacyImplementation(TextAnnotatorInterface): grounding_implementation = None def init_spacy(self, grounding_implementation: GroundingInterface): - self.nlp = spacy.load("en_core_sci_sm") - self.grounding_implementation = grounding_implementation - self.nlp("Nystagmus, strabismus, fundus, ocular albinism, lewis.") + try: + # Define the URL for the Spacy model + model_url = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz" + + # Use pystow.ensure to download and cache the model archive + try: + model_archive = pystow.ensure("spacy", "models", url=model_url) + except Exception as e: + raise RuntimeError(f"Failed to download or cache the Spacy model: {e}") + + # Define the expected unpacked directory + model_dir = model_archive.parent / "en_core_sci" + + # Unpack the model if it's not already unpacked + if not model_dir.exists(): + try: + print("Unpacking Spacy model...") + with tarfile.open(model_archive, "r:gz") as tar: + tar.extractall(path=model_dir.parent) + except (tarfile.TarError, IOError) as e: + raise RuntimeError(f"Error while unpacking the Spacy model archive: {e}") + + # Identify the unpacked directory dynamically + model_subdir = next( + (d for d in model_archive.parent.iterdir() if d.is_dir() and d.name.startswith("en_core_sci")), None + ) + if not model_subdir: + raise FileNotFoundError(f"Unpacked directory not found in {model_dir}.") + + inner_model_dir = next( + (d for d in model_subdir.iterdir() if + d.is_dir() and d.name.startswith("en_core_sci") and "egg-info" not in d.name), None + ) + if not inner_model_dir: + raise FileNotFoundError(f"Inner 'en_core_sci' directory not found in {model_subdir}.") + + # Load the model + try: + self.nlp = spacy.load( + str(model_archive.parent / model_subdir.name / inner_model_dir.name / model_subdir.name)) + except Exception as e: + raise RuntimeError(f"Failed to load the Spacy model: {e}") + + self.grounding_implementation = grounding_implementation + + # Test the model + try: + self.nlp("Nystagmus, strabismus, fundus, ocular albinism, lewis.") + except Exception as e: + raise RuntimeError(f"Test run of Spacy NLP model failed: {e}") + + except Exception as e: + print(f"An error occurred during Spacy initialization: {e}") + raise def get_annotated_entities(self, text) -> List[TextAnnotationResult]: """Annotate text using SPACY""" From 7e5119cba6856e8eb22fe795f3743716eb2636c6 Mon Sep 17 00:00:00 2001 From: Patrick Golden Date: Wed, 27 Nov 2024 11:27:37 -0500 Subject: [PATCH 2/3] Update poetry.lock for fbcd6650 --- backend/poetry.lock | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/backend/poetry.lock b/backend/poetry.lock index 3aebca8fe..23746a6e2 100644 --- a/backend/poetry.lock +++ b/backend/poetry.lock @@ -822,23 +822,6 @@ files = [ {file = "docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b"}, ] -[[package]] -name = "en_core_sci_sm" -version = "0.5.4" -description = "Spacy Models for Biomedical Text." -optional = false -python-versions = "*" -files = [ - {file = "en_core_sci_sm-0.5.4.tar.gz", hash = "sha256:a54a114ee11dab7cf6db8cf69ef485145011537f45f3c16c71338c48fd6f070b"}, -] - -[package.dependencies] -spacy = ">=3.7.4,<3.8.0" - -[package.source] -type = "url" -url = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz" - [[package]] name = "et-xmlfile" version = "1.1.0" @@ -5319,4 +5302,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "2cce2c4f6dc36c8b23aa4a62bc72f03554f3fb69d0e219b388150642bd78575e" +content-hash = "e4c96e16276d5a535c44767a7b270389e3bd60109bc821182f0b46494075a188" From 5193daf28a9eab7a034e647a740fb1b214663275 Mon Sep 17 00:00:00 2001 From: madanucd Date: Tue, 10 Dec 2024 12:53:06 -0500 Subject: [PATCH 3/3] linear error handling for codecov coverage --- .../spacy/spacy_implementation.py | 83 +++++++------------ 1 file changed, 29 insertions(+), 54 deletions(-) diff --git a/backend/src/monarch_py/implementations/spacy/spacy_implementation.py b/backend/src/monarch_py/implementations/spacy/spacy_implementation.py index b36ae9d20..431c2fe3f 100644 --- a/backend/src/monarch_py/implementations/spacy/spacy_implementation.py +++ b/backend/src/monarch_py/implementations/spacy/spacy_implementation.py @@ -18,60 +18,35 @@ class SpacyImplementation(TextAnnotatorInterface): grounding_implementation = None def init_spacy(self, grounding_implementation: GroundingInterface): - try: - # Define the URL for the Spacy model - model_url = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz" - - # Use pystow.ensure to download and cache the model archive - try: - model_archive = pystow.ensure("spacy", "models", url=model_url) - except Exception as e: - raise RuntimeError(f"Failed to download or cache the Spacy model: {e}") - - # Define the expected unpacked directory - model_dir = model_archive.parent / "en_core_sci" - - # Unpack the model if it's not already unpacked - if not model_dir.exists(): - try: - print("Unpacking Spacy model...") - with tarfile.open(model_archive, "r:gz") as tar: - tar.extractall(path=model_dir.parent) - except (tarfile.TarError, IOError) as e: - raise RuntimeError(f"Error while unpacking the Spacy model archive: {e}") - - # Identify the unpacked directory dynamically - model_subdir = next( - (d for d in model_archive.parent.iterdir() if d.is_dir() and d.name.startswith("en_core_sci")), None - ) - if not model_subdir: - raise FileNotFoundError(f"Unpacked directory not found in {model_dir}.") - - inner_model_dir = next( - (d for d in model_subdir.iterdir() if - d.is_dir() and d.name.startswith("en_core_sci") and "egg-info" not in d.name), None - ) - if not inner_model_dir: - raise FileNotFoundError(f"Inner 'en_core_sci' directory not found in {model_subdir}.") - - # Load the model - try: - self.nlp = spacy.load( - str(model_archive.parent / model_subdir.name / inner_model_dir.name / model_subdir.name)) - except Exception as e: - raise RuntimeError(f"Failed to load the Spacy model: {e}") - - self.grounding_implementation = grounding_implementation - - # Test the model - try: - self.nlp("Nystagmus, strabismus, fundus, ocular albinism, lewis.") - except Exception as e: - raise RuntimeError(f"Test run of Spacy NLP model failed: {e}") - - except Exception as e: - print(f"An error occurred during Spacy initialization: {e}") - raise + # Define the URL for the Spacy model + model_url = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz" + + # Use pystow.ensure to download and cache the model archive + model_archive = pystow.ensure("spacy", "models", url=model_url) + # Define the expected unpacked directory + model_dir = model_archive.parent / "en_core_sci" + + # Unpack the model if it's not already unpacked + if not model_dir.exists(): + print("Unpacking Spacy model...") + with tarfile.open(model_archive, "r:gz") as tar: + tar.extractall(path=model_dir.parent) + + model_subdir = next((d for d in model_archive.parent.iterdir() if d.is_dir() and d.name.startswith("en_core_sci")), None) + + if model_subdir: + inner_model_dir = next((d for d in model_subdir.iterdir() if d.is_dir() and d.name.startswith("en_core_sci") and "egg-info" not in d.name), None) + if inner_model_dir: + # Load the model + self.nlp = spacy.load(str(str(model_archive.parent / model_subdir.name / inner_model_dir.name / model_subdir.name))) + + # Assign the grounding implementation + self.grounding_implementation = grounding_implementation + + # Test the model with a sample sentence + self.nlp("Nystagmus, strabismus, fundus, ocular albinism, lewis.") + + def get_annotated_entities(self, text) -> List[TextAnnotationResult]: """Annotate text using SPACY"""