diff --git a/backend/poetry.lock b/backend/poetry.lock index 3aebca8fe..23746a6e2 100644 --- a/backend/poetry.lock +++ b/backend/poetry.lock @@ -822,23 +822,6 @@ files = [ {file = "docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b"}, ] -[[package]] -name = "en_core_sci_sm" -version = "0.5.4" -description = "Spacy Models for Biomedical Text." -optional = false -python-versions = "*" -files = [ - {file = "en_core_sci_sm-0.5.4.tar.gz", hash = "sha256:a54a114ee11dab7cf6db8cf69ef485145011537f45f3c16c71338c48fd6f070b"}, -] - -[package.dependencies] -spacy = ">=3.7.4,<3.8.0" - -[package.source] -type = "url" -url = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz" - [[package]] name = "et-xmlfile" version = "1.1.0" @@ -5319,4 +5302,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "2cce2c4f6dc36c8b23aa4a62bc72f03554f3fb69d0e219b388150642bd78575e" +content-hash = "e4c96e16276d5a535c44767a7b270389e3bd60109bc821182f0b46494075a188" diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 94f842b47..86293687f 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -33,9 +33,6 @@ typer = "^0.12.0" beautifulsoup4 = "^4.12.3" spacy = "^3.7.6" -[tool.poetry.dependencies.en_core_sci_sm] -url = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz" - [tool.poetry.group.dev.dependencies] pytest = "^8.2.0" mkdocs = ">=1.6.0" diff --git a/backend/src/monarch_py/implementations/spacy/spacy_implementation.py b/backend/src/monarch_py/implementations/spacy/spacy_implementation.py index 20c74283e..431c2fe3f 100644 --- a/backend/src/monarch_py/implementations/spacy/spacy_implementation.py +++ b/backend/src/monarch_py/implementations/spacy/spacy_implementation.py @@ -7,6 +7,8 @@ from monarch_py.interfaces.text_annotation_interface import TextAnnotatorInterface from monarch_py.datamodels.model import TextAnnotationResult, SearchResult +import pystow +import tarfile @dataclass class SpacyImplementation(TextAnnotatorInterface): @@ -16,10 +18,36 @@ class SpacyImplementation(TextAnnotatorInterface): grounding_implementation = None def init_spacy(self, grounding_implementation: GroundingInterface): - self.nlp = spacy.load("en_core_sci_sm") + # Define the URL for the Spacy model + model_url = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz" + + # Use pystow.ensure to download and cache the model archive + model_archive = pystow.ensure("spacy", "models", url=model_url) + # Define the expected unpacked directory + model_dir = model_archive.parent / "en_core_sci" + + # Unpack the model if it's not already unpacked + if not model_dir.exists(): + print("Unpacking Spacy model...") + with tarfile.open(model_archive, "r:gz") as tar: + tar.extractall(path=model_dir.parent) + + model_subdir = next((d for d in model_archive.parent.iterdir() if d.is_dir() and d.name.startswith("en_core_sci")), None) + + if model_subdir: + inner_model_dir = next((d for d in model_subdir.iterdir() if d.is_dir() and d.name.startswith("en_core_sci") and "egg-info" not in d.name), None) + if inner_model_dir: + # Load the model + self.nlp = spacy.load(str(str(model_archive.parent / model_subdir.name / inner_model_dir.name / model_subdir.name))) + + # Assign the grounding implementation self.grounding_implementation = grounding_implementation + + # Test the model with a sample sentence self.nlp("Nystagmus, strabismus, fundus, ocular albinism, lewis.") + + def get_annotated_entities(self, text) -> List[TextAnnotationResult]: """Annotate text using SPACY""" results: List[TextAnnotationResult] = []