first version of script (not tested)

alliance-genome · Aug 13, 2024 · bb78d30 · bb78d30
1 parent 3835599
commit bb78d30
Show file tree

Hide file tree

Showing 6 changed files with 71 additions and 14 deletions.
diff --git a/.env b/.env
@@ -67,3 +67,5 @@ NCBI_API_KEY=
 ATEAM_API_URL=
 WB_API_URL=https://caltech-curation.textpressolab.com/pub/cgi-bin/forms/abc_readonly_api.cgi
 SGD_API_URL=https://backend.yeastgenome.org/entity/
+PDF2TEI_API_URL="https://grobid.alliancegenome.org/api/processFulltextDocument"
+
diff --git a/.env.test b/.env.test
@@ -59,3 +59,4 @@ TEST_CLEANUP=true
 ALLIANCE_FMS_RELEASE=5.2.2
 ID_MATI_URL=
 NCBI_API_KEY=
+PDF2TEI_API_URL=
diff --git a/agr_literature_service/api/crud/referencefile_crud.py b/agr_literature_service/api/crud/referencefile_crud.py
@@ -7,7 +7,7 @@
 import tarfile
 import tempfile
 from itertools import count
-from typing import List
+from typing import List, Union
 
 import boto3
 from fastapi import HTTPException, status, UploadFile
@@ -37,6 +37,25 @@
 logger = logging.getLogger(__name__)
 
 
+def get_main_pdf_referencefile_id(db: Session, curie_or_reference_id: str,
+                                  mod_abbreviation: str = None) -> Union[int, None]:
+    reference: ReferenceModel = get_reference(db=db, curie_or_reference_id=curie_or_reference_id, load_referencefiles=True)
+    referencefile: ReferencefileModel
+    main_pdf_referencefiles = [referencefile for referencefile in reference.referencefiles if
+                               referencefile.file_class == "main" and referencefile.file_publication_status == "final"
+                               and referencefile.pdf_type == "pdf" and referencefile.file_extension == "pdf"]
+    if mod_abbreviation is not None:
+        for main_pdf_ref_file in main_pdf_referencefiles:
+            for ref_file_mod in main_pdf_ref_file.referencefile_mods:
+                if ref_file_mod.mod.abbreviation == mod_abbreviation:
+                    return main_pdf_ref_file.referencefile_id
+    for main_pdf_ref_file in main_pdf_referencefiles:
+        for ref_file_mod in main_pdf_ref_file.referencefile_mods:
+            if ref_file_mod.mod.abbreviation is None:
+                return main_pdf_ref_file.referencefile_id
+    return None
+
+
 def set_referencefile_mods(referencefile_obj, referencefile_dict):
     del referencefile_dict["reference_id"]
     referencefile_dict["referencefile_mods"] = []
@@ -388,7 +407,8 @@ def file_upload_single(db: Session, metadata: dict, file: UploadFile):  # pragma
     return md5sum
 
 
-def download_file(db: Session, referencefile_id: int, mod_access: OktaAccess):  # pragma: no cover
+def download_file(db: Session, referencefile_id: int, mod_access: OktaAccess,  # pragma: no cover
+                  use_in_api: bool = True):  # pragma: no cover
     referencefile = read_referencefile_db_obj(db, referencefile_id)
 
     user_permission = False
@@ -411,8 +431,14 @@ def download_file(db: Session, referencefile_id: int, mod_access: OktaAccess):
         with gzip.open(md5sum + ".gz", 'rb') as f_in, open(display_name, 'wb') as f_out:
             shutil.copyfileobj(f_in, f_out)
         os.remove(md5sum + ".gz")
-        return FileResponse(path=display_name, filename=display_name, media_type="application/octet-stream",
-                            background=BackgroundTask(cleanup, display_name))
+        if use_in_api:
+            return FileResponse(path=display_name, filename=display_name, media_type="application/octet-stream",
+                                background=BackgroundTask(cleanup, display_name))
+        else:
+            with open(display_name, 'rb') as file:
+                file_content = file.read()
+            os.remove(display_name)
+            return file_content
 
     raise HTTPException(status_code=status.HTTP_403_FORBIDDEN,
                         detail="The current user does not have permissions to get the requested file url. "

diff --git a/agr_literature_service/api/crud/workflow_tag_crud.py b/agr_literature_service/api/crud/workflow_tag_crud.py
@@ -340,12 +340,12 @@ def get_current_workflow_status(db: Session, curie_or_reference_id: str, workflo
     return None if not current_workflow_tag_db_obj else current_workflow_tag_db_obj.workflow_tag_id
 
 
-def get_ref_ids_with_status(db: Session, workflow_atp_id: str, mod_abbreviation: str = None):
+def get_ref_ids_with_workflow_status(db: Session, workflow_atp_id: str, mod_abbreviation: str = None):
     query = db.query(WorkflowTagModel.reference_id).filter(WorkflowTagModel.workflow_tag_id == workflow_atp_id)
     if mod_abbreviation is not None:
         mod_id = db.query(ModModel).filter(ModModel.abbreviation == mod_abbreviation).first()
         query = query.filter(WorkflowTagModel.mod_id == mod_id)
-    query.all()
+    return [ref.reference_id for ref in query.all()]
 
 
 def create(db: Session, workflow_tag: WorkflowTagSchemaPost) -> int:

diff --git a/agr_literature_service/lit_processing/pdf2tei/pdf2tei.py b/agr_literature_service/lit_processing/pdf2tei/pdf2tei.py
@@ -1,19 +1,46 @@
+import logging
+import os
 from typing import List
 
-from agr_literature_service.api.crud.workflow_tag_crud import g
+import requests
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
 
+from agr_literature_service.api.crud.referencefile_crud import get_main_pdf_referencefile_id, download_file
+from agr_literature_service.api.crud.workflow_tag_crud import get_ref_ids_with_workflow_status
+from agr_literature_service.api.database.config import SQLALCHEMY_DATABASE_URL
+from agr_literature_service.api.models import ModModel
+from agr_literature_service.api.routers.okta_utils import OktaAccess
 
-def get_refs_to_convert() -> List[str]:
-    pass
 
-
-def convert_ref(reference_curie: str):
-    pass
+logger = logging.getLogger(__name__)
 
 
 def main():
-    for ref_curie in get_refs_to_convert():
-        convert_ref(ref_curie)
+    engine = create_engine(SQLALCHEMY_DATABASE_URL, connect_args={"options": "-c timezone=utc"})
+    new_session = sessionmaker(bind=engine, autoflush=True)
+    db = new_session()
+    mod_abbreviations = [mod.abbreviation for mod in db.query(ModModel.abbreviation).all()]
+    for mod_abbreviation in mod_abbreviations:
+        for ref_id in get_ref_ids_with_workflow_status(db=db, workflow_atp_id="", mod_abbreviation=mod_abbreviation):
+            ref_file_id_to_convert = get_main_pdf_referencefile_id(db=db, curie_or_reference_id=ref_id,
+                                                                   mod_abbreviation=mod_abbreviation)
+            file_content = download_file(db=db, referencefile_id=ref_file_id_to_convert,
+                                         mod_access=OktaAccess.ALL_ACCESS, use_in_api=False)
+            # Define the GROBID API endpoint
+            grobid_api_url = os.environ.get("PDF2TEI_API_URL",
+                                            "https://grobid.alliancegenome.org/api/processFulltextDocument")
+
+            # Send the file content to the GROBID API
+            response = requests.post(grobid_api_url, files={'input': ("file", file_content)})
+
+            # Check the response
+            if response.status_code == 200:
+                logger.info("File successfully processed by GROBID.")
+                return response.content
+            else:
+                logger.error(f"Failed to process file. Status code: {response.status_code}")
+                return None
 
 
 if __name__ == '__main__':

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -198,6 +198,7 @@ services:
       DEBEZIUM_KSQLDB_HOST: "${DEBEZIUM_KSQLDB_HOST}"
       DEBEZIUM_KSQLDB_PORT: "${DEBEZIUM_KSQLDB_PORT}"
       TZ: "UTC"
+      PDF2TEI_API_URL: "${PDF2TEI_API_URL}"
     volumes:
       - "${LOG_PATH}:/var/log/automated_scripts"
       - /etc/localtime:/etc/localtime:ro