Skip to content

Commit

Permalink
first version of script (not tested)
Browse files Browse the repository at this point in the history
  • Loading branch information
valearna committed Aug 13, 2024
1 parent 3835599 commit bb78d30
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 14 deletions.
2 changes: 2 additions & 0 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,5 @@ NCBI_API_KEY=
ATEAM_API_URL=
WB_API_URL=https://caltech-curation.textpressolab.com/pub/cgi-bin/forms/abc_readonly_api.cgi
SGD_API_URL=https://backend.yeastgenome.org/entity/
PDF2TEI_API_URL="https://grobid.alliancegenome.org/api/processFulltextDocument"

1 change: 1 addition & 0 deletions .env.test
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,4 @@ TEST_CLEANUP=true
ALLIANCE_FMS_RELEASE=5.2.2
ID_MATI_URL=
NCBI_API_KEY=
PDF2TEI_API_URL=
34 changes: 30 additions & 4 deletions agr_literature_service/api/crud/referencefile_crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import tarfile
import tempfile
from itertools import count
from typing import List
from typing import List, Union

import boto3
from fastapi import HTTPException, status, UploadFile
Expand Down Expand Up @@ -37,6 +37,25 @@
logger = logging.getLogger(__name__)


def get_main_pdf_referencefile_id(db: Session, curie_or_reference_id: str,
mod_abbreviation: str = None) -> Union[int, None]:
reference: ReferenceModel = get_reference(db=db, curie_or_reference_id=curie_or_reference_id, load_referencefiles=True)
referencefile: ReferencefileModel
main_pdf_referencefiles = [referencefile for referencefile in reference.referencefiles if
referencefile.file_class == "main" and referencefile.file_publication_status == "final"
and referencefile.pdf_type == "pdf" and referencefile.file_extension == "pdf"]
if mod_abbreviation is not None:
for main_pdf_ref_file in main_pdf_referencefiles:
for ref_file_mod in main_pdf_ref_file.referencefile_mods:
if ref_file_mod.mod.abbreviation == mod_abbreviation:
return main_pdf_ref_file.referencefile_id
for main_pdf_ref_file in main_pdf_referencefiles:
for ref_file_mod in main_pdf_ref_file.referencefile_mods:
if ref_file_mod.mod.abbreviation is None:
return main_pdf_ref_file.referencefile_id
return None


def set_referencefile_mods(referencefile_obj, referencefile_dict):
del referencefile_dict["reference_id"]
referencefile_dict["referencefile_mods"] = []
Expand Down Expand Up @@ -388,7 +407,8 @@ def file_upload_single(db: Session, metadata: dict, file: UploadFile): # pragma
return md5sum


def download_file(db: Session, referencefile_id: int, mod_access: OktaAccess): # pragma: no cover
def download_file(db: Session, referencefile_id: int, mod_access: OktaAccess, # pragma: no cover
use_in_api: bool = True): # pragma: no cover
referencefile = read_referencefile_db_obj(db, referencefile_id)

user_permission = False
Expand All @@ -411,8 +431,14 @@ def download_file(db: Session, referencefile_id: int, mod_access: OktaAccess):
with gzip.open(md5sum + ".gz", 'rb') as f_in, open(display_name, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(md5sum + ".gz")
return FileResponse(path=display_name, filename=display_name, media_type="application/octet-stream",
background=BackgroundTask(cleanup, display_name))
if use_in_api:
return FileResponse(path=display_name, filename=display_name, media_type="application/octet-stream",
background=BackgroundTask(cleanup, display_name))
else:
with open(display_name, 'rb') as file:
file_content = file.read()
os.remove(display_name)
return file_content

raise HTTPException(status_code=status.HTTP_403_FORBIDDEN,
detail="The current user does not have permissions to get the requested file url. "
Expand Down
4 changes: 2 additions & 2 deletions agr_literature_service/api/crud/workflow_tag_crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,12 +340,12 @@ def get_current_workflow_status(db: Session, curie_or_reference_id: str, workflo
return None if not current_workflow_tag_db_obj else current_workflow_tag_db_obj.workflow_tag_id


def get_ref_ids_with_status(db: Session, workflow_atp_id: str, mod_abbreviation: str = None):
def get_ref_ids_with_workflow_status(db: Session, workflow_atp_id: str, mod_abbreviation: str = None):
query = db.query(WorkflowTagModel.reference_id).filter(WorkflowTagModel.workflow_tag_id == workflow_atp_id)
if mod_abbreviation is not None:
mod_id = db.query(ModModel).filter(ModModel.abbreviation == mod_abbreviation).first()
query = query.filter(WorkflowTagModel.mod_id == mod_id)
query.all()
return [ref.reference_id for ref in query.all()]


def create(db: Session, workflow_tag: WorkflowTagSchemaPost) -> int:
Expand Down
43 changes: 35 additions & 8 deletions agr_literature_service/lit_processing/pdf2tei/pdf2tei.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,46 @@
import logging
import os
from typing import List

from agr_literature_service.api.crud.workflow_tag_crud import g
import requests
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from agr_literature_service.api.crud.referencefile_crud import get_main_pdf_referencefile_id, download_file
from agr_literature_service.api.crud.workflow_tag_crud import get_ref_ids_with_workflow_status
from agr_literature_service.api.database.config import SQLALCHEMY_DATABASE_URL
from agr_literature_service.api.models import ModModel
from agr_literature_service.api.routers.okta_utils import OktaAccess

def get_refs_to_convert() -> List[str]:
pass


def convert_ref(reference_curie: str):
pass
logger = logging.getLogger(__name__)


def main():
for ref_curie in get_refs_to_convert():
convert_ref(ref_curie)
engine = create_engine(SQLALCHEMY_DATABASE_URL, connect_args={"options": "-c timezone=utc"})
new_session = sessionmaker(bind=engine, autoflush=True)
db = new_session()
mod_abbreviations = [mod.abbreviation for mod in db.query(ModModel.abbreviation).all()]
for mod_abbreviation in mod_abbreviations:
for ref_id in get_ref_ids_with_workflow_status(db=db, workflow_atp_id="", mod_abbreviation=mod_abbreviation):
ref_file_id_to_convert = get_main_pdf_referencefile_id(db=db, curie_or_reference_id=ref_id,
mod_abbreviation=mod_abbreviation)
file_content = download_file(db=db, referencefile_id=ref_file_id_to_convert,
mod_access=OktaAccess.ALL_ACCESS, use_in_api=False)
# Define the GROBID API endpoint
grobid_api_url = os.environ.get("PDF2TEI_API_URL",
"https://grobid.alliancegenome.org/api/processFulltextDocument")

# Send the file content to the GROBID API
response = requests.post(grobid_api_url, files={'input': ("file", file_content)})

# Check the response
if response.status_code == 200:
logger.info("File successfully processed by GROBID.")
return response.content
else:
logger.error(f"Failed to process file. Status code: {response.status_code}")
return None


if __name__ == '__main__':
Expand Down
1 change: 1 addition & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ services:
DEBEZIUM_KSQLDB_HOST: "${DEBEZIUM_KSQLDB_HOST}"
DEBEZIUM_KSQLDB_PORT: "${DEBEZIUM_KSQLDB_PORT}"
TZ: "UTC"
PDF2TEI_API_URL: "${PDF2TEI_API_URL}"
volumes:
- "${LOG_PATH}:/var/log/automated_scripts"
- /etc/localtime:/etc/localtime:ro
Expand Down

0 comments on commit bb78d30

Please sign in to comment.