Skip to content

Commit

Permalink
check if title is extracted, if not set failure
Browse files Browse the repository at this point in the history
  • Loading branch information
valearna committed Dec 6, 2024
1 parent b889732 commit 882ec56
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 1 deletion.
6 changes: 5 additions & 1 deletion agr_literature_service/lit_processing/pdf2tei/pdf2tei.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from fastapi import UploadFile
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from lxml import etree

from agr_literature_service.api.crud.referencefile_crud import get_main_pdf_referencefile_id, download_file, file_upload
from agr_literature_service.api.crud.workflow_tag_crud import get_jobs, job_change_atp_code
Expand Down Expand Up @@ -57,7 +58,10 @@ def main():
"is_annotation": None,
"mod_abbreviation": mod_abbreviation
}
if response.content == "[NO_BLOCKS] PDF parsing resulted in empty content":
root = etree.fromstring(response.content) # Check for empty elements that indicate failure
title = root.xpath('//tei:title[@level="a"]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
if (response.content == "[NO_BLOCKS] PDF parsing resulted in empty content" or title is None or
title[0].text is None):
job_change_atp_code(db, reference_workflow_tag_id, "on_failed")
else:
file_upload(db=db, metadata=metadata, file=UploadFile(file=BytesIO(response.content),
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@ bs4==0.0.1 # No update needed.
elasticsearch==7.13.4 # No update needed.
retry==0.9.2 # No update needed.
cachetools==5.3.1 # Updated for better performance.
lxml==4.9.4

0 comments on commit 882ec56

Please sign in to comment.