From 882ec56e22084361187b735a8a268c9e73bc7b3d Mon Sep 17 00:00:00 2001 From: Valerio Arnaboldi Date: Fri, 6 Dec 2024 12:25:33 -0800 Subject: [PATCH] check if title is extracted, if not set failure --- agr_literature_service/lit_processing/pdf2tei/pdf2tei.py | 6 +++++- requirements.txt | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/agr_literature_service/lit_processing/pdf2tei/pdf2tei.py b/agr_literature_service/lit_processing/pdf2tei/pdf2tei.py index 0543b05ef..1f7962db5 100644 --- a/agr_literature_service/lit_processing/pdf2tei/pdf2tei.py +++ b/agr_literature_service/lit_processing/pdf2tei/pdf2tei.py @@ -6,6 +6,7 @@ from fastapi import UploadFile from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker +from lxml import etree from agr_literature_service.api.crud.referencefile_crud import get_main_pdf_referencefile_id, download_file, file_upload from agr_literature_service.api.crud.workflow_tag_crud import get_jobs, job_change_atp_code @@ -57,7 +58,10 @@ def main(): "is_annotation": None, "mod_abbreviation": mod_abbreviation } - if response.content == "[NO_BLOCKS] PDF parsing resulted in empty content": + root = etree.fromstring(response.content) # Check for empty elements that indicate failure + title = root.xpath('//tei:title[@level="a"]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) + if (response.content == "[NO_BLOCKS] PDF parsing resulted in empty content" or title is None or + title[0].text is None): job_change_atp_code(db, reference_workflow_tag_id, "on_failed") else: file_upload(db=db, metadata=metadata, file=UploadFile(file=BytesIO(response.content), diff --git a/requirements.txt b/requirements.txt index c15415cac..6f2768fbb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -52,3 +52,4 @@ bs4==0.0.1 # No update needed. elasticsearch==7.13.4 # No update needed. retry==0.9.2 # No update needed. cachetools==5.3.1 # Updated for better performance. +lxml==4.9.4 \ No newline at end of file