From 2afc64b552f5a5b423a07046c4ba83489e11b42b Mon Sep 17 00:00:00 2001 From: Gabo Date: Thu, 18 Jul 2024 14:09:48 +0200 Subject: [PATCH] Return XML using the API container --- docker-compose-test.yml | 2 ++ docker-compose.yml | 2 ++ src/QueueProcessor.py | 3 +-- src/app.py | 8 ++++++++ src/get_xml.py | 14 ++++++++++++++ 5 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 src/get_xml.py diff --git a/docker-compose-test.yml b/docker-compose-test.yml index 5a1c341..8cc029b 100755 --- a/docker-compose-test.yml +++ b/docker-compose-test.yml @@ -50,6 +50,8 @@ services: - "5060:5060" networks: - network-pdf-layout + volumes: + - data:/app/xmls mongo-pdf-layout: container_name: "mongo-pdf-layout" diff --git a/docker-compose.yml b/docker-compose.yml index 3598994..6f4081f 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -51,6 +51,8 @@ services: - driver: nvidia count: 1 capabilities: [ gpu ] + volumes: + - data:/app/xmls mongo-pdf-layout: container_name: "mongo-pdf-layout" diff --git a/src/QueueProcessor.py b/src/QueueProcessor.py index 570c2bb..6703749 100644 --- a/src/QueueProcessor.py +++ b/src/QueueProcessor.py @@ -56,14 +56,13 @@ def process(self, id, message, rc, ts): xml_file_name = get_xml_name(task) extraction_data = extract_segments(task, xml_file_name) service_url = f"{SERVICE_HOST}:{SERVICE_PORT}" - get_xml_url = f"{SERVICE_HOST}:{DOCUMENT_LAYOUT_ANALYSIS_PORT}" extraction_message = ResultMessage( tenant=extraction_data.tenant, task=task.task, params=task.params, success=True, data_url=f"{service_url}/get_paragraphs/{task.tenant}/{task.params.filename}", - file_url=f"{get_xml_url}/get_xml/{xml_file_name}", + file_url=f"{service_url}/get_xml/{xml_file_name}", ) extraction_data_json = extraction_data.model_dump_json() diff --git a/src/app.py b/src/app.py index 9d37935..2043a01 100755 --- a/src/app.py +++ b/src/app.py @@ -8,11 +8,13 @@ from sentry_sdk.integrations.asgi import SentryAsgiMiddleware import sentry_sdk from starlette.concurrency import run_in_threadpool +from starlette.responses import PlainTextResponse from catch_exceptions import catch_exceptions from configuration import MONGO_HOST, MONGO_PORT, service_logger from PdfFile import PdfFile from get_paragraphs import get_paragraphs +from get_xml import get_xml from run import extract_segments_from_file @@ -69,3 +71,9 @@ async def async_extraction(tenant, file: UploadFile = File(...)): @catch_exceptions async def get_paragraphs_endpoint(tenant: str, pdf_file_name: str): return await run_in_threadpool(get_paragraphs, app.mongodb_client, tenant, pdf_file_name) + + +@app.get("/get_xml/{xml_file_name}", response_class=PlainTextResponse) +@catch_exceptions +async def get_xml_by_name(xml_file_name: str): + return await run_in_threadpool(get_xml, xml_file_name) diff --git a/src/get_xml.py b/src/get_xml.py new file mode 100644 index 0000000..f0ca880 --- /dev/null +++ b/src/get_xml.py @@ -0,0 +1,14 @@ +import os +from os.path import join +from pathlib import Path + +from configuration import DATA_PATH + + +def get_xml(xml_file_name: str) -> str: + xml_file_path = Path(join(DATA_PATH, xml_file_name)) + + with open(xml_file_path, mode="r") as file: + content = file.read() + os.remove(xml_file_path) + return content