Replace Cached with Yolox-graphic-elements and drop Deplot (#365)

Co-authored-by: Jeremy Dyer <[email protected]>
NVIDIA · Feb 13, 2025 · 32a9021 · 32a9021
1 parent b6ba1a0
commit 32a9021
Show file tree

Hide file tree

Showing 21 changed files with 931 additions and 728 deletions.
diff --git a/README.md b/README.md
@@ -4,6 +4,10 @@ All rights reserved.
 SPDX-License-Identifier: Apache-2.0
 -->
 
+> [!Note]
+> Cached and Deplot are deprecated, docker-compose now points to a beta version of the yolox-graphic-elements container instead. That model and container is slated for full release in March.
+> With this change, you should now be able to run on a single 80GB A100 or H100 GPU.
+> If you want to continue using the old pipeline with Cached and Deplot, please use the [24.12.1 release](https://github.com/NVIDIA/nv-ingest/tree/24.12.1).
 
 ## NVIDIA-Ingest: Multi-modal data extraction
 
@@ -44,8 +48,8 @@ A service that:
 
 | GPU | Family | Memory | # of GPUs (min.) |
 | ------ | ------ | ------ | ------ |
-| H100 | SXM or PCIe | 80GB | 2 |
-| A100 | SXM or PCIe | 80GB | 2 |
+| H100 | SXM or PCIe | 80GB | 1 |
+| A100 | SXM or PCIe | 80GB | 1 |
 
 ### Software
 

diff --git a/client/src/nv_ingest_client/client/client.py b/client/src/nv_ingest_client/client/client.py
@@ -93,10 +93,13 @@ def __init__(
         self._message_client_hostname = message_client_hostname or "localhost"
         self._message_client_port = message_client_port or 7670
         self._message_counter_id = msg_counter_id or "nv-ingest-message-id"
+        self._message_client_kwargs = message_client_kwargs or {}
 
         logger.debug("Instantiate NvIngestClient:\n%s", str(self))
         self._message_client = message_client_allocator(
-            host=self._message_client_hostname, port=self._message_client_port
+            host=self._message_client_hostname,
+            port=self._message_client_port,
+            **self._message_client_kwargs,
         )
 
         # Initialize the worker pool with the specified size

diff --git a/client/src/nv_ingest_client/message_clients/rest/rest_client.py b/client/src/nv_ingest_client/message_clients/rest/rest_client.py
@@ -230,7 +230,7 @@ def fetch_message(self, job_id: str, timeout: float = 10) -> ResponseSchema:
                         except RuntimeError as rte:
                             raise rte
 
-            except requests.HTTPError as err:
+            except (ConnectionError, requests.HTTPError, requests.exceptions.ConnectionError) as err:
                 logger.error(f"Error during fetching, retrying... Error: {err}")
                 self._client = None  # Invalidate client to force reconnection
                 try:

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -34,12 +34,12 @@ services:
         reservations:
           devices:
             - driver: nvidia
-              device_ids: ["1"]
+              device_ids: ["0"]
               capabilities: [gpu]
     runtime: nvidia
 
-  deplot:
-    image: ${DEPLOT_IMAGE:-nvcr.io/nvidia/nemo-microservices/deplot}:${DEPLOT_TAG:-1.0.0}
+  yolox-graphic-elements:
+    image: ${YOLOX_GRAPHIC_ELEMENTS_IMAGE:-nvcr.io/nvidia/nemo-microservices/nemoretriever-graphic-elements-v1}:${YOLOX_GRAPHIC_ELEMENTS_TAG:-1.1}
     ports:
       - "8003:8000"
       - "8004:8001"
@@ -59,28 +59,6 @@ services:
               capabilities: [gpu]
     runtime: nvidia
 
-  cached:
-    image: ${CACHED_IMAGE:-nvcr.io/nvidia/nemo-microservices/cached}:${CACHED_TAG:-0.2.1}
-    shm_size: 2gb
-    ports:
-      - "8006:8000"
-      - "8007:8001"
-      - "8008:8002"
-    user: root
-    environment:
-      - NIM_HTTP_API_PORT=8000
-      - NIM_TRITON_LOG_VERBOSE=1
-      - NGC_API_KEY=${NIM_NGC_API_KEY:-${NGC_API_KEY:-ngcapikey}}
-      - CUDA_VISIBLE_DEVICES=0
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              device_ids: ["1"]
-              capabilities: [gpu]
-    runtime: nvidia
-
   paddle:
     image: ${PADDLE_IMAGE:-nvcr.io/nvidia/nemo-microservices/paddleocr}:${PADDLE_TAG:-1.0.0}
     shm_size: 2gb
@@ -99,13 +77,13 @@ services:
         reservations:
           devices:
             - driver: nvidia
-              device_ids: ["1"]
+              device_ids: ["0"]
               capabilities: [gpu]
     runtime: nvidia
 
   embedding:
     # NIM ON
-    image: ${EMBEDDING_IMAGE:-nvcr.io/nim/nvidia/nv-embedqa-e5-v5}:${EMBEDDING_TAG:-1.1.0}
+    image: ${EMBEDDING_IMAGE:-nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2}:${EMBEDDING_TAG:-1.3.0}
     shm_size: 16gb
     ports:
       - "8012:8000"
@@ -121,7 +99,7 @@ services:
         reservations:
           devices:
             - driver: nvidia
-              device_ids: ["1"]
+              device_ids: ["0"]
               capabilities: [gpu]
     runtime: nvidia
 
@@ -141,22 +119,9 @@ services:
     cap_add:
       - sys_nice
     environment:
-      # Self-hosted cached endpoints.
-      - CACHED_GRPC_ENDPOINT=cached:8001
-      - CACHED_HTTP_ENDPOINT=http://cached:8000/v1/infer
-      - CACHED_INFER_PROTOCOL=grpc
-      # build.nvidia.com hosted cached endpoints.
-      #- CACHED_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/university-at-buffalo/cached
-      #- CACHED_INFER_PROTOCOL=http
       - CUDA_VISIBLE_DEVICES=0
-      #- DEPLOT_GRPC_ENDPOINT=""
-      # Self-hosted deplot endpoints.
-      - DEPLOT_HTTP_ENDPOINT=http://deplot:8000/v1/chat/completions
-      # build.nvidia.com hosted deplot
-      #- DEPLOT_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/vlm/google/deplot
-      - DEPLOT_INFER_PROTOCOL=http
       - DOUGHNUT_GRPC_TRITON=triton-doughnut:8001
-      - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-nvidia/nv-embedqa-e5-v5}
+      - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
       - INGEST_LOG_LEVEL=DEFAULT
       # Message client for development
       #- MESSAGE_CLIENT_HOST=0.0.0.0
@@ -187,6 +152,9 @@ services:
       # build.nvidia.com hosted yolox endpoints.
       #- YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nv-yolox-page-elements-v1
       #- YOLOX_INFER_PROTOCOL=http
+      - YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT=yolox-graphic-elements:8001
+      - YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=http://yolox-graphic-elements:8000/v1/infer
+      - YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=grpc
       - VLM_CAPTION_ENDPOINT=https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-90b-vision-instruct/chat/completions
       - VLM_CAPTION_MODEL_NAME=meta/llama-3.2-90b-vision-instruct
     healthcheck:
@@ -199,7 +167,7 @@ services:
         reservations:
           devices:
             - driver: nvidia
-              device_ids: ["1"]
+              device_ids: ["0"]
               capabilities: [gpu]
 
   otel-collector:
@@ -321,7 +289,7 @@ services:
         reservations:
           devices:
             - driver: nvidia
-              device_ids: ["1"]
+              device_ids: ["0"]
               capabilities: [gpu]
     depends_on:
       - "etcd"

diff --git a/src/nv_ingest/api/v1/health.py b/src/nv_ingest/api/v1/health.py
@@ -64,20 +64,26 @@ async def get_ready_state() -> dict:
     # We give the users an option to disable checking all distributed services for "readiness"
     check_all_components = os.getenv("READY_CHECK_ALL_COMPONENTS", "True").lower()
     if check_all_components in ["1", "true", "yes"]:
-        yolox_ready = is_ready(os.getenv("YOLOX_HTTP_ENDPOINT", None), "/v1/health/ready")
-        deplot_ready = is_ready(os.getenv("DEPLOT_HTTP_ENDPOINT", None), "/v1/health/ready")
-        cached_ready = is_ready(os.getenv("CACHED_HTTP_ENDPOINT", None), "/v1/health/ready")
+        yolox_page_elements_ready = is_ready(os.getenv("YOLOX_HTTP_ENDPOINT", None), "/v1/health/ready")
+        yolox_graphic_elements_ready = is_ready(
+            os.getenv("YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT", None), "/v1/health/ready"
+        )
         paddle_ready = is_ready(os.getenv("PADDLE_HTTP_ENDPOINT", None), "/v1/health/ready")
 
-        if ingest_ready and morpheus_pipeline_ready and yolox_ready and deplot_ready and cached_ready and paddle_ready:
+        if (
+            ingest_ready
+            and morpheus_pipeline_ready
+            and yolox_page_elements_ready
+            and yolox_graphic_elements_ready
+            and paddle_ready
+        ):
             return JSONResponse(content={"ready": True}, status_code=200)
         else:
             ready_statuses = {
                 "ingest_ready": ingest_ready,
                 "morpheus_pipeline_ready": morpheus_pipeline_ready,
-                "yolox_ready": yolox_ready,
-                "deplot_ready": deplot_ready,
-                "cached_ready": cached_ready,
+                "yolox_page_elemenst_ready": yolox_page_elements_ready,
+                "yolox_graphic_elements_ready": yolox_graphic_elements_ready,
                 "paddle_ready": paddle_ready,
             }
             logger.debug(f"Ready Statuses: {ready_statuses}")

diff --git a/src/nv_ingest/extraction_workflows/image/image_handlers.py b/src/nv_ingest/extraction_workflows/image/image_handlers.py
@@ -158,7 +158,7 @@ def extract_table_and_chart_images(
         objects = annotation_dict[label]
         for idx, bboxes in enumerate(objects):
             *bbox, _ = bboxes
-            h1, w1, h2, w2 = np.array(bbox) * np.array([height, width, height, width])
+            h1, w1, h2, w2 = bbox
 
             base64_img = crop_image(original_image, (int(h1), int(w1), int(h2), int(w2)))
 

diff --git a/src/nv_ingest/extraction_workflows/pdf/pdfium_helper.py b/src/nv_ingest/extraction_workflows/pdf/pdfium_helper.py
@@ -173,15 +173,15 @@ def extract_table_and_chart_images(
         objects = annotation_dict[label]
         for idx, bboxes in enumerate(objects):
             *bbox, _ = bboxes
-            h1, w1, h2, w2 = bbox * np.array([height, width, height, width])
+            h1, w1, h2, w2 = bbox
 
-            cropped = crop_image(original_image, (h1, w1, h2, w2))
+            cropped = crop_image(original_image, (int(h1), int(w1), int(h2), int(w2)))
             base64_img = numpy_to_base64(cropped)
 
             table_data = CroppedImageWithContent(
                 content="",
                 image=base64_img,
-                bbox=(w1, h1, w2, h2),
+                bbox=(int(w1), int(h1), int(w2), int(h2)),
                 max_width=width,
                 max_height=height,
                 type_string=label,

diff --git a/src/nv_ingest/schemas/chart_extractor_schema.py b/src/nv_ingest/schemas/chart_extractor_schema.py
@@ -20,12 +20,8 @@ class ChartExtractorConfigSchema(BaseModel):
     auth_token : Optional[str], default=None
         Authentication token required for secure services.
 
-    cached_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
-        A tuple containing the gRPC and HTTP services for the cached endpoint.
-        Either the gRPC or HTTP service can be empty, but not both.
-
-    deplot_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
-        A tuple containing the gRPC and HTTP services for the deplot endpoint.
+    yolox_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
+        A tuple containing the gRPC and HTTP services for the yolox endpoint.
         Either the gRPC or HTTP service can be empty, but not both.
 
     paddle_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
@@ -50,13 +46,9 @@ class ChartExtractorConfigSchema(BaseModel):
 
     auth_token: Optional[str] = None
 
-    cached_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
-    cached_infer_protocol: str = ""
-
-    deplot_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
-    deplot_infer_protocol: str = ""
+    yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
+    yolox_infer_protocol: str = ""
 
-    ## NOTE: Paddle isn't currently called independently of the cached NIM, but will be in the future.
     paddle_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
     paddle_infer_protocol: str = ""
 
@@ -94,7 +86,7 @@ def clean_service(service):
                 return None
             return service
 
-        for endpoint_name in ["cached_endpoints", "deplot_endpoints", "paddle_endpoints"]:
+        for endpoint_name in ["yolox_endpoints", "paddle_endpoints"]:
             grpc_service, http_service = values.get(endpoint_name, (None, None))
             grpc_service = clean_service(grpc_service)
             http_service = clean_service(http_service)
@@ -125,7 +117,7 @@ class ChartExtractorSchema(BaseModel):
         A flag indicating whether to raise an exception if a failure occurs during chart extraction.
 
     stage_config : Optional[ChartExtractorConfigSchema], default=None
-        Configuration for the chart extraction stage, including cached, deplot, and paddle service endpoints.
+        Configuration for the chart extraction stage, including yolox and paddle service endpoints.
     """
 
     max_queue_size: int = 1