Enforce minimum dimensions for paddle (#99)

NVIDIA · Sep 27, 2024 · 064add5 · 064add5
1 parent 2fea03c
commit 064add5
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 43 deletions.
diff --git a/src/nv_ingest/extraction_workflows/pdf/pdfium_helper.py b/src/nv_ingest/extraction_workflows/pdf/pdfium_helper.py
@@ -2,27 +2,39 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-import io
 import logging
-from math import ceil
-from math import floor
 from math import log
 from typing import List
 from typing import Tuple
 
 import numpy as np
 import pypdfium2 as libpdfium
 import tritonclient.grpc as grpcclient
-from PIL import Image
 
 from nv_ingest.extraction_workflows.pdf import yolox_utils
 from nv_ingest.schemas.metadata_schema import AccessLevelEnum
 from nv_ingest.schemas.metadata_schema import TextTypeEnum
 from nv_ingest.schemas.pdf_extractor_schema import PDFiumConfigSchema
-from nv_ingest.util.converters import bytetools
 from nv_ingest.util.image_processing.table_and_chart import join_cached_and_deplot_output
+from nv_ingest.util.image_processing.transforms import crop_image
 from nv_ingest.util.image_processing.transforms import numpy_to_base64
+from nv_ingest.util.nim.helpers import call_image_inference_model
+from nv_ingest.util.nim.helpers import create_inference_client
+from nv_ingest.util.nim.helpers import perform_model_inference
 from nv_ingest.util.pdf.metadata_aggregators import Base64Image
 from nv_ingest.util.pdf.metadata_aggregators import ImageChart
 from nv_ingest.util.pdf.metadata_aggregators import ImageTable
@@ -33,37 +45,31 @@
 from nv_ingest.util.pdf.pdfium import PDFIUM_PAGEOBJ_MAPPING
 from nv_ingest.util.pdf.pdfium import pdfium_pages_to_numpy
 from nv_ingest.util.pdf.pdfium import pdfium_try_get_bitmap_as_numpy
-from nv_ingest.util.nim.helpers import call_image_inference_model
-from nv_ingest.util.nim.helpers import create_inference_client
-from nv_ingest.util.nim.helpers import perform_model_inference
 
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
+PADDLE_MIN_WIDTH = 32
+PADDLE_MIN_HEIGHT = 32
+YOLOX_MAX_BATCH_SIZE = 8
+YOLOX_MAX_WIDTH = 1536
+YOLOX_MAX_HEIGHT = 1536
+YOLOX_NUM_CLASSES = 3
+YOLOX_CONF_THRESHOLD = 0.01
+YOLOX_IOU_THRESHOLD = 0.5
+YOLOX_MIN_SCORE = 0.1
+YOLOX_FINAL_SCORE = 0.48
 
 logger = logging.getLogger(__name__)
 
 
 def extract_tables_and_charts_using_image_ensemble(
     pages: List[libpdfium.PdfPage],
     config: PDFiumConfigSchema,
-    max_batch_size: int = 8,
-    num_classes: int = 3,
-    conf_thresh: float = 0.01,
-    iou_thresh: float = 0.5,
-    min_score: float = 0.1,
-    final_thresh: float = 0.48,
+    max_batch_size: int = YOLOX_MAX_BATCH_SIZE,
+    num_classes: int = YOLOX_NUM_CLASSES,
+    conf_thresh: float = YOLOX_CONF_THRESHOLD,
+    iou_thresh: float = YOLOX_IOU_THRESHOLD,
+    min_score: float = YOLOX_MIN_SCORE,
+    final_thresh: float = YOLOX_FINAL_SCORE,
 ) -> List[Tuple[int, ImageTable]]:
     """
     Extract tables and charts from a series of document pages using an ensemble of image-based models.
@@ -140,7 +146,7 @@ def extract_tables_and_charts_using_image_ensemble(
 
         page_idx = 0
         for batch in batches:
-            original_images, _ = pdfium_pages_to_numpy(batch, scale_tuple=(1536, 1536))
+            original_images, _ = pdfium_pages_to_numpy(batch, scale_tuple=(YOLOX_MAX_WIDTH, YOLOX_MAX_HEIGHT))
 
             original_image_shapes = [image.shape for image in original_images]
             input_array = prepare_images_for_inference(original_images)
@@ -269,15 +275,15 @@ def process_inference_results(
 
     for annotation_dict in annotation_dicts:
         new_dict = {}
-        if 'table' in annotation_dict:
-            new_dict['table'] = [bb for bb in annotation_dict["table"] if bb[4] >= final_thresh]
-        if 'chart' in annotation_dict:
-            new_dict['chart'] = [bb for bb in annotation_dict["chart"] if bb[4] >= final_thresh]
-        if 'title' in annotation_dict:
-            new_dict['title'] = annotation_dict["title"]
+        if "table" in annotation_dict:
+            new_dict["table"] = [bb for bb in annotation_dict["table"] if bb[4] >= final_thresh]
+        if "chart" in annotation_dict:
+            new_dict["chart"] = [bb for bb in annotation_dict["chart"] if bb[4] >= final_thresh]
+        if "title" in annotation_dict:
+            new_dict["title"] = annotation_dict["title"]
         inference_results.append(new_dict)
 
-    return inference_results 
+    return inference_results
 
 
 # Handle individual table/chart extraction and model inference
@@ -328,18 +334,24 @@ def handle_table_chart_extraction(
         for idx, bboxes in enumerate(objects):
             *bbox, _ = bboxes
             h1, w1, h2, w2 = bbox * np.array([height, width, height, width])
-            cropped = original_image[floor(w1) : ceil(w2), floor(h1) : ceil(h2)]  # noqa: E203
-
-            img = Image.fromarray(cropped.astype(np.uint8))
-            with io.BytesIO() as buffer:
-                img.save(buffer, format="PNG")
-                base64_img = bytetools.base64frombytes(buffer.getvalue())
 
             if label == "table":
+                # PaddleOCR NIM enforces minimum dimensions for TRT engines.
+                cropped = crop_image(
+                    original_image,
+                    (h1, w1, h2, w2),
+                    min_width=PADDLE_MIN_WIDTH,
+                    min_height=PADDLE_MIN_HEIGHT,
+                )
+                base64_img = numpy_to_base64(cropped)
+
                 table_content = call_image_inference_model(paddle_client, "paddle", cropped)
                 table_data = ImageTable(table_content, base64_img, (w1, h1, w2, h2))
                 tables_and_charts.append((page_idx, table_data))
             elif label == "chart":
+                cropped = crop_image(original_image, (h1, w1, h2, w2))
+                base64_img = numpy_to_base64(cropped)
+
                 deplot_result = call_image_inference_model(deplot_client, "google/deplot", cropped)
                 cached_result = call_image_inference_model(cached_client, "cached", cropped)
                 chart_content = join_cached_and_deplot_output(cached_result, deplot_result)

diff --git a/src/nv_ingest/util/image_processing/transforms.py b/src/nv_ingest/util/image_processing/transforms.py
@@ -74,7 +74,9 @@ def pad_image(
     return canvas, (pad_width, pad_height)
 
 
-def crop_image(array: np.array, bbox: Tuple[int, int, int, int]) -> Optional[np.ndarray]:
+def crop_image(
+    array: np.array, bbox: Tuple[int, int, int, int], min_width: int = 1, min_height: int = 1
+) -> Optional[np.ndarray]:
     """
     Crops a NumPy array representing an image according to the specified bounding box.
 
@@ -84,6 +86,12 @@ def crop_image(array: np.array, bbox: Tuple[int, int, int, int]) -> Optional[np.
         The image as a NumPy array.
     bbox : Tuple[int, int, int, int]
         The bounding box to crop the image to, given as (w1, h1, w2, h2).
+    min_width : int, optional
+        The minimum allowable width for the cropped image. If the cropped width is smaller than this value,
+        the function returns None. Default is 1.
+    min_height : int, optional
+        The minimum allowable height for the cropped image. If the cropped height is smaller than this value,
+        the function returns None. Default is 1.
 
     Returns
     -------
@@ -96,7 +104,7 @@ def crop_image(array: np.array, bbox: Tuple[int, int, int, int]) -> Optional[np.
     w1 = max(floor(w1), 0)
     w2 = min(ceil(w2), array.shape[1])
 
-    if (w2 - w1 <= 0) or (h2 - h1 <= 0):
+    if (w2 - w1 < min_width) or (h2 - h1 < min_height):
         return None
 
     # Crop the image using the bounding box