Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ability to group collections of image bounding boxes #386

Merged
merged 15 commits into from
Feb 27, 2025
Merged
23 changes: 20 additions & 3 deletions client/src/nv_ingest_client/primitives/tasks/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
from typing import Optional
from typing import get_args

from pydantic import BaseModel, field_validator, model_validator
from pydantic import BaseModel
from pydantic import ConfigDict
from pydantic import field_validator
from pydantic import model_validator

from .task_base import Task

Expand Down Expand Up @@ -86,12 +89,15 @@
"pptx": get_args(_Type_Extract_Tables_Method_PPTX),
}

_Type_Extract_Images_Method = Literal["simple", "group"]


class ExtractTaskSchema(BaseModel):
document_type: str
extract_method: str = None # Initially allow None to set a smart default
extract_text: bool = True
extract_images: bool = True
extract_images_method: str = "group"
extract_tables: bool = True
extract_tables_method: str = "yolox"
extract_charts: Optional[bool] = None # Initially allow None to set a smart default
Expand Down Expand Up @@ -152,8 +158,15 @@ def extract_tables_method_must_be_valid(cls, v, values, **kwargs):
raise ValueError(f"extract_method must be one of {valid_methods}")
return v

class Config:
extra = "forbid"
@field_validator("extract_images_method")
def extract_images_method_must_be_valid(cls, v):
if v.lower() not in get_args(_Type_Extract_Images_Method):
raise ValueError(
f"Unsupported document type '{v}'. Supported types are: {', '.join(_Type_Extract_Images_Method)}"
)
return v.lower()

model_config = ConfigDict(extra="forbid")


class ExtractTask(Task):
Expand All @@ -169,6 +182,7 @@ def __init__(
extract_images: bool = False,
extract_tables: bool = False,
extract_charts: Optional[bool] = None,
extract_images_method: _Type_Extract_Images_Method = "group",
extract_tables_method: _Type_Extract_Tables_Method_PDF = "yolox",
text_depth: str = "document",
paddle_output_format: str = "pseudo_markdown",
Expand All @@ -182,6 +196,7 @@ def __init__(
self._extract_images = extract_images
self._extract_method = extract_method
self._extract_tables = extract_tables
self._extract_images_method = extract_images_method
self._extract_tables_method = extract_tables_method
# `extract_charts` is initially set to None for backward compatibility.
# {extract_tables: true, extract_charts: None} or {extract_tables: true, extract-charts: true} enables both
Expand All @@ -204,6 +219,7 @@ def __str__(self) -> str:
info += f" extract images: {self._extract_images}\n"
info += f" extract tables: {self._extract_tables}\n"
info += f" extract charts: {self._extract_charts}\n"
info += f" extract images method: {self._extract_images_method}\n"
info += f" extract tables method: {self._extract_tables_method}\n"
info += f" text depth: {self._text_depth}\n"
info += f" paddle_output_format: {self._paddle_output_format}\n"
Expand All @@ -217,6 +233,7 @@ def to_dict(self) -> Dict:
"extract_text": self._extract_text,
"extract_images": self._extract_images,
"extract_tables": self._extract_tables,
"extract_images_method": self._extract_images_method,
"extract_tables_method": self._extract_tables_method,
"extract_charts": self._extract_charts,
"text_depth": self._text_depth,
Expand Down
Binary file added data/test-page-form.pdf
Binary file not shown.
Binary file added data/test-shapes.pdf
Binary file not shown.
62 changes: 29 additions & 33 deletions src/nv_ingest/extraction_workflows/pdf/pdfium_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,23 @@

import numpy as np
import pypdfium2 as libpdfium
import nv_ingest.util.nim.yolox as yolox_utils

import nv_ingest.util.nim.yolox as yolox_utils
from nv_ingest.schemas.metadata_schema import AccessLevelEnum
from nv_ingest.schemas.metadata_schema import TableFormatEnum
from nv_ingest.schemas.metadata_schema import TextTypeEnum
from nv_ingest.schemas.pdf_extractor_schema import PDFiumConfigSchema
from nv_ingest.util.image_processing.transforms import crop_image
from nv_ingest.util.image_processing.transforms import numpy_to_base64
from nv_ingest.util.nim.helpers import create_inference_client
from nv_ingest.util.pdf.metadata_aggregators import Base64Image
from nv_ingest.util.pdf.metadata_aggregators import CroppedImageWithContent
from nv_ingest.util.pdf.metadata_aggregators import construct_image_metadata_from_pdf_image
from nv_ingest.util.pdf.metadata_aggregators import construct_table_and_chart_metadata
from nv_ingest.util.pdf.metadata_aggregators import construct_text_metadata
from nv_ingest.util.pdf.metadata_aggregators import extract_pdf_metadata
from nv_ingest.util.pdf.pdfium import PDFIUM_PAGEOBJ_MAPPING
from nv_ingest.util.pdf.pdfium import pdfium_pages_to_numpy
from nv_ingest.util.pdf.pdfium import pdfium_try_get_bitmap_as_numpy
from nv_ingest.util.pdf.pdfium import extract_nested_simple_images_from_pdfium_page
from nv_ingest.util.pdf.pdfium import extract_image_like_objects_from_pdfium_page

YOLOX_MAX_BATCH_SIZE = 8
YOLOX_MAX_WIDTH = 1536
Expand Down Expand Up @@ -174,6 +173,8 @@ def extract_table_and_chart_images(
h1, w1, h2, w2 = bbox * np.array([height, width, height, width])

cropped = crop_image(original_image, (h1, w1, h2, w2))
if cropped is None:
continue
base64_img = numpy_to_base64(cropped)

table_data = CroppedImageWithContent(
Expand All @@ -197,6 +198,7 @@ def _extract_page_text(page) -> str:


def _extract_page_images(
extract_images_method: str,
page,
page_idx: int,
page_width: float,
Expand All @@ -209,36 +211,26 @@ def _extract_page_images(
Always extract images from the given page and return a list of image metadata items.
The caller decides whether to call this based on a flag.
"""
extracted_images = []
for obj in page.get_objects():
obj_type = PDFIUM_PAGEOBJ_MAPPING.get(obj.type, "UNKNOWN")
if obj_type == "IMAGE":
try:
image_numpy = pdfium_try_get_bitmap_as_numpy(obj)
image_base64 = numpy_to_base64(image_numpy)
image_bbox = obj.get_pos()
image_size = obj.get_size()

image_data = Base64Image(
image=image_base64,
bbox=image_bbox,
width=image_size[0],
height=image_size[1],
max_width=page_width,
max_height=page_height,
)
if extract_images_method == "simple":
extracted_image_data = extract_nested_simple_images_from_pdfium_page(page)

image_meta = construct_image_metadata_from_pdf_image(
image_data,
page_idx,
page_count,
source_metadata,
base_unified_metadata,
)
extracted_images.append(image_meta)
except Exception as e:
logger.error(f"Unhandled error extracting image on page {page_idx}: {e}")
# continue extracting other images
elif extract_images_method == "group":
extracted_image_data = extract_image_like_objects_from_pdfium_page(page, merge=True)

extracted_images = []
for image_data in extracted_image_data:
try:
image_meta = construct_image_metadata_from_pdf_image(
image_data,
page_idx,
page_count,
source_metadata,
base_unified_metadata,
)
extracted_images.append(image_meta)
except Exception as e:
logger.error(f"Unhandled error extracting image on page {page_idx}: {e}")
# continue extracting other images

return extracted_images

Expand Down Expand Up @@ -298,6 +290,7 @@ def pdfium_extractor(

paddle_output_format = kwargs.get("paddle_output_format", "pseudo_markdown")
paddle_output_format = TableFormatEnum[paddle_output_format.upper()]
extract_images_method = kwargs.get("extract_images_method", "group")

# Basic config
metadata_col = kwargs.get("metadata_column", "metadata")
Expand Down Expand Up @@ -377,6 +370,7 @@ def pdfium_extractor(
# If we want images, extract images now.
if extract_images:
image_data = _extract_page_images(
extract_images_method,
page,
page_idx,
page_width,
Expand Down Expand Up @@ -445,5 +439,7 @@ def pdfium_extractor(
)
extracted_data.append(doc_text_meta)

doc.close()

logger.debug(f"Extracted {len(extracted_data)} items from PDF.")
return extracted_data
1 change: 1 addition & 0 deletions src/nv_ingest/util/pdf/metadata_aggregators.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,7 @@ def construct_image_metadata_from_pdf_image(
"image_location": pdf_image.bbox,
"image_location_max_dimensions": (max(pdf_image.max_width, 0), max(pdf_image.max_height, 0)),
"height": pdf_image.height,
"width": pdf_image.width,
}

# Update the unified metadata with the extracted image information
Expand Down
Loading
Loading