Add extraction support for png, jpeg, tiff, and svg; and VLM captioni…

…ng stage (#217)
NVIDIA · Nov 12, 2024 · 081f787 · 081f787
1 parent cefb7ef
commit 081f787
Show file tree

Hide file tree

Showing 41 changed files with 2,743 additions and 591 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -49,6 +49,7 @@ RUN source activate nv_ingest \
     && mamba install -y \
      nvidia/label/dev::morpheus-core \
      nvidia/label/dev::morpheus-llm \
+     imagemagick \
      # pin to earlier version of cuda-python until __pyx_capi__ fix is upstreamed.
      cuda-python=12.6.0 \
      -c rapidsai -c pytorch -c nvidia -c conda-forge

diff --git a/client/src/nv_ingest_client/nv_ingest_cli.py b/client/src/nv_ingest_client/nv_ingest_cli.py
@@ -116,57 +116,61 @@
 
 \b
 Tasks and Options:
-- split: Divides documents according to specified criteria.
+- caption: Attempts to extract captions for unstructured images extracted from documents. 
     Options:
-    - split_by (str): Criteria ('page', 'size', 'word', 'sentence'). No default.
-    - split_length (int): Segment length. No default.
-    - split_overlap (int): Segment overlap. No default.
-    - max_character_length (int): Maximum segment character count. No default.
-    - sentence_window_size (int): Sentence window size. No default.
+      - api_key (str): API key for captioning service.
+      Default: os.environ(NVIDIA_BUILD_API_KEY).'
+      - endpoint_url (str): Endpoint URL for captioning service.
+      Default: 'https://build.nvidia.com/meta/llama-3.2-90b-vision-instruct'.
+      - prompt (str): Prompt for captioning service.
+      Default: 'Caption the content of this image:'.
+\b
+- dedup: Identifies and optionally filters duplicate images in extraction.
+    Options:
+      - content_type (str): Content type to deduplicate ('image').
+      - filter (bool): When set to True, duplicates will be filtered, otherwise, an info message will be added.
+\b
+- embed: Computes embeddings on multimodal extractions.
+    Options:
+    - filter_errors (bool): Flag to filter embedding errors. Optional.
+    - tables (bool): Flag to create embeddings for table extractions. Optional.
+    - text (bool): Flag to create embeddings for text extractions. Optional.
 \b
 - extract: Extracts content from documents, customizable per document type.
     Can be specified multiple times for different 'document_type' values.
     Options:
     - document_type (str): Document format ('pdf', 'docx', 'pptx', 'html', 'xml', 'excel', 'csv', 'parquet'). Required.
-    - extract_method (str): Extraction technique. Defaults are smartly chosen based on 'document_type'.
-    - extract_text (bool): Enables text extraction. Default: False.
+    - extract_charts (bool): Enables chart extraction. Default: False.
     - extract_images (bool): Enables image extraction. Default: False.
+    - extract_method (str): Extraction technique. Defaults are smartly chosen based on 'document_type'.
     - extract_tables (bool): Enables table extraction. Default: False.
-    - extract_charts (bool): Enables chart extraction. Default: False.
-    - text_depth (str): Text extraction granularity ('document', 'page'). Default: 'document'. 
+    - extract_text (bool): Enables text extraction. Default: False.
+    - text_depth (str): Text extraction granularity ('document', 'page'). Default: 'document'.
         Note: this will affect the granularity of text extraction, and the associated metadata. ie. 'page' will extract
         text per page and you will get page-level metadata, 'document' will extract text for the entire document so
         elements like page numbers will not be associated with individual text elements.
 \b
-- store: Stores any images extracted from documents.
-    Options:
-    - structured (bool): Flag to write extracted charts and tables to object store.
-    - images (bool): Flag to write extracted images to object store.
-    - store_method (str): Storage type ('minio', ). Required.
-\b
-- caption: Attempts to extract captions for images extracted from documents. Note: this is not generative, but rather a
-    simple extraction.
-    Options:
-      N/A
-\b
-- dedup: Identifies and optionally filters duplicate images in extraction.
+- filter: Identifies and optionally filters images above or below scale thresholds.
     Options:
-      - content_type (str): Content type to deduplicate ('image')
-      - filter (bool): When set to True, duplicates will be filtered, otherwise, an info message will be added.
+      - content_type (str): Content type to filter ('image').
+      - filter (bool): When set to True, filtered images will be excluded; otherwise, an info message will be added.
+      - max_aspect_ratio (Union[float, int]): Maximum allowable aspect ratio of extracted image.
+      - min_aspect_ratio (Union[float, int]): Minimum allowable aspect ratio of extracted image.
+      - min_size (int): Minimum allowable size of extracted image.
 \b
-- filter: Identifies and optionally filters images above or below scale thresholds.
+- split: Divides documents according to specified criteria.
     Options:
-      - content_type (str): Content type to deduplicate ('image')
-      - min_size: (Union[float, int]): Minimum allowable size of extracted image.
-      - max_aspect_ratio: (Union[float, int]): Maximum allowable aspect ratio of extracted image.
-      - min_aspect_ratio: (Union[float, int]): Minimum allowable aspect ratio of extracted image.
-      - filter (bool): When set to True, duplicates will be filtered, otherwise, an info message will be added.
+    - max_character_length (int): Maximum segment character count. No default.
+    - sentence_window_size (int): Sentence window size. No default.
+    - split_by (str): Criteria ('page', 'size', 'word', 'sentence'). No default.
+    - split_length (int): Segment length. No default.
+    - split_overlap (int): Segment overlap. No default.
 \b
-- embed: Computes embeddings on multimodal extractions.
+- store: Stores any images extracted from documents.
     Options:
-    - text (bool): Flag to create embeddings for text extractions. Optional.
-    - tables (bool): Flag to creae embeddings for table extractions. Optional.
-    - filter_errors (bool): Flag to filter embedding errors. Optional.
+    - images (bool): Flag to write extracted images to object store.
+    - structured (bool): Flag to write extracted charts and tables to object store.
+    - store_method (str): Storage type ('minio', ). Required.
 \b
 - vdb_upload: Uploads extraction embeddings to vector database.
 \b

diff --git a/client/src/nv_ingest_client/primitives/tasks/caption.py b/client/src/nv_ingest_client/primitives/tasks/caption.py
@@ -7,7 +7,7 @@
 # pylint: disable=too-many-arguments
 
 import logging
-from typing import Dict
+from typing import Dict, Optional
 
 from pydantic import BaseModel
 
@@ -17,29 +17,56 @@
 
 
 class CaptionTaskSchema(BaseModel):
+    api_key: Optional[str] = None
+    endpoint_url: Optional[str] = None
+    prompt: Optional[str] = None
+
     class Config:
         extra = "forbid"
 
 
 class CaptionTask(Task):
     def __init__(
-        self,
+            self,
+            api_key: str = None,
+            endpoint_url: str = None,
+            prompt: str = None,
     ) -> None:
         super().__init__()
 
+        self._api_key = api_key
+        self._endpoint_url = endpoint_url
+        self._prompt = prompt
+
     def __str__(self) -> str:
         """
         Returns a string with the object's config and run time state
         """
         info = ""
+        info += "Image Caption Task:\n"
+
+        if (self._api_key):
+            info += f"  api_key: [redacted]\n"
+        if (self._endpoint_url):
+            info += f"  endpoint_url: {self._endpoint_url}\n"
+        if (self._prompt):
+            info += f"  prompt: {self._prompt}\n"
+
         return info
 
     def to_dict(self) -> Dict:
         """
         Convert to a dict for submission to redis
         """
-        task_properties = {
-            "content_type": "image",
-        }
+        task_properties = {}
+
+        if (self._api_key):
+            task_properties["api_key"] = self._api_key
+
+        if (self._endpoint_url):
+            task_properties["endpoint_url"] = self._endpoint_url
+
+        if (self._prompt):
+            task_properties["prompt"] = self._prompt
 
         return {"type": "caption", "task_properties": task_properties}
diff --git a/client/src/nv_ingest_client/primitives/tasks/extract.py b/client/src/nv_ingest_client/primitives/tasks/extract.py
@@ -34,34 +34,46 @@
 ADOBE_CLIENT_SECRET = os.environ.get("ADOBE_CLIENT_SECRET", None)
 
 _DEFAULT_EXTRACTOR_MAP = {
-    "pdf": "pdfium",
+    "csv": "pandas",
     "docx": "python_docx",
-    "pptx": "python_pptx",
-    "html": "beautifulsoup",
-    "xml": "lxml",
     "excel": "openpyxl",
-    "csv": "pandas",
+    "html": "beautifulsoup",
+    "jpeg": "image",
+    "jpg": "image",
     "parquet": "pandas",
+    "pdf": "pdfium",
+    "png": "image",
+    "pptx": "python_pptx",
+    "svg": "image",
+    "tiff": "image",
+    "xml": "lxml",
 }
 
 _Type_Extract_Method_PDF = Literal[
-    "pdfium",
+    "adobe",
     "doughnut",
     "haystack",
+    "llama_parse",
+    "pdfium",
     "tika",
     "unstructured_io",
-    "llama_parse",
-    "adobe",
 ]
 
 _Type_Extract_Method_DOCX = Literal["python_docx", "haystack", "unstructured_local", "unstructured_service"]
 
 _Type_Extract_Method_PPTX = Literal["python_pptx", "haystack", "unstructured_local", "unstructured_service"]
 
+_Type_Extract_Method_Image = Literal["image"]
+
 _Type_Extract_Method_Map = {
-    "pdf": get_args(_Type_Extract_Method_PDF),
     "docx": get_args(_Type_Extract_Method_DOCX),
+    "jpeg": get_args(_Type_Extract_Method_Image),
+    "jpg": get_args(_Type_Extract_Method_Image),
+    "pdf": get_args(_Type_Extract_Method_PDF),
+    "png": get_args(_Type_Extract_Method_Image),
     "pptx": get_args(_Type_Extract_Method_PPTX),
+    "svg": get_args(_Type_Extract_Method_Image),
+    "tiff": get_args(_Type_Extract_Method_Image),
 }
 
 _Type_Extract_Tables_Method_PDF = Literal["yolox", "pdfium"]
@@ -77,12 +89,14 @@
 }
 
 
+
+
 class ExtractTaskSchema(BaseModel):
     document_type: str
     extract_method: str = None  # Initially allow None to set a smart default
-    extract_text: bool = (True,)
-    extract_images: bool = (True,)
-    extract_tables: bool = False
+    extract_text: bool = True
+    extract_images: bool = True
+    extract_tables: bool = True
     extract_tables_method: str = "yolox"
     extract_charts: Optional[bool] = None  # Initially allow None to set a smart default
     text_depth: str = "document"

diff --git a/client/src/nv_ingest_client/util/file_processing/extract.py b/client/src/nv_ingest_client/util/file_processing/extract.py
@@ -21,16 +21,17 @@
 
 # Enums
 class DocumentTypeEnum(str, Enum):
-    pdf = "pdf"
-    txt = "text"
+    bmp = "bmp"
     docx = "docx"
-    pptx = "pptx"
+    html = "html"
     jpeg = "jpeg"
-    bmp = "bmp"
+    md = "md"
+    pdf = "pdf"
     png = "png"
+    pptx = "pptx"
     svg = "svg"
-    html = "html"
-    md = "md"
+    tiff = "tiff"
+    txt = "text"
 
 
 # Maps MIME types to DocumentTypeEnum
@@ -49,19 +50,20 @@ class DocumentTypeEnum(str, Enum):
 
 # Maps file extensions to DocumentTypeEnum
 EXTENSION_TO_DOCUMENT_TYPE = {
-    "pdf": DocumentTypeEnum.pdf,
-    "txt": DocumentTypeEnum.txt,
-    "docx": DocumentTypeEnum.docx,
-    "pptx": DocumentTypeEnum.pptx,
-    "jpg": DocumentTypeEnum.jpeg,
-    "jpeg": DocumentTypeEnum.jpeg,
     "bmp": DocumentTypeEnum.bmp,
-    "png": DocumentTypeEnum.png,
-    "svg": DocumentTypeEnum.svg,
+    "docx": DocumentTypeEnum.docx,
     "html": DocumentTypeEnum.html,
+    "jpeg": DocumentTypeEnum.jpeg,
+    "jpg": DocumentTypeEnum.jpeg,
+    "json": DocumentTypeEnum.txt,
     "md": DocumentTypeEnum.txt,
+    "pdf": DocumentTypeEnum.pdf,
+    "png": DocumentTypeEnum.png,
+    "pptx": DocumentTypeEnum.pptx,
     "sh": DocumentTypeEnum.txt,
-    "json": DocumentTypeEnum.txt,
+    "svg": DocumentTypeEnum.svg,
+    "tiff": DocumentTypeEnum.tiff,
+    "txt": DocumentTypeEnum.txt,
     # Add more as needed
 }
 

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -117,7 +117,7 @@ services:
     runtime: nvidia
 
   nv-ingest-ms-runtime:
-    image: nvcr.io/ohlfw0olaadg/ea-participants/nv-ingest:24.10
+    image: nvcr.io/ohlfw0olaadg/ea-participants/nv-ingest:24.10.1
     build:
       context: ${NV_INGEST_ROOT:-.}
       dockerfile: "./Dockerfile"
@@ -157,6 +157,7 @@ services:
       - YOLOX_GRPC_ENDPOINT=yolox:8001
       - YOLOX_HTTP_ENDPOINT=http://yolox:8000/v1/infer
       - YOLOX_INFER_PROTOCOL=grpc
+      - VLM_CAPTION_ENDPOINT=https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-90b-vision-instruct/chat/completions
     healthcheck:
       test: curl --fail http://nv-ingest-ms-runtime:7670/v1/health/ready || exit 1
       interval: 10s

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,7 @@
 aiohttp==3.9.4
 charset-normalizer
 click
+opencv-python
 dataclasses
 farm-haystack[ocr,inference,pdf,preprocessing,file-conversion]
 fastapi==0.109.1
@@ -35,3 +36,4 @@ tabulate
 torchvision==0.18.0
 unstructured-client==0.23.3
 uvicorn==0.24.0-post.1
+Wand==0.6.13
diff --git a/src/nv_ingest/extraction_workflows/image/__init__.py b/src/nv_ingest/extraction_workflows/image/__init__.py
@@ -0,0 +1,3 @@
+from .image_handlers import image_data_extractor as image
+
+__all__ = ["image"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .image_handlers import image_data_extractor as image

		__all__ = ["image"]