Switch split task to token based splitting (#283)

Co-authored-by: Chris Jarrett <[email protected]> Co-authored-by: Devin Robison <[email protected]>
NVIDIA · Feb 25, 2025 · 6960827 · 6960827
1 parent 766f35d
commit 6960827
Show file tree

Hide file tree

Showing 25 changed files with 431 additions and 471 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -12,6 +12,8 @@ FROM $BASE_IMG:$BASE_IMG_TAG AS base
 ARG RELEASE_TYPE="dev"
 ARG VERSION=""
 ARG VERSION_REV="0"
+ARG DOWNLOAD_LLAMA_TOKENIZER=""
+ARG HF_ACCESS_TOKEN=""
 
 # Embed the `git rev-parse HEAD` as a Docker metadata label
 # Allows for linking container builds to git commits
@@ -71,6 +73,9 @@ WORKDIR /workspace
 # Copy custom entrypoint script
 COPY ./docker/scripts/entrypoint.sh /workspace/docker/entrypoint.sh
 
+# Copy post build triggers script
+COPY ./docker/scripts/post_build_triggers.py /workspace/docker/post_build_triggers.py
+
 FROM base AS nv_ingest_install
 # Copy the module code
 COPY setup.py setup.py
@@ -124,6 +129,11 @@ RUN --mount=type=cache,target=/opt/conda/pkgs\
     && pip install ./api/dist/*.whl \
     && pip install ./client/dist/*.whl
 
+
+RUN  --mount=type=cache,target=/root/.cache/pip \
+    source activate nv_ingest_runtime \
+    && python3 /workspace/docker/post_build_triggers.py
+
 RUN rm -rf src
 
 FROM nv_ingest_install AS runtime

diff --git a/README.md b/README.md
@@ -131,6 +131,9 @@ NVIDIA_BUILD_API_KEY=<key to use NIMs that are hosted on build.nvidia.com>
 > Make sure NVIDIA is set as your default container runtime before running the docker compose command with the command:
 > `sudo nvidia-ctk runtime configure --runtime=docker --set-as-default`
 
+> [!NOTE]
+> The most accurate tokenizer based splitting depends on the [llama-3.2 tokenizer](https://huggingface.co/meta-llama/Llama-3.2-1B). To download this model at container build time, you must set `DOWNLOAD_LLAMA_TOKENIZER=True` _and_ supply an authorized HuggingFace access token via `HF_ACCESS_TOKEN=<your access token>`. If not, the ungated [e5-large-unsupervised](https://huggingface.co/intfloat/e5-large-unsupervised) tokenizer model will be downloaded instead. By default, the split task will use whichever model has been predownloaded. Refer to [Environment Configuration Variables](docs/docs/user-guide/developer-guide/environment-config.md) for more info.
+
 5. Start all services:
 `docker compose --profile retrieval up`
 
@@ -422,6 +425,12 @@ https://pypi.org/project/pdfservices-sdk/
     required if you want to use the Adobe extraction service for PDF decomposition. Please review the
     [license agreement](https://github.com/adobe/pdfservices-python-sdk?tab=License-1-ov-file) for the
     pdfservices-sdk before enabling this option.
+- **`DOWNLOAD_LLAMA_TOKENIZER` (Built With Llama):**:
+  - **Description**: The Split task uses the `meta-llama/Llama-3.2-1B` tokenizer, which will be downloaded
+    from HuggingFace at build time if `DOWNLOAD_LLAMA_TOKENIZER` is set to `True`. Please review the
+    [license agreement](https://huggingface.co/meta-llama/Llama-3.2-1B) for Llama 3.2 materials before using this.
+    This is a gated model so you'll need to [request access](https://huggingface.co/meta-llama/Llama-3.2-1B) and
+    set `HF_ACCESS_TOKEN` to your HuggingFace access token in order to use it.
 
 
 ### Contributing

diff --git a/client/client_examples/examples/python_client_usage.ipynb b/client/client_examples/examples/python_client_usage.ipynb
@@ -465,11 +465,11 @@
     "        min_aspect_ratio=0.2,\n",
     "        filter=True,\n",
     "    ).split(\n",
-    "        split_by=\"word\",\n",
-    "        split_length=300,\n",
-    "        split_overlap=10,\n",
-    "        max_character_length=5000,\n",
-    "        sentence_window_size=0,\n",
+    "        chunk_size=300,\n",
+    "        chunk_overlap=10,\n",
+    "        params={\n",
+    "            \"split_source_types\": [\"PDF\"],\n",
+    "        },\n",
     "    ).store(\n",
     "        structured=True,\n",
     "        images=True,\n",

diff --git a/client/src/nv_ingest_client/primitives/tasks/split.py b/client/src/nv_ingest_client/primitives/tasks/split.py
@@ -8,29 +8,20 @@
 
 import logging
 from typing import Dict
-from typing import Literal
 from typing import Optional
 
-from pydantic import BaseModel, field_validator
+from pydantic import BaseModel
 
 from .task_base import Task
 
 logger = logging.getLogger(__name__)
 
 
 class SplitTaskSchema(BaseModel):
-    split_by: Optional[str] = "sentence"
-    split_length: Optional[int] = 10
-    split_overlap: Optional[int] = 0
-    max_character_length: Optional[int] = 1024
-    sentence_window_size: Optional[int] = 0
-
-    @field_validator("split_by")
-    def split_by_must_be_valid(cls, v):
-        valid_criteria = ["page", "size", "word", "sentence"]
-        if v not in valid_criteria:
-            raise ValueError(f"split_by must be one of {valid_criteria}")
-        return v
+    tokenizer: Optional[str] = None
+    chunk_size: int = 1024
+    chunk_overlap: int = 150
+    params: dict = {}
 
     class Config:
         extra = "forbid"
@@ -41,37 +32,33 @@ class SplitTask(Task):
     Object for document splitting task
     """
 
-    _TypeSplitBy = Literal["word", "sentence", "passage"]
-
     def __init__(
         self,
-        split_by: _TypeSplitBy = None,
-        split_length: int = None,
-        split_overlap: int = None,
-        max_character_length: int = None,
-        sentence_window_size: int = None,
+        tokenizer: str = None,
+        chunk_size: int = 1024,
+        chunk_overlap: int = 150,
+        params: dict = {},
     ) -> None:
         """
         Setup Split Task Config
         """
         super().__init__()
-        self._split_by = split_by
-        self._split_length = split_length
-        self._split_overlap = split_overlap
-        self._max_character_length = max_character_length
-        self._sentence_window_size = sentence_window_size
+        self._tokenizer = tokenizer
+        self._chunk_size = chunk_size
+        self._chunk_overlap = chunk_overlap
+        self._params = params
 
     def __str__(self) -> str:
         """
         Returns a string with the object's config and run time state
         """
         info = ""
         info += "Split Task:\n"
-        info += f"  split_by: {self._split_by}\n"
-        info += f"  split_length: {self._split_length}\n"
-        info += f"  split_overlap: {self._split_overlap}\n"
-        info += f"  split_max_character_length: {self._max_character_length}\n"
-        info += f"  split_sentence_window_size: {self._sentence_window_size}\n"
+        info += f"  tokenizer: {self._tokenizer}\n"
+        info += f"  chunk_size: {self._chunk_size}\n"
+        info += f"  chunk_overlap: {self._chunk_overlap}\n"
+        for key, value in self._params.items():
+            info += f"  {key}: {value}\n"
         return info
 
     def to_dict(self) -> Dict:
@@ -80,15 +67,13 @@ def to_dict(self) -> Dict:
         """
         split_params = {}
 
-        if self._split_by is not None:
-            split_params["split_by"] = self._split_by
-        if self._split_length is not None:
-            split_params["split_length"] = self._split_length
-        if self._split_overlap is not None:
-            split_params["split_overlap"] = self._split_overlap
-        if self._max_character_length is not None:
-            split_params["max_character_length"] = self._max_character_length
-        if self._sentence_window_size is not None:
-            split_params["sentence_window_size"] = self._sentence_window_size
+        if self._tokenizer is not None:
+            split_params["tokenizer"] = self._tokenizer
+        if self._chunk_size is not None:
+            split_params["chunk_size"] = self._chunk_size
+        if self._chunk_overlap is not None:
+            split_params["chunk_overlap"] = self._chunk_overlap
+        if self._params is not None:
+            split_params["params"] = self._params
 
         return {"type": "split", "task_properties": split_params}
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -180,6 +180,9 @@ services:
       context: ${NV_INGEST_ROOT:-.}
       dockerfile: "./Dockerfile"
       target: runtime
+      args:
+        DOWNLOAD_LLAMA_TOKENIZER: ${DOWNLOAD_LLAMA_TOKENIZER:-False}
+        HF_ACCESS_TOKEN: ${HF_ACCESS_TOKEN:-hfaccesstoken}
     volumes:
       - ${DATASET_ROOT:-./data}:/workspace/data
     ports:

diff --git a/docker/scripts/post_build_triggers.py b/docker/scripts/post_build_triggers.py
@@ -0,0 +1,15 @@
+import os
+from transformers import AutoTokenizer
+
+if os.getenv("DOWNLOAD_LLAMA_TOKENIZER") == "True":
+    tokenizer_path = "/workspace/models/llama-3.2-1b/tokenizer/"
+    os.makedirs(tokenizer_path)
+
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", token=os.getenv("HF_ACCESS_TOKEN"))
+    tokenizer.save_pretrained(tokenizer_path)
+else:
+    tokenizer_path = "/workspace/models/e5-large-unsupervised/tokenizer/"
+    os.makedirs(tokenizer_path)
+
+    tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-large-unsupervised")
+    tokenizer.save_pretrained(tokenizer_path)
diff --git a/docs/docs/user-guide/developer-guide/environment-config.md b/docs/docs/user-guide/developer-guide/environment-config.md
@@ -16,3 +16,5 @@ The following are the environment configuration variables that you can specify i
 | `NVIDIA_BUILD_API_KEY`           | —                                                          | The key to access NIMs that are hosted on build.nvidia.com instead of a self-hosted NIM. This is necessary only in some cases when it is different from `NGC_API_KEY`. If this is not specified, `NGC_API_KEY` is used for build.nvidia.com. |
 | `OTEL_EXPORTER_OTLP_ENDPOINT`    | `http://otel-collector:4317` <br/>                       | The endpoint for the OpenTelemetry exporter, used for sending telemetry data. |
 | `REDIS_MORPHEUS_TASK_QUEUE`      | `morpheus_task_queue` <br/>                              | The name of the task queue in Redis where tasks are stored and processed. |
+| `DOWNLOAD_LLAMA_TOKENIZER`       | `True` <br/>                                             | If `True`, the [llama-3.2 tokenizer](https://huggingface.co/meta-llama/Llama-3.2-1B) will be pre-dowloaded at build time. If not set to `True`, the (e5-large-unsupervised)[https://huggingface.co/intfloat/e5-large-unsupervised] tokenizer will be pre-downloaded. Note: setting this to `True` requires a HuggingFace access token with access to the gated Llama-3.2 models. See below for more info. |
+| `HF_ACCESS_TOKEN`                | -                                                         | The HuggingFace access token used to pre-downlaod the Llama-3.2 tokenizer from HuggingFace (see above for more info). Llama 3.2 is a gated model, so you must [request access](https://huggingface.co/meta-llama/Llama-3.2-1B) to the Llama-3.2 models and then set this variable to a token that can access gated repositories on your behalf in order to use `DOWNLOAD_LLAMA_TOKENIZER=True`. |
diff --git a/src/nv_ingest/modules/transforms/__init__.py b/src/nv_ingest/modules/transforms/__init__.py
@@ -2,6 +2,6 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from .nemo_doc_splitter import NemoDocSplitterLoaderFactory
+from .text_splitter import TextSplitterLoaderFactory
 
-__all__ = ["NemoDocSplitterLoaderFactory"]
+__all__ = ["TextSplitterLoaderFactory"]