-
Notifications
You must be signed in to change notification settings - Fork 219
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Introduce a "Convert" endpoint for directly handling HTTP fileupload …
…documents (#313) Co-authored-by: Julio Perez <[email protected]> Co-authored-by: Julio Perez <[email protected]> Co-authored-by: Devin Robison <[email protected]> Co-authored-by: tmonty12 <[email protected]>
- Loading branch information
1 parent
cf8c5f5
commit 0e44b01
Showing
8 changed files
with
406 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# SPDX-License-Identifier: LicenseRef-NvidiaProprietary | ||
# | ||
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual | ||
# property and proprietary rights in and to this material, related | ||
# documentation and any modifications thereto. Any use, reproduction, | ||
# disclosure or distribution of this material and related documentation | ||
# without an express license agreement from NVIDIA CORPORATION or | ||
# its affiliates is strictly prohibited. | ||
|
||
from pydantic import BaseModel, ConfigDict | ||
from enum import Enum | ||
|
||
|
||
class ConversionStatus(str, Enum): | ||
IN_PROGRESS = "in_progress" | ||
SUCCESS = "success" | ||
FAILED = "failed" | ||
|
||
model_config = ConfigDict(extra="forbid") | ||
|
||
|
||
class ProcessingJob(BaseModel): | ||
submitted_job_id: str | ||
filename: str | ||
raw_result: str = "" | ||
content: str = "" | ||
status: ConversionStatus | ||
error: str | None = None | ||
|
||
model_config = ConfigDict(extra="forbid") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# SPDX-License-Identifier: LicenseRef-NvidiaProprietary | ||
# | ||
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual | ||
# property and proprietary rights in and to this material, related | ||
# documentation and any modifications thereto. Any use, reproduction, | ||
# disclosure or distribution of this material and related documentation | ||
# without an express license agreement from NVIDIA CORPORATION or | ||
# its affiliates is strictly prohibited. | ||
|
||
# pylint: skip-file | ||
|
||
import json | ||
|
||
|
||
def ingest_json_results_to_blob(result_content): | ||
""" | ||
Parse a JSON string or BytesIO object, combine and sort entries, and create a blob string. | ||
Returns: | ||
str: The generated blob string. | ||
""" | ||
try: | ||
# Load the JSON data | ||
data = json.loads(result_content) if isinstance(result_content, str) else json.loads(result_content) | ||
data = data["data"] | ||
|
||
# Smarter sorting: by page, then structured objects by x0, y0 | ||
def sorting_key(entry): | ||
page = entry["metadata"]["content_metadata"]["page_number"] | ||
if entry["document_type"] == "structured": | ||
# Use table location's x0 and y0 as secondary keys | ||
x0 = entry["metadata"]["table_metadata"]["table_location"][0] | ||
y0 = entry["metadata"]["table_metadata"]["table_location"][1] | ||
else: | ||
# Non-structured objects are sorted after structured ones | ||
x0 = float("inf") | ||
y0 = float("inf") | ||
return page, x0, y0 | ||
|
||
data.sort(key=sorting_key) | ||
|
||
# Initialize the blob string | ||
blob = [] | ||
|
||
for entry in data: | ||
document_type = entry.get("document_type", "") | ||
|
||
if document_type == "structured": | ||
# Add table content to the blob | ||
blob.append(entry["metadata"]["table_metadata"]["table_content"]) | ||
blob.append("\n") | ||
|
||
elif document_type == "text": | ||
# Add content to the blob | ||
blob.append(entry["metadata"]["content"]) | ||
blob.append("\n") | ||
|
||
elif document_type == "image": | ||
# Add image caption to the blob | ||
caption = entry["metadata"]["image_metadata"].get("caption", "") | ||
blob.append(f"image_caption:[{caption}]") | ||
blob.append("\n") | ||
|
||
# Join all parts of the blob into a single string | ||
return "".join(blob) | ||
|
||
except Exception as e: | ||
print(f"[ERROR] An error occurred while processing JSON content: {e}") | ||
return "" |
1 change: 1 addition & 0 deletions
1
tests/nv_ingest/util/converters/multimodal_test_raw_results.json
Large diffs are not rendered by default.
Oops, something went wrong.
Oops, something went wrong.