Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update dockerfile to address glibcxx issue #468

Merged
merged 10 commits into from
Feb 20, 2025
11 changes: 9 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# syntax=docker/dockerfile:1.3

ARG BASE_IMG=nvcr.io/nvidia/cuda
ARG BASE_IMG_TAG=12.4.1-base-ubuntu22.04
ARG BASE_IMG_TAG=12.5.1-base-ubuntu22.04

# Use NVIDIA Morpheus as the base image
FROM $BASE_IMG:$BASE_IMG_TAG AS base
Expand All @@ -21,13 +21,20 @@ LABEL git_commit=$GIT_COMMIT

# Install necessary dependencies using apt-get
RUN apt-get update && apt-get install -y \
wget \
bzip2 \
ca-certificates \
curl \
libgl1-mesa-glx \
software-properties-common \
wget \
&& apt-get clean

# A workaround for the error (mrc-core): /usr/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.32' not found
# Issue: https://github.com/NVIDIA/nv-ingest/issues/474
RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test \
&& apt-get update \
&& apt-get install -y --only-upgrade libstdc++6

RUN wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" -O /tmp/miniforge.sh \
&& bash /tmp/miniforge.sh -b -p /opt/conda \
&& rm /tmp/miniforge.sh
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,6 @@ def nemoretriever_parse(
"""
logger.debug("Extracting PDF with nemoretriever_parse backend.")

nemoretriever_parse_config = kwargs.get("nemoretriever_parse_config", {})
nemoretriever_parse_config = nemoretriever_parse_config if nemoretriever_parse_config is not None else {}

row_data = kwargs.get("row_data")
# get source_id
source_id = row_data["source_id"]
Expand All @@ -111,9 +108,10 @@ def nemoretriever_parse(
paddle_output_format = kwargs.get("paddle_output_format", "pseudo_markdown")
paddle_output_format = TableFormatEnum[paddle_output_format.upper()]

pdfium_config = kwargs.get("pdfium_config", {})
if isinstance(pdfium_config, dict):
pdfium_config = PDFiumConfigSchema(**pdfium_config)
if (extract_tables_method == "yolox") and (extract_tables or extract_charts):
pdfium_config = kwargs.get("pdfium_config", {})
if isinstance(pdfium_config, dict):
pdfium_config = PDFiumConfigSchema(**pdfium_config)
nemoretriever_parse_config = kwargs.get("nemoretriever_parse_config", {})
if isinstance(nemoretriever_parse_config, dict):
nemoretriever_parse_config = NemoRetrieverParseConfigSchema(**nemoretriever_parse_config)
Expand Down
4 changes: 2 additions & 2 deletions src/nv_ingest/schemas/pdf_extractor_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def validate_endpoints(cls, values):

for model_name in ["yolox"]:
endpoint_name = f"{model_name}_endpoints"
grpc_service, http_service = values.get(endpoint_name)
grpc_service, http_service = values.get(endpoint_name, ("", ""))
grpc_service = _clean_service(grpc_service)
http_service = _clean_service(http_service)

Expand Down Expand Up @@ -156,7 +156,7 @@ def validate_endpoints(cls, values):

for model_name in ["nemoretriever_parse"]:
endpoint_name = f"{model_name}_endpoints"
grpc_service, http_service = values.get(endpoint_name)
grpc_service, http_service = values.get(endpoint_name, ("", ""))
grpc_service = _clean_service(grpc_service)
http_service = _clean_service(http_service)

Expand Down
3 changes: 0 additions & 3 deletions src/nv_ingest/stages/nim/chart_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,6 @@ def _update_metadata(
# Image is too small; mark as skipped.
results[i] = (img, None)

if not valid_images:
return results

# Prepare data payloads for both clients.
data_yolox = {"images": valid_arrays}
data_paddle = {"base64_images": valid_images}
Expand Down
4 changes: 2 additions & 2 deletions src/nv_ingest/stages/nim/table_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def _update_metadata(
logger.debug(f"Running table extraction using protocol {paddle_client.protocol}")

# Initialize the results list in the same order as base64_images.
results: List[Optional[Tuple[str, Tuple[Any, Any, Any]]]] = ["", (None, None, None)] * len(base64_images)
results: List[Optional[Tuple[str, Tuple[Any, Any, Any]]]] = [("", None, None, None)] * len(base64_images)

valid_images: List[str] = []
valid_indices: List[int] = []
Expand All @@ -70,7 +70,7 @@ def _update_metadata(
valid_indices.append(i)
else:
# Image is too small; mark as skipped.
results[i] = ("", None, None, None)
results[i] = (img, None, None, None)

if not valid_images:
return results
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,26 +34,37 @@ def sample_pdf_stream():
return pdf_stream


@pytest.fixture
def mock_parser_config():
return {
"nemoretriever_parse_endpoints": ("parser:8001", "http://parser:8000"),
}


@patch(f"{_MODULE_UNDER_TEST}.create_inference_client")
def test_nemoretriever_parse_text_extraction(mock_client, sample_pdf_stream, document_df):
def test_nemoretriever_parse_text_extraction(mock_client, sample_pdf_stream, document_df, mock_parser_config):
mock_client_instance = MagicMock()
mock_client.return_value = mock_client_instance
mock_client_instance.infer.return_value = [
{
"bbox": {"xmin": 0.16633729456384325, "ymin": 0.0969, "xmax": 0.3097820480404551, "ymax": 0.1102},
"text": "testing",
"type": "Text",
}
[
{
"bbox": {"xmin": 0.16633729456384325, "ymin": 0.0969, "xmax": 0.3097820480404551, "ymax": 0.1102},
"text": "testing",
"type": "Text",
}
]
]

result = nemoretriever_parse(
pdf_stream=sample_pdf_stream,
extract_text=True,
extract_images=False,
extract_tables=False,
extract_charts=False,
row_data=document_df.iloc[0],
text_depth="page",
nemoretriever_parse_config=MagicMock(),
extract_tables_method="nemoretriever_parse",
nemoretriever_parse_config=mock_parser_config,
)

assert len(result) == 1
Expand All @@ -63,25 +74,29 @@ def test_nemoretriever_parse_text_extraction(mock_client, sample_pdf_stream, doc


@patch(f"{_MODULE_UNDER_TEST}.create_inference_client")
def test_nemoretriever_parse_table_extraction(mock_client, sample_pdf_stream, document_df):
def test_nemoretriever_parse_table_extraction(mock_client, sample_pdf_stream, document_df, mock_parser_config):
mock_client_instance = MagicMock()
mock_client.return_value = mock_client_instance
mock_client_instance.infer.return_value = [
{
"bbox": {"xmin": 1 / 1024, "ymin": 2 / 1280, "xmax": 101 / 1024, "ymax": 102 / 1280},
"text": "table text",
"type": "Table",
}
[
{
"bbox": {"xmin": 1 / 1024, "ymin": 2 / 1280, "xmax": 101 / 1024, "ymax": 102 / 1280},
"text": "table text",
"type": "Table",
}
]
]

result = nemoretriever_parse(
pdf_stream=sample_pdf_stream,
extract_text=True,
extract_images=False,
extract_tables=True,
extract_charts=False,
row_data=document_df.iloc[0],
text_depth="page",
nemoretriever_parse_config=MagicMock(),
extract_tables_method="nemoretriever_parse",
nemoretriever_parse_config=mock_parser_config,
)

assert len(result) == 2
Expand All @@ -93,25 +108,29 @@ def test_nemoretriever_parse_table_extraction(mock_client, sample_pdf_stream, do


@patch(f"{_MODULE_UNDER_TEST}.create_inference_client")
def test_nemoretriever_parse_image_extraction(mock_client, sample_pdf_stream, document_df):
def test_nemoretriever_parse_image_extraction(mock_client, sample_pdf_stream, document_df, mock_parser_config):
mock_client_instance = MagicMock()
mock_client.return_value = mock_client_instance
mock_client_instance.infer.return_value = [
{
"bbox": {"xmin": 1 / 1024, "ymin": 2 / 1280, "xmax": 101 / 1024, "ymax": 102 / 1280},
"text": "",
"type": "Picture",
}
[
{
"bbox": {"xmin": 1 / 1024, "ymin": 2 / 1280, "xmax": 101 / 1024, "ymax": 102 / 1280},
"text": "",
"type": "Picture",
}
]
]

result = nemoretriever_parse(
pdf_stream=sample_pdf_stream,
extract_text=True,
extract_images=True,
extract_tables=False,
extract_charts=False,
row_data=document_df.iloc[0],
text_depth="page",
nemoretriever_parse_config=MagicMock(),
extract_tables_method="nemoretriever_parse",
nemoretriever_parse_config=mock_parser_config,
)

assert len(result) == 2
Expand All @@ -123,30 +142,34 @@ def test_nemoretriever_parse_image_extraction(mock_client, sample_pdf_stream, do


@patch(f"{_MODULE_UNDER_TEST}.create_inference_client")
def test_nemoretriever_parse_text_extraction_bboxes(mock_client, sample_pdf_stream, document_df):
def test_nemoretriever_parse_text_extraction_bboxes(mock_client, sample_pdf_stream, document_df, mock_parser_config):
mock_client_instance = MagicMock()
mock_client.return_value = mock_client_instance
mock_client_instance.infer.return_value = [
{
"bbox": {"xmin": 0.16633729456384325, "ymin": 0.0969, "xmax": 0.3097820480404551, "ymax": 0.1102},
"text": "testing0",
"type": "Title",
},
{
"bbox": {"xmin": 0.16633729456384325, "ymin": 0.0969, "xmax": 0.3097820480404551, "ymax": 0.1102},
"text": "testing1",
"type": "Text",
},
[
{
"bbox": {"xmin": 0.16633729456384325, "ymin": 0.0969, "xmax": 0.3097820480404551, "ymax": 0.1102},
"text": "testing0",
"type": "Title",
},
{
"bbox": {"xmin": 0.16633729456384325, "ymin": 0.0969, "xmax": 0.3097820480404551, "ymax": 0.1102},
"text": "testing1",
"type": "Text",
},
]
]

result = nemoretriever_parse(
pdf_stream=sample_pdf_stream,
extract_text=True,
extract_images=False,
extract_tables=False,
extract_charts=False,
row_data=document_df.iloc[0],
text_depth="page",
nemoretriever_parse_config=MagicMock(),
extract_tables_method="nemoretriever_parse",
nemoretriever_parse_config=mock_parser_config,
)

assert len(result) == 1
Expand Down
51 changes: 42 additions & 9 deletions tests/nv_ingest/schemas/test_table_extractor_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,49 +7,79 @@

# Test cases for TableExtractorConfigSchema
def test_valid_config_with_grpc_only():
config = TableExtractorConfigSchema(auth_token="valid_token", paddle_endpoints=("grpc://paddle_service", None))
config = TableExtractorConfigSchema(
auth_token="valid_token",
yolox_endpoints=("grpc://yolox_service", None),
paddle_endpoints=("grpc://paddle_service", None),
)
assert config.auth_token == "valid_token"
assert config.yolox_endpoints == ("grpc://yolox_service", None)
assert config.paddle_endpoints == ("grpc://paddle_service", None)


def test_valid_config_with_http_only():
config = TableExtractorConfigSchema(auth_token="valid_token", paddle_endpoints=(None, "http://paddle_service"))
config = TableExtractorConfigSchema(
auth_token="valid_token",
yolox_endpoints=(None, "http://yolox_service"),
paddle_endpoints=(None, "http://paddle_service"),
)
assert config.auth_token == "valid_token"
assert config.yolox_endpoints == (None, "http://yolox_service")
assert config.paddle_endpoints == (None, "http://paddle_service")


def test_valid_config_with_both_services():
config = TableExtractorConfigSchema(
auth_token="valid_token", paddle_endpoints=("grpc://paddle_service", "http://paddle_service")
auth_token="valid_token",
yolox_endpoints=("grpc://yolox_service", "http://yolox_service"),
paddle_endpoints=("grpc://paddle_service", "http://paddle_service"),
)
assert config.auth_token == "valid_token"
assert config.yolox_endpoints == ("grpc://yolox_service", "http://yolox_service")
assert config.paddle_endpoints == ("grpc://paddle_service", "http://paddle_service")


def test_invalid_config_empty_endpoints():
with pytest.raises(ValidationError) as exc_info:
TableExtractorConfigSchema(paddle_endpoints=(None, None))
TableExtractorConfigSchema(
yolox_endpoints=("grpc://yolox_service", "http://yolox_service"),
paddle_endpoints=(None, None),
)
assert "Both gRPC and HTTP services cannot be empty for paddle_endpoints" in str(exc_info.value)


def test_invalid_extra_fields():
with pytest.raises(ValidationError) as exc_info:
TableExtractorConfigSchema(
auth_token="valid_token", paddle_endpoints=("grpc://paddle_service", None), extra_field="invalid"
auth_token="valid_token",
yolox_endpoints=("grpc://yolox_service", None),
paddle_endpoints=("grpc://paddle_service", None),
extra_field="invalid",
)
assert "Extra inputs are not permitted" in str(exc_info.value)


def test_cleaning_empty_strings_in_endpoints():
config = TableExtractorConfigSchema(paddle_endpoints=(" ", "http://paddle_service"))
config = TableExtractorConfigSchema(
yolox_endpoints=("grpc://yolox_service", " "),
paddle_endpoints=(" ", "http://paddle_service"),
)
assert config.yolox_endpoints == ("grpc://yolox_service", None)
assert config.paddle_endpoints == (None, "http://paddle_service")

config = TableExtractorConfigSchema(paddle_endpoints=("grpc://paddle_service", ""))
config = TableExtractorConfigSchema(
yolox_endpoints=("", "http://yolox_service"),
paddle_endpoints=("grpc://paddle_service", ""),
)
assert config.yolox_endpoints == (None, "http://yolox_service")
assert config.paddle_endpoints == ("grpc://paddle_service", None)


def test_auth_token_is_none_by_default():
config = TableExtractorConfigSchema(paddle_endpoints=("grpc://paddle_service", "http://paddle_service"))
config = TableExtractorConfigSchema(
yolox_endpoints=("grpc://yolox_service", "http://yolox_service"),
paddle_endpoints=("grpc://paddle_service", "http://paddle_service"),
)
assert config.auth_token is None


Expand All @@ -63,7 +93,10 @@ def test_table_extractor_schema_defaults():


def test_table_extractor_schema_with_custom_values():
stage_config = TableExtractorConfigSchema(paddle_endpoints=("grpc://paddle_service", "http://paddle_service"))
stage_config = TableExtractorConfigSchema(
yolox_endpoints=("grpc://yolox_service", "http://yolox_service"),
paddle_endpoints=("grpc://paddle_service", "http://paddle_service"),
)
config = TableExtractorSchema(max_queue_size=15, n_workers=12, raise_on_failure=True, stage_config=stage_config)
assert config.max_queue_size == 15
assert config.n_workers == 12
Expand Down
4 changes: 2 additions & 2 deletions tests/nv_ingest/stages/nims/test_chart_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def test_update_metadata_single_batch_single_worker(mocker, base64_image):

# Patch join_yolox_and_paddle_output so that it returns a dict per image.
mock_join = mocker.patch(
f"{MODULE_UNDER_TEST}.join_yolox_and_paddle_output",
f"{MODULE_UNDER_TEST}.join_yolox_graphic_elements_and_paddle_output",
side_effect=[{"chart_title": "joined_1"}, {"chart_title": "joined_2"}],
)
# Patch process_yolox_graphic_elements to extract the chart title.
Expand Down Expand Up @@ -176,7 +176,7 @@ def test_update_metadata_multiple_batches_multi_worker(mocker, base64_image):

# Patch join_yolox_and_paddle_output so it returns the expected joined dict per image.
mock_join = mocker.patch(
f"{MODULE_UNDER_TEST}.join_yolox_and_paddle_output",
f"{MODULE_UNDER_TEST}.join_yolox_graphic_elements_and_paddle_output",
side_effect=[{"chart_title": "joined_1"}, {"chart_title": "joined_2"}, {"chart_title": "joined_3"}],
)
# Patch process_yolox_graphic_elements to extract the chart title.
Expand Down
Loading
Loading