Skip to content

Commit

Permalink
update dockerfile to address glibcxx issue (#468)
Browse files Browse the repository at this point in the history
Co-authored-by: Jeremy Dyer <[email protected]>
  • Loading branch information
edknv and jdye64 authored Feb 20, 2025
1 parent 863819b commit f002e4d
Show file tree
Hide file tree
Showing 9 changed files with 190 additions and 112 deletions.
11 changes: 9 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# syntax=docker/dockerfile:1.3

ARG BASE_IMG=nvcr.io/nvidia/cuda
ARG BASE_IMG_TAG=12.4.1-base-ubuntu22.04
ARG BASE_IMG_TAG=12.5.1-base-ubuntu22.04

# Use NVIDIA Morpheus as the base image
FROM $BASE_IMG:$BASE_IMG_TAG AS base
Expand All @@ -21,13 +21,20 @@ LABEL git_commit=$GIT_COMMIT

# Install necessary dependencies using apt-get
RUN apt-get update && apt-get install -y \
wget \
bzip2 \
ca-certificates \
curl \
libgl1-mesa-glx \
software-properties-common \
wget \
&& apt-get clean

# A workaround for the error (mrc-core): /usr/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.32' not found
# Issue: https://github.com/NVIDIA/nv-ingest/issues/474
RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test \
&& apt-get update \
&& apt-get install -y --only-upgrade libstdc++6

RUN wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" -O /tmp/miniforge.sh \
&& bash /tmp/miniforge.sh -b -p /opt/conda \
&& rm /tmp/miniforge.sh
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,6 @@ def nemoretriever_parse(
"""
logger.debug("Extracting PDF with nemoretriever_parse backend.")

nemoretriever_parse_config = kwargs.get("nemoretriever_parse_config", {})
nemoretriever_parse_config = nemoretriever_parse_config if nemoretriever_parse_config is not None else {}

row_data = kwargs.get("row_data")
# get source_id
source_id = row_data["source_id"]
Expand All @@ -111,9 +108,10 @@ def nemoretriever_parse(
paddle_output_format = kwargs.get("paddle_output_format", "pseudo_markdown")
paddle_output_format = TableFormatEnum[paddle_output_format.upper()]

pdfium_config = kwargs.get("pdfium_config", {})
if isinstance(pdfium_config, dict):
pdfium_config = PDFiumConfigSchema(**pdfium_config)
if (extract_tables_method == "yolox") and (extract_tables or extract_charts):
pdfium_config = kwargs.get("pdfium_config", {})
if isinstance(pdfium_config, dict):
pdfium_config = PDFiumConfigSchema(**pdfium_config)
nemoretriever_parse_config = kwargs.get("nemoretriever_parse_config", {})
if isinstance(nemoretriever_parse_config, dict):
nemoretriever_parse_config = NemoRetrieverParseConfigSchema(**nemoretriever_parse_config)
Expand Down
4 changes: 2 additions & 2 deletions src/nv_ingest/schemas/pdf_extractor_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def validate_endpoints(cls, values):

for model_name in ["yolox"]:
endpoint_name = f"{model_name}_endpoints"
grpc_service, http_service = values.get(endpoint_name)
grpc_service, http_service = values.get(endpoint_name, ("", ""))
grpc_service = _clean_service(grpc_service)
http_service = _clean_service(http_service)

Expand Down Expand Up @@ -156,7 +156,7 @@ def validate_endpoints(cls, values):

for model_name in ["nemoretriever_parse"]:
endpoint_name = f"{model_name}_endpoints"
grpc_service, http_service = values.get(endpoint_name)
grpc_service, http_service = values.get(endpoint_name, ("", ""))
grpc_service = _clean_service(grpc_service)
http_service = _clean_service(http_service)

Expand Down
3 changes: 0 additions & 3 deletions src/nv_ingest/stages/nim/chart_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,6 @@ def _update_metadata(
# Image is too small; mark as skipped.
results[i] = (img, None)

if not valid_images:
return results

# Prepare data payloads for both clients.
data_yolox = {"images": valid_arrays}
data_paddle = {"base64_images": valid_images}
Expand Down
4 changes: 2 additions & 2 deletions src/nv_ingest/stages/nim/table_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def _update_metadata(
logger.debug(f"Running table extraction using protocol {paddle_client.protocol}")

# Initialize the results list in the same order as base64_images.
results: List[Optional[Tuple[str, Tuple[Any, Any, Any]]]] = ["", (None, None, None)] * len(base64_images)
results: List[Optional[Tuple[str, Tuple[Any, Any, Any]]]] = [("", None, None, None)] * len(base64_images)

valid_images: List[str] = []
valid_indices: List[int] = []
Expand All @@ -70,7 +70,7 @@ def _update_metadata(
valid_indices.append(i)
else:
# Image is too small; mark as skipped.
results[i] = ("", None, None, None)
results[i] = (img, None, None, None)

if not valid_images:
return results
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,26 +34,37 @@ def sample_pdf_stream():
return pdf_stream


@pytest.fixture
def mock_parser_config():
return {
"nemoretriever_parse_endpoints": ("parser:8001", "http://parser:8000"),
}


@patch(f"{_MODULE_UNDER_TEST}.create_inference_client")
def test_nemoretriever_parse_text_extraction(mock_client, sample_pdf_stream, document_df):
def test_nemoretriever_parse_text_extraction(mock_client, sample_pdf_stream, document_df, mock_parser_config):
mock_client_instance = MagicMock()
mock_client.return_value = mock_client_instance
mock_client_instance.infer.return_value = [
{
"bbox": {"xmin": 0.16633729456384325, "ymin": 0.0969, "xmax": 0.3097820480404551, "ymax": 0.1102},
"text": "testing",
"type": "Text",
}
[
{
"bbox": {"xmin": 0.16633729456384325, "ymin": 0.0969, "xmax": 0.3097820480404551, "ymax": 0.1102},
"text": "testing",
"type": "Text",
}
]
]

result = nemoretriever_parse(
pdf_stream=sample_pdf_stream,
extract_text=True,
extract_images=False,
extract_tables=False,
extract_charts=False,
row_data=document_df.iloc[0],
text_depth="page",
nemoretriever_parse_config=MagicMock(),
extract_tables_method="nemoretriever_parse",
nemoretriever_parse_config=mock_parser_config,
)

assert len(result) == 1
Expand All @@ -63,25 +74,29 @@ def test_nemoretriever_parse_text_extraction(mock_client, sample_pdf_stream, doc


@patch(f"{_MODULE_UNDER_TEST}.create_inference_client")
def test_nemoretriever_parse_table_extraction(mock_client, sample_pdf_stream, document_df):
def test_nemoretriever_parse_table_extraction(mock_client, sample_pdf_stream, document_df, mock_parser_config):
mock_client_instance = MagicMock()
mock_client.return_value = mock_client_instance
mock_client_instance.infer.return_value = [
{
"bbox": {"xmin": 1 / 1024, "ymin": 2 / 1280, "xmax": 101 / 1024, "ymax": 102 / 1280},
"text": "table text",
"type": "Table",
}
[
{
"bbox": {"xmin": 1 / 1024, "ymin": 2 / 1280, "xmax": 101 / 1024, "ymax": 102 / 1280},
"text": "table text",
"type": "Table",
}
]
]

result = nemoretriever_parse(
pdf_stream=sample_pdf_stream,
extract_text=True,
extract_images=False,
extract_tables=True,
extract_charts=False,
row_data=document_df.iloc[0],
text_depth="page",
nemoretriever_parse_config=MagicMock(),
extract_tables_method="nemoretriever_parse",
nemoretriever_parse_config=mock_parser_config,
)

assert len(result) == 2
Expand All @@ -93,25 +108,29 @@ def test_nemoretriever_parse_table_extraction(mock_client, sample_pdf_stream, do


@patch(f"{_MODULE_UNDER_TEST}.create_inference_client")
def test_nemoretriever_parse_image_extraction(mock_client, sample_pdf_stream, document_df):
def test_nemoretriever_parse_image_extraction(mock_client, sample_pdf_stream, document_df, mock_parser_config):
mock_client_instance = MagicMock()
mock_client.return_value = mock_client_instance
mock_client_instance.infer.return_value = [
{
"bbox": {"xmin": 1 / 1024, "ymin": 2 / 1280, "xmax": 101 / 1024, "ymax": 102 / 1280},
"text": "",
"type": "Picture",
}
[
{
"bbox": {"xmin": 1 / 1024, "ymin": 2 / 1280, "xmax": 101 / 1024, "ymax": 102 / 1280},
"text": "",
"type": "Picture",
}
]
]

result = nemoretriever_parse(
pdf_stream=sample_pdf_stream,
extract_text=True,
extract_images=True,
extract_tables=False,
extract_charts=False,
row_data=document_df.iloc[0],
text_depth="page",
nemoretriever_parse_config=MagicMock(),
extract_tables_method="nemoretriever_parse",
nemoretriever_parse_config=mock_parser_config,
)

assert len(result) == 2
Expand All @@ -123,30 +142,34 @@ def test_nemoretriever_parse_image_extraction(mock_client, sample_pdf_stream, do


@patch(f"{_MODULE_UNDER_TEST}.create_inference_client")
def test_nemoretriever_parse_text_extraction_bboxes(mock_client, sample_pdf_stream, document_df):
def test_nemoretriever_parse_text_extraction_bboxes(mock_client, sample_pdf_stream, document_df, mock_parser_config):
mock_client_instance = MagicMock()
mock_client.return_value = mock_client_instance
mock_client_instance.infer.return_value = [
{
"bbox": {"xmin": 0.16633729456384325, "ymin": 0.0969, "xmax": 0.3097820480404551, "ymax": 0.1102},
"text": "testing0",
"type": "Title",
},
{
"bbox": {"xmin": 0.16633729456384325, "ymin": 0.0969, "xmax": 0.3097820480404551, "ymax": 0.1102},
"text": "testing1",
"type": "Text",
},
[
{
"bbox": {"xmin": 0.16633729456384325, "ymin": 0.0969, "xmax": 0.3097820480404551, "ymax": 0.1102},
"text": "testing0",
"type": "Title",
},
{
"bbox": {"xmin": 0.16633729456384325, "ymin": 0.0969, "xmax": 0.3097820480404551, "ymax": 0.1102},
"text": "testing1",
"type": "Text",
},
]
]

result = nemoretriever_parse(
pdf_stream=sample_pdf_stream,
extract_text=True,
extract_images=False,
extract_tables=False,
extract_charts=False,
row_data=document_df.iloc[0],
text_depth="page",
nemoretriever_parse_config=MagicMock(),
extract_tables_method="nemoretriever_parse",
nemoretriever_parse_config=mock_parser_config,
)

assert len(result) == 1
Expand Down
51 changes: 42 additions & 9 deletions tests/nv_ingest/schemas/test_table_extractor_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,49 +7,79 @@

# Test cases for TableExtractorConfigSchema
def test_valid_config_with_grpc_only():
config = TableExtractorConfigSchema(auth_token="valid_token", paddle_endpoints=("grpc://paddle_service", None))
config = TableExtractorConfigSchema(
auth_token="valid_token",
yolox_endpoints=("grpc://yolox_service", None),
paddle_endpoints=("grpc://paddle_service", None),
)
assert config.auth_token == "valid_token"
assert config.yolox_endpoints == ("grpc://yolox_service", None)
assert config.paddle_endpoints == ("grpc://paddle_service", None)


def test_valid_config_with_http_only():
config = TableExtractorConfigSchema(auth_token="valid_token", paddle_endpoints=(None, "http://paddle_service"))
config = TableExtractorConfigSchema(
auth_token="valid_token",
yolox_endpoints=(None, "http://yolox_service"),
paddle_endpoints=(None, "http://paddle_service"),
)
assert config.auth_token == "valid_token"
assert config.yolox_endpoints == (None, "http://yolox_service")
assert config.paddle_endpoints == (None, "http://paddle_service")


def test_valid_config_with_both_services():
config = TableExtractorConfigSchema(
auth_token="valid_token", paddle_endpoints=("grpc://paddle_service", "http://paddle_service")
auth_token="valid_token",
yolox_endpoints=("grpc://yolox_service", "http://yolox_service"),
paddle_endpoints=("grpc://paddle_service", "http://paddle_service"),
)
assert config.auth_token == "valid_token"
assert config.yolox_endpoints == ("grpc://yolox_service", "http://yolox_service")
assert config.paddle_endpoints == ("grpc://paddle_service", "http://paddle_service")


def test_invalid_config_empty_endpoints():
with pytest.raises(ValidationError) as exc_info:
TableExtractorConfigSchema(paddle_endpoints=(None, None))
TableExtractorConfigSchema(
yolox_endpoints=("grpc://yolox_service", "http://yolox_service"),
paddle_endpoints=(None, None),
)
assert "Both gRPC and HTTP services cannot be empty for paddle_endpoints" in str(exc_info.value)


def test_invalid_extra_fields():
with pytest.raises(ValidationError) as exc_info:
TableExtractorConfigSchema(
auth_token="valid_token", paddle_endpoints=("grpc://paddle_service", None), extra_field="invalid"
auth_token="valid_token",
yolox_endpoints=("grpc://yolox_service", None),
paddle_endpoints=("grpc://paddle_service", None),
extra_field="invalid",
)
assert "Extra inputs are not permitted" in str(exc_info.value)


def test_cleaning_empty_strings_in_endpoints():
config = TableExtractorConfigSchema(paddle_endpoints=(" ", "http://paddle_service"))
config = TableExtractorConfigSchema(
yolox_endpoints=("grpc://yolox_service", " "),
paddle_endpoints=(" ", "http://paddle_service"),
)
assert config.yolox_endpoints == ("grpc://yolox_service", None)
assert config.paddle_endpoints == (None, "http://paddle_service")

config = TableExtractorConfigSchema(paddle_endpoints=("grpc://paddle_service", ""))
config = TableExtractorConfigSchema(
yolox_endpoints=("", "http://yolox_service"),
paddle_endpoints=("grpc://paddle_service", ""),
)
assert config.yolox_endpoints == (None, "http://yolox_service")
assert config.paddle_endpoints == ("grpc://paddle_service", None)


def test_auth_token_is_none_by_default():
config = TableExtractorConfigSchema(paddle_endpoints=("grpc://paddle_service", "http://paddle_service"))
config = TableExtractorConfigSchema(
yolox_endpoints=("grpc://yolox_service", "http://yolox_service"),
paddle_endpoints=("grpc://paddle_service", "http://paddle_service"),
)
assert config.auth_token is None


Expand All @@ -63,7 +93,10 @@ def test_table_extractor_schema_defaults():


def test_table_extractor_schema_with_custom_values():
stage_config = TableExtractorConfigSchema(paddle_endpoints=("grpc://paddle_service", "http://paddle_service"))
stage_config = TableExtractorConfigSchema(
yolox_endpoints=("grpc://yolox_service", "http://yolox_service"),
paddle_endpoints=("grpc://paddle_service", "http://paddle_service"),
)
config = TableExtractorSchema(max_queue_size=15, n_workers=12, raise_on_failure=True, stage_config=stage_config)
assert config.max_queue_size == 15
assert config.n_workers == 12
Expand Down
4 changes: 2 additions & 2 deletions tests/nv_ingest/stages/nims/test_chart_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def test_update_metadata_single_batch_single_worker(mocker, base64_image):

# Patch join_yolox_and_paddle_output so that it returns a dict per image.
mock_join = mocker.patch(
f"{MODULE_UNDER_TEST}.join_yolox_and_paddle_output",
f"{MODULE_UNDER_TEST}.join_yolox_graphic_elements_and_paddle_output",
side_effect=[{"chart_title": "joined_1"}, {"chart_title": "joined_2"}],
)
# Patch process_yolox_graphic_elements to extract the chart title.
Expand Down Expand Up @@ -176,7 +176,7 @@ def test_update_metadata_multiple_batches_multi_worker(mocker, base64_image):

# Patch join_yolox_and_paddle_output so it returns the expected joined dict per image.
mock_join = mocker.patch(
f"{MODULE_UNDER_TEST}.join_yolox_and_paddle_output",
f"{MODULE_UNDER_TEST}.join_yolox_graphic_elements_and_paddle_output",
side_effect=[{"chart_title": "joined_1"}, {"chart_title": "joined_2"}, {"chart_title": "joined_3"}],
)
# Patch process_yolox_graphic_elements to extract the chart title.
Expand Down
Loading

0 comments on commit f002e4d

Please sign in to comment.