diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index f135ff2dc2..42a261fdc0 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -43,6 +43,14 @@ permissions: jobs: + pr-builder: + needs: + - prepare + - checks + - ci_pipe + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02 + prepare: # Executes the get-pr-info action to determine if the PR has the skip-ci label, if the action fails we assume the # PR does not have the label @@ -91,13 +99,3 @@ jobs: test_container: nvcr.io/ea-nvidia-morpheus/morpheus:morpheus-ci-test-240614 secrets: NGC_API_KEY: ${{ secrets.NGC_API_KEY }} - - pr-builder: - # Always run this step even if others are skipped or cancelled - if: '!cancelled()' - needs: - - prepare - - checks - - ci_pipe - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02 diff --git a/conda/environments/all_cuda-121_arch-x86_64.yaml b/conda/environments/all_cuda-121_arch-x86_64.yaml index f860ece4c6..a6aab42af3 100644 --- a/conda/environments/all_cuda-121_arch-x86_64.yaml +++ b/conda/environments/all_cuda-121_arch-x86_64.yaml @@ -83,6 +83,7 @@ dependencies: - pydantic - pylint=3.0.3 - pypdf=3.17.4 +- pypdfium2=4.30 - pytest-asyncio - pytest-benchmark=4.0 - pytest-cov @@ -120,7 +121,6 @@ dependencies: - pip: - --find-links https://data.dgl.ai/wheels-test/repo.html - --find-links https://data.dgl.ai/wheels/cu121/repo.html - - PyMuPDF==1.23.* - databricks-cli < 0.100 - databricks-connect - dgl==2.0.0 diff --git a/conda/environments/dev_cuda-121_arch-x86_64.yaml b/conda/environments/dev_cuda-121_arch-x86_64.yaml index 30781144a4..468166b3d7 100644 --- a/conda/environments/dev_cuda-121_arch-x86_64.yaml +++ b/conda/environments/dev_cuda-121_arch-x86_64.yaml @@ -67,6 +67,7 @@ dependencies: - pybind11-stubgen=0.10.5 - pydantic - pylint=3.0.3 +- pypdfium2=4.30 - pytest-asyncio - pytest-benchmark=4.0 - pytest-cov @@ -98,7 +99,6 @@ dependencies: - yapf=0.40.1 - zlib=1.2.13 - pip: - - PyMuPDF==1.23.* - databricks-cli < 0.100 - databricks-connect - milvus==2.3.5 diff --git a/conda/environments/examples_cuda-121_arch-x86_64.yaml b/conda/environments/examples_cuda-121_arch-x86_64.yaml index b144b38b13..ea17b3bdb3 100644 --- a/conda/environments/examples_cuda-121_arch-x86_64.yaml +++ b/conda/environments/examples_cuda-121_arch-x86_64.yaml @@ -44,6 +44,7 @@ dependencies: - pluggy=1.3 - pydantic - pypdf=3.17.4 +- pypdfium2=4.30 - python-confluent-kafka>=1.9.2,<1.10.0a0 - python-docx==1.1.0 - python-graphviz @@ -67,7 +68,6 @@ dependencies: - pip: - --find-links https://data.dgl.ai/wheels-test/repo.html - --find-links https://data.dgl.ai/wheels/cu121/repo.html - - PyMuPDF==1.23.* - databricks-cli < 0.100 - databricks-connect - dgl==2.0.0 diff --git a/dependencies.yaml b/dependencies.yaml index 089e0d24c9..f95295ee52 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -364,6 +364,7 @@ dependencies: - output_types: [conda] packages: - &nodejs nodejs=18.* + - &pypdfium2 pypdfium2=4.30 - pytest-asyncio - pytest-benchmark=4.0 - pytest-cov @@ -371,7 +372,6 @@ dependencies: - &python-docx python-docx==1.1.0 - pip - pip: - - &PyMuPDF PyMuPDF==1.23.* - pytest-kafka==0.6.0 example-dfp-prod: @@ -410,6 +410,7 @@ dependencies: - onnx=1.15 - openai=1.13 - pypdf=3.17.4 + - *pypdfium2 - *python-docx - requests-toolbelt=1.0 # Transitive dep needed by nemollm, specified here to ensure we get a compatible version - sentence-transformers=2.7 @@ -420,7 +421,6 @@ dependencies: - faiss-gpu==1.7.* - google-search-results==2.4 - nemollm==0.3.5 - - *PyMuPDF model-training-tuning: common: diff --git a/examples/llm/vdb_upload/module/content_extractor_module.py b/examples/llm/vdb_upload/module/content_extractor_module.py index ac5ae771e6..c02dd89718 100755 --- a/examples/llm/vdb_upload/module/content_extractor_module.py +++ b/examples/llm/vdb_upload/module/content_extractor_module.py @@ -22,11 +22,11 @@ from typing import Dict from typing import List -import fitz import fsspec import mrc import mrc.core.operators as ops import pandas as pd +import pypdfium2 as libpdfium from docx import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from pydantic import BaseModel # pylint: disable=no-name-in-module @@ -172,10 +172,13 @@ def wrapper(input_info: ConverterInputInfo, *args, **kwargs): @_converter_error_handler def _pdf_to_text_converter(input_info: ConverterInputInfo) -> str: text = "" - pdf_document = fitz.open(stream=input_info.io_bytes, filetype="pdf") - for page_num in range(pdf_document.page_count): - page = pdf_document[page_num] - text += page.get_text() + pdf_document = libpdfium.PdfDocument(input_info.io_bytes) + for page_idx in range(len(pdf_document)): + page = pdf_document.get_page(page_idx) + textpage = page.get_textpage() + page_text = textpage.get_text_bounded() + text += page_text + return text