Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring PDF loaders: 01 prepare #29062

Merged
merged 6 commits into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
from langchain_community.document_loaders.blob_loaders import (
Blob,
BlobLoader,
CloudBlobLoader,
FileSystemBlobLoader,
YoutubeAudioLoader,
)
Expand Down Expand Up @@ -574,6 +575,7 @@
"CSVLoader": "langchain_community.document_loaders.csv_loader",
"CassandraLoader": "langchain_community.document_loaders.cassandra",
"ChatGPTLoader": "langchain_community.document_loaders.chatgpt",
"CloudBlobLoader": "langchain_community.document_loaders.blob_loaders",
"CoNLLULoader": "langchain_community.document_loaders.conllu",
"CollegeConfidentialLoader": "langchain_community.document_loaders.college_confidential", # noqa: E501
"ConcurrentLoader": "langchain_community.document_loaders.concurrent",
Expand Down Expand Up @@ -781,6 +783,7 @@ def __getattr__(name: str) -> Any:
"CSVLoader",
"CassandraLoader",
"ChatGPTLoader",
"CloudBlobLoader",
"CoNLLULoader",
"CollegeConfidentialLoader",
"ConcurrentLoader",
Expand Down
34 changes: 14 additions & 20 deletions libs/community/langchain_community/document_loaders/parsers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterable,
Iterator,
Mapping,
Expand All @@ -23,15 +22,13 @@
from langchain_community.document_loaders.blob_loaders import Blob

if TYPE_CHECKING:
import fitz.fitz
import pdfminer.layout
import pdfplumber.page
import pypdf._page
import pypdfium2._helpers.page
from pypdf import PageObject
import fitz
import pdfminer
import pdfplumber
import pypdf
import pypdfium2
from textractor.data.text_linearization_config import TextLinearizationConfig


_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
_PDF_FILTER_WITHOUT_LOSS = [
"LZWDecode",
Expand Down Expand Up @@ -90,7 +87,7 @@ def __init__(
extract_images: bool = False,
*,
extraction_mode: str = "plain",
extraction_kwargs: Optional[Dict[str, Any]] = None,
extraction_kwargs: Optional[dict[str, Any]] = None,
):
self.password = password
self.extract_images = extract_images
Expand All @@ -107,7 +104,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
"`pip install pypdf`"
)

def _extract_text_from_page(page: "PageObject") -> str:
def _extract_text_from_page(page: pypdf.PageObject) -> str:
"""
Extract text from image given the version of pypdf.
"""
Expand All @@ -126,12 +123,13 @@ def _extract_text_from_page(page: "PageObject") -> str:
Document(
page_content=_extract_text_from_page(page=page)
+ self._extract_images_from_page(page),
metadata={"source": blob.source, "page": page_number}, # type: ignore[attr-defined]
metadata={"source": blob.source, "page": page_number},
# type: ignore[attr-defined]
)
for page_number, page in enumerate(pdf_reader.pages)
]

def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
def _extract_images_from_page(self, page: pypdf.PageObject) -> str:
"""Extract images from page and get the text with RapidOCR."""
if not self.extract_images or "/XObject" not in page["/Resources"].keys(): # type: ignore[attr-defined]
return ""
Expand Down Expand Up @@ -307,9 +305,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
for page in doc
]

def _get_page_content(
self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
) -> str:
def _get_page_content(self, doc: fitz.Document, page: fitz.Page, blob: Blob) -> str:
"""
Get the text of the page using PyMuPDF and RapidOCR and issue a warning
if it is empty.
Expand All @@ -327,7 +323,7 @@ def _get_page_content(
return content

def _extract_metadata(
self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
self, doc: fitz.Document, page: fitz.Page, blob: Blob
) -> dict:
"""Extract metadata from the document and page."""
return dict(
Expand All @@ -344,9 +340,7 @@ def _extract_metadata(
},
)

def _extract_images_from_page(
self, doc: fitz.fitz.Document, page: fitz.fitz.Page
) -> str:
def _extract_images_from_page(self, doc: fitz.Document, page: fitz.Page) -> str:
"""Extract images from page and get the text with RapidOCR."""
if not self.extract_images:
return ""
Expand Down Expand Up @@ -558,7 +552,7 @@ def __init__(
textract_features: Optional[Sequence[int]] = None,
client: Optional[Any] = None,
*,
linearization_config: Optional["TextLinearizationConfig"] = None,
linearization_config: Optional[TextLinearizationConfig] = None,
) -> None:
"""Initializes the parser.

Expand Down
Loading
Loading