Skip to content

Commit

Permalink
Extend docx and pptx extractors to attempt to extract tables/charts f…
Browse files Browse the repository at this point in the history
…rom images (NVIDIA#334)

Co-authored-by: Edward Kim <[email protected]>
  • Loading branch information
drobison00 and edknv authored Jan 17, 2025
1 parent 0e44b01 commit 9b1668d
Show file tree
Hide file tree
Showing 18 changed files with 1,147 additions and 389 deletions.
21 changes: 17 additions & 4 deletions src/nv_ingest/extraction_workflows/docx/docx_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,14 @@
logger = logging.getLogger(__name__)


def python_docx(docx: Union[str, Path, IO], extract_text: bool, extract_images: bool, extract_tables: bool, **kwargs):
def python_docx(
docx: Union[str, Path, IO],
extract_text: bool,
extract_images: bool,
extract_tables: bool,
extract_charts: bool,
**kwargs
):
"""
Helper function that use python-docx to extract text from a bytestream document
Expand All @@ -57,6 +64,8 @@ def python_docx(docx: Union[str, Path, IO], extract_text: bool, extract_images:
Specifies whether to extract images.
extract_tables : bool
Specifies whether to extract tables.
extract_charts : bool
Specifies whether to extract charts.
**kwargs
The keyword arguments are used for additional extraction parameters.
Expand All @@ -73,10 +82,12 @@ def python_docx(docx: Union[str, Path, IO], extract_text: bool, extract_images:
source_id = row_data["source_id"]
# get text_depth
text_depth = kwargs.get("text_depth", "document")
text_depth = TextTypeEnum[text_depth.upper()]
text_depth = TextTypeEnum(text_depth)
# get base metadata
metadata_col = kwargs.get("metadata_column", "metadata")

docx_extractor_config = kwargs.get("docx_extraction_config", {})

base_unified_metadata = row_data[metadata_col] if metadata_col in row_data.index else {}

# get base source_metadata
Expand All @@ -103,7 +114,9 @@ def python_docx(docx: Union[str, Path, IO], extract_text: bool, extract_images:
}

# Extract data from the document using python-docx
doc = DocxReader(docx, source_metadata)
extracted_data = doc.extract_data(base_unified_metadata, text_depth, extract_text, extract_tables, extract_images)
doc = DocxReader(docx, source_metadata, extraction_config=docx_extractor_config)
extracted_data = doc.extract_data(
base_unified_metadata, text_depth, extract_text, extract_charts, extract_tables, extract_images
)

return extracted_data
Loading

0 comments on commit 9b1668d

Please sign in to comment.