Skip to content

Commit

Permalink
fix: Fixing images in the input Word files (#330)
Browse files Browse the repository at this point in the history
* Fixing images identification in the input Word files

Signed-off-by: Maksym Lysak <[email protected]>

* Populating extracted image data into docling picture for wordx backend

Signed-off-by: Maksym Lysak <[email protected]>

* Updated tests

Signed-off-by: Maksym Lysak <[email protected]>

* removed base64 dependency in msword_backend

Signed-off-by: Maksym Lysak <[email protected]>

---------

Signed-off-by: Maksym Lysak <[email protected]>
Co-authored-by: Maksym Lysak <[email protected]>
  • Loading branch information
maxmnemonic and Maksym Lysak authored Nov 14, 2024
1 parent bf2a85f commit 8533039
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 77 deletions.
37 changes: 26 additions & 11 deletions docling/backend/msword_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
DoclingDocument,
DocumentOrigin,
GroupLabel,
ImageRef,
TableCell,
TableData,
)
from lxml import etree
from PIL import Image

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
Expand Down Expand Up @@ -130,13 +132,8 @@ def get_level(self) -> int:
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
for element in body:
tag_name = etree.QName(element).localname
# Check for Inline Images (drawings or blip elements)
found_drawing = etree.ElementBase.xpath(
element, ".//w:drawing", namespaces=self.xml_namespaces
)
found_pict = etree.ElementBase.xpath(
element, ".//w:pict", namespaces=self.xml_namespaces
)
# Check for Inline Images (blip elements)
drawing_blip = element.xpath(".//a:blip")

# Check for Tables
if element.tag.endswith("tbl"):
Expand All @@ -145,8 +142,8 @@ def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
except Exception:
_log.debug("could not parse a table, broken docx table")

elif found_drawing or found_pict:
self.handle_pictures(element, docx_obj, doc)
elif drawing_blip:
self.handle_pictures(element, docx_obj, drawing_blip, doc)
# Check for Text
elif tag_name in ["p"]:
self.handle_text_elements(element, docx_obj, doc)
Expand Down Expand Up @@ -491,6 +488,24 @@ def get_rowspan(cell):
doc.add_table(data=data, parent=self.parents[level - 1])
return

def handle_pictures(self, element, docx_obj, doc):
doc.add_picture(parent=self.parents[self.level], caption=None)
def handle_pictures(self, element, docx_obj, drawing_blip, doc):
def get_docx_image(element, drawing_blip):
rId = drawing_blip[0].get(
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
)
if rId in docx_obj.part.rels:
# Access the image part using the relationship ID
image_part = docx_obj.part.rels[rId].target_part
image_data = image_part.blob # Get the binary image data
return image_data

image_data = get_docx_image(element, drawing_blip)
image_bytes = BytesIO(image_data)
# Open the BytesIO object with PIL to create an Image
pil_image = Image.open(image_bytes)
doc.add_picture(
parent=self.parents[self.level],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
return
2 changes: 1 addition & 1 deletion tests/data/groundtruth/docling_v2/word_sample.docx.itxt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Summer activities
item-2 at level 1: title: Swimming in the lake
item-3 at level 2: paragraph: Duck
item-4 at level 2: paragraph:
item-4 at level 2: picture
item-5 at level 2: paragraph: Figure 1: This is a cute duckling
item-6 at level 2: section_header: Let’s swim!
item-7 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown:
Expand Down
Loading

0 comments on commit 8533039

Please sign in to comment.