From b132c68a328b29d17490a97ac5e58ddb756d2f3f Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Mon, 18 Nov 2024 14:06:43 +0100 Subject: [PATCH] Inferring image DPI from pptx file Signed-off-by: Maksym Lysak --- docling/backend/mspowerpoint_backend.py | 4 +++- .../groundtruth/docling_v2/powerpoint_with_image.pptx.json | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index fc59adb3e..f595e4bd4 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -273,6 +273,8 @@ def handle_pictures(self, shape, parent_slide, slide_ind, doc): # Get the image bytes image = shape.image image_bytes = image.blob + im_dpi, _ = image.dpi + # Open it with PIL pil_image = Image.open(BytesIO(image_bytes)) @@ -280,7 +282,7 @@ def handle_pictures(self, shape, parent_slide, slide_ind, doc): prov = self.generate_prov(shape, slide_ind, "") doc.add_picture( parent=parent_slide, - image=ImageRef.from_pil(image=pil_image, dpi=72), + image=ImageRef.from_pil(image=pil_image, dpi=im_dpi), caption=None, prov=prov, ) diff --git a/tests/data/groundtruth/docling_v2/powerpoint_with_image.pptx.json b/tests/data/groundtruth/docling_v2/powerpoint_with_image.pptx.json index 1bf93e8a7..eaa343f09 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_with_image.pptx.json +++ b/tests/data/groundtruth/docling_v2/powerpoint_with_image.pptx.json @@ -127,7 +127,7 @@ "footnotes": [], "image": { "mimetype": "image/png", - "dpi": 72, + "dpi": 300, "size": { "width": 268.0, "height": 268.0