From 4a2fd95fdd733749fd46a978b25fa6a7e83b44cb Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Fri, 22 Mar 2024 13:22:53 -0700 Subject: [PATCH] Fix: embedded text not getting merged with inferred elements (#331) This PR is the first part of fixing "embedded text not getting merged with inferred elements" and works together with the unstructured PR - https://github.com/Unstructured-IO/unstructured/pull/2679. ### Summary - replace `Rectangle.is_in()` with `Rectangle.is_almost_subregion_of()` when filling in an inferred element with embedded text - add env_config `EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD` ### Note The ingest test won't pass until we merge the unstructured PR - https://github.com/Unstructured-IO/unstructured/pull/2679. --- CHANGELOG.md | 3 ++- test_unstructured_inference/test_elements.py | 13 +++++++++++++ unstructured_inference/__version__.py | 2 +- unstructured_inference/config.py | 10 ++++++++++ unstructured_inference/inference/elements.py | 7 ++++++- 5 files changed, 32 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cd42a716..cf1430e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ -## 0.7.25-dev2 +## 0.7.25 +* fix: replace `Rectangle.is_in()` with `Rectangle.is_almost_subregion_of()` when filling in an inferred element with embedded text * bug: check for None in Chipper bounding box reduction * chore: removes `install-detectron2` from the `Makefile` * fix: convert label_map keys read from os.environment `UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH` to int type diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py index 9e49accd..9d77562e 100644 --- a/test_unstructured_inference/test_elements.py +++ b/test_unstructured_inference/test_elements.py @@ -272,3 +272,16 @@ def test_merge_inferred_layout_with_extracted_layout(): assert merged_layout[0].text == "Example Section Header" assert merged_layout[1].type == ElementType.TEXT assert merged_layout[1].text == "Example Title" + + +def test_aggregate_by_block(): + expected = "Inside region1 Inside region2" + embedded_regions = [ + TextRegion.from_coords(0, 0, 20, 20, "Inside region1"), + TextRegion.from_coords(50, 50, 150, 150, "Inside region2"), + TextRegion.from_coords(250, 250, 350, 350, "Outside region"), + ] + target_region = TextRegion.from_coords(0, 0, 300, 300) + + text = elements.aggregate_by_block(target_region, embedded_regions) + assert text == expected diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 3bd31c04..85e12390 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.7.25-dev2" # pragma: no cover +__version__ = "0.7.25" # pragma: no cover diff --git a/unstructured_inference/config.py b/unstructured_inference/config.py index 0d85b6a3..dfdfdf67 100644 --- a/unstructured_inference/config.py +++ b/unstructured_inference/config.py @@ -91,6 +91,16 @@ def LAYOUT_SUBREGION_THRESHOLD(self) -> float: """ return self._get_float("LAYOUT_SUBREGION_THRESHOLD", 0.75) + @property + def EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD(self) -> float: + """threshold to determine if an embedded region is a sub-region of a given block + when aggregating the text from embedded elements that lie within the given block + + When the intersection region area divided by self area is larger than this threshold self is + considered a subregion of the other + """ + return self._get_float("EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD", 0.99) + @property def ELEMENTS_H_PADDING_COEF(self) -> float: """When extending the boundaries of a PDF object for the purpose of determining which other diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index 86433430..62ff0a84 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -8,6 +8,7 @@ import numpy as np +from unstructured_inference.config import inference_config from unstructured_inference.constants import Source from unstructured_inference.math import safe_division @@ -246,8 +247,12 @@ def aggregate_by_block( ) -> str: """Extracts the text aggregated from the elements of the given layout that lie within the given block.""" + + subregion_threshold = inference_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD filtered_blocks = [ - obj for obj in pdf_objects if obj.bbox.is_in(text_region.bbox, error_margin=5) + obj + for obj in pdf_objects + if obj.bbox.is_almost_subregion_of(text_region.bbox, subregion_threshold) ] text = " ".join([x.text for x in filtered_blocks if x.text]) return text