Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: embedded text not getting merged with inferred elements #331

3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
## 0.7.25-dev2
## 0.7.25

* fix: replace `Rectangle.is_in()` with `Rectangle.is_almost_subregion_of()` when filling in an inferred element with embedded text
* bug: check for None in Chipper bounding box reduction
* chore: removes `install-detectron2` from the `Makefile`
* fix: convert label_map keys read from os.environment `UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH` to int type
Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.7.25-dev2" # pragma: no cover
__version__ = "0.7.25" # pragma: no cover
10 changes: 10 additions & 0 deletions unstructured_inference/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,16 @@ def LAYOUT_SUBREGION_THRESHOLD(self) -> float:
"""
return self._get_float("LAYOUT_SUBREGION_THRESHOLD", 0.75)

@property
def EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD(self) -> float:
"""threshold to determine if an embedded region is a sub-region of a given block
when aggregating the text from embedded elements that lie within the given block

When the intersection region area divided by self area is larger than this threshold self is
considered a subregion of the other
"""
return self._get_float("EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD", 0.99)

@property
def ELEMENTS_H_PADDING_COEF(self) -> float:
"""When extending the boundaries of a PDF object for the purpose of determining which other
Expand Down
7 changes: 6 additions & 1 deletion unstructured_inference/inference/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import numpy as np

from unstructured_inference.config import inference_config
from unstructured_inference.constants import Source
from unstructured_inference.math import safe_division

Expand Down Expand Up @@ -246,8 +247,12 @@ def aggregate_by_block(
) -> str:
"""Extracts the text aggregated from the elements of the given layout that lie within the given
block."""

subregion_threshold = inference_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
filtered_blocks = [
obj for obj in pdf_objects if obj.bbox.is_in(text_region.bbox, error_margin=5)
obj
for obj in pdf_objects
if obj.bbox.is_almost_subregion_of(text_region.bbox, subregion_threshold)
]
text = " ".join([x.text for x in filtered_blocks if x.text])
return text
Expand Down
Loading