Skip to content

Commit

Permalink
feat: add env_config EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
Browse files Browse the repository at this point in the history
  • Loading branch information
christinestraub committed Mar 22, 2024
1 parent b4c0f9f commit cadc5b2
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 8 deletions.
10 changes: 10 additions & 0 deletions unstructured_inference/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,16 @@ def LAYOUT_SUBREGION_THRESHOLD(self) -> float:
"""
return self._get_float("LAYOUT_SUBREGION_THRESHOLD", 0.75)

@property
def EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD(self) -> float:
"""threshold to determine if an embedded region is a sub-region of a given block
when aggregating the text from embedded elements that lie within the given block
When the intersection region area divided by self area is larger than this threshold self is
considered a subregion of the other
"""
return self._get_float("EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD", 0.95)

@property
def ELEMENTS_H_PADDING_COEF(self) -> float:
"""When extending the boundaries of a PDF object for the purpose of determining which other
Expand Down
16 changes: 8 additions & 8 deletions unstructured_inference/inference/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,7 @@ def intersection_over_minimum(self, other: Rectangle) -> float:
min_area = min(self.area, other.area)
return safe_division(intersection_area, min_area)

def is_almost_subregion_of(
self,
other: Rectangle,
subregion_threshold: float = inference_config.LAYOUT_SUBREGION_THRESHOLD,
) -> bool:
def is_almost_subregion_of(self, other: Rectangle, subregion_threshold: float = 0.75) -> bool:
"""Returns whether this region is almost a subregion of other. This is determined by
comparing the intersection area over self area to some threshold, and checking whether self
is the smaller rectangle."""
Expand Down Expand Up @@ -246,13 +242,17 @@ def extract_text(


def aggregate_by_block(
text_region: TextRegion,
target_region: TextRegion,
pdf_objects: Collection[TextRegion],
) -> str:
"""Extracts the text aggregated from the elements of the given layout that lie within the given
block."""

subregion_threshold = inference_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
filtered_blocks = [
obj for obj in pdf_objects if obj.bbox.is_almost_subregion_of(text_region.bbox)
obj
for obj in pdf_objects
if obj.bbox.is_almost_subregion_of(target_region.bbox, subregion_threshold)
]
text = " ".join([x.text for x in filtered_blocks if x.text])
return text
Expand Down Expand Up @@ -288,7 +288,7 @@ def remove_control_characters(text: str) -> str:
def region_bounding_boxes_are_almost_the_same(
region1: Rectangle,
region2: Rectangle,
same_region_threshold: float = inference_config.LAYOUT_SAME_REGION_THRESHOLD,
same_region_threshold: float = 0.75,
) -> bool:
"""Returns whether bounding boxes are almost the same. This is determined by checking if the
intersection over union is above some threshold."""
Expand Down

0 comments on commit cadc5b2

Please sign in to comment.