Skip to content

Commit

Permalink
feat: make PageLayout.elements a cached property (#414)
Browse files Browse the repository at this point in the history
- default `PageLayout.get_elements_with_detection_model` now returns
`LayoutElements`
- `PageLayout.elements` is a cached property computed from
`elements_array` property to save memory and cpu costs
  • Loading branch information
badGarnet authored Mar 5, 2025
1 parent 5d6e50b commit d41730d
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 10 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
## 0.8.8-dev0
## 0.8.8-dev1

* fix: pdfminer-six dependencies
* feat: `PageLayout.elements` is now a `cached_property` to reduce unecessary memory and cpu costs

## 0.8.7

Expand Down
2 changes: 1 addition & 1 deletion test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def test_get_page_elements(monkeypatch, mock_final_layout):
)
elements = page.get_elements_with_detection_model(inplace=False)
page.get_elements_with_detection_model(inplace=True)
assert elements == page.elements
assert elements == page.elements_array


class MockPool:
Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.8-dev0" # pragma: no cover
__version__ = "0.8.8-dev1" # pragma: no cover
19 changes: 12 additions & 7 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
import tempfile
from functools import cached_property
from pathlib import PurePath
from typing import Any, BinaryIO, Collection, List, Optional, Union, cast

Expand Down Expand Up @@ -149,7 +150,6 @@ def __init__(
self.number = number
self.detection_model = detection_model
self.element_extraction_model = element_extraction_model
self.elements: Collection[LayoutElement] = []
self.elements_array: LayoutElements | None = None
self.password = password
# NOTE(alan): Dropped LocationlessLayoutElement that was created for chipper - chipper has
Expand All @@ -159,10 +159,18 @@ def __init__(
def __str__(self) -> str:
return "\n\n".join([str(element) for element in self.elements])

@cached_property
def elements(self) -> Collection[LayoutElement]:
"""return a list of layout elements from the array data structure; intended for backward
compatibility"""
if self.elements_array is None:
return []
return self.elements_array.as_list()

def get_elements_using_image_extraction(
self,
inplace=True,
) -> Optional[List[LayoutElement]]:
) -> Optional[list[LayoutElement]]:
"""Uses end-to-end text element extraction model to extract the elements on the page."""
if self.element_extraction_model is None:
raise ValueError(
Expand All @@ -178,8 +186,7 @@ def get_elements_using_image_extraction(
def get_elements_with_detection_model(
self,
inplace: bool = True,
array_only: bool = False,
) -> Optional[List[LayoutElement]]:
) -> Optional[LayoutElements]:
"""Uses specified model to detect the elements on the page."""
if self.detection_model is None:
model = get_model()
Expand All @@ -198,11 +205,9 @@ def get_elements_with_detection_model(

if inplace:
self.elements_array = inferred_layout
if not array_only:
self.elements = inferred_layout.as_list()
return None

return inferred_layout.as_list()
return inferred_layout

def _get_image_array(self) -> Union[np.ndarray[Any, Any], None]:
"""Converts the raw image into a numpy array."""
Expand Down

0 comments on commit d41730d

Please sign in to comment.