Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix html_as_text appearing in every element metadata #319

Merged
merged 10 commits into from
Feb 7, 2024
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.7.24

* fix: assign value to `text_as_html` element attribute only if `text` attribute contains HTML tags.

## 0.7.23

* fix: added handling in `UnstructuredTableTransformerModel` for if `recognize` returns an empty
Expand Down
24 changes: 24 additions & 0 deletions test_unstructured_inference/models/test_chippermodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pytest
import torch
from PIL import Image
from unstructured_inference.inference.layoutelement import LayoutElement

from unstructured_inference.models import chipper
from unstructured_inference.models.base import get_model
Expand Down Expand Up @@ -422,3 +423,26 @@ def test_check_overlap(bbox1, bbox2, output):
model = get_model("chipper")

assert model.check_overlap(bbox1, bbox2) == output


def test_format_table_elements():
table_html = "<table><tr><td>Cell 1</td><td>Cell 2</td></tr><tr><td>Cell 3</td></tr></table>"
texts = [
"Text",
" - List element",
table_html,
None,
]
elements = [LayoutElement(bbox=mock.MagicMock(), text=text) for text in texts]
formatted_elements = chipper.UnstructuredChipperModel.format_table_elements(elements)
text_attributes = [fe.text for fe in formatted_elements]
text_as_html_attributes = [
fe.text_as_html if hasattr(fe, "text_as_html") else None for fe in formatted_elements
]
assert text_attributes == [
"Text",
" - List element",
"Cell 1Cell 2Cell 3",
None,
]
assert text_as_html_attributes == [None, None, table_html, None]
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.7.23" # pragma: no cover
__version__ = "0.7.24" # pragma: no cover
16 changes: 9 additions & 7 deletions unstructured_inference/models/chipper.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,16 +171,18 @@ def predict(self, image) -> List[LayoutElement]:
return elements

@staticmethod
def format_table_elements(elements):
"""makes chipper table element return the same as other layout models
def format_table_elements(elements: List[LayoutElement]) -> List[LayoutElement]:
"""Makes chipper table element return the same as other layout models.

- copies the html representation to attribute text_as_html
- strip html tags from the attribute text
1. If `text` attribute is an html (has html tags in it), copies the `text`
attribute to `text_as_html` attribute.
2. Strips html tags from the `text` attribute.
"""
for element in elements:
element.text_as_html = element.text
element.text = strip_tags(element.text)

text = strip_tags(element.text) if element.text is not None else element.text
if text != element.text:
element.text_as_html = element.text # type: ignore[attr-defined]
element.text = text
return elements

def predict_tokens(
Expand Down
Loading