diff --git a/CHANGELOG.md b/CHANGELOG.md index 4423e3bb..44314686 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.7.24 + +* fix: assign value to `text_as_html` element attribute only if `text` attribute contains HTML tags. + ## 0.7.23 * fix: added handling in `UnstructuredTableTransformerModel` for if `recognize` returns an empty diff --git a/test_unstructured_inference/models/test_chippermodel.py b/test_unstructured_inference/models/test_chippermodel.py index 065e24bc..c68aa6bc 100644 --- a/test_unstructured_inference/models/test_chippermodel.py +++ b/test_unstructured_inference/models/test_chippermodel.py @@ -3,6 +3,7 @@ import pytest import torch from PIL import Image +from unstructured_inference.inference.layoutelement import LayoutElement from unstructured_inference.models import chipper from unstructured_inference.models.base import get_model @@ -422,3 +423,26 @@ def test_check_overlap(bbox1, bbox2, output): model = get_model("chipper") assert model.check_overlap(bbox1, bbox2) == output + + +def test_format_table_elements(): + table_html = "
Cell 1Cell 2
Cell 3
" + texts = [ + "Text", + " - List element", + table_html, + None, + ] + elements = [LayoutElement(bbox=mock.MagicMock(), text=text) for text in texts] + formatted_elements = chipper.UnstructuredChipperModel.format_table_elements(elements) + text_attributes = [fe.text for fe in formatted_elements] + text_as_html_attributes = [ + fe.text_as_html if hasattr(fe, "text_as_html") else None for fe in formatted_elements + ] + assert text_attributes == [ + "Text", + " - List element", + "Cell 1Cell 2Cell 3", + None, + ] + assert text_as_html_attributes == [None, None, table_html, None] diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index b16a8c5b..688c38bb 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.7.23" # pragma: no cover +__version__ = "0.7.24" # pragma: no cover diff --git a/unstructured_inference/models/chipper.py b/unstructured_inference/models/chipper.py index 2df09c21..d147f8cb 100644 --- a/unstructured_inference/models/chipper.py +++ b/unstructured_inference/models/chipper.py @@ -171,16 +171,18 @@ def predict(self, image) -> List[LayoutElement]: return elements @staticmethod - def format_table_elements(elements): - """makes chipper table element return the same as other layout models + def format_table_elements(elements: List[LayoutElement]) -> List[LayoutElement]: + """Makes chipper table element return the same as other layout models. - - copies the html representation to attribute text_as_html - - strip html tags from the attribute text + 1. If `text` attribute is an html (has html tags in it), copies the `text` + attribute to `text_as_html` attribute. + 2. Strips html tags from the `text` attribute. """ for element in elements: - element.text_as_html = element.text - element.text = strip_tags(element.text) - + text = strip_tags(element.text) if element.text is not None else element.text + if text != element.text: + element.text_as_html = element.text # type: ignore[attr-defined] + element.text = text return elements def predict_tokens(