From df1f7bcd0e529c7af5e6848b72c65152c8665653 Mon Sep 17 00:00:00 2001 From: Pluto Date: Thu, 25 Apr 2024 13:14:48 +0200 Subject: [PATCH] Save table prediction in cells format (#2892) This pull request allows to return predictions in raw cell representation from table transformer. It will be later used to save prediction in a cells format for simpler metrics calculation. This PR has to be merged, after https://github.com/Unstructured-IO/unstructured-inference/pull/335 --- CHANGELOG.md | 3 +- requirements/dev.txt | 4 - requirements/extra-pdf-image.in | 2 +- requirements/extra-pdf-image.txt | 36 +- requirements/huggingface.txt | 34 ++ requirements/ingest/embed-huggingface.txt | 34 ++ .../metrics/test_table_formats.py | 33 ++ .../layout-parser-paper-with-table.jpg.json | 170 ++++++++ .../layout-parser-paper.pdf.json | 382 ++++++++++++++++++ unstructured/__version__.py | 2 +- unstructured/documents/elements.py | 4 + unstructured/metrics/table/table_formats.py | 49 +++ unstructured/partition/common.py | 5 +- unstructured/partition/pdf_image/ocr.py | 20 +- 14 files changed, 764 insertions(+), 14 deletions(-) create mode 100644 test_unstructured/metrics/test_table_formats.py create mode 100644 unstructured/metrics/table/table_formats.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d83e74b440..31bd6638e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.13.4-dev2 +## 0.13.4-dev3 ### Enhancements * **Unique and deterministic hash IDs for elements** Element IDs produced by any partitioning @@ -8,6 +8,7 @@ * **Enable remote chunking via unstructured-ingest** Chunking using unstructured-ingest was previously limited to local chunking using the strategies `basic` and `by_title`. Remote chunking options via the API are now accessible. +* **Save table in cells format**. `UnstructuredTableTransformerModel` is able to return predicted table in cells format ### Features diff --git a/requirements/dev.txt b/requirements/dev.txt index 54f006385e..d910fdf455 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -9,10 +9,6 @@ anyio==3.7.1 # -c ././deps/constraints.txt # httpx # jupyter-server -appnope==0.1.4 - # via - # ipykernel - # ipython argon2-cffi==23.1.0 # via jupyter-server argon2-cffi-bindings==21.2.0 diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index b4e3f3b8ec..293237632d 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -9,7 +9,7 @@ pillow_heif pypdf # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.7.27 +unstructured-inference==0.7.28 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats # from one tesseract call unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 2acc11122a..d1aabf8486 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -37,6 +37,7 @@ filelock==3.13.4 # huggingface-hub # torch # transformers + # triton flatbuffers==24.3.25 # via onnxruntime fonttools==4.51.0 @@ -114,6 +115,37 @@ numpy==1.26.4 # scipy # torchvision # transformers +nvidia-cublas-cu12==12.1.3.1 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via torch +nvidia-cuda-runtime-cu12==12.1.105 + # via torch +nvidia-cudnn-cu12==8.9.2.26 + # via torch +nvidia-cufft-cu12==11.0.2.54 + # via torch +nvidia-curand-cu12==10.3.2.106 + # via torch +nvidia-cusolver-cu12==11.4.5.107 + # via torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-nccl-cu12==2.19.3 + # via torch +nvidia-nvjitlink-cu12==12.4.127 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via torch omegaconf==2.3.0 # via effdet onnx==1.16.0 @@ -275,6 +307,8 @@ tqdm==4.66.2 # transformers transformers==4.40.0 # via unstructured-inference +triton==2.2.0 + # via torch typing-extensions==4.11.0 # via # -c ./base.txt @@ -284,7 +318,7 @@ typing-extensions==4.11.0 # torch tzdata==2024.1 # via pandas -unstructured-inference==0.7.27 +unstructured-inference==0.7.28 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.12 # via diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 73e03b0337..cd73ac4a0b 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -22,6 +22,7 @@ filelock==3.13.4 # huggingface-hub # torch # transformers + # triton fsspec==2024.3.1 # via # huggingface-hub @@ -54,6 +55,37 @@ numpy==1.26.4 # via # -c ./base.txt # transformers +nvidia-cublas-cu12==12.1.3.1 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via torch +nvidia-cuda-runtime-cu12==12.1.105 + # via torch +nvidia-cudnn-cu12==8.9.2.26 + # via torch +nvidia-cufft-cu12==11.0.2.54 + # via torch +nvidia-curand-cu12==10.3.2.106 + # via torch +nvidia-cusolver-cu12==11.4.5.107 + # via torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-nccl-cu12==2.19.3 + # via torch +nvidia-nvjitlink-cu12==12.4.127 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via torch packaging==23.2 # via # -c ././deps/constraints.txt @@ -100,6 +132,8 @@ tqdm==4.66.2 # transformers transformers==4.40.0 # via -r ./huggingface.in +triton==2.2.0 + # via torch typing-extensions==4.11.0 # via # -c ./base.txt diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt index 4360feb567..fae7b910af 100644 --- a/requirements/ingest/embed-huggingface.txt +++ b/requirements/ingest/embed-huggingface.txt @@ -32,6 +32,7 @@ filelock==3.13.4 # huggingface-hub # torch # transformers + # triton frozenlist==1.4.1 # via # aiohttp @@ -98,6 +99,37 @@ numpy==1.26.4 # scipy # sentence-transformers # transformers +nvidia-cublas-cu12==12.1.3.1 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via torch +nvidia-cuda-runtime-cu12==12.1.105 + # via torch +nvidia-cudnn-cu12==8.9.2.26 + # via torch +nvidia-cufft-cu12==11.0.2.54 + # via torch +nvidia-curand-cu12==10.3.2.106 + # via torch +nvidia-cusolver-cu12==11.4.5.107 + # via torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-nccl-cu12==2.19.3 + # via torch +nvidia-nvjitlink-cu12==12.4.127 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via torch orjson==3.10.1 # via langsmith packaging==23.2 @@ -168,6 +200,8 @@ tqdm==4.66.2 # transformers transformers==4.40.0 # via sentence-transformers +triton==2.2.0 + # via torch typing-extensions==4.11.0 # via # -c ./ingest/../base.txt diff --git a/test_unstructured/metrics/test_table_formats.py b/test_unstructured/metrics/test_table_formats.py new file mode 100644 index 0000000000..e72afd06b5 --- /dev/null +++ b/test_unstructured/metrics/test_table_formats.py @@ -0,0 +1,33 @@ +import pytest + +from unstructured.metrics.table.table_formats import SimpleTableCell + + +@pytest.mark.parametrize( + ("row_nums", "column_nums", "x", "y", "w", "h"), + [ + ([3, 2, 1], [6, 7], 6, 1, 2, 3), + ([2], [6, 7], 6, 2, 2, 1), + ([1, 2, 3], [20], 20, 1, 1, 3), + ([5], [5], 5, 5, 1, 1), + ], +) +def test_simple_table_cell_parsing_from_table_transformer_when_expected_input( + row_nums, column_nums, x, y, w, h +): + table_transformer_cell = {"row_nums": row_nums, "column_nums": column_nums, "cell text": "text"} + transformed_cell = SimpleTableCell.from_table_transformer_cell(table_transformer_cell) + expected_cell = SimpleTableCell(x=x, y=y, w=w, h=h, content="text") + assert expected_cell == transformed_cell + + +def test_simple_table_cell_parsing_from_table_transformer_when_missing_row_nums(): + cell = {"row_nums": [], "column_nums": [1], "cell text": "text"} + with pytest.raises(ValueError, match='has missing values under "row_nums" key'): + SimpleTableCell.from_table_transformer_cell(cell) + + +def test_simple_table_cell_parsing_from_table_transformer_when_missing_column_nums(): + cell = {"row_nums": [1], "column_nums": [], "cell text": "text"} + with pytest.raises(ValueError, match='has missing values under "column_nums" key'): + SimpleTableCell.from_table_transformer_cell(cell) diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json index 1b97d213ee..5575a6de9b 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json @@ -55,6 +55,176 @@ "eng" ], "page_number": 1, + "table_as_cells": [ + { + "content": "Dataset", + "h": 1, + "w": 1, + "x": 0, + "y": 0 + }, + { + "content": "PubLayNet [33]", + "h": 1, + "w": 1, + "x": 0, + "y": 1 + }, + { + "content": "PRImA [3]", + "h": 1, + "w": 1, + "x": 0, + "y": 2 + }, + { + "content": "Newspaper [17]", + "h": 1, + "w": 1, + "x": 0, + "y": 3 + }, + { + "content": "TableBank [18]", + "h": 1, + "w": 1, + "x": 0, + "y": 4 + }, + { + "content": "HIDataset [31]", + "h": 1, + "w": 1, + "x": 0, + "y": 5 + }, + { + "content": "| Base Model!|", + "h": 1, + "w": 1, + "x": 1, + "y": 0 + }, + { + "content": "P/M", + "h": 1, + "w": 1, + "x": 1, + "y": 1 + }, + { + "content": "M", + "h": 1, + "w": 1, + "x": 1, + "y": 2 + }, + { + "content": "P", + "h": 1, + "w": 1, + "x": 1, + "y": 3 + }, + { + "content": "P", + "h": 1, + "w": 1, + "x": 1, + "y": 4 + }, + { + "content": "P/M", + "h": 1, + "w": 1, + "x": 1, + "y": 5 + }, + { + "content": "Large Model", + "h": 1, + "w": 1, + "x": 2, + "y": 0 + }, + { + "content": "M", + "h": 1, + "w": 1, + "x": 2, + "y": 1 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 2, + "y": 2 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 2, + "y": 3 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 2, + "y": 4 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 2, + "y": 5 + }, + { + "content": "| Notes", + "h": 1, + "w": 1, + "x": 3, + "y": 0 + }, + { + "content": "Layouts of modern scientific documents", + "h": 1, + "w": 1, + "x": 3, + "y": 1 + }, + { + "content": "Layouts of scanned modern magazines and scientific reports", + "h": 1, + "w": 1, + "x": 3, + "y": 2 + }, + { + "content": "Layouts of scanned US newspapers from the 20th century", + "h": 1, + "w": 1, + "x": 3, + "y": 3 + }, + { + "content": "Table region on modern scientific and business document", + "h": 1, + "w": 1, + "x": 3, + "y": 4 + }, + { + "content": "Layouts of history Japanese documents", + "h": 1, + "w": 1, + "x": 3, + "y": 5 + } + ], "text_as_html": "
Dataset| Base Model!|Large Model| Notes
PubLayNet [33]P/MMLayouts of modern scientific documents
PRImA [3]MLayouts of scanned modern magazines and scientific reports
Newspaper [17]PLayouts of scanned US newspapers from the 20th century
TableBank [18]PTable region on modern scientific and business document
HIDataset [31]P/MLayouts of history Japanese documents
" }, "text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century ‘TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents", diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index d28eecd582..ec397366ad 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -775,6 +775,134 @@ "eng" ], "page_number": 5, + "table_as_cells": [ + { + "content": "Dataset", + "h": 1, + "w": 1, + "x": 0, + "y": 0 + }, + { + "content": "PubLayNet B8]|", + "h": 1, + "w": 1, + "x": 0, + "y": 1 + }, + { + "content": "PRImA", + "h": 1, + "w": 1, + "x": 0, + "y": 2 + }, + { + "content": "Newspaper", + "h": 1, + "w": 1, + "x": 0, + "y": 3 + }, + { + "content": "TableBank", + "h": 1, + "w": 1, + "x": 0, + "y": 4 + }, + { + "content": "HJDataset", + "h": 1, + "w": 1, + "x": 0, + "y": 5 + }, + { + "content": "| Base Model'|", + "h": 1, + "w": 1, + "x": 1, + "y": 0 + }, + { + "content": "F/M", + "h": 1, + "w": 1, + "x": 1, + "y": 1 + }, + { + "content": "M", + "h": 1, + "w": 1, + "x": 1, + "y": 2 + }, + { + "content": "F", + "h": 1, + "w": 1, + "x": 1, + "y": 3 + }, + { + "content": "F", + "h": 1, + "w": 1, + "x": 1, + "y": 4 + }, + { + "content": "F/M", + "h": 1, + "w": 1, + "x": 1, + "y": 5 + }, + { + "content": "| Notes", + "h": 1, + "w": 1, + "x": 2, + "y": 0 + }, + { + "content": "Layouts of modern scientific documents", + "h": 1, + "w": 1, + "x": 2, + "y": 1 + }, + { + "content": "Layouts of scanned modern magazines and scientific report", + "h": 1, + "w": 1, + "x": 2, + "y": 2 + }, + { + "content": "Layouts of scanned US newspapers from the 20th century", + "h": 1, + "w": 1, + "x": 2, + "y": 3 + }, + { + "content": "Table region on modern scientific and business document", + "h": 1, + "w": 1, + "x": 2, + "y": 4 + }, + { + "content": "Layouts of history Japanese documents", + "h": 1, + "w": 1, + "x": 2, + "y": 5 + } + ], "text_as_html": "
Dataset| Base Model'|| Notes
PubLayNet B8]|F/MLayouts of modern scientific documents
PRImAMLayouts of scanned modern magazines and scientific report
NewspaperFLayouts of scanned US newspapers from the 20th century
TableBankFTable region on modern scientific and business document
HJDatasetF/MLayouts of history Japanese documents
" }, "text": "Dataset Base Model1 Large Model Notes PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scientific documents Layouts of scanned modern magazines and scientific reports Layouts of scanned US newspapers from the 20th century Table region on modern scientific and business document Layouts of history Japanese documents", @@ -1276,6 +1404,260 @@ "eng" ], "page_number": 8, + "table_as_cells": [ + { + "content": "block.pad(top, bottom,", + "h": 1, + "w": 1, + "x": 0, + "y": 0 + }, + { + "content": "block.scale(fx, fy)", + "h": 1, + "w": 1, + "x": 0, + "y": 1 + }, + { + "content": "block.shift(dx, dy)", + "h": 1, + "w": 1, + "x": 0, + "y": 2 + }, + { + "content": "block1.is_in(block2)", + "h": 1, + "w": 1, + "x": 0, + "y": 3 + }, + { + "content": "block1. intersect (block2)", + "h": 1, + "w": 1, + "x": 0, + "y": 4 + }, + { + "content": "block1.union(block2)", + "h": 1, + "w": 1, + "x": 0, + "y": 5 + }, + { + "content": "block1.relative_to(block2)", + "h": 1, + "w": 1, + "x": 0, + "y": 6 + }, + { + "content": "block1.condition_on(block2)", + "h": 1, + "w": 1, + "x": 0, + "y": 7 + }, + { + "content": "block. crop_image (image)", + "h": 1, + "w": 1, + "x": 0, + "y": 8 + }, + { + "content": "right,", + "h": 1, + "w": 1, + "x": 1, + "y": 0 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 1 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 2 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 3 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 4 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 5 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 6 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 7 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 8 + }, + { + "content": "left)", + "h": 1, + "w": 1, + "x": 2, + "y": 0 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 2, + "y": 1 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 2, + "y": 2 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 2, + "y": 3 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 2, + "y": 4 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 2, + "y": 5 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 2, + "y": 6 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 2, + "y": 7 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 2, + "y": 8 + }, + { + "content": "Enlarge the current block according to the input", + "h": 1, + "w": 1, + "x": 3, + "y": 0 + }, + { + "content": "Scale the current block given the ratio in x and y direction", + "h": 1, + "w": 1, + "x": 3, + "y": 1 + }, + { + "content": "Move the current block with the shift distances in x and y direction", + "h": 1, + "w": 1, + "x": 3, + "y": 2 + }, + { + "content": "Whether block] is inside of block2", + "h": 1, + "w": 1, + "x": 3, + "y": 3 + }, + { + "content": "Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs", + "h": 1, + "w": 1, + "x": 3, + "y": 4 + }, + { + "content": "Return the union region of blockl and block2. Coordinate type to be determined based on the inputs", + "h": 1, + "w": 1, + "x": 3, + "y": 5 + }, + { + "content": "Convert the absolute coordinates of block to relative coordinates to block2", + "h": 1, + "w": 1, + "x": 3, + "y": 6 + }, + { + "content": "Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates", + "h": 1, + "w": 1, + "x": 3, + "y": 7 + }, + { + "content": "Obtain the image segments in the block region", + "h": 1, + "w": 1, + "x": 3, + "y": 8 + } + ], "text_as_html": "
block.pad(top, bottom,right,left)Enlarge the current block according to the input
block.scale(fx, fy)Scale the current block given the ratio in x and y direction
block.shift(dx, dy)Move the current block with the shift distances in x and y direction
block1.is_in(block2)Whether block] is inside of block2
block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs
block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs
block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
block1.condition_on(block2)Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates
block. crop_image (image)Obtain the image segments in the block region
" }, "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates block.crop image(image) Obtain the image segments in the block region", diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 2935ce42cb..86cab6287d 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.4-dev2" # pragma: no cover +__version__ = "0.13.4-dev3" # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index a120a2def9..976ab1271b 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -201,6 +201,7 @@ class ElementMetadata: # -- used for Table elements to capture rows/col structure -- text_as_html: Optional[str] + table_as_cells: Optional[dict[str, str | int]] url: Optional[str] # -- debug fields can be assigned and referenced using dotted-notation but are not serialized @@ -239,6 +240,7 @@ def __init__( signature: Optional[str] = None, subject: Optional[str] = None, text_as_html: Optional[str] = None, + table_as_cells: Optional[dict[str, str | int]] = None, url: Optional[str] = None, ) -> None: self.attached_to_filename = attached_to_filename @@ -278,6 +280,7 @@ def __init__( self.signature = signature self.subject = subject self.text_as_html = text_as_html + self.table_as_cells = table_as_cells self.url = url def __eq__(self, other: object) -> bool: @@ -490,6 +493,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]: "signature": cls.FIRST, "subject": cls.FIRST, "text_as_html": cls.FIRST, # -- only occurs in Table -- + "table_as_cells": cls.FIRST, # -- only occurs in Table -- "url": cls.FIRST, } diff --git a/unstructured/metrics/table/table_formats.py b/unstructured/metrics/table/table_formats.py new file mode 100644 index 0000000000..6d8b134dd2 --- /dev/null +++ b/unstructured/metrics/table/table_formats.py @@ -0,0 +1,49 @@ +from dataclasses import dataclass +from typing import Union + + +@dataclass +class SimpleTableCell: + x: int + y: int + w: int + h: int + content: str = "" + + def to_dict(self): + return { + "x": self.x, + "y": self.y, + "w": self.w, + "h": self.h, + "content": self.content, + } + + @classmethod + def from_table_transformer_cell(cls, tatr_table_cell: dict[str, Union[list[int], str]]): + """ + Args: + tatr_table_cell (dict): + Cell in a format returned by Table Transformer model, for example: + { + "row_nums": [1,2,3], + "column_nums": [2], + "cell text": "Text inside cell" + } + """ + + row_nums = tatr_table_cell.get("row_nums", []) + column_nums = tatr_table_cell.get("column_nums", []) + + if not row_nums: + raise ValueError(f'Cell {tatr_table_cell} has missing values under "row_nums" key') + if not column_nums: + raise ValueError(f'Cell {tatr_table_cell} has missing values under "column_nums" key') + + return cls( + x=min(column_nums), + y=min(row_nums), + w=len(column_nums), + h=len(row_nums), + content=tatr_table_cell.get("cell text", ""), + ) diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 3e79573437..61b3bde1e9 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -577,9 +577,8 @@ def document_to_element_list( else: if last_modification_date: element.metadata.last_modified = last_modification_date - element.metadata.text_as_html = ( - layout_element.text_as_html if hasattr(layout_element, "text_as_html") else None - ) + element.metadata.text_as_html = getattr(layout_element, "text_as_html", None) + element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None) try: if ( isinstance(element, Title) and element.metadata.category_depth is None diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index 1883fbe3aa..05c000578b 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -13,6 +13,7 @@ from unstructured.documents.elements import ElementType from unstructured.logger import logger +from unstructured.metrics.table.table_formats import SimpleTableCell from unstructured.partition.pdf_image.pdf_image_utils import pad_element_bboxes, valid_text from unstructured.partition.utils.config import env_config from unstructured.partition.utils.constants import ( @@ -255,6 +256,7 @@ def supplement_page_layout_with_ocr( return page_layout +@requires_dependencies("unstructured_inference") def supplement_element_with_table_extraction( elements: List["LayoutElement"], image: PILImage.Image, @@ -264,9 +266,11 @@ def supplement_element_with_table_extraction( extracted_regions: Optional[List["TextRegion"]] = None, ) -> List["LayoutElement"]: """Supplement the existing layout with table extraction. Any Table elements - that are extracted will have a metadata field "text_as_html" where - the table's text content is rendered into an html string. + that are extracted will have a metadata fields "text_as_html" where + the table's text content is rendered into a html string and "table_as_cells" + with the raw table cells output from table agent """ + from unstructured_inference.models.tables import cells_to_html table_elements = [el for el in elements if el.type == ElementType.TABLE] for element in table_elements: @@ -287,7 +291,17 @@ def supplement_element_with_table_extraction( extracted_regions=extracted_regions, table_element=padded_element, ) - element.text_as_html = tables_agent.predict(cropped_image, ocr_tokens=table_tokens) + tatr_cells = tables_agent.predict( + cropped_image, ocr_tokens=table_tokens, result_format="cells" + ) + text_as_html = cells_to_html(tatr_cells) + simple_table_cells = [ + SimpleTableCell.from_table_transformer_cell(cell).to_dict() for cell in tatr_cells + ] + + element.text_as_html = text_as_html + element.table_as_cells = simple_table_cells + return elements