Skip to content

Commit

Permalink
override reading order by global block order
Browse files Browse the repository at this point in the history
  • Loading branch information
bertsky committed Aug 7, 2024
1 parent 93d766c commit 23e7248
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion textract2page/convert_aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -724,7 +724,9 @@ def convert_file(json_path: str, img_path: str, out_path: str) -> None:
key_value_set_blocks,
layout_blocks,
) = ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})
for block in aws_json["Blocks"]:
block_order = {}
for order, block in enumerate(aws_json["Blocks"]):
block_order[block["Id"]] = order
if block["BlockType"] == "PAGE":
assert not page_block, "page must not have more than 1 PAGE block"
page_block = block
Expand Down Expand Up @@ -795,6 +797,10 @@ def convert_file(json_path: str, img_path: str, out_path: str) -> None:

# reading order of top-level objects
textract_objects_in_reading_order = derive_reading_order(words.values())
def aws_block_order(obj):
return block_order[obj.id]
textract_objects_in_reading_order = sorted(textract_objects_in_reading_order,
key=aws_block_order)

# build PRIMAPageXML
pil_img = Image.open(img_path)
Expand Down

0 comments on commit 23e7248

Please sign in to comment.