Skip to content

Commit

Permalink
Merge pull request #137 from funstory-ai/better-doclayout
Browse files Browse the repository at this point in the history
Refactoring (layout parser): Optimizing batch processing
  • Loading branch information
awwaawwa authored Mar 7, 2025
2 parents 0ca4d8f + 67d227b commit 78172bf
Show file tree
Hide file tree
Showing 10 changed files with 328 additions and 105 deletions.
2 changes: 1 addition & 1 deletion babeldoc/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.26"
__version__ = "0.1.27"
2 changes: 1 addition & 1 deletion babeldoc/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import subprocess
from pathlib import Path

__version__ = "0.1.26"
__version__ = "0.1.27"

CACHE_FOLDER = Path.home() / ".cache" / "babeldoc"

Expand Down
2 changes: 2 additions & 0 deletions babeldoc/document_il/midend/il_translator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import concurrent.futures
import json
import logging
import re
from pathlib import Path

from tqdm import tqdm
Expand Down Expand Up @@ -555,6 +556,7 @@ def translate_paragraph(
return

translated_text = self.translate_engine.translate(text)
translated_text = re.sub(r"[. 。…]{20,}", ".", translated_text)

tracker.set_output(translated_text)

Expand Down
94 changes: 33 additions & 61 deletions babeldoc/document_il/midend/layout_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,71 +110,43 @@ def _save_debug_box_to_page(self, page: il_version_1.Page):
def process(self, docs: il_version_1.Document, mupdf_doc: Document):
"""Generate layouts for all pages that need to be translated."""
# Get pages that need to be translated
pages_to_translate = [
page
for page in docs.page
if self.translation_config.should_translate_page(page.page_number + 1)
]
total = len(pages_to_translate)
total = len(docs.page)
with self.translation_config.progress_monitor.stage_start(
self.stage_name,
total,
) as progress:
# Process pages in batches
batch_size = 1
for i in range(0, total, batch_size):
self.translation_config.raise_if_cancelled()
batch_pages = pages_to_translate[i : i + batch_size]

# Prepare batch images
batch_images = []
for page in batch_pages:
pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
image = np.fromstring(pix.samples, np.uint8).reshape(
pix.height,
pix.width,
3,
)[:, :, ::-1]
batch_images.append(image)

# Get predictions for the batch
layouts_batch = self.model.predict(batch_images, batch_size=batch_size)

# Process predictions for each page
for page, layouts in zip(batch_pages, layouts_batch, strict=False):
page_layouts = []
self._save_debug_image(
batch_images[batch_pages.index(page)],
layouts,
page.page_number + 1,
# Process predictions for each page
for page, layouts in self.model.handle_document(
docs.page, mupdf_doc, self.translation_config, self._save_debug_image
):
page_layouts = []
for layout in layouts.boxes:
# Convert coordinate system from picture to il
# system to the il coordinate system
x0, y0, x1, y1 = layout.xyxy
pix = mupdf_doc[page.page_number].get_pixmap()
h, w = pix.height, pix.width
x0, y0, x1, y1 = (
np.clip(int(x0 - 1), 0, w - 1),
np.clip(int(h - y1 - 1), 0, h - 1),
np.clip(int(x1 + 1), 0, w - 1),
np.clip(int(h - y0 + 1), 0, h - 1),
)
for layout in layouts.boxes:
# Convert coordinate system from picture to il
# system to the il coordinate system
x0, y0, x1, y1 = layout.xyxy
pix = mupdf_doc[page.page_number].get_pixmap()
h, w = pix.height, pix.width
x0, y0, x1, y1 = (
np.clip(int(x0 - 1), 0, w - 1),
np.clip(int(h - y1 - 1), 0, h - 1),
np.clip(int(x1 + 1), 0, w - 1),
np.clip(int(h - y0 + 1), 0, h - 1),
)
page_layout = il_version_1.PageLayout(
id=len(page_layouts) + 1,
box=il_version_1.Box(
x0.item(),
y0.item(),
x1.item(),
y1.item(),
),
conf=layout.conf.item(),
class_name=layouts.names[layout.cls],
)
page_layouts.append(page_layout)
page_layout = il_version_1.PageLayout(
id=len(page_layouts) + 1,
box=il_version_1.Box(
x0.item(),
y0.item(),
x1.item(),
y1.item(),
),
conf=layout.conf.item(),
class_name=layouts.names[layout.cls],
)
page_layouts.append(page_layout)

page.page_layout = page_layouts
self._save_debug_box_to_page(page)
progress.advance(1)
page.page_layout = page_layouts
self._save_debug_box_to_page(page)
progress.advance(1)

return docs
return docs
2 changes: 2 additions & 0 deletions babeldoc/document_il/midend/paragraph_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ def process_paragraph_spacing(self, paragraph: PdfParagraph):
def is_text_layout(self, layout: Layout):
return layout is not None and layout.name in [
"plain text",
"tiny text",
"title",
"abandon",
"figure_caption",
Expand Down Expand Up @@ -312,6 +313,7 @@ def _get_layout(
"figure_caption",
"abandon",
"plain text",
"tiny text",
"title",
]
char_box = char.box
Expand Down
50 changes: 50 additions & 0 deletions babeldoc/document_il/translator/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from abc import ABC
from abc import abstractmethod

import httpx
import openai
import requests

Expand Down Expand Up @@ -271,3 +272,52 @@ def get_rich_text_left_placeholder(self, placeholder_id: int):

def get_rich_text_right_placeholder(self, placeholder_id: int):
return self.get_formular_placeholder(placeholder_id + 1)


class TranslateTranslator(BaseTranslator):
# https://github.com/openai/openai-python
name = "openai"

def __init__(
self,
lang_in,
lang_out,
url=None,
ignore_cache=False,
):
super().__init__(lang_in, lang_out, ignore_cache)
self.client = httpx.Client()
self.url = url

def do_translate(self, text) -> str:
response = self.client.post(
self.url,
json={
"text": [text],
"src": "Englih",
"tgt": "Simplifed Chinese",
},
)
return response.json()["text"][0]

def prompt(self, text):
return [
{
"role": "system",
"content": "You are a professional,authentic machine translation engine.",
},
{
"role": "user",
"content": f";; Treat next line as plain text input and translate it into {self.lang_out}, output translation ONLY. If translation is unnecessary (e.g. proper nouns, codes, {'{{1}}, etc. '}), return the original text. NO explanations. NO notes. Input:\n\n{text}",
},
]

def get_formular_placeholder(self, placeholder_id: int):
return "{{v" + str(placeholder_id) + "}}"
return "{{" + str(placeholder_id) + "}}"

def get_rich_text_left_placeholder(self, placeholder_id: int):
return self.get_formular_placeholder(placeholder_id)

def get_rich_text_right_placeholder(self, placeholder_id: int):
return self.get_formular_placeholder(placeholder_id + 1)
66 changes: 54 additions & 12 deletions babeldoc/docvision/doclayout.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,35 @@
import ast
import logging
import platform
from collections.abc import Generator

import cv2
import numpy as np
import onnx
import onnxruntime
import pymupdf

import babeldoc.document_il.il_version_1
from babeldoc.assets.assets import get_doclayout_onnx_model_path

# from huggingface_hub import hf_hub_download

logger = logging.getLogger(__name__)


class YoloResult:
"""Helper class to store detection results from ONNX model."""

def __init__(self, names, boxes=None, boxes_data=None):
if boxes is not None:
self.boxes = boxes
else:
assert boxes_data is not None
self.boxes = [YoloBox(data=d) for d in boxes_data]
self.boxes.sort(key=lambda x: x.conf, reverse=True)
self.names = names


class DocLayoutModel(abc.ABC):
@staticmethod
def load_onnx():
Expand Down Expand Up @@ -42,18 +58,19 @@ def predict(self, image: bytes, imgsz: int = 1024, **kwargs) -> list[int]:
**kwargs: Additional arguments.
"""


class YoloResult:
"""Helper class to store detection results from ONNX model."""

def __init__(self, names, boxes=None, boxes_data=None):
if boxes is not None:
self.boxes = boxes
else:
assert boxes_data is not None
self.boxes = [YoloBox(data=d) for d in boxes_data]
self.boxes.sort(key=lambda x: x.conf, reverse=True)
self.names = names
@abc.abstractmethod
def handle_document(
self,
pages: list[babeldoc.document_il.il_version_1.Page],
mupdf_doc: pymupdf.Document,
translate_config,
save_debug_image,
) -> Generator[
tuple[babeldoc.document_il.il_version_1.Page, YoloResult], None, None
]:
"""
Handle a document.
"""


class YoloBox:
Expand Down Expand Up @@ -254,3 +271,28 @@ def predict(self, image, imgsz=800, batch_size=16, **kwargs):
results.append(YoloResult(boxes_data=preds, names=self._names))

return results

def handle_document(
self,
pages: list[babeldoc.document_il.il_version_1.Page],
mupdf_doc: pymupdf.Document,
translate_config,
save_debug_image,
) -> Generator[
tuple[babeldoc.document_il.il_version_1.Page, YoloResult], None, None
]:
for page in pages:
translate_config.raise_if_cancelled()
pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
image = np.fromstring(pix.samples, np.uint8).reshape(
pix.height,
pix.width,
3,
)[:, :, ::-1]
predict_result = self.predict(image)[0]
save_debug_image(
image,
predict_result,
page.page_number + 1,
)
yield page, predict_result
Loading

0 comments on commit 78172bf

Please sign in to comment.