feat(translation_config): add split short lines functionality

- introduce split_short_lines and short_line_split_factor in TranslationConfig - update ParagraphFinder to handle new configuration for splitting short lines - adjust logic in paragraph processing to use new config options refactor(paragraph_finder): enhance initialization and condition checks - initialize ParagraphFinder with translation_config - improve condition check readability in get_layout method - pass translation_config to ParagraphFinder in high_level.py
funstory-ai · Jan 20, 2025 · f9ca1a3 · f9ca1a3
1 parent f0c76ec
commit f9ca1a3
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 23 deletions.
diff --git a/yadt/document_il/midend/paragraph_finder.py b/yadt/document_il/midend/paragraph_finder.py
@@ -18,6 +18,9 @@
 
 
 class ParagraphFinder:
+    def __init__(self, translation_config: TranslationConfig):
+        self.translation_config = translation_config
+
     def update_paragraph_data(self, paragraph: PdfParagraph, update_unicode=False):
         if not paragraph.pdf_paragraph_composition:
             return
@@ -203,7 +206,10 @@ def get_layout(
         ] = "middle",
     ):
         # 这几个符号，解析出来的大小经常只有实际大小的一点点。
-        if xy_mode != "bottomright" and char.char_unicode in HEIGHT_NOT_USFUL_CHAR_IN_CHAR:
+        if (
+            xy_mode != "bottomright"
+            and char.char_unicode in HEIGHT_NOT_USFUL_CHAR_IN_CHAR
+        ):
             return self.get_layout(char, page, "bottomright")
         # current layouts
         # {
@@ -337,26 +343,30 @@ def process_independent_paragraphs(
                     break
 
                 # 如果前一行宽度小于中位数的一半，将当前行及后续行分割成新段落
-                # if prev_width < median_width * 0.8:
-                #     # 创建新的段落
-                #     new_paragraph = PdfParagraph(
-                #         box=Box(0, 0, 0, 0),  # 临时边界框
-                #         pdf_paragraph_composition=(
-                #             paragraph.pdf_paragraph_composition[j:]
-                #         ),
-                #         unicode="",
-                #     )
-                #     # 更新原段落
-                #     paragraph.pdf_paragraph_composition = (
-                #         paragraph.pdf_paragraph_composition[:j]
-                #     )
-                #
-                #     # 更新两个段落的数据
-                #     self.update_paragraph_data(paragraph)
-                #     self.update_paragraph_data(new_paragraph)
-                #
-                #     # 在原段落后插入新段落
-                #     paragraphs.insert(i + 1, new_paragraph)
-                #     break
+                if (
+                    self.translation_config.split_short_lines
+                    and prev_width
+                    < median_width * self.translation_config.short_line_split_factor
+                ):
+                    # 创建新的段落
+                    new_paragraph = PdfParagraph(
+                        box=Box(0, 0, 0, 0),  # 临时边界框
+                        pdf_paragraph_composition=(
+                            paragraph.pdf_paragraph_composition[j:]
+                        ),
+                        unicode="",
+                    )
+                    # 更新原段落
+                    paragraph.pdf_paragraph_composition = (
+                        paragraph.pdf_paragraph_composition[:j]
+                    )
+
+                    # 更新两个段落的数据
+                    self.update_paragraph_data(paragraph)
+                    self.update_paragraph_data(new_paragraph)
+
+                    # 在原段落后插入新段落
+                    paragraphs.insert(i + 1, new_paragraph)
+                    break
                 j += 1
             i += 1
diff --git a/yadt/high_level.py b/yadt/high_level.py
@@ -193,7 +193,7 @@ def translate(translation_config: TranslationConfig):
             docs, translation_config.get_working_file_path("create_il.debug.json")
         )
 
-    ParagraphFinder().process(docs)
+    ParagraphFinder(translation_config).process(docs)
     logger.debug(f"finish paragraph finder from {temp_pdf_path}")
     if translation_config.debug:
         xml_converter.write_json(

diff --git a/yadt/translation_config.py b/yadt/translation_config.py
@@ -21,6 +21,8 @@ def __init__(
         formular_font_pattern: str | None = None,
         formular_char_pattern: str | None = None,
         qps: int = 1,
+        split_short_lines: bool = False,  # 是否将比较短的行强制切分成不同段落，此功能可能会导致糟糕的排版&bug
+        short_line_split_factor: float = 0.8,   # 切分阈值系数。实际阈值为当前页所有行长度中位数*此系数
     ):
         self.input_file = input_file
         self.translator = translator
@@ -35,6 +37,8 @@ def __init__(
         self.formular_font_pattern = formular_font_pattern
         self.formular_char_pattern = formular_char_pattern
         self.qps = qps
+        self.split_short_lines = split_short_lines
+        self.short_line_split_factor = short_line_split_factor
 
         if working_dir is None:
             working_dir = os.path.join(