refactor(styles_and_formulas): improve code readability and maintaina…

…bility - add LEFT_BRACKET and RIGHT_BRACKET imports for better readability - simplify is_corner_mark condition with improved formatting - use regular expression for is_formulas_font method - replace bracket character checks with LEFT_BRACKET and RIGHT_BRACKET constants refactor(typesetting): enhance code structure and logic - improve assert statements for better readability - simplify Box creation in get_box method - add add_watermark method to render_page - streamline create_passthrough_composition call - optimize space_width calculation - simplify line height and unit width checks - update TypesettingUnit creation in create_typesetting_units refactor(layout_helper): update and expand constants - add new cid characters to HEIGHT_NOT_USFUL_CHAR_IN_CHAR - define LEFT_BRACKET and RIGHT_BRACKET constants for bracket characters refactor(fontmap): enhance font mapping and initialization - initialize fontid2font dictionary with base, fallback, and kai fonts - handle different font types and their attributes - improve font mapping logic for better accuracy
funstory-ai · Jan 22, 2025 · 614ee35 · 614ee35
1 parent 5cfbd87
commit 614ee35
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 99 deletions.
diff --git a/yadt/document_il/midend/styles_and_formulas.py b/yadt/document_il/midend/styles_and_formulas.py
@@ -19,12 +19,15 @@
     formular_height_ignore_char,
     get_char_unicode_string,
     is_same_style,
+    LEFT_BRACKET,
+    RIGHT_BRACKET,
 )
 from yadt.translation_config import TranslationConfig
 
 
 class StylesAndFormulas:
     stage_name = "解析公式与样式"
+
     def __init__(self, translation_config: TranslationConfig):
         self.translation_config = translation_config
 
@@ -96,22 +99,21 @@ def process_page_formulas(self, page: Page):
                         )
                     )
 
-                    is_corner_mark =( (
-                            len(current_chars) > 0
-                            and not get_char_unicode_string(current_chars).isspace()
-                            # 角标字体，有 0.76 的角标和 0.799 的大写，这里用 0.79 取中，同时考虑首字母放大的情况
-                            and char.pdf_style.font_size
-                            < current_chars[-1].pdf_style.font_size * 0.79
-                            and not in_corner_mark_state
-                        ) or (
-                            len(current_chars) > 0
-                            and not get_char_unicode_string(current_chars).isspace()
-                            # 角标字体，有 0.76 的角标和 0.799 的大写，这里用 0.79 取中，同时考虑首字母放大的情况
-                            and char.pdf_style.font_size
-                            < current_chars[-1].pdf_style.font_size * 1.1
-                            and in_corner_mark_state
-
-                    ))
+                    is_corner_mark = (
+                        len(current_chars) > 0
+                        and not get_char_unicode_string(current_chars).isspace()
+                        # 角标字体，有 0.76 的角标和 0.799 的大写，这里用 0.79 取中，同时考虑首字母放大的情况
+                        and char.pdf_style.font_size
+                        < current_chars[-1].pdf_style.font_size * 0.79
+                        and not in_corner_mark_state
+                    ) or (
+                        len(current_chars) > 0
+                        and not get_char_unicode_string(current_chars).isspace()
+                        # 角标字体，有 0.76 的角标和 0.799 的大写，这里用 0.79 取中，同时考虑首字母放大的情况
+                        and char.pdf_style.font_size
+                        < current_chars[-1].pdf_style.font_size * 1.1
+                        and in_corner_mark_state
+                    )
 
                     is_formula = is_formula or is_corner_mark
 
@@ -466,9 +468,7 @@ def is_translatable_formula(self, formula: PdfFormula) -> bool:
         return bool(re.match(r"^[0-9, ]+$", text))
 
     def is_formulas_font(self, font_name: str) -> bool:
-        pattern2 = (
-            r'^(Cambria|Cambria-BoldItalic|Cambria-Bold|Cambria-Italic)$'
-        )
+        pattern2 = r"^(Cambria|Cambria-BoldItalic|Cambria-Bold|Cambria-Italic)$"
         if self.translation_config.formular_font_pattern:
             pattern = self.translation_config.formular_font_pattern
         else:
@@ -566,25 +566,11 @@ def split_formula_by_comma(
 
         for char in formula.pdf_character:
             # 检查是否是左括号
-            if char.char_unicode in [
-                "(cid:8)",
-                "(",
-                "(cid:16)",
-                "{",
-                "[",
-                "(cid:104)",
-            ]:
+            if char.char_unicode in LEFT_BRACKET:
                 bracket_level += 1
                 current_chars.append(char)
             # 检查是否是右括号
-            elif char.char_unicode in [
-                "(cid:9)",
-                ")",
-                "(cid:17)",
-                "}",
-                "]",
-                "(cid:105)",
-            ]:
+            elif char.char_unicode in RIGHT_BRACKET:
                 bracket_level = max(0, bracket_level - 1)  # 防止括号不匹配的情况
                 current_chars.append(char)
             # 检查是否是逗号，且不在括号内

diff --git a/yadt/document_il/midend/typesetting.py b/yadt/document_il/midend/typesetting.py
@@ -37,9 +37,9 @@ def __init__(
         font_size: float = None,
         style: PdfStyle = None,
     ):
-        assert sum((x is not None for x in [char, formular, unicode])) == 1, (
-            "Only one of chars and formular can be not None"
-        )
+        assert (
+            sum((x is not None for x in [char, formular, unicode])) == 1
+        ), "Only one of chars and formular can be not None"
         self.char = char
         self.formular = formular
         self.unicode = unicode
@@ -48,9 +48,7 @@ def __init__(
         self.scale = None
 
         if unicode:
-            assert font_size, (
-                "Font size must be provided when unicode is provided"
-            )
+            assert font_size, "Font size must be provided when unicode is provided"
             assert font, "Font must be provided when unicode is provided"
             assert style, "Style must be provided when unicode is provided"
             assert len(unicode) == 1, "Unicode must be a single character"
@@ -167,6 +165,7 @@ def is_hung_punctuation(self):
                 "、",
                 "”",
                 '"',
+                "；",
             ]
         return False
 
@@ -189,14 +188,10 @@ def box(self):
         elif self.formular:
             return self.formular.box
         elif self.unicode:
-            char_width = self.font.char_lengths(self.unicode, self.font_size)[
-                0
-            ]
+            char_width = self.font.char_lengths(self.unicode, self.font_size)[0]
             if self.x is None or self.y is None or self.scale is None:
                 return Box(0, 0, char_width, self.font_size)
-            return Box(
-                self.x, self.y, self.x + char_width, self.y + self.font_size
-            )
+            return Box(self.x, self.y, self.x + char_width, self.y + self.font_size)
 
     @property
     def width(self):
@@ -235,9 +230,7 @@ def relocate(self, x: float, y: float, scale: float) -> "TypesettingUnit":
                 ),
                 scale=scale,
                 vertical=self.char.vertical,
-                advance=self.char.advance * scale
-                if self.char.advance
-                else None,
+                advance=self.char.advance * scale if self.char.advance else None,
             )
             return TypesettingUnit(char=new_char)
 
@@ -260,18 +253,10 @@ def relocate(self, x: float, y: float, scale: float) -> "TypesettingUnit":
                         x=x + (rel_x + self.formular.x_offset) * scale,
                         y=y + (rel_y + self.formular.y_offset) * scale,
                         x2=x
-                        + (
-                            rel_x
-                            + (char.box.x2 - char.box.x)
-                            + self.formular.x_offset
-                        )
+                        + (rel_x + (char.box.x2 - char.box.x) + self.formular.x_offset)
                         * scale,
                         y2=y
-                        + (
-                            rel_y
-                            + (char.box.y2 - char.box.y)
-                            + self.formular.y_offset
-                        )
+                        + (rel_y + (char.box.y2 - char.box.y) + self.formular.y_offset)
                         * scale,
                     ),
                     pdf_style=PdfStyle(
@@ -326,15 +311,15 @@ def render(self) -> [PdfCharacter]:
         if self.can_passthrough:
             return self.passthrough()
         elif self.unicode:
-            assert self.x is not None, (
-                "x position must be set, should be set by `relocate`"
-            )
-            assert self.y is not None, (
-                "y position must be set, should be set by `relocate`"
-            )
-            assert self.scale is not None, (
-                "scale must be set, should be set by `relocate`"
-            )
+            assert (
+                self.x is not None
+            ), "x position must be set, should be set by `relocate`"
+            assert (
+                self.y is not None
+            ), "y position must be set, should be set by `relocate`"
+            assert (
+                self.scale is not None
+            ), "scale must be set, should be set by `relocate`"
             # 计算字符宽度
             char_width = self.width
 
@@ -363,24 +348,55 @@ def render(self) -> [PdfCharacter]:
 
 class Typesetting:
     stage_name = "排版"
+
     def __init__(self, translation_config: TranslationConfig):
         self.font_mapper = FontMapper(translation_config)
         self.translation_config = translation_config
 
     def typsetting_document(self, document: il_version_1.Document):
         with self.translation_config.progress_monitor.stage_start(
-                self.stage_name, len(document.page)
+            self.stage_name, len(document.page)
         ) as pbar:
             for page in document.page:
                 self.render_page(page)
                 pbar.advance()
 
     def render_page(self, page: il_version_1.Page):
         fonts = {f.font_id: f for f in page.pdf_font}
+        for k, v in self.font_mapper.fontid2font.items():
+            fonts[k] = v
+        if page.page_number == 0:
+            self.add_watermark(page)
         # 开始实际的渲染过程
         for paragraph in page.pdf_paragraph:
             self.render_paragraph(paragraph, page, fonts)
 
+    def add_watermark(self, page: il_version_1.Page):
+        page_width = page.cropbox.box.x2 - page.cropbox.box.x
+        page_height = page.cropbox.box.y2 - page.cropbox.box.y
+        style = il_version_1.PdfStyle(font_id='base', font_size=6, graphic_state=il_version_1.GraphicState(), )
+        page.pdf_paragraph.append(
+            il_version_1.PdfParagraph(
+                first_line_indent=False,
+                box=il_version_1.Box(
+                    x=page.cropbox.box.x + page_width * 0.05,
+                    y=page.cropbox.box.y,
+                    x2=page.cropbox.box.x2,
+                    y2=page.cropbox.box.y2 - page_height * 0.05,
+                ),
+                vertical=False,
+                pdf_style=style,
+                pdf_paragraph_composition=[
+                    il_version_1.PdfParagraphComposition(
+                        pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters(
+                            unicode="本文档由funstory.ai的开源PDF翻译库（https://github.com/funstory-ai/yadt）翻译，本仓库正在积极的建设当中，欢迎star和关注。",
+                            pdf_style=style,
+                        )
+                    )
+                ],
+            )
+        )
+
     def render_paragraph(
         self,
         paragraph: il_version_1.PdfParagraph,
@@ -391,8 +407,8 @@ def render_paragraph(
         # 如果所有单元都可以直接传递，则直接传递
         if all(unit.can_passthrough for unit in typesetting_units):
             paragraph.scale = 1.0
-            paragraph.pdf_paragraph_composition = (
-                self.create_passthrough_composition(typesetting_units)
+            paragraph.pdf_paragraph_composition = self.create_passthrough_composition(
+                typesetting_units
             )
             return
 
@@ -430,8 +446,7 @@ def _layout_typesetting_units(
         font_size = statistics.mode(font_sizes)
 
         space_width = (
-            self.font_mapper.base_font.char_lengths("你", font_size * scale)[0]
-            * 0.5
+            self.font_mapper.base_font.char_lengths("你", font_size * scale)[0] * 0.5
         )
 
         # 计算平均行高
@@ -466,8 +481,7 @@ def _layout_typesetting_units(
 
             if (
                 last_unit  # 有上一个单元
-                and last_unit.is_chinese_char
-                ^ unit.is_chinese_char  # 中英文交界处
+                and last_unit.is_chinese_char ^ unit.is_chinese_char  # 中英文交界处
                 and (
                     last_unit.box
                     and last_unit.box.y
@@ -484,10 +498,7 @@ def _layout_typesetting_units(
                 current_x += space_width * 0.5
 
             # 如果当前行放不下这个元素，换行
-            if (
-                current_x + unit_width > box.x2
-                and not unit.is_hung_punctuation
-            ):
+            if current_x + unit_width > box.x2 and not unit.is_hung_punctuation:
                 # 换行
                 current_x = box.x
                 current_y -= line_height * line_spacing
@@ -508,10 +519,7 @@ def _layout_typesetting_units(
 
             # workaround: 超长行距暂时没找到具体原因，有待进一步修复。这里的1.2是魔法数字！
             # 更新当前行的最大高度
-            if (
-                line_height == 0
-                or line_height * 1.2 > unit_height > line_height
-            ):
+            if line_height == 0 or line_height * 1.2 > unit_height > line_height:
                 line_height = unit_height
 
             # 更新 x 坐标
@@ -627,9 +635,7 @@ def create_typesetting_units(
                     ]
                 )
             elif composition.pdf_formula:
-                result.extend(
-                    [TypesettingUnit(formular=composition.pdf_formula)]
-                )
+                result.extend([TypesettingUnit(formular=composition.pdf_formula)])
             else:
                 raise ValueError(
                     f"Unknown composition type. "
@@ -687,8 +693,7 @@ def get_max_right_space(self, current_box: Box, page) -> float:
         # 检查图形
         for figure in page.pdf_figure:
             if figure.box.x > current_box.x and not (
-                figure.box.y >= current_box.y2
-                or figure.box.y2 <= current_box.y
+                figure.box.y >= current_box.y2 or figure.box.y2 <= current_box.y
             ):
                 max_x = min(max_x, figure.box.x)
 

diff --git a/yadt/document_il/utils/fontmap.py b/yadt/document_il/utils/fontmap.py
@@ -39,19 +39,40 @@ def __init__(self, translation_config: TranslationConfig):
         self.fallback_font.font_id = "fallback"
         self.kai_font.font_id = "kai"
 
+        self.fontid2font = {
+            f.font_id: f for f in self.fonts.values()
+        }
+        self.fontid2font["base"] = self.base_font
+        self.fontid2font["fallback"] = self.fallback_font
+        self.fontid2font["kai"] = self.kai_font
+
     def map(self, original_font: PdfFont, char_unicode: str):
         current_char = ord(char_unicode)
-        if original_font.italic and self.kai_font.has_glyph(current_char):
+        if isinstance(original_font, pymupdf.Font):
+            bold = original_font.is_bold
+            italic = original_font.is_italic
+            monospaced = original_font.is_monospaced
+            serif = original_font.is_serif
+        elif isinstance(original_font, PdfFont):
+            bold = original_font.bold
+            italic = original_font.italic
+            monospaced = original_font.monospace
+            serif = original_font.serif
+        else:
+            raise Exception(
+                f"Unknown font type: {type(original_font)}"
+            )
+        if italic and self.kai_font.has_glyph(current_char):
             return self.kai_font
         for k, font in self.fonts.items():
             if not font.has_glyph(current_char):
                 continue
-            if original_font.bold != font.is_bold:
+            if bold != font.is_bold:
                 continue
             # 不知道什么原因，思源黑体的 serif 属性为1，先workaround
-            if original_font.serif == 1 and 'serif' not in font.font_id:
+            if serif == 1 and 'serif' not in font.font_id:
                 continue
-            if original_font.serif == 0 and 'serif' in font.font_id:
+            if serif == 0 and 'serif' in font.font_id:
                 continue
             return font
         if self.base_font.has_glyph(current_char):