Skip to content

Commit

Permalink
refactor(styles_and_formulas): improve code readability and maintaina…
Browse files Browse the repository at this point in the history
…bility

- add LEFT_BRACKET and RIGHT_BRACKET imports for better readability
- simplify is_corner_mark condition with improved formatting
- use regular expression for is_formulas_font method
- replace bracket character checks with LEFT_BRACKET and RIGHT_BRACKET constants

refactor(typesetting): enhance code structure and logic

- improve assert statements for better readability
- simplify Box creation in get_box method
- add add_watermark method to render_page
- streamline create_passthrough_composition call
- optimize space_width calculation
- simplify line height and unit width checks
- update TypesettingUnit creation in create_typesetting_units

refactor(layout_helper): update and expand constants

- add new cid characters to HEIGHT_NOT_USFUL_CHAR_IN_CHAR
- define LEFT_BRACKET and RIGHT_BRACKET constants for bracket characters

refactor(fontmap): enhance font mapping and initialization

- initialize fontid2font dictionary with base, fallback, and kai fonts
- handle different font types and their attributes
- improve font mapping logic for better accuracy
  • Loading branch information
awwaawwa committed Jan 22, 2025
1 parent 5cfbd87 commit 614ee35
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 99 deletions.
56 changes: 21 additions & 35 deletions yadt/document_il/midend/styles_and_formulas.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,15 @@
formular_height_ignore_char,
get_char_unicode_string,
is_same_style,
LEFT_BRACKET,
RIGHT_BRACKET,
)
from yadt.translation_config import TranslationConfig


class StylesAndFormulas:
stage_name = "解析公式与样式"

def __init__(self, translation_config: TranslationConfig):
self.translation_config = translation_config

Expand Down Expand Up @@ -96,22 +99,21 @@ def process_page_formulas(self, page: Page):
)
)

is_corner_mark =( (
len(current_chars) > 0
and not get_char_unicode_string(current_chars).isspace()
# 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况
and char.pdf_style.font_size
< current_chars[-1].pdf_style.font_size * 0.79
and not in_corner_mark_state
) or (
len(current_chars) > 0
and not get_char_unicode_string(current_chars).isspace()
# 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况
and char.pdf_style.font_size
< current_chars[-1].pdf_style.font_size * 1.1
and in_corner_mark_state

))
is_corner_mark = (
len(current_chars) > 0
and not get_char_unicode_string(current_chars).isspace()
# 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况
and char.pdf_style.font_size
< current_chars[-1].pdf_style.font_size * 0.79
and not in_corner_mark_state
) or (
len(current_chars) > 0
and not get_char_unicode_string(current_chars).isspace()
# 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况
and char.pdf_style.font_size
< current_chars[-1].pdf_style.font_size * 1.1
and in_corner_mark_state
)

is_formula = is_formula or is_corner_mark

Expand Down Expand Up @@ -466,9 +468,7 @@ def is_translatable_formula(self, formula: PdfFormula) -> bool:
return bool(re.match(r"^[0-9, ]+$", text))

def is_formulas_font(self, font_name: str) -> bool:
pattern2 = (
r'^(Cambria|Cambria-BoldItalic|Cambria-Bold|Cambria-Italic)$'
)
pattern2 = r"^(Cambria|Cambria-BoldItalic|Cambria-Bold|Cambria-Italic)$"
if self.translation_config.formular_font_pattern:
pattern = self.translation_config.formular_font_pattern
else:
Expand Down Expand Up @@ -566,25 +566,11 @@ def split_formula_by_comma(

for char in formula.pdf_character:
# 检查是否是左括号
if char.char_unicode in [
"(cid:8)",
"(",
"(cid:16)",
"{",
"[",
"(cid:104)",
]:
if char.char_unicode in LEFT_BRACKET:
bracket_level += 1
current_chars.append(char)
# 检查是否是右括号
elif char.char_unicode in [
"(cid:9)",
")",
"(cid:17)",
"}",
"]",
"(cid:105)",
]:
elif char.char_unicode in RIGHT_BRACKET:
bracket_level = max(0, bracket_level - 1) # 防止括号不匹配的情况
current_chars.append(char)
# 检查是否是逗号,且不在括号内
Expand Down
113 changes: 59 additions & 54 deletions yadt/document_il/midend/typesetting.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ def __init__(
font_size: float = None,
style: PdfStyle = None,
):
assert sum((x is not None for x in [char, formular, unicode])) == 1, (
"Only one of chars and formular can be not None"
)
assert (
sum((x is not None for x in [char, formular, unicode])) == 1
), "Only one of chars and formular can be not None"
self.char = char
self.formular = formular
self.unicode = unicode
Expand All @@ -48,9 +48,7 @@ def __init__(
self.scale = None

if unicode:
assert font_size, (
"Font size must be provided when unicode is provided"
)
assert font_size, "Font size must be provided when unicode is provided"
assert font, "Font must be provided when unicode is provided"
assert style, "Style must be provided when unicode is provided"
assert len(unicode) == 1, "Unicode must be a single character"
Expand Down Expand Up @@ -167,6 +165,7 @@ def is_hung_punctuation(self):
"、",
"”",
'"',
";",
]
return False

Expand All @@ -189,14 +188,10 @@ def box(self):
elif self.formular:
return self.formular.box
elif self.unicode:
char_width = self.font.char_lengths(self.unicode, self.font_size)[
0
]
char_width = self.font.char_lengths(self.unicode, self.font_size)[0]
if self.x is None or self.y is None or self.scale is None:
return Box(0, 0, char_width, self.font_size)
return Box(
self.x, self.y, self.x + char_width, self.y + self.font_size
)
return Box(self.x, self.y, self.x + char_width, self.y + self.font_size)

@property
def width(self):
Expand Down Expand Up @@ -235,9 +230,7 @@ def relocate(self, x: float, y: float, scale: float) -> "TypesettingUnit":
),
scale=scale,
vertical=self.char.vertical,
advance=self.char.advance * scale
if self.char.advance
else None,
advance=self.char.advance * scale if self.char.advance else None,
)
return TypesettingUnit(char=new_char)

Expand All @@ -260,18 +253,10 @@ def relocate(self, x: float, y: float, scale: float) -> "TypesettingUnit":
x=x + (rel_x + self.formular.x_offset) * scale,
y=y + (rel_y + self.formular.y_offset) * scale,
x2=x
+ (
rel_x
+ (char.box.x2 - char.box.x)
+ self.formular.x_offset
)
+ (rel_x + (char.box.x2 - char.box.x) + self.formular.x_offset)
* scale,
y2=y
+ (
rel_y
+ (char.box.y2 - char.box.y)
+ self.formular.y_offset
)
+ (rel_y + (char.box.y2 - char.box.y) + self.formular.y_offset)
* scale,
),
pdf_style=PdfStyle(
Expand Down Expand Up @@ -326,15 +311,15 @@ def render(self) -> [PdfCharacter]:
if self.can_passthrough:
return self.passthrough()
elif self.unicode:
assert self.x is not None, (
"x position must be set, should be set by `relocate`"
)
assert self.y is not None, (
"y position must be set, should be set by `relocate`"
)
assert self.scale is not None, (
"scale must be set, should be set by `relocate`"
)
assert (
self.x is not None
), "x position must be set, should be set by `relocate`"
assert (
self.y is not None
), "y position must be set, should be set by `relocate`"
assert (
self.scale is not None
), "scale must be set, should be set by `relocate`"
# 计算字符宽度
char_width = self.width

Expand Down Expand Up @@ -363,24 +348,55 @@ def render(self) -> [PdfCharacter]:

class Typesetting:
stage_name = "排版"

def __init__(self, translation_config: TranslationConfig):
self.font_mapper = FontMapper(translation_config)
self.translation_config = translation_config

def typsetting_document(self, document: il_version_1.Document):
with self.translation_config.progress_monitor.stage_start(
self.stage_name, len(document.page)
self.stage_name, len(document.page)
) as pbar:
for page in document.page:
self.render_page(page)
pbar.advance()

def render_page(self, page: il_version_1.Page):
fonts = {f.font_id: f for f in page.pdf_font}
for k, v in self.font_mapper.fontid2font.items():
fonts[k] = v
if page.page_number == 0:
self.add_watermark(page)
# 开始实际的渲染过程
for paragraph in page.pdf_paragraph:
self.render_paragraph(paragraph, page, fonts)

def add_watermark(self, page: il_version_1.Page):
page_width = page.cropbox.box.x2 - page.cropbox.box.x
page_height = page.cropbox.box.y2 - page.cropbox.box.y
style = il_version_1.PdfStyle(font_id='base', font_size=6, graphic_state=il_version_1.GraphicState(), )
page.pdf_paragraph.append(
il_version_1.PdfParagraph(
first_line_indent=False,
box=il_version_1.Box(
x=page.cropbox.box.x + page_width * 0.05,
y=page.cropbox.box.y,
x2=page.cropbox.box.x2,
y2=page.cropbox.box.y2 - page_height * 0.05,
),
vertical=False,
pdf_style=style,
pdf_paragraph_composition=[
il_version_1.PdfParagraphComposition(
pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters(
unicode="本文档由funstory.ai的开源PDF翻译库(https://github.com/funstory-ai/yadt)翻译,本仓库正在积极的建设当中,欢迎star和关注。",
pdf_style=style,
)
)
],
)
)

def render_paragraph(
self,
paragraph: il_version_1.PdfParagraph,
Expand All @@ -391,8 +407,8 @@ def render_paragraph(
# 如果所有单元都可以直接传递,则直接传递
if all(unit.can_passthrough for unit in typesetting_units):
paragraph.scale = 1.0
paragraph.pdf_paragraph_composition = (
self.create_passthrough_composition(typesetting_units)
paragraph.pdf_paragraph_composition = self.create_passthrough_composition(
typesetting_units
)
return

Expand Down Expand Up @@ -430,8 +446,7 @@ def _layout_typesetting_units(
font_size = statistics.mode(font_sizes)

space_width = (
self.font_mapper.base_font.char_lengths("你", font_size * scale)[0]
* 0.5
self.font_mapper.base_font.char_lengths("你", font_size * scale)[0] * 0.5
)

# 计算平均行高
Expand Down Expand Up @@ -466,8 +481,7 @@ def _layout_typesetting_units(

if (
last_unit # 有上一个单元
and last_unit.is_chinese_char
^ unit.is_chinese_char # 中英文交界处
and last_unit.is_chinese_char ^ unit.is_chinese_char # 中英文交界处
and (
last_unit.box
and last_unit.box.y
Expand All @@ -484,10 +498,7 @@ def _layout_typesetting_units(
current_x += space_width * 0.5

# 如果当前行放不下这个元素,换行
if (
current_x + unit_width > box.x2
and not unit.is_hung_punctuation
):
if current_x + unit_width > box.x2 and not unit.is_hung_punctuation:
# 换行
current_x = box.x
current_y -= line_height * line_spacing
Expand All @@ -508,10 +519,7 @@ def _layout_typesetting_units(

# workaround: 超长行距暂时没找到具体原因,有待进一步修复。这里的1.2是魔法数字!
# 更新当前行的最大高度
if (
line_height == 0
or line_height * 1.2 > unit_height > line_height
):
if line_height == 0 or line_height * 1.2 > unit_height > line_height:
line_height = unit_height

# 更新 x 坐标
Expand Down Expand Up @@ -627,9 +635,7 @@ def create_typesetting_units(
]
)
elif composition.pdf_formula:
result.extend(
[TypesettingUnit(formular=composition.pdf_formula)]
)
result.extend([TypesettingUnit(formular=composition.pdf_formula)])
else:
raise ValueError(
f"Unknown composition type. "
Expand Down Expand Up @@ -687,8 +693,7 @@ def get_max_right_space(self, current_box: Box, page) -> float:
# 检查图形
for figure in page.pdf_figure:
if figure.box.x > current_box.x and not (
figure.box.y >= current_box.y2
or figure.box.y2 <= current_box.y
figure.box.y >= current_box.y2 or figure.box.y2 <= current_box.y
):
max_x = min(max_x, figure.box.x)

Expand Down
29 changes: 25 additions & 4 deletions yadt/document_il/utils/fontmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,19 +39,40 @@ def __init__(self, translation_config: TranslationConfig):
self.fallback_font.font_id = "fallback"
self.kai_font.font_id = "kai"

self.fontid2font = {
f.font_id: f for f in self.fonts.values()
}
self.fontid2font["base"] = self.base_font
self.fontid2font["fallback"] = self.fallback_font
self.fontid2font["kai"] = self.kai_font

def map(self, original_font: PdfFont, char_unicode: str):
current_char = ord(char_unicode)
if original_font.italic and self.kai_font.has_glyph(current_char):
if isinstance(original_font, pymupdf.Font):
bold = original_font.is_bold
italic = original_font.is_italic
monospaced = original_font.is_monospaced
serif = original_font.is_serif
elif isinstance(original_font, PdfFont):
bold = original_font.bold
italic = original_font.italic
monospaced = original_font.monospace
serif = original_font.serif
else:
raise Exception(
f"Unknown font type: {type(original_font)}"
)
if italic and self.kai_font.has_glyph(current_char):
return self.kai_font
for k, font in self.fonts.items():
if not font.has_glyph(current_char):
continue
if original_font.bold != font.is_bold:
if bold != font.is_bold:
continue
# 不知道什么原因,思源黑体的 serif 属性为1,先workaround
if original_font.serif == 1 and 'serif' not in font.font_id:
if serif == 1 and 'serif' not in font.font_id:
continue
if original_font.serif == 0 and 'serif' in font.font_id:
if serif == 0 and 'serif' in font.font_id:
continue
return font
if self.base_font.has_glyph(current_char):
Expand Down
Loading

0 comments on commit 614ee35

Please sign in to comment.