Skip to content

Commit

Permalink
feat(translation_config): add split short lines functionality
Browse files Browse the repository at this point in the history
- introduce split_short_lines and short_line_split_factor in TranslationConfig
- update ParagraphFinder to handle new configuration for splitting short lines
- adjust logic in paragraph processing to use new config options

refactor(paragraph_finder): enhance initialization and condition checks

- initialize ParagraphFinder with translation_config
- improve condition check readability in get_layout method
- pass translation_config to ParagraphFinder in high_level.py
  • Loading branch information
awwaawwa committed Jan 20, 2025
1 parent f0c76ec commit f9ca1a3
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 23 deletions.
54 changes: 32 additions & 22 deletions yadt/document_il/midend/paragraph_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@


class ParagraphFinder:
def __init__(self, translation_config: TranslationConfig):
self.translation_config = translation_config

def update_paragraph_data(self, paragraph: PdfParagraph, update_unicode=False):
if not paragraph.pdf_paragraph_composition:
return
Expand Down Expand Up @@ -203,7 +206,10 @@ def get_layout(
] = "middle",
):
# 这几个符号,解析出来的大小经常只有实际大小的一点点。
if xy_mode != "bottomright" and char.char_unicode in HEIGHT_NOT_USFUL_CHAR_IN_CHAR:
if (
xy_mode != "bottomright"
and char.char_unicode in HEIGHT_NOT_USFUL_CHAR_IN_CHAR
):
return self.get_layout(char, page, "bottomright")
# current layouts
# {
Expand Down Expand Up @@ -337,26 +343,30 @@ def process_independent_paragraphs(
break

# 如果前一行宽度小于中位数的一半,将当前行及后续行分割成新段落
# if prev_width < median_width * 0.8:
# # 创建新的段落
# new_paragraph = PdfParagraph(
# box=Box(0, 0, 0, 0), # 临时边界框
# pdf_paragraph_composition=(
# paragraph.pdf_paragraph_composition[j:]
# ),
# unicode="",
# )
# # 更新原段落
# paragraph.pdf_paragraph_composition = (
# paragraph.pdf_paragraph_composition[:j]
# )
#
# # 更新两个段落的数据
# self.update_paragraph_data(paragraph)
# self.update_paragraph_data(new_paragraph)
#
# # 在原段落后插入新段落
# paragraphs.insert(i + 1, new_paragraph)
# break
if (
self.translation_config.split_short_lines
and prev_width
< median_width * self.translation_config.short_line_split_factor
):
# 创建新的段落
new_paragraph = PdfParagraph(
box=Box(0, 0, 0, 0), # 临时边界框
pdf_paragraph_composition=(
paragraph.pdf_paragraph_composition[j:]
),
unicode="",
)
# 更新原段落
paragraph.pdf_paragraph_composition = (
paragraph.pdf_paragraph_composition[:j]
)

# 更新两个段落的数据
self.update_paragraph_data(paragraph)
self.update_paragraph_data(new_paragraph)

# 在原段落后插入新段落
paragraphs.insert(i + 1, new_paragraph)
break
j += 1
i += 1
2 changes: 1 addition & 1 deletion yadt/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def translate(translation_config: TranslationConfig):
docs, translation_config.get_working_file_path("create_il.debug.json")
)

ParagraphFinder().process(docs)
ParagraphFinder(translation_config).process(docs)
logger.debug(f"finish paragraph finder from {temp_pdf_path}")
if translation_config.debug:
xml_converter.write_json(
Expand Down
4 changes: 4 additions & 0 deletions yadt/translation_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def __init__(
formular_font_pattern: str | None = None,
formular_char_pattern: str | None = None,
qps: int = 1,
split_short_lines: bool = False, # 是否将比较短的行强制切分成不同段落,此功能可能会导致糟糕的排版&bug
short_line_split_factor: float = 0.8, # 切分阈值系数。实际阈值为当前页所有行长度中位数*此系数
):
self.input_file = input_file
self.translator = translator
Expand All @@ -35,6 +37,8 @@ def __init__(
self.formular_font_pattern = formular_font_pattern
self.formular_char_pattern = formular_char_pattern
self.qps = qps
self.split_short_lines = split_short_lines
self.short_line_split_factor = short_line_split_factor

if working_dir is None:
working_dir = os.path.join(
Expand Down

0 comments on commit f9ca1a3

Please sign in to comment.