diff --git a/babeldoc/converter.py b/babeldoc/converter.py index a5e56287..b5840782 100644 --- a/babeldoc/converter.py +++ b/babeldoc/converter.py @@ -2,14 +2,18 @@ import logging import re import unicodedata +from collections.abc import Sequence +from typing import cast import numpy as np from pdfminer.converter import PDFConverter from pdfminer.layout import LTChar from pdfminer.layout import LTComponent +from pdfminer.layout import LTCurve from pdfminer.layout import LTFigure from pdfminer.layout import LTLine from pdfminer.layout import LTPage +from pdfminer.layout import LTRect from pdfminer.layout import LTText from pdfminer.pdfcolor import PDFColorSpace from pdfminer.pdffont import PDFCIDFont @@ -18,6 +22,8 @@ from pdfminer.pdfinterp import PDFGraphicState from pdfminer.pdfinterp import PDFResourceManager from pdfminer.utils import Matrix +from pdfminer.utils import PathSegment +from pdfminer.utils import Point from pdfminer.utils import apply_matrix_pt from pdfminer.utils import bbox2str from pdfminer.utils import matrix2str @@ -120,6 +126,133 @@ def render_char( item.font = font # hack 插入原字符字体 return item.adv + def paint_path( + self, + gstate: PDFGraphicState, + stroke: bool, + fill: bool, + evenodd: bool, + path: Sequence[PathSegment], + graphicstate: PDFGraphicState, + ) -> None: + """Paint paths described in section 4.4 of the PDF reference manual""" + shape = "".join(x[0] for x in path) + + if shape[:1] != "m": + # Per PDF Reference Section 4.4.1, "path construction operators may + # be invoked in any sequence, but the first one invoked must be m + # or re to begin a new subpath." Since pdfminer.six already + # converts all `re` (rectangle) operators to their equivelent + # `mlllh` representation, paths ingested by `.paint_path(...)` that + # do not begin with the `m` operator are invalid. + pass + + elif shape.count("m") > 1: + # recurse if there are multiple m's in this shape + for m in re.finditer(r"m[^m]+", shape): + subpath = path[m.start(0) : m.end(0)] + self.paint_path(gstate, stroke, fill, evenodd, subpath, graphicstate) + + else: + # Although the 'h' command does not not literally provide a + # point-position, its position is (by definition) equal to the + # subpath's starting point. + # + # And, per Section 4.4's Table 4.9, all other path commands place + # their point-position in their final two arguments. (Any preceding + # arguments represent control points on Bézier curves.) + raw_pts = [ + cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path + ] + pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts] + + operators = [str(operation[0]) for operation in path] + transformed_points = [ + [ + apply_matrix_pt(self.ctm, (float(operand1), float(operand2))) + for operand1, operand2 in zip( + operation[1::2], + operation[2::2], + strict=False, + ) + ] + for operation in path + ] + transformed_path = [ + cast(PathSegment, (o, *p)) + for o, p in zip(operators, transformed_points, strict=False) + ] + + if shape in {"mlh", "ml"}: + # single line segment + # + # Note: 'ml', in conditional above, is a frequent anomaly + # that we want to support. + line = LTLine( + gstate.linewidth, + pts[0], + pts[1], + stroke, + fill, + evenodd, + gstate.scolor, + gstate.ncolor, + original_path=transformed_path, + dashing_style=gstate.dash, + ) + line.graphicstate = graphicstate + self.cur_item.add(line) + + elif shape in {"mlllh", "mllll"}: + (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts + + is_closed_loop = pts[0] == pts[4] + has_square_coordinates = ( + x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0 + ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0) + if is_closed_loop and has_square_coordinates: + rect = LTRect( + gstate.linewidth, + (*pts[0], *pts[2]), + stroke, + fill, + evenodd, + gstate.scolor, + gstate.ncolor, + transformed_path, + gstate.dash, + ) + rect.graphicstate = graphicstate + self.cur_item.add(rect) + else: + curve = LTCurve( + gstate.linewidth, + pts, + stroke, + fill, + evenodd, + gstate.scolor, + gstate.ncolor, + transformed_path, + gstate.dash, + ) + curve.graphicstate = graphicstate + self.cur_item.add(curve) + else: + curve = LTCurve( + gstate.linewidth, + pts, + stroke, + fill, + evenodd, + gstate.scolor, + gstate.ncolor, + transformed_path, + gstate.dash, + ) + curve.graphicstate = graphicstate + self.cur_item.add(curve) + class AWLTChar(LTChar): """Actual letter in the text as a Unicode string.""" @@ -384,6 +517,10 @@ def vflag(font: str, char: str): # 匹配公式(和角标)字体 vlstk.append(child) else: # 全局线条 lstk.append(child) + elif isinstance(child, LTCurve): + pass + elif isinstance(child, LTRect): + pass else: pass return diff --git a/babeldoc/document_il/frontend/il_creater.py b/babeldoc/document_il/frontend/il_creater.py index 5c25fd93..7e132fa8 100644 --- a/babeldoc/document_il/frontend/il_creater.py +++ b/babeldoc/document_il/frontend/il_creater.py @@ -34,12 +34,19 @@ def __init__(self, translation_config: TranslationConfig): self.xobj_inc = 0 self.xobj_map: dict[int, il_version_1.PdfXobject] = {} self.xobj_stack = [] + color_pattern = r"sc|scn|g|rg|k|cs|gs|ri" + line_pattern = r"w|j|M|d|i" + + self.PASSTHROUGH_PER_CHAR_PATTERN = re.compile( + f"^({color_pattern}|{line_pattern})$", + re.IGNORECASE, + ) def on_finish(self): self.progress.__exit__(None, None, None) def is_passthrough_per_char_operation(self, operator: str): - return re.match("^(sc|scn|g|rg|k|cs|gs|ri)$", operator, re.IGNORECASE) + return self.PASSTHROUGH_PER_CHAR_PATTERN.match(operator) def on_passthrough_per_char(self, operator: str, args: list[str]): if not self.is_passthrough_per_char_operation(operator): diff --git a/babeldoc/document_il/translator/cache.py b/babeldoc/document_il/translator/cache.py index 3d37c6e3..ff18f664 100644 --- a/babeldoc/document_il/translator/cache.py +++ b/babeldoc/document_il/translator/cache.py @@ -8,7 +8,7 @@ from peewee import SqliteDatabase from peewee import TextField -from babeldoc.const import CACHE_FOLDER as cache_folder +from babeldoc.const import CACHE_FOLDER # we don't init the database here db = SqliteDatabase(None) @@ -94,9 +94,9 @@ def set(self, original_text: str, translation: str): def init_db(remove_exists=False): - cache_folder.mkdir(parents=True, exist_ok=True) + CACHE_FOLDER.mkdir(parents=True, exist_ok=True) # The current version does not support database migration, so add the version number to the file name. - cache_db_path = cache_folder / "cache.v1.db" + cache_db_path = CACHE_FOLDER / "cache.v1.db" if remove_exists and cache_db_path.exists(): cache_db_path.unlink() db.init( diff --git a/babeldoc/pdfinterp.py b/babeldoc/pdfinterp.py index 666f0e37..dc6f05c4 100644 --- a/babeldoc/pdfinterp.py +++ b/babeldoc/pdfinterp.py @@ -121,29 +121,42 @@ def get_colorspace(spec: object) -> PDFColorSpace | None: pass def do_S(self) -> None: - # 重载过滤非公式线条 """Stroke path""" + self.device.paint_path( + self.graphicstate, + True, + False, + False, + self.curpath, + self.get_graphic_state(), + ) + self.curpath = [] + return - def is_black(color: Color) -> bool: - if isinstance(color, tuple): - return sum(color) == 0 - else: - return color == 0 - - if ( - len(self.curpath) == 2 - and self.curpath[0][0] == "m" - and self.curpath[1][0] == "l" - and apply_matrix_pt(self.ctm, self.curpath[0][-2:])[1] - == apply_matrix_pt(self.ctm, self.curpath[1][-2:])[1] - and is_black(self.graphicstate.scolor) - ): # 独立直线,水平,黑色 - # print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor) - self.device.paint_path(self.graphicstate, True, False, False, self.curpath) - self.curpath = [] - return "n" - else: - self.curpath = [] + # def do_S(self) -> None: + # # 重载过滤非公式线条 + # """Stroke path""" + # + # def is_black(color: Color) -> bool: + # if isinstance(color, tuple): + # return sum(color) == 0 + # else: + # return color == 0 + # + # if ( + # len(self.curpath) == 2 + # and self.curpath[0][0] == "m" + # and self.curpath[1][0] == "l" + # and apply_matrix_pt(self.ctm, self.curpath[0][-2:])[1] + # == apply_matrix_pt(self.ctm, self.curpath[1][-2:])[1] + # and is_black(self.graphicstate.scolor) + # ): # 独立直线,水平,黑色 + # # print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor) + # self.device.paint_path(self.graphicstate, True, False, False, self.curpath) + # self.curpath = [] + # return "n" + # else: + # self.curpath = [] def do_CS(self, name: PDFStackT) -> None: """Set color space for stroking operations @@ -172,25 +185,54 @@ def do_cs(self, name: PDFStackT) -> None: # 重载过滤非公式线条(F/B) def do_f(self) -> None: """Fill path using nonzero winding number rule""" - # self.device.paint_path(self.graphicstate, False, True, False, self.curpath) + gs = self.get_graphic_state() + self.device.paint_path(self.graphicstate, False, True, False, self.curpath, gs) self.curpath = [] - def do_F(self) -> None: - """Fill path using nonzero winding number rule (obsolete)""" + def get_graphic_state(self): + gs = self.graphicstate.copy() + gs.passthrough_instruction = ( + self.il_creater.passthrough_per_char_instruction.copy() + ) + return gs + + # def do_F(self) -> None: + # """Fill path using nonzero winding number rule (obsolete)""" def do_f_a(self) -> None: """Fill path using even-odd rule""" - # self.device.paint_path(self.graphicstate, False, True, True, self.curpath) + self.device.paint_path( + self.graphicstate, + False, + True, + True, + self.curpath, + self.get_graphic_state(), + ) self.curpath = [] def do_B(self) -> None: """Fill and stroke path using nonzero winding number rule""" - # self.device.paint_path(self.graphicstate, True, True, False, self.curpath) + self.device.paint_path( + self.graphicstate, + True, + True, + False, + self.curpath, + self.get_graphic_state(), + ) self.curpath = [] def do_B_a(self) -> None: """Fill and stroke path using even-odd rule""" - # self.device.paint_path(self.graphicstate, True, True, True, self.curpath) + self.device.paint_path( + self.graphicstate, + True, + True, + True, + self.curpath, + self.get_graphic_state(), + ) self.curpath = [] ############################################################ @@ -380,10 +422,7 @@ def do_TJ(self, seq: PDFStackT) -> None: raise PDFInterpreterError("No font specified!") return assert self.ncs is not None - gs = self.graphicstate.copy() - gs.passthrough_instruction = ( - self.il_creater.passthrough_per_char_instruction.copy() - ) + gs = self.get_graphic_state() self.device.render_string(self.textstate, cast(PDFTextSeq, seq), self.ncs, gs) return