diff --git a/examples/lib_usage_example.py b/examples/lib_usage_example.py index c2437ae..689bfb3 100644 --- a/examples/lib_usage_example.py +++ b/examples/lib_usage_example.py @@ -4,7 +4,6 @@ import sys import libpdf - from tests.conftest import PDF_LOREM_IPSUM as TEST_PDF LOG = logging.getLogger(__name__) @@ -13,24 +12,26 @@ def main(): """Show how the library is used via API.""" - if 'tqdm' not in sys.modules: + if "tqdm" not in sys.modules: # if tqdm is not available, only basic config for logging is initialized. # if tqdm is installed the root logger is assigned a custom handler libpdf.log.TqdmLoggingHandler # that writes all log messages through tqdm.write() to integrate progress bars with logging - logging.basicConfig(level='DEBUG', format='[%(levelname)5s] %(name)s - %(message)s') + logging.basicConfig( + level="DEBUG", format="[%(levelname)5s] %(name)s - %(message)s" + ) # constrain log levels of pdfminer and PIL to avoid log spam - logging.getLogger('pdfminer').level = logging.WARNING - logging.getLogger('PIL').level = logging.WARNING + logging.getLogger("pdfminer").level = logging.WARNING + logging.getLogger("PIL").level = logging.WARNING objects = libpdf.load( TEST_PDF, verbose=3, visual_debug=True, - visual_debug_output_dir='visual_debug', + visual_debug_output_dir="visual_debug", ) LOG.info(objects) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/libpdf/__init__.py b/libpdf/__init__.py index b19980b..6442077 100644 --- a/libpdf/__init__.py +++ b/libpdf/__init__.py @@ -8,27 +8,27 @@ :copyright: © 2020 by team useblocks :license: MIT, see LICENSE for more details -""" # noqa: RST399, RST201 # needed for autodoc +""" # needed for autodoc try: import importlib_metadata # Python 3.6 and 3.7 except ImportError: import importlib.metadata as importlib_metadata # Python 3.8, 3.9 -__version__: str = importlib_metadata.version('libpdf') -__summary__: str = importlib_metadata.metadata('libpdf')['Summary'] +__version__: str = importlib_metadata.version("libpdf") +__summary__: str = importlib_metadata.metadata("libpdf")["Summary"] # below imports from libpdf.core cannot be at the top avoid circular import errors in core.py when # importing __version__ and __summary__ -import libpdf._import_forks # noqa: E402, F401 -from libpdf.core import main_api as load # noqa: E402 -from libpdf.core import main_cli # noqa: E402 +import libpdf._import_forks # noqa: F401 +from libpdf.core import main_api as load +from libpdf.core import main_cli # define importable objects -__all__ = ['load', '__version__', '__summary__'] +__all__ = ["load", "__version__", "__summary__"] # Enable running # python -m libpdf.__init__ # python libpdf/__init__.py # before installing the package itself -if __name__ == '__main__': +if __name__ == "__main__": main_cli() diff --git a/libpdf/_import_forks.py b/libpdf/_import_forks.py index ae5934f..240e5cf 100644 --- a/libpdf/_import_forks.py +++ b/libpdf/_import_forks.py @@ -12,20 +12,24 @@ import os import sys -DEPS_DIR = os.path.join(os.path.dirname(__file__), '..', 'deps') +DEPS_DIR = os.path.join(os.path.dirname(__file__), "..", "deps") # first try to import the dependencies so active venvs with direct Git dependencies are not overriden try: import pdfminer except ModuleNotFoundError: # make dependency available as wheel to sys.path as first entry - sys.path.insert(0, os.path.join(DEPS_DIR, 'pdfminer.six-20200517.dev1-py3-none-any.whl')) + sys.path.insert( + 0, os.path.join(DEPS_DIR, "pdfminer.six-20200517.dev1-py3-none-any.whl") + ) else: del pdfminer try: import pdfplumber except ModuleNotFoundError: - sys.path.insert(0, os.path.join(DEPS_DIR, 'pdfplumber-0.5.21.dev1-py3-none-any.whl')) + sys.path.insert( + 0, os.path.join(DEPS_DIR, "pdfplumber-0.5.21.dev1-py3-none-any.whl") + ) else: del pdfplumber diff --git a/libpdf/apiobjects.py b/libpdf/apiobjects.py index 91fb961..dd8ff5b 100644 --- a/libpdf/apiobjects.py +++ b/libpdf/apiobjects.py @@ -1,6 +1,9 @@ """Modules defines extracted object instances.""" -from typing import List, NamedTuple +from typing import NamedTuple + +from pdfminer.pdfdocument import PDFDocument +from pdfplumber.pdf import PDF from libpdf.models.chapter import Chapter from libpdf.models.figure import Figure @@ -8,10 +11,6 @@ from libpdf.models.root import Root from libpdf.models.table import Table -from pdfminer.pdfdocument import PDFDocument - -from pdfplumber.pdf import PDF - # disable too-few-public-methods because this is a data storage class # another option is using a dict but this does not feature IDE type hinting @@ -33,10 +32,10 @@ class ApiObjects: # pylint: disable = too-few-public-methods def __init__( # pylint: disable=too-many-arguments # the parameters are needed to describe in Sphinx autodoc self, root: Root, - chapters: List[Chapter], - paragraphs: List[Paragraph], - tables: List[Table], - figures: List[Figure], + chapters: list[Chapter], + paragraphs: list[Paragraph], + tables: list[Table], + figures: list[Figure], pdfplumber: PDF, pdfminer: PDFDocument, ): @@ -45,7 +44,9 @@ def __init__( # pylint: disable=too-many-arguments # the parameters are needed self.root = root # attributes for API convenience - self.flattened = Flattened(chapters=chapters, paragraphs=paragraphs, tables=tables, figures=figures) + self.flattened = Flattened( + chapters=chapters, paragraphs=paragraphs, tables=tables, figures=figures + ) # exposing the pdfplumber PDF object self.pdfplumber = pdfplumber @@ -66,7 +67,7 @@ def __init__( # pylint: disable=too-many-arguments # the parameters are needed class Flattened(NamedTuple): """NamedTuple to hold flattened Element instances featuring also type hinting.""" - chapters: List[Chapter] - paragraphs: List[Paragraph] - tables: List[Table] - figures: List[Figure] + chapters: list[Chapter] + paragraphs: list[Paragraph] + tables: list[Table] + figures: list[Figure] diff --git a/libpdf/catalog.py b/libpdf/catalog.py index e5029ca..16faa0e 100644 --- a/libpdf/catalog.py +++ b/libpdf/catalog.py @@ -1,29 +1,29 @@ """PDF catalog extraction.""" import logging import re -from typing import Any, Dict, List, Union +from typing import Any, Union + +from pdfminer.pdftypes import PDFObjRef +from pdfminer.psparser import PSLiteral from libpdf.log import logging_needed from libpdf.parameters import ANNO_X_TOLERANCE, ANNO_Y_TOLERANCE from libpdf.progress import bar_format_lvl2, tqdm from libpdf.utils import decode_title, to_pdfplumber_bbox -from pdfminer.pdftypes import PDFObjRef -from pdfminer.psparser import PSLiteral - - LOG = logging.getLogger(__name__) catalog = { - 'outline': {}, - 'annos': {}, - 'dests': {}, + "outline": {}, + "annos": {}, + "dests": {}, } def get_named_destination(pdf): # pylint: disable=too-many-branches - """Extract Name destination catalog. + """ + Extract Name destination catalog. Extracts Name destination catalog (link target) from pdf.doc.catalog['Name'] to obtain the coordinates (x,y) and page for the corresponding destination's name. @@ -37,18 +37,21 @@ def get_named_destination(pdf): # pylint: disable=too-many-branches :param pdf: pdf object of pdfplumber.pdf.PDF :return: named destination dictionary mapping reference of destination by name object """ - LOG.info('Catalog extraction: name destination ...') + LOG.info("Catalog extraction: name destination ...") # check if name tree exist in catalog and extract name tree name_tree = {} named_destination = {} pdf_catalog = pdf.doc.catalog - if 'Names' in pdf_catalog: + if "Names" in pdf_catalog: # PDF 1.2 - if isinstance(pdf_catalog['Names'], PDFObjRef) and 'Dests' in pdf_catalog['Names'].resolve(): - name_tree = pdf_catalog['Names'].resolve()['Dests'].resolve() - elif isinstance(pdf_catalog['Names'], dict) and 'Dests' in pdf_catalog['Names']: - name_tree = pdf_catalog['Names']['Dests'].resolve() + if ( + isinstance(pdf_catalog["Names"], PDFObjRef) + and "Dests" in pdf_catalog["Names"].resolve() + ): + name_tree = pdf_catalog["Names"].resolve()["Dests"].resolve() + elif isinstance(pdf_catalog["Names"], dict) and "Dests" in pdf_catalog["Names"]: + name_tree = pdf_catalog["Names"]["Dests"].resolve() # check if name tree not empty if name_tree: # map page id to page number @@ -61,9 +64,9 @@ def get_named_destination(pdf): # pylint: disable=too-many-branches # name_obj_list always contains a flatten name destination catalog. # resolve name objects - if 'Kids' in name_tree: + if "Kids" in name_tree: kids_hierarchy = [] - kids_hierarchy.extend([kid.resolve() for kid in name_tree['Kids']]) + kids_hierarchy.extend([kid.resolve() for kid in name_tree["Kids"]]) name_obj_list = resolve_name_obj(kids_hierarchy) else: name_obj_list = [name_tree] @@ -71,18 +74,18 @@ def get_named_destination(pdf): # pylint: disable=too-many-branches for index_dest, item_dest in enumerate(name_obj_list): # In 'Names', odd indices are destination's names, while even indices are the obj id which can be # referred to the certain page in PDF - for index_name in range(0, len(item_dest['Names']), 2): - named_destination[name_obj_list[index_dest]['Names'][index_name].decode('utf-8')] = name_obj_list[ - index_dest - ]['Names'][index_name + 1] - elif 'Dests' in pdf_catalog: + for index_name in range(0, len(item_dest["Names"]), 2): + named_destination[ + name_obj_list[index_dest]["Names"][index_name].decode("utf-8") + ] = name_obj_list[index_dest]["Names"][index_name + 1] + elif "Dests" in pdf_catalog: # PDF 1.1 - if isinstance(pdf_catalog['Dests'], PDFObjRef): - named_destination = pdf_catalog['Dests'].resolve() - elif isinstance(pdf_catalog['Dests'], dict): - named_destination = pdf_catalog['Dests'] + if isinstance(pdf_catalog["Dests"], PDFObjRef): + named_destination = pdf_catalog["Dests"].resolve() + elif isinstance(pdf_catalog["Dests"], dict): + named_destination = pdf_catalog["Dests"] else: - LOG.debug('Catalog extraction: name destinations do not exist') + LOG.debug("Catalog extraction: name destinations do not exist") return None for key_object in named_destination: @@ -92,25 +95,26 @@ def get_named_destination(pdf): # pylint: disable=too-many-branches for key_name_dest in named_destination: # get the page number and the coordinate for the destination - if 'D' not in named_destination[key_name_dest]: + if "D" not in named_destination[key_name_dest]: # the value of named_destination is a list, contains explict destination explict_dest = get_explict_dest(pdf, named_destination[key_name_dest]) else: # the value of named_destination is a dictionary with a D entry, whose value is a list like above - explict_dest = get_explict_dest(pdf, named_destination[key_name_dest]['D']) + explict_dest = get_explict_dest(pdf, named_destination[key_name_dest]["D"]) named_destination[key_name_dest] = {} named_destination[key_name_dest] = { - 'X': explict_dest[1], - 'Y': explict_dest[2], - 'Num': explict_dest[0], + "X": explict_dest[1], + "Y": explict_dest[2], + "Num": explict_dest[0], } return named_destination def resolve_name_obj(name_tree_kids): - """Resolve 'Names' objects recursively. + """ + Resolve 'Names' objects recursively. If key 'Kids' exists in 'Names', the name destination is nested in a hierarchical structure. In this case, this recursion is used to resolve all the 'Kids' @@ -120,9 +124,9 @@ def resolve_name_obj(name_tree_kids): """ temp_list = [] for kid in name_tree_kids: - if 'Kids' in kid and kid['Kids']: - temp_list.extend([kid_kid.resolve() for kid_kid in kid['Kids']]) - elif 'Names' in kid: + if "Kids" in kid and kid["Kids"]: + temp_list.extend([kid_kid.resolve() for kid_kid in kid["Kids"]]) + elif "Names" in kid: return name_tree_kids return resolve_name_obj(temp_list) @@ -142,34 +146,36 @@ def get_outline(pdf, des_dict): :param des_dict: the dictionary of name destination :return: outline dictionary in a nested structure with each chapter's coordinates (x0, y0) and pages """ - LOG.info('Catalog extraction: outline ...') + LOG.info("Catalog extraction: outline ...") # check if outlines exist in catalog - if 'Outlines' not in pdf.doc.catalog: - LOG.info('Catalog extraction: outline does not exist...') + if "Outlines" not in pdf.doc.catalog: + LOG.info("Catalog extraction: outline does not exist...") return None # check if outline dictionary not empty - if not pdf.doc.catalog['Outlines'].resolve(): - LOG.info('Catalog extraction: outline exists but is empty...') + if not pdf.doc.catalog["Outlines"].resolve(): + LOG.info("Catalog extraction: outline exists but is empty...") return None # TODO: why need dictionary with only one key here??? Can I change to list? This may affect downstream - outlines = {'content': []} + outlines = {"content": []} - outline_obj = pdf.doc.catalog['Outlines'].resolve() - if 'First' not in outline_obj: + outline_obj = pdf.doc.catalog["Outlines"].resolve() + if "First" not in outline_obj: raise ValueError('Key "First" is not in Outlines') - resolve_outline(outline_obj['First'].resolve(), outlines['content'], des_dict, pdf) + resolve_outline(outline_obj["First"].resolve(), outlines["content"], des_dict, pdf) - if outlines['content']: - chapter_number_giver(outlines['content'], '1') + if outlines["content"]: + chapter_number_giver(outlines["content"], "1") return outlines -def chapter_number_giver(chapters_in_outline: List[Dict], virt_hierarchical_level: str) -> None: +def chapter_number_giver( + chapters_in_outline: list[dict], virt_hierarchical_level: str +) -> None: """ Assign chapter number to each chapter from the index in its title or the hierarchical level on the outline. @@ -180,35 +186,46 @@ def chapter_number_giver(chapters_in_outline: List[Dict], virt_hierarchical_leve :param virt_hierarchical_level: the current level of the outline hierarchy aka virtual number :return: None """ - levels = virt_hierarchical_level.split('.') + levels = virt_hierarchical_level.split(".") start_level = int(levels[-1]) # last item in virt_hierarchical_level - parent_level = '.'.join((levels[0:-1])) # all but last item in virt_hierarchical_level + parent_level = ".".join( + levels[0:-1] + ) # all but last item in virt_hierarchical_level for idx_chapter, chapter in enumerate(chapters_in_outline): current_level = start_level + idx_chapter if parent_level: - new_hierarchical_level = f'{parent_level}.{current_level}' + new_hierarchical_level = f"{parent_level}.{current_level}" else: - new_hierarchical_level = f'{current_level}' + new_hierarchical_level = f"{current_level}" # remove leading spaces - chapter_title = chapter['title'].strip() + chapter_title = chapter["title"].strip() # match chapter number/index if exist, supported titles may contain a-z, A-Z, 0-9 and lower/upper case # roman numbers, separated by dots, e.g. 1.2.3 | 2.a.i | 2.a.IV | 1.2.3. | A | A.a.2 - pattern = re.compile(r'^(?!\.)((^|\.)(([iIvVxX]{1,8})|[a-zA-Z]|[0-9]+))+\.?(?=[ \t]+\S+)') - chapter_number = re.match(pattern, chapter['title'].strip()) + pattern = re.compile( + r"^(?!\.)((^|\.)(([iIvVxX]{1,8})|[a-zA-Z]|[0-9]+))+\.?(?=[ \t]+\S+)" + ) + chapter_number = re.match(pattern, chapter["title"].strip()) if chapter_number: # The assumption is that only one match is found - chapters_in_outline[idx_chapter].update({'number': chapter_number[0]}) - chapters_in_outline[idx_chapter].update({'title': chapter_title.replace(chapter_number[0], '', 1).strip()}) + chapters_in_outline[idx_chapter].update({"number": chapter_number[0]}) + chapters_in_outline[idx_chapter].update( + {"title": chapter_title.replace(chapter_number[0], "", 1).strip()} + ) else: - chapters_in_outline[idx_chapter].update({'number': f'virt.{new_hierarchical_level}'}) + chapters_in_outline[idx_chapter].update( + {"number": f"virt.{new_hierarchical_level}"} + ) - if chapter['content']: + if chapter["content"]: # next deeper level - chapter_number_giver(chapters_in_outline[idx_chapter]['content'], f'{new_hierarchical_level}.1') + chapter_number_giver( + chapters_in_outline[idx_chapter]["content"], + f"{new_hierarchical_level}.1", + ) def resolve_outline(outline_obj, outline_list, des_dict, pdf): # pylint: disable=too-many-branches, too-many-statements @@ -230,76 +247,81 @@ def resolve_outline(outline_obj, outline_list, des_dict, pdf): # pylint: disabl :return: outline list """ # check if outline_obj['A'] and outline_obj['Dest'] coexist - if 'A' in outline_obj and 'Dest' in outline_obj: - LOG.error('Key A and Dest can not coexist in outline.') - raise ValueError('Key A and Dest can not coexist in outline.') + if "A" in outline_obj and "Dest" in outline_obj: + LOG.error("Key A and Dest can not coexist in outline.") + raise ValueError("Key A and Dest can not coexist in outline.") # get outline destination - if 'A' in outline_obj: + if "A" in outline_obj: # make sure outline_obj['A'] is resolved - if isinstance(outline_obj['A'], PDFObjRef): - outline_dest_entry = outline_obj['A'].resolve() + if isinstance(outline_obj["A"], PDFObjRef): + outline_dest_entry = outline_obj["A"].resolve() else: - outline_dest_entry = outline_obj['A'] + outline_dest_entry = outline_obj["A"] # consider only go-to action, used for various targets in PDF standard - if outline_dest_entry['S'].name == 'GoTo': - if isinstance(outline_dest_entry['D'], list): + if outline_dest_entry["S"].name == "GoTo": + if isinstance(outline_dest_entry["D"], list): # explict destination - if isinstance(outline_dest_entry['D'][0], PDFObjRef): - explict_dest = get_explict_dest(pdf, outline_dest_entry['D']) + if isinstance(outline_dest_entry["D"][0], PDFObjRef): + explict_dest = get_explict_dest(pdf, outline_dest_entry["D"]) outline_dest = { - 'page': explict_dest[0], - 'rect_X': explict_dest[1], - 'rect_Y': explict_dest[2], + "page": explict_dest[0], + "rect_X": explict_dest[1], + "rect_Y": explict_dest[2], } - title_bytes = outline_obj['Title'] + title_bytes = outline_obj["Title"] else: raise RuntimeError( f"Page {outline_dest_entry['D'][0]} is not an indirect reference to a page object", ) else: # named destination - if isinstance(outline_dest_entry['D'], PSLiteral): + if isinstance(outline_dest_entry["D"], PSLiteral): # PDF 1.1 name object - outline_dest = outline_dest_entry['D'].name + outline_dest = outline_dest_entry["D"].name else: # PDF 1.2 byte string - outline_dest = outline_dest_entry['D'].decode('utf-8') + outline_dest = outline_dest_entry["D"].decode("utf-8") - if isinstance(outline_obj['Title'], PDFObjRef): - title_bytes = outline_obj['Title'].resolve() # title is a PDFObjRef + if isinstance(outline_obj["Title"], PDFObjRef): + title_bytes = outline_obj["Title"].resolve() # title is a PDFObjRef else: - title_bytes = outline_obj['Title'] + title_bytes = outline_obj["Title"] else: # not go-to action, no destination in this document to jump to outline_dest = None - title_bytes = outline_obj['Title'] - LOG.info('Jump target of outline entry "%s" is outside of this document.', outline_obj) - elif 'Dest' in outline_obj: + title_bytes = outline_obj["Title"] + LOG.info( + 'Jump target of outline entry "%s" is outside of this document.', + outline_obj, + ) + elif "Dest" in outline_obj: # direct destination, used to directly address page locations - if isinstance(outline_obj['Dest'], list): + if isinstance(outline_obj["Dest"], list): # explict destination - if isinstance(outline_obj['Dest'][0], PDFObjRef): - explict_dest = get_explict_dest(pdf, outline_obj['Dest']) + if isinstance(outline_obj["Dest"][0], PDFObjRef): + explict_dest = get_explict_dest(pdf, outline_obj["Dest"]) outline_dest = { - 'page': explict_dest[0], - 'rect_X': explict_dest[1], - 'rect_Y': explict_dest[2], + "page": explict_dest[0], + "rect_X": explict_dest[1], + "rect_Y": explict_dest[2], } else: - raise RuntimeError(f"Page {outline_obj['Dest'][0]} is not an indirect reference to a page object") + raise RuntimeError( + f"Page {outline_obj['Dest'][0]} is not an indirect reference to a page object" + ) else: # named destination - if isinstance(outline_obj['Dest'], PSLiteral): + if isinstance(outline_obj["Dest"], PSLiteral): # PDF 1.1 name object - outline_dest = outline_obj['Dest'].name + outline_dest = outline_obj["Dest"].name else: # PDF 1.2 byte string - outline_dest = outline_obj['Dest'].decode('utf-8') - title_bytes = outline_obj['Title'] + outline_dest = outline_obj["Dest"].decode("utf-8") + title_bytes = outline_obj["Title"] else: - raise ValueError('No key A and Dest in outline.') + raise ValueError("No key A and Dest in outline.") # various encodings like UTF-8 and UTF-16 are in the wild for the title, so using chardet to guess them title_decoded = decode_title(title_bytes) @@ -307,38 +329,51 @@ def resolve_outline(outline_obj, outline_list, des_dict, pdf): # pylint: disabl # check if outline_dest exists if outline_dest: # get outline_dest location and page number and store in temp_dict - if des_dict is not None and not isinstance(outline_dest, dict) and des_dict[outline_dest]: + if ( + des_dict is not None + and not isinstance(outline_dest, dict) + and des_dict[outline_dest] + ): # outline with named destination, which means outline_dest must not be dict, representing explict # destination outline = { - 'number': '', - 'title': title_decoded, - 'position': { + "number": "", + "title": title_decoded, + "position": { # TODO change to x, y and give them meaning when comparing to libpdf elements - 'x0': des_dict[outline_dest]['X'], - 'y1': des_dict[outline_dest]['Y'], # dest (X, Y) is left top, equals (x0, y1) in pdfminer - 'page': des_dict[outline_dest]['Num'], + "x0": des_dict[outline_dest]["X"], + "y1": des_dict[outline_dest][ + "Y" + ], # dest (X, Y) is left top, equals (x0, y1) in pdfminer + "page": des_dict[outline_dest]["Num"], }, - 'content': [], + "content": [], } else: # outline with explict destination outline = { - 'number': '', - 'title': title_decoded, - 'position': { - 'x0': outline_dest['rect_X'], - 'y1': outline_dest['rect_Y'], # dest (X, Y) is left top, equals (x0, y1) in pdfminer - 'page': outline_dest['page'], + "number": "", + "title": title_decoded, + "position": { + "x0": outline_dest["rect_X"], + "y1": outline_dest[ + "rect_Y" + ], # dest (X, Y) is left top, equals (x0, y1) in pdfminer + "page": outline_dest["page"], }, - 'content': [], + "content": [], } outline_list.append(outline) - if 'First' in outline_obj: - resolve_outline(outline_obj['First'].resolve(), outline_list[len(outline_list) - 1]['content'], des_dict, pdf) - if 'Next' in outline_obj: - resolve_outline(outline_obj['Next'].resolve(), outline_list, des_dict, pdf) + if "First" in outline_obj: + resolve_outline( + outline_obj["First"].resolve(), + outline_list[len(outline_list) - 1]["content"], + des_dict, + pdf, + ) + if "Next" in outline_obj: + resolve_outline(outline_obj["Next"].resolve(), outline_list, des_dict, pdf) def get_explict_dest(pdf, dest_list): @@ -358,12 +393,12 @@ def get_explict_dest(pdf, dest_list): # explict destination support a lot possibilities to describe like [page, /XYZ, left, top, zoom], or [page, /Fit] # according to TABLE 8.2 Destination syntax of PDF Reference 1.7 - if dest_list[1].name == 'XYZ': + if dest_list[1].name == "XYZ": dest_rect_x = dest_list[2] dest_rect_y = dest_list[3] else: dest_rect_x = 0 - dest_rect_y = dest_list[0].resolve()['MediaBox'][3] # page top + dest_rect_y = dest_list[0].resolve()["MediaBox"][3] # page top return [dest_page_num, dest_rect_x, dest_rect_y] @@ -386,11 +421,11 @@ def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf): # :return: None """ # safety check - if 'Rect' not in ann_resolved: + if "Rect" not in ann_resolved: LOG.error('"Rect" is missing in annotation.') - if 'A' in ann_resolved and 'Dest' in ann_resolved: - LOG.error('Key A and Dest can not coexist in annotation.') + if "A" in ann_resolved and "Dest" in ann_resolved: + LOG.error("Key A and Dest can not coexist in annotation.") # get annotation location on the page # Rect[0] is the x0 in pdfminer coordination @@ -398,33 +433,37 @@ def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf): # # Rect[2] is the x1 in pdfminer # Rect[3] is the y1 in pdfminer ann_bbox = to_pdfplumber_bbox( - float(ann_resolved['Rect'][0]) - ANNO_X_TOLERANCE, - float(ann_resolved['Rect'][1]) - ANNO_Y_TOLERANCE, - float(ann_resolved['Rect'][2]) + ANNO_X_TOLERANCE, - float(ann_resolved['Rect'][3]) + ANNO_Y_TOLERANCE, + float(ann_resolved["Rect"][0]) - ANNO_X_TOLERANCE, + float(ann_resolved["Rect"][1]) - ANNO_Y_TOLERANCE, + float(ann_resolved["Rect"][2]) + ANNO_X_TOLERANCE, + float(ann_resolved["Rect"][3]) + ANNO_Y_TOLERANCE, page.height, ) page_crop = page.within_bbox(ann_bbox) ann_text = page_crop.extract_text(x_tolerance=1, y_tolerance=4) - if 'A' in ann_resolved: + if "A" in ann_resolved: # make sure ann_resolved['A'] is resolved - if isinstance(ann_resolved['A'], PDFObjRef): - ann_resolved_entry = ann_resolved['A'].resolve() + if isinstance(ann_resolved["A"], PDFObjRef): + ann_resolved_entry = ann_resolved["A"].resolve() else: - ann_resolved_entry = ann_resolved['A'] + ann_resolved_entry = ann_resolved["A"] # consider only go-to action - if ann_resolved_entry['S'].name == 'GoTo': - if isinstance(ann_resolved_entry['D'], list): + if ann_resolved_entry["S"].name == "GoTo": + if isinstance(ann_resolved_entry["D"], list): # explict destination, ann_resolved['A']['D'] is a list - if isinstance(ann_resolved_entry['D'][0], PDFObjRef): - explict_dest = get_explict_dest(pdf, ann_resolved_entry['D']) - annotation_page_map[idx_page + 1]['annotation'].append( + if isinstance(ann_resolved_entry["D"][0], PDFObjRef): + explict_dest = get_explict_dest(pdf, ann_resolved_entry["D"]) + annotation_page_map[idx_page + 1]["annotation"].append( { - 'text': ann_text, - 'rect': ann_resolved['Rect'], - 'dest': {'page': explict_dest[0], 'rect_X': explict_dest[1], 'rect_Y': explict_dest[2]}, + "text": ann_text, + "rect": ann_resolved["Rect"], + "dest": { + "page": explict_dest[0], + "rect_X": explict_dest[1], + "rect_Y": explict_dest[2], + }, }, ) else: @@ -433,59 +472,62 @@ def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf): # ) else: # Named destination - if isinstance(ann_resolved_entry['D'], PSLiteral): + if isinstance(ann_resolved_entry["D"], PSLiteral): # PDF 1.1 name object - des_name = ann_resolved_entry['D'].name + des_name = ann_resolved_entry["D"].name else: # PDF 1.2 byte string - des_name = ann_resolved_entry['D'].decode('utf-8') - annotation_page_map[idx_page + 1]['annotation'].append( + des_name = ann_resolved_entry["D"].decode("utf-8") + annotation_page_map[idx_page + 1]["annotation"].append( { - 'text': ann_text, - 'rect': ann_resolved['Rect'], - 'des_name': des_name, + "text": ann_text, + "rect": ann_resolved["Rect"], + "des_name": des_name, }, ) else: LOG.info( - 'The %s link target on page %s is not in this document.', - ann_resolved_entry['S'].name, + "The %s link target on page %s is not in this document.", + ann_resolved_entry["S"].name, idx_page + 1, ) - elif 'Dest' in ann_resolved: + elif "Dest" in ann_resolved: # direct destination, used to directly address page locations - if isinstance(ann_resolved['Dest'], list): + if isinstance(ann_resolved["Dest"], list): # explict destination - if isinstance(ann_resolved['Dest'][0], PDFObjRef): - explict_dest = get_explict_dest(pdf, ann_resolved['Dest']) + if isinstance(ann_resolved["Dest"][0], PDFObjRef): + explict_dest = get_explict_dest(pdf, ann_resolved["Dest"]) anno_dest = { - 'page': explict_dest[0], - 'rect_X': explict_dest[1], - 'rect_Y': explict_dest[2], + "page": explict_dest[0], + "rect_X": explict_dest[1], + "rect_Y": explict_dest[2], } - annotation_page_map[idx_page + 1]['annotation'].append( - {'text': ann_text, 'rect': ann_resolved['Rect'], 'dest': anno_dest}, + annotation_page_map[idx_page + 1]["annotation"].append( + {"text": ann_text, "rect": ann_resolved["Rect"], "dest": anno_dest}, ) else: - raise RuntimeError(f"Page {ann_resolved['Dest'][0]} is not an indirect reference to a page object") + raise RuntimeError( + f"Page {ann_resolved['Dest'][0]} is not an indirect reference to a page object" + ) else: # Named destination - if isinstance(ann_resolved['Dest'], PSLiteral): + if isinstance(ann_resolved["Dest"], PSLiteral): # PDF 1.1 name object - des_name = ann_resolved['Dest'].name + des_name = ann_resolved["Dest"].name else: # PDF 1.2 byte string - des_name = ann_resolved['Dest'].decode('utf-8') + des_name = ann_resolved["Dest"].decode("utf-8") - annotation_page_map[idx_page + 1]['annotation'].append( - {'text': ann_text, 'rect': ann_resolved['Rect'], 'des_name': des_name}, + annotation_page_map[idx_page + 1]["annotation"].append( + {"text": ann_text, "rect": ann_resolved["Rect"], "des_name": des_name}, ) else: raise Exception('Key "A" and "Dest" do not exist in annotations.') def annotation_dict_extraction(pdf): - """Extract annotation (link source) from the catalog of the PDF. + """ + Extract annotation (link source) from the catalog of the PDF. The annotation is stored in page.page_obj.annots instead of page.anno or pdf.doc.catalog, if annotations exist in the corresponding page. @@ -498,21 +540,30 @@ def annotation_dict_extraction(pdf): -destination's name, which is the interface to map with the name destination catalog (target link). """ - LOG.info('Catalog extraction: annotations ...') + LOG.info("Catalog extraction: annotations ...") annotation_page_map = {} for idx_page, page in enumerate( - tqdm(pdf.pages, desc='###### Extracting annotations', unit='pages', bar_format=bar_format_lvl2()), + tqdm( + pdf.pages, + desc="###### Extracting annotations", + unit="pages", + bar_format=bar_format_lvl2(), + ), ): if logging_needed(idx_page, len(pdf.pages)): - LOG.debug('Catalog extraction: annotations page %s of %s', idx_page + 1, len(pdf.pages)) + LOG.debug( + "Catalog extraction: annotations page %s of %s", + idx_page + 1, + len(pdf.pages), + ) # extract annotations from page_obj.annots, if any exists page_obj = page.page_obj if page_obj.annots is not None: # TODO remove key 'annotation' and refactor relevant code usage - annotation_page_map.update({idx_page + 1: {'annotation': []}}) + annotation_page_map.update({idx_page + 1: {"annotation": []}}) if isinstance(page_obj.annots, PDFObjRef): annotations = page_obj.annots.resolve() @@ -521,10 +572,12 @@ def annotation_dict_extraction(pdf): for ann in annotations: ann_resolved = ann.resolve() - if ann_resolved['Subtype'].name == 'Link': - update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf) + if ann_resolved["Subtype"].name == "Link": + update_ann_info( + annotation_page_map, ann_resolved, page, idx_page, pdf + ) # if no link annotation on this page, remove this page from annotation dictionary - if not annotation_page_map[idx_page + 1]['annotation']: + if not annotation_page_map[idx_page + 1]["annotation"]: del annotation_page_map[idx_page + 1] if not annotation_page_map: @@ -534,8 +587,8 @@ def annotation_dict_extraction(pdf): def _resolve_pdf_obj_refs( - object_to_resolve: Union[List, Dict], - resolved_objects_flat: Dict[int, Any], + object_to_resolve: Union[list, dict], + resolved_objects_flat: dict[int, Any], depth=None, reason=None, ): # pylint: disable=too-many-branches, too-many-statements # warning not fixed due to algorithmic benefits @@ -559,11 +612,15 @@ def _resolve_pdf_obj_refs( for key, value in object_to_resolve.items(): if isinstance(value, dict): # recurse and add child dict - ret_dict, _ = _resolve_pdf_obj_refs(value, resolved_objects_flat, depth, f'key {key} > dict') + ret_dict, _ = _resolve_pdf_obj_refs( + value, resolved_objects_flat, depth, f"key {key} > dict" + ) resolved_dict[key] = ret_dict elif isinstance(value, list): # recurse and set list - _, ret_list = _resolve_pdf_obj_refs(value, resolved_objects_flat, depth, f'key {key} > list') + _, ret_list = _resolve_pdf_obj_refs( + value, resolved_objects_flat, depth, f"key {key} > list" + ) resolved_dict[key] = ret_list elif isinstance(value, PDFObjRef): # Parent: used in Page to navigate to Pages @@ -571,7 +628,7 @@ def _resolve_pdf_obj_refs( # Last: used in Outline to get to last section # ParentTree: used in StructTreeRoot to get to parent # P: used in StructElem to get to parent - forbidden_keys = ['Parent', 'Prev', 'Last', 'ParentTree', 'P'] + forbidden_keys = ["Parent", "Prev", "Last", "ParentTree", "P"] if key in forbidden_keys: # don't resolve PDFObjRef under forbidden keys to avoid endless recursion resolved_dict[key] = value @@ -585,7 +642,7 @@ def _resolve_pdf_obj_refs( resolved, resolved_objects_flat, depth, - f'key {key} > PDFObjRef {value.objid} > dict', + f"key {key} > PDFObjRef {value.objid} > dict", ) resolved_dict[key] = ret_dict elif isinstance(resolved, list): @@ -594,21 +651,27 @@ def _resolve_pdf_obj_refs( resolved, resolved_objects_flat, depth, - f'key {key} > PDFObjRef {value.objid} > list', + f"key {key} > PDFObjRef {value.objid} > list", ) resolved_dict[key] = ret_list else: - resolved_dict[key] = resolved # add resolved element to dictionary + resolved_dict[ + key + ] = resolved # add resolved element to dictionary else: # leave other types as they are resolved_dict[key] = value elif isinstance(object_to_resolve, list): for idx, value in enumerate(object_to_resolve): if isinstance(value, dict): - ret_dict, _ = _resolve_pdf_obj_refs(value, resolved_objects_flat, depth, f'list idx {idx} > dict') + ret_dict, _ = _resolve_pdf_obj_refs( + value, resolved_objects_flat, depth, f"list idx {idx} > dict" + ) resolved_list.append(ret_dict) elif isinstance(value, list): - _, ret_list = _resolve_pdf_obj_refs(value, resolved_objects_flat, depth, f'list idx {idx} > list') + _, ret_list = _resolve_pdf_obj_refs( + value, resolved_objects_flat, depth, f"list idx {idx} > list" + ) resolved_list.append(ret_list) elif isinstance(value, PDFObjRef): resolved = value.resolve() @@ -620,7 +683,7 @@ def _resolve_pdf_obj_refs( resolved, resolved_objects_flat, depth, - f'list idx {idx} > PDFObjRef {value.objid} > dict', + f"list idx {idx} > PDFObjRef {value.objid} > dict", ) resolved_list.append(ret_dict) elif isinstance(resolved, list): @@ -629,7 +692,7 @@ def _resolve_pdf_obj_refs( resolved, resolved_objects_flat, depth, - f'list idx {idx} > PDFObjRef {value.objid} > list', + f"list idx {idx} > PDFObjRef {value.objid} > list", ) resolved_list.append(ret_list) else: @@ -638,7 +701,7 @@ def _resolve_pdf_obj_refs( # leave other types as they are resolved_list.append(value) else: - raise RuntimeError('object_to_resolve must of type dictionary or list') + raise RuntimeError("object_to_resolve must of type dictionary or list") del depth[-1] # pop last item in list return resolved_dict, resolved_list @@ -655,7 +718,7 @@ def extract_catalog(pdf, no_annotations: bool): Extract named destination. """ - LOG.info('Catalog extraction started ...') + LOG.info("Catalog extraction started ...") # debug purpose only # resolved_objects: Dict[int, Any] = {} # key is the PDFObjRef.objid and value the resolved object @@ -664,7 +727,7 @@ def extract_catalog(pdf, no_annotations: bool): if no_annotations: ann_dict = None - LOG.info('Catalog extraction: annotations is excluded') + LOG.info("Catalog extraction: annotations is excluded") else: # extract annotation (link source) and store in the dict by pages for further process of links # on texts in extract() @@ -676,6 +739,6 @@ def extract_catalog(pdf, no_annotations: bool): # extract outline of a pdf, if it exists. All the chapters of outline are in a nested and hierarchical structure outline_dict = get_outline(pdf, des_dict) - catalog['outline'] = outline_dict - catalog['annos'] = ann_dict - catalog['dests'] = des_dict + catalog["outline"] = outline_dict + catalog["annos"] = ann_dict + catalog["dests"] = des_dict diff --git a/libpdf/core.py b/libpdf/core.py index 20e3c00..0f9c6d1 100644 --- a/libpdf/core.py +++ b/libpdf/core.py @@ -3,7 +3,7 @@ import logging import re import sys -from typing import List, Optional, Tuple +from typing import Optional import click @@ -24,7 +24,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable pdf: str, verbose: int = 0, page_range: str = None, - page_crop: Tuple[float, float, float, float] = None, + page_crop: tuple[float, float, float, float] = None, smart_page_crop: bool = False, output_format: str = None, output_path: str = None, @@ -39,8 +39,8 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable visual_debug: bool = False, visual_debug_output_dir: str = None, visual_split_elements: bool = False, - visual_debug_include_elements: List[str] = None, - visual_debug_exclude_elements: List[str] = None, + visual_debug_include_elements: list[str] = None, + visual_debug_exclude_elements: list[str] = None, ) -> Optional[ApiObjects]: """ Entry point for both CLI and API. @@ -75,17 +75,17 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable :return: instance of Object class for API usage, None for CLI usage """ if page_crop: - parameters.PAGE_CROP_MARGINS['top'] = page_crop[0] - parameters.PAGE_CROP_MARGINS['right'] = page_crop[1] - parameters.PAGE_CROP_MARGINS['bottom'] = page_crop[2] - parameters.PAGE_CROP_MARGINS['left'] = page_crop[3] + parameters.PAGE_CROP_MARGINS["top"] = page_crop[0] + parameters.PAGE_CROP_MARGINS["right"] = page_crop[1] + parameters.PAGE_CROP_MARGINS["bottom"] = page_crop[2] + parameters.PAGE_CROP_MARGINS["left"] = page_crop[3] if cli_usage: - LOG.info('libpdf version %s - %s', __version__, __summary__) + LOG.info("libpdf version %s - %s", __version__, __summary__) with tqdm( total=100, - desc='### libpdf progress', + desc="### libpdf progress", bar_format=bar_format_lvl1(), - unit='%', + unit="%", leave=False, ) as overall_pbar: pages = None @@ -96,23 +96,30 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable LOG.warning("Install optional dependency 'tqdm' for progress bars") if TQDM_AVAILABLE and not COLORAMA_AVAILABLE: - LOG.warning("Install optional dependency 'colorama' for colored progress bars") + LOG.warning( + "Install optional dependency 'colorama' for colored progress bars" + ) - LOG.info('Verbosity level: %s', get_level_name(verbose)) - LOG.info('Input file: %s', pdf) - LOG.info('Output format: %s', output_format) + LOG.info("Verbosity level: %s", get_level_name(verbose)) + LOG.info("Input file: %s", pdf) + LOG.info("Output format: %s", output_format) if output_path: - LOG.info('Output path: %s', output_path) + LOG.info("Output path: %s", output_path) else: - LOG.info('Writing extracted data to stdout') - LOG.info('Page range: [%s]', 'all' if not pages else ','.join(str(x) for x in pages)) - LOG.info('Page crop: %s', 'not cropped' if not page_crop else ' '.join(str(x) for x in page_crop)) - LOG.info('Smart page crop: %s', 'on' if smart_page_crop else 'off') - LOG.info('Extract annotations: %s', 'no' if no_annotations else 'yes') - LOG.info('Extract chapters: %s', 'no' if no_chapters else 'yes') - LOG.info('Extract paragraphs: %s', 'no' if no_paragraphs else 'yes') - LOG.info('Extract tables: %s', 'no' if no_tables else 'yes') - LOG.info('Extract figures: %s', 'no' if no_figures else 'yes') + LOG.info("Writing extracted data to stdout") + LOG.info( + "Page range: [%s]", "all" if not pages else ",".join(str(x) for x in pages) + ) + LOG.info( + "Page crop: %s", + "not cropped" if not page_crop else " ".join(str(x) for x in page_crop), + ) + LOG.info("Smart page crop: %s", "on" if smart_page_crop else "off") + LOG.info("Extract annotations: %s", "no" if no_annotations else "yes") + LOG.info("Extract chapters: %s", "no" if no_chapters else "yes") + LOG.info("Extract paragraphs: %s", "no" if no_paragraphs else "yes") + LOG.info("Extract tables: %s", "no" if no_tables else "yes") + LOG.info("Extract figures: %s", "no" if no_figures else "yes") overall_pbar.update(1) try: objects = extract( @@ -130,7 +137,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable ) except LibpdfException: if cli_usage: - LOG.critical('Exiting with code 1') + LOG.critical("Exiting with code 1") sys.exit(1) else: raise @@ -148,9 +155,9 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable if not cli_usage: return objects - LOG.info('Write output...') + LOG.info("Write output...") output_dump(output_format, output_path, objects) - LOG.info('Write output... done') + LOG.info("Write output... done") overall_pbar.update(7) @@ -161,10 +168,10 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals pdf: str, verbose: int = 1, # log level WARNING for library usage is considered a good compromise as a default page_range: str = None, - page_crop: Tuple[float, float, float, float] = None, + page_crop: tuple[float, float, float, float] = None, smart_page_crop: bool = False, save_figures: bool = False, - figure_dir: str = 'figures', + figure_dir: str = "figures", no_annotations: bool = False, no_chapters: bool = False, no_paragraphs: bool = False, @@ -172,10 +179,10 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals no_figures: bool = False, init_logging: bool = True, visual_debug: bool = False, - visual_debug_output_dir: str = 'visual_debug_libpdf', + visual_debug_output_dir: str = "visual_debug_libpdf", visual_split_elements: bool = False, - visual_debug_include_elements: List[str] = None, - visual_debug_exclude_elements: List[str] = None, + visual_debug_include_elements: list[str] = None, + visual_debug_exclude_elements: list[str] = None, ) -> ApiObjects: """ Entry point for the usage of libpdf as a library. @@ -214,16 +221,16 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals for visual_incl_element in visual_debug_include_elements: if visual_incl_element not in RENDER_ELEMENTS: raise ValueError( - f'Given visual included elements {visual_incl_element} not in {RENDER_ELEMENTS}', + f"Given visual included elements {visual_incl_element} not in {RENDER_ELEMENTS}", ) if visual_debug_exclude_elements: for visual_excl_element in visual_debug_exclude_elements: if visual_excl_element not in RENDER_ELEMENTS: raise ValueError( - f'Given visual excluded elements {visual_excl_element} not in {RENDER_ELEMENTS}', + f"Given visual excluded elements {visual_excl_element} not in {RENDER_ELEMENTS}", ) if visual_debug_include_elements and visual_debug_exclude_elements: - raise ValueError('Can not visual include and exclude at the same time.') + raise ValueError("Can not visual include and exclude at the same time.") objects = main( pdf, @@ -255,6 +262,7 @@ def docstring_parameter(*sub): This is used in below main function to get the version and description of the package to the click help screen. """ + # decorator definition def dec(obj): obj.__doc__ = obj.__doc__.format(*sub) @@ -272,12 +280,12 @@ def validate_range(ctx, param, value): if value is None: # this can only happen when the range is not given return value - match = re.match(r'^(\d+-\d+|\d+)(,(\d+-\d+|\d+))*$', value) + match = re.match(r"^(\d+-\d+|\d+)(,(\d+-\d+|\d+))*$", value) if match is None: - raise click.BadParameter('must follow the example pattern 2-3,6,8-12') - numbers = value.replace('-', ',').split(',') - if not all(int(x) < int(y) for x, y in zip(numbers, numbers[1:])): - raise click.BadParameter('values must increase monotonic') + raise click.BadParameter("must follow the example pattern 2-3,6,8-12") + numbers = value.replace("-", ",").split(",") + if not all(int(x) < int(y) for x, y in zip(numbers, numbers[1:], strict=False)): + raise click.BadParameter("values must increase monotonic") return value @@ -293,19 +301,25 @@ def validate_visual_elements(ctx, param, value): # check if multiple options are given, e.g. -ve chapter -ve table # TODO check this if len(value) > 1: - raise click.BadParameter('Option cannot be given multiple times. Use comma separation instead.') + raise click.BadParameter( + "Option cannot be given multiple times. Use comma separation instead." + ) value = value[0] - elements = value.split(',') + elements = value.split(",") if len(elements) != len(set(elements)): - raise click.BadParameter(f'Option {param.name} contains duplicate entries.') + raise click.BadParameter(f"Option {param.name} contains duplicate entries.") for element in elements: if element not in RENDER_ELEMENTS: - raise click.BadParameter(f"Option {param.name} contains an unknown entry '{element}'.") - if param.name == 'visual_debug_exclude_elements': + raise click.BadParameter( + f"Option {param.name} contains an unknown entry '{element}'." + ) + if param.name == "visual_debug_exclude_elements": if len(elements) == len(RENDER_ELEMENTS): # TODO Why is this not supported? It will just save the pages as images which might also be useful. - raise click.BadParameter('Cannot exclude all elements from visual debugging.') + raise click.BadParameter( + "Cannot exclude all elements from visual debugging." + ) return elements @@ -322,15 +336,17 @@ class DependentOption(click.Option): def __init__(self, *args, **kwargs): """Initialize class.""" - self.depends_on = set(kwargs.pop('depends_on', [])) - self.mutually_exclusive = set(kwargs.pop('mutually_exclusive', [])) + self.depends_on = set(kwargs.pop("depends_on", [])) + self.mutually_exclusive = set(kwargs.pop("mutually_exclusive", [])) help_msgs = [] if self.depends_on: help_msgs.append(f"this option depends on [{', '.join(self.depends_on)}]") if self.mutually_exclusive: - help_msgs.append(f"this option is mutually exclusive with [{', '.join(self.mutually_exclusive)}]") - kwargs['help'] = kwargs.get('help', '') + (f' NOTE: {"; ".join(help_msgs)}') + help_msgs.append( + f"this option is mutually exclusive with [{', '.join(self.mutually_exclusive)}]" + ) + kwargs["help"] = kwargs.get("help", "") + (f' NOTE: {"; ".join(help_msgs)}') super().__init__(*args, **kwargs) def handle_parse_result(self, ctx, opts, args): @@ -342,21 +358,25 @@ def handle_parse_result(self, ctx, opts, args): if self.mutually_exclusive.intersection(opts) and self.name in opts: raise click.UsageError( f"Illegal usage: '{self.name}' is mutually exclusive with '{', '.join(self.mutually_exclusive)}' " - 'which is also given.', + "which is also given.", ) return super().handle_parse_result(ctx, opts, args) -@click.command(context_settings={'help_option_names': ['-h', '--help']}, no_args_is_help=True) +@click.command( + context_settings={"help_option_names": ["-h", "--help"]}, no_args_is_help=True +) @click.argument( - 'pdf', + "pdf", required=True, - type=click.Path(exists=True, readable=True, resolve_path=True, file_okay=True, dir_okay=False), + type=click.Path( + exists=True, readable=True, resolve_path=True, file_okay=True, dir_okay=False + ), ) @click.option( - '-v', - '--verbose', + "-v", + "--verbose", count=True, help="""Verbosity level, can be passed repeatedly. @@ -368,150 +388,151 @@ def handle_parse_result(self, ctx, opts, args): """, ) @click.option( - '-p', - '--page-range', + "-p", + "--page-range", callback=validate_range, - help='Page range to extract. No spaces allowed. Examples: 3-5 or 3,4,7 or 3-5,7', + help="Page range to extract. No spaces allowed. Examples: 3-5 or 3,4,7 or 3-5,7", ) @click.option( - '-m', - '--page-crop', + "-m", + "--page-crop", nargs=4, type=float, - help='Margins for all pages given as space delimited floats in the order top right bottom left.' - ' The margins will be ignored during extraction, so this can be used to crop all pages.' - ' The values are given in points (72 points = 1 inch = 25.4 mm). Example: 30 0 45 0', + help="Margins for all pages given as space delimited floats in the order top right bottom left." + " The margins will be ignored during extraction, so this can be used to crop all pages." + " The values are given in points (72 points = 1 inch = 25.4 mm). Example: 30 0 45 0", ) @click.option( - '--smart-page-crop', + "--smart-page-crop", is_flag=True, - help='Flag enabling a smart header/footer detection. The algorithm will get the' - ' bounding boxes of all paragraphs, tables and figures inside a defined area (given by' - ' default parameters) at the top/bottom parts of all pages.' - ' If a certain box is found on multiple pages it is considered a header/footer element and will be' - ' ignored for the extraction. This feature can be used together with --page-crop. In this case the pages will' - ' first be cropped to the values defined in page_margins and then the header/footer detection will run.', + help="Flag enabling a smart header/footer detection. The algorithm will get the" + " bounding boxes of all paragraphs, tables and figures inside a defined area (given by" + " default parameters) at the top/bottom parts of all pages." + " If a certain box is found on multiple pages it is considered a header/footer element and will be" + " ignored for the extraction. This feature can be used together with --page-crop. In this case the pages will" + " first be cropped to the values defined in page_margins and then the header/footer detection will run.", ) @click.option( - '-f', - '--output-format', - type=click.Choice(['yaml', 'json'], case_sensitive=False), - default='yaml', - help='Output format.', + "-f", + "--output-format", + type=click.Choice(["yaml", "json"], case_sensitive=False), + default="yaml", + help="Output format.", ) -@click.option('-o', '--output-path', type=click.Path(file_okay=True, dir_okay=False)) +@click.option("-o", "--output-path", type=click.Path(file_okay=True, dir_okay=False)) @click.option( - '-sf', - '--save-figures', + "-sf", + "--save-figures", is_flag=True, show_default=True, - help='Flag enabling the export of PDF figures into the directory given in --figure-dir.' - ' Has no effect if --no-figures is also given.', + help="Flag enabling the export of PDF figures into the directory given in --figure-dir." + " Has no effect if --no-figures is also given.", ) @click.option( - '-d', - '--figure-dir', + "-d", + "--figure-dir", type=click.Path(file_okay=False, dir_okay=True), - default='figures', + default="figures", show_default=True, - help='Output directory for extracted figures; if it does not exist, it will be created', + help="Output directory for extracted figures; if it does not exist, it will be created", ) @click.option( - '--no-annotations', + "--no-annotations", is_flag=True, show_default=True, - help='Do not extract annotations from catalog. All PDF-internal links will not be resolved.' - ' Chapter detection however will work', + help="Do not extract annotations from catalog. All PDF-internal links will not be resolved." + " Chapter detection however will work", ) @click.option( - '--no-chapters', + "--no-chapters", is_flag=True, show_default=True, - help='Do not extract chapter/outline structure. The list of paragraphs, tables and figures will be flattened.', + help="Do not extract chapter/outline structure. The list of paragraphs, tables and figures will be flattened.", ) @click.option( - '--no-paragraphs', + "--no-paragraphs", is_flag=True, show_default=True, - help='Skip paragraphs. The chapter structure will still be preserved.', + help="Skip paragraphs. The chapter structure will still be preserved.", ) -@click.option('--no-tables', is_flag=True, help='Skip tables.') +@click.option("--no-tables", is_flag=True, help="Skip tables.") @click.option( - '--no-figures', + "--no-figures", is_flag=True, show_default=True, - help='Skip figures. Figures will not be part of the output JSON/YAML structures and also not saved if' - ' --save-figures is given.', + help="Skip figures. Figures will not be part of the output JSON/YAML structures and also not saved if" + " --save-figures is given.", ) -@click.option('-vd', '--visual-debug', is_flag=True, help='Visual debug libpdf.') +@click.option("-vd", "--visual-debug", is_flag=True, help="Visual debug libpdf.") @click.option( - '-vo', - '--visual-debug-output-dir', + "-vo", + "--visual-debug-output-dir", cls=DependentOption, - depends_on=['visual_debug'], + depends_on=["visual_debug"], type=click.Path(file_okay=False, dir_okay=True), - default='visual_debug_libpdf', + default="visual_debug_libpdf", show_default=True, - help='Output directory for visualized pdf pages.', + help="Output directory for visualized pdf pages.", ) @click.option( - '-vs', - '--visual-split-elements', + "-vs", + "--visual-split-elements", is_flag=True, show_default=True, - help='Put visual debugging elements into separate directories.', + help="Put visual debugging elements into separate directories.", ) @click.option( - '-vi', - '--visual-debug-include-elements', + "-vi", + "--visual-debug-include-elements", cls=DependentOption, type=str, callback=validate_visual_elements, - help='Included visualized elements when visual debugging. ' + help="Included visualized elements when visual debugging. " 'No space allowed. Example: "chapter,table" or "paragraph"', - mutually_exclusive=['visual_debug_exclude_elements'], - depends_on=['visual_debug'], + mutually_exclusive=["visual_debug_exclude_elements"], + depends_on=["visual_debug"], multiple=True, # this error conditions is handled in the callback ) @click.option( - '-ve', - '--visual-debug-exclude-elements', + "-ve", + "--visual-debug-exclude-elements", cls=DependentOption, type=str, callback=validate_visual_elements, help='Excluded visualized elements when visual debugging. No space allowed. Example: "chapter,table,paragraph"', - mutually_exclusive=['visual_debug_include_elements'], - depends_on=['visual_debug'], + mutually_exclusive=["visual_debug_include_elements"], + depends_on=["visual_debug"], multiple=True, # this error conditions is handled in the callback ) @click.version_option(version=__version__) -@click.help_option('-h', '--help') +@click.help_option("-h", "--help") @docstring_parameter(__version__, __summary__) # flake8 ignore of docstring issues D400 (end with period) and D403 (first word capitalized) not reasonable here # as the docstring is used by click in the CLI help page def main_cli(**kwargs): - """libpdf version {0}: {1} + """ + libpdf version {0}: {1} The argument PDF points to the PDF path that shall be extracted. - """ # noqa: D400, D403 + """ # noqa: D400 config_logger(cli=True) - set_log_level(kwargs['verbose']) # if not given it's 0 which means log level ERROR + set_log_level(kwargs["verbose"]) # if not given it's 0 which means log level ERROR main(**kwargs, cli_usage=True) -def calculate_pages(page_range_string) -> List[int]: +def calculate_pages(page_range_string) -> list[int]: """ Calculate a list of pages from the ranges given as CLI parameter page-range. :param page_range_string: CLI parameter page-range :return: list of pages in given range """ - page_ranges = page_range_string.split(',') + page_ranges = page_range_string.split(",") pages = [] for page_range in page_ranges: - if '-' in page_range: - start_page = int(page_range.split('-')[0]) - end_page = int(page_range.split('-')[1]) + if "-" in page_range: + start_page = int(page_range.split("-")[0]) + end_page = int(page_range.split("-")[1]) else: start_page = int(page_range) end_page = int(page_range) diff --git a/libpdf/extract.py b/libpdf/extract.py index 9437ae4..761a0f5 100644 --- a/libpdf/extract.py +++ b/libpdf/extract.py @@ -4,7 +4,11 @@ import os from datetime import datetime from pathlib import Path -from typing import List, Optional +from typing import Optional + +import pdfplumber +import yaml +from pdfminer.layout import LTText from libpdf import parameters from libpdf import process as pro @@ -31,12 +35,6 @@ from libpdf.textbox import extract_linked_chars, extract_paragraphs_chapters from libpdf.utils import lt_page_crop, lt_to_libpdf_hbox_converter, to_pdfplumber_bbox -from pdfminer.layout import LTText - -import pdfplumber - -import yaml - LOG = logging.getLogger(__name__) @@ -46,7 +44,7 @@ class FoldedStr(str): def folded_str_representer(dumper, text): """Warp function of the representer.""" - return dumper.represent_scalar('tag', text, style='>') + return dumper.represent_scalar("tag", text, style=">") yaml.add_representer(FoldedStr, folded_str_representer) @@ -54,7 +52,7 @@ def folded_str_representer(dumper, text): def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-statements, too-many-arguments pdf_path: str, - pages: Optional[List[int]], + pages: Optional[list[int]], smart_page_crop: bool, save_figures: bool, figure_dir: Optional[str], @@ -82,21 +80,23 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta :return: instance of Objects class :raise LibpdfException: PDF contains no pages """ - LOG.info('PDF extraction started ...') + LOG.info("PDF extraction started ...") - LOG.info('Loading the PDF with pdfminer LTTextBox analysis ...') + LOG.info("Loading the PDF with pdfminer LTTextBox analysis ...") with pdfplumber.open(pdf_path, laparams=LA_PARAMS) as pdf: - LOG.info('The PDF has %s pages', len(pdf.pages)) + LOG.info("The PDF has %s pages", len(pdf.pages)) if pages: # TODO: # 2. checkout if delete pages works before pdfplumber loading pdf, which can probably improve performance # 3. page_range extract cover feature header/footer??? - includelist_non_existent = [page for page in pages if page <= 0 or page > len(pdf.pages)] + includelist_non_existent = [ + page for page in pages if page <= 0 or page > len(pdf.pages) + ] if includelist_non_existent: LOG.error( - 'The selected page number(s) [%s] do not exist in the pdf. They will be skipped.', - ','.join([str(x) for x in includelist_non_existent]), + "The selected page number(s) [%s] do not exist in the pdf. They will be skipped.", + ",".join([str(x) for x in includelist_non_existent]), ) # delete pages from pdfplumber that are not in the extracted_pages list @@ -106,7 +106,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta pdf.pages.remove(page) if len(pdf.pages) == 0: - message = 'Page range selection: no pages left in the PDF to analyze.' + message = "Page range selection: no pages left in the PDF to analyze." LOG.critical(message) raise LibpdfException(message) @@ -123,7 +123,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta pages_list = extract_page_metadata(pdf) if not pages_list: - raise LibpdfException('PDF contains no pages') + raise LibpdfException("PDF contains no pages") overall_pbar.update(1) @@ -135,7 +135,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta # It is the pre-process for the table extraction to see if the table is recognised as the figure # In some cases, an element is recognised as a table and a figure at the same time if no_figures: - LOG.info('Excluding figures extraction') + LOG.info("Excluding figures extraction") figure_list = [] else: figure_list = extract_figures(pdf, pages_list, figure_dir) @@ -145,7 +145,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta overall_pbar.update(30) if no_tables: - LOG.info('Excluding tables extraction') + LOG.info("Excluding tables extraction") table_list = [] else: table_list = extract_pdf_table(pdf, pages_list, figure_list) @@ -168,15 +168,17 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta if smart_page_crop: paragraph_list = smart_page_crop_header_footer(pdf, paragraph_list) - element_list = pro.merge_all_elements(figure_list, table_list, paragraph_list, chapter_list) + element_list = pro.merge_all_elements( + figure_list, table_list, paragraph_list, chapter_list + ) # to check if elements shall be mapped into nested outline structure. - if catalog['outline'] is not None and not no_chapters: - element_list = pro.map_elements_outline(element_list, catalog['outline']) + if catalog["outline"] is not None and not no_chapters: + element_list = pro.map_elements_outline(element_list, catalog["outline"]) root = Root(file, pages_list, element_list) - if catalog['annos']: + if catalog["annos"]: pro.libpdf_target_explorer(paragraph_list, pages_list) pro.libpdf_target_explorer(table_list, pages_list) @@ -200,7 +202,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta return objects -def smart_page_crop_header_footer( # pylint: disable=too-many-branches, too-many-locals # noqa: C901 # too-complex +def smart_page_crop_header_footer( # pylint: disable=too-many-branches, too-many-locals # too-complex pdf, elements_list, ): @@ -230,7 +232,9 @@ def smart_page_crop_header_footer( # pylint: disable=too-many-branches, too-man page_height = float(pdf.pages[0].mediabox[3]) header_elements_list = [] - default_header_bottom = (1 - parameters.SMART_PAGE_CROP_REL_MARGINS['top']) * page_height + default_header_bottom = ( + 1 - parameters.SMART_PAGE_CROP_REL_MARGINS["top"] + ) * page_height for pot_header_elements in elements_page_dict.values(): # pylint: disable=too-many-nested-blocks # potential header element @@ -242,8 +246,16 @@ def smart_page_crop_header_footer( # pylint: disable=too-many-branches, too-man element_cnt = 0 for header_element in header_elements: if ( - abs(pot_header_element.position.y0 - header_element.position.y0) < 1 - and abs(pot_header_element.position.y1 - header_element.position.y1) < 1 + abs( + pot_header_element.position.y0 + - header_element.position.y0 + ) + < 1 + and abs( + pot_header_element.position.y1 + - header_element.position.y1 + ) + < 1 ): element_cnt += 1 # on one page several header elements may have same y coordination but count only once @@ -251,20 +263,26 @@ def smart_page_crop_header_footer( # pylint: disable=too-many-branches, too-man page_cnt = page_cnt + element_cnt # occur on more than HEADER_FOOTER_OCCURRENCE_PERCENTAGE pages, considered as header element # and remove from list - if page_cnt >= parameters.HEADER_FOOTER_OCCURRENCE_PERCENTAGE * len(pdf.pages): + if page_cnt >= parameters.HEADER_FOOTER_OCCURRENCE_PERCENTAGE * len( + pdf.pages + ): header_elements_list.append(pot_header_element) # remove false header elements from potential header elements list if header_elements_list: - real_header_elements_list = check_false_positive_header_footer(pdf, header_elements_list) + real_header_elements_list = check_false_positive_header_footer( + pdf, header_elements_list + ) else: real_header_elements_list = header_elements_list # remove header elements - elements_list = [element for element in elements_list if element not in real_header_elements_list] + elements_list = [ + element for element in elements_list if element not in real_header_elements_list + ] footer_elements_list = [] - default_footer_top = parameters.SMART_PAGE_CROP_REL_MARGINS['bottom'] * page_height + default_footer_top = parameters.SMART_PAGE_CROP_REL_MARGINS["bottom"] * page_height for pot_footer_elements in elements_page_dict.values(): # pylint: disable=too-many-nested-blocks # potential footer element for pot_footer_element in pot_footer_elements: @@ -275,8 +293,16 @@ def smart_page_crop_header_footer( # pylint: disable=too-many-branches, too-man element_cnt = 0 for footer_element in footer_elements: if ( - abs(pot_footer_element.position.y0 - footer_element.position.y0) < 1 - and abs(pot_footer_element.position.y1 - footer_element.position.y1) < 1 + abs( + pot_footer_element.position.y0 + - footer_element.position.y0 + ) + < 1 + and abs( + pot_footer_element.position.y1 + - footer_element.position.y1 + ) + < 1 ): element_cnt += 1 # on one page several footer elements may have same y coordination but count only once @@ -284,17 +310,23 @@ def smart_page_crop_header_footer( # pylint: disable=too-many-branches, too-man page_cnt = page_cnt + element_cnt # occur on more than HEADER_FOOTER_OCCURRENCE_PERCENTAGE pages, considered as footer element # and remove from list - if page_cnt >= parameters.HEADER_FOOTER_OCCURRENCE_PERCENTAGE * len(pdf.pages): + if page_cnt >= parameters.HEADER_FOOTER_OCCURRENCE_PERCENTAGE * len( + pdf.pages + ): footer_elements_list.append(pot_footer_element) # filter out false footer elements if footer_elements_list: - real_footer_elements_list = check_false_positive_header_footer(pdf, footer_elements_list) + real_footer_elements_list = check_false_positive_header_footer( + pdf, footer_elements_list + ) else: real_footer_elements_list = footer_elements_list # remove footer elements - elements_list = [element for element in elements_list if element not in real_footer_elements_list] + elements_list = [ + element for element in elements_list if element not in real_footer_elements_list + ] return elements_list @@ -331,9 +363,13 @@ def check_false_positive_header_footer(pdf, elements_list): # pylint: disable=t # search the lowest element height on each page element_low_pos_dict = {} for page_num, elements in elements_page_dict.items(): - lowest_element_pos = float(f'{elements[0].position.y0:.4f}') # restrict to 4 digits precision + lowest_element_pos = float( + f"{elements[0].position.y0:.4f}" + ) # restrict to 4 digits precision for element in elements: - lowest_element_pos = min(lowest_element_pos, float(f'{element.position.y0:.4f}')) + lowest_element_pos = min( + lowest_element_pos, float(f"{element.position.y0:.4f}") + ) element_low_pos_dict[page_num] = lowest_element_pos start_page_low_pos = list(element_low_pos_dict)[0] @@ -342,7 +378,10 @@ def check_false_positive_header_footer(pdf, elements_list): # pylint: disable=t # find the lowest y0 header_low_pos = min(set(element_low_pos_dict.values())) # check continuous of potential header/footer element from start page to end page - if page_breaks / (end_page_low_pos - start_page_low_pos + 1) <= PAGES_MISSING_HEADER_OR_FOOTER_PERCENTAGE: + if ( + page_breaks / (end_page_low_pos - start_page_low_pos + 1) + <= PAGES_MISSING_HEADER_OR_FOOTER_PERCENTAGE + ): # check unique low_pos if len(set(element_low_pos_dict.values())) != 1: # a list of page numbers to check the element's continuous @@ -352,7 +391,9 @@ def check_false_positive_header_footer(pdf, elements_list): # pylint: disable=t continuous_page_list.append(page) # check header/footer continuous sorted_continuous_page_list = sorted(continuous_page_list) - continuous_list_length = sorted_continuous_page_list[-1] - sorted_continuous_page_list[0] + 1 + continuous_list_length = ( + sorted_continuous_page_list[-1] - sorted_continuous_page_list[0] + 1 + ) # TODO: need to improve the parameter UNIQUE_HEADER_OR_FOOTER_ELEMENTS_PERCENTAGE to solve # partially continuous header or footer elements if len( @@ -364,7 +405,7 @@ def check_false_positive_header_footer(pdf, elements_list): # pylint: disable=t UNIQUE_HEADER_OR_FOOTER_ELEMENTS_PERCENTAGE * len(pdf.pages), ): for idx, element in enumerate(elements_list): - if float(f'{element.position.y0:.4f}') == header_low_pos: + if float(f"{element.position.y0:.4f}") == header_low_pos: del elements_list[idx] # recursively check again, to find the next min_low_pos, which will determine the header/footer boundary if elements_list: @@ -374,7 +415,7 @@ def check_false_positive_header_footer(pdf, elements_list): # pylint: disable=t elements_list.pop() else: for idx, element in enumerate(elements_list): - if float(f'{element.position.y0:.4f}') == header_low_pos: + if float(f"{element.position.y0:.4f}") == header_low_pos: del elements_list[idx] # recursively check again, to find the next min_low_pos, which will determine the header/footer boundary if elements_list: @@ -397,26 +438,31 @@ def delete_page_ann(pdf): :param pdf: :return: """ - LOG.info('Deleting strange anno objects created by layout analysis ...') + LOG.info("Deleting strange anno objects created by layout analysis ...") for idx_page, page in enumerate( - tqdm(pdf.pages, desc='###### Deleting anno objects', unit='pages', bar_format=bar_format_lvl2()), + tqdm( + pdf.pages, + desc="###### Deleting anno objects", + unit="pages", + bar_format=bar_format_lvl2(), + ), ): if logging_needed(idx_page, len(pdf.pages)): LOG.debug( - 'Deleting strange anno objects created by layout analysis page %s of %s', + "Deleting strange anno objects created by layout analysis page %s of %s", idx_page + 1, len(pdf.pages), ) # filter out the strange items - if 'anno' in page.objects: - page.objects['anno'] = [ + if "anno" in page.objects: + page.objects["anno"] = [ item - for item in page.objects['anno'] - if not (item['object_type'] == 'anno' and item['text'] in [' ', '\n']) + for item in page.objects["anno"] + if not (item["object_type"] == "anno" and item["text"] in [" ", "\n"]) ] - if not page.objects['anno']: + if not page.objects["anno"]: # remove the whole key if it's empty after above operation - del page.objects['anno'] + del page.objects["anno"] return pdf @@ -432,13 +478,13 @@ def file_info_extraction(pdf, pdf_path): :param pdf_path: path to the PDF to read :return: File object containing file and file meta information """ - LOG.info('Extracting file information ...') + LOG.info("Extracting file information ...") file_name = os.path.basename(pdf_path) # date format string example D:20110120163651-05'00' # zulu timezone Z0000 will be converted to +0000 def _time_preprocess(date_str): # converts to 20110120163651-0500 - return date_str.replace('D:', '').replace("'", '').replace('Z', '+') + return date_str.replace("D:", "").replace("'", "").replace("Z", "+") def _get_datetime_format(date: str): """ @@ -446,33 +492,37 @@ def _get_datetime_format(date: str): The returned format string is valid after pre-processing, see _time_preprocess(). """ - if '+' in date or '-' in date: - return '%Y%m%d%H%M%S%z' # with timezone - return '%Y%m%d%H%M%S' # without timezone + if "+" in date or "-" in date: + return "%Y%m%d%H%M%S%z" # with timezone + return "%Y%m%d%H%M%S" # without timezone file_meta_params = {} - if 'Author' in pdf.metadata: - file_meta_params.update({'author': pdf.metadata['Author']}) - if 'Title' in pdf.metadata: - file_meta_params.update({'title': pdf.metadata['Title']}) - if 'Subject' in pdf.metadata: - file_meta_params.update({'subject': pdf.metadata['Subject']}) - if 'Creator' in pdf.metadata: - file_meta_params.update({'creator': pdf.metadata['Creator']}) - if 'Producer' in pdf.metadata: - file_meta_params.update({'producer': pdf.metadata['Producer']}) - if 'Keywords' in pdf.metadata: - file_meta_params.update({'keywords': pdf.metadata['Keywords']}) - if 'CreationDate' in pdf.metadata: - preprocessed_date = _time_preprocess(pdf.metadata['CreationDate']) + if "Author" in pdf.metadata: + file_meta_params.update({"author": pdf.metadata["Author"]}) + if "Title" in pdf.metadata: + file_meta_params.update({"title": pdf.metadata["Title"]}) + if "Subject" in pdf.metadata: + file_meta_params.update({"subject": pdf.metadata["Subject"]}) + if "Creator" in pdf.metadata: + file_meta_params.update({"creator": pdf.metadata["Creator"]}) + if "Producer" in pdf.metadata: + file_meta_params.update({"producer": pdf.metadata["Producer"]}) + if "Keywords" in pdf.metadata: + file_meta_params.update({"keywords": pdf.metadata["Keywords"]}) + if "CreationDate" in pdf.metadata: + preprocessed_date = _time_preprocess(pdf.metadata["CreationDate"]) time_format = _get_datetime_format(preprocessed_date) - file_meta_params.update({'creation_date': datetime.strptime(preprocessed_date, time_format)}) - if 'ModDate' in pdf.metadata: - preprocessed_date = _time_preprocess(pdf.metadata['ModDate']) + file_meta_params.update( + {"creation_date": datetime.strptime(preprocessed_date, time_format)} + ) + if "ModDate" in pdf.metadata: + preprocessed_date = _time_preprocess(pdf.metadata["ModDate"]) time_format = _get_datetime_format(preprocessed_date) - file_meta_params.update({'modified_date': datetime.strptime(preprocessed_date, time_format)}) - if 'Trapped' in pdf.metadata: - file_meta_params.update({'trapped': pdf.metadata['Trapped']}) + file_meta_params.update( + {"modified_date": datetime.strptime(preprocessed_date, time_format)} + ) + if "Trapped" in pdf.metadata: + file_meta_params.update({"trapped": pdf.metadata["Trapped"]}) file_meta_data = FileMeta(**file_meta_params) @@ -480,10 +530,10 @@ def _get_datetime_format(date: str): file_name, pdf_path, len(pdf.pages), - parameters.PAGE_CROP_MARGINS['top'], - parameters.PAGE_CROP_MARGINS['bottom'], - parameters.PAGE_CROP_MARGINS['left'], - parameters.PAGE_CROP_MARGINS['right'], + parameters.PAGE_CROP_MARGINS["top"], + parameters.PAGE_CROP_MARGINS["bottom"], + parameters.PAGE_CROP_MARGINS["left"], + parameters.PAGE_CROP_MARGINS["right"], file_meta_data, ) @@ -502,14 +552,19 @@ def extract_page_metadata(pdf): :param pdf: instance of pdfplumber.pdf.PDF class :return: A list of Page objects """ - LOG.info('Extracting page metadata ...') + LOG.info("Extracting page metadata ...") page_list = [] for idx_page, page in enumerate( - tqdm(pdf.pages, desc='###### Extracting metadata', unit='pages', bar_format=bar_format_lvl2()), + tqdm( + pdf.pages, + desc="###### Extracting metadata", + unit="pages", + bar_format=bar_format_lvl2(), + ), ): if logging_needed(idx_page, len(pdf.pages)): - LOG.debug('Extracting metadata page %s of %s', idx_page + 1, len(pdf.pages)) + LOG.debug("Extracting metadata page %s of %s", idx_page + 1, len(pdf.pages)) page_obj = Page(page.page_number, float(page.width), float(page.height)) page_list.append(page_obj) @@ -520,18 +575,21 @@ def extract_figures( pdf, pages_list, figure_dir, -) -> List[ - Figure -]: # pylint: disable=too-many-nested-blocks, too-many-branches # local algorithm, easier to read when not split up +) -> list[Figure]: # pylint: disable=too-many-nested-blocks, too-many-branches # local algorithm, easier to read when not split up """Extract figures in PDF.""" - LOG.info('Extracting figures ...') + LOG.info("Extracting figures ...") figure_list = [] for idx_page, page in enumerate( # pylint: disable=too-many-nested-blocks - tqdm(pdf.pages, desc='###### Extracting figures', unit='pages', bar_format=bar_format_lvl2()), + tqdm( + pdf.pages, + desc="###### Extracting figures", + unit="pages", + bar_format=bar_format_lvl2(), + ), ): if logging_needed(idx_page, len(pdf.pages)): - LOG.debug('Extracting figures page %s of %s', idx_page + 1, len(pdf.pages)) + LOG.debug("Extracting figures page %s of %s", idx_page + 1, len(pdf.pages)) page_crop = pro.remove_page_header_footer(page) lt_page = page._layout # pylint: disable=protected-access # easiest way to obtain LTPage @@ -541,10 +599,10 @@ def extract_figures( if len(figures) != 0: for idx_figure, figure in enumerate(figures): fig_pos = Position( - float(figure['x0']), - float(figure['y0']), - float(figure['x1']), - float(figure['y1']), + float(figure["x0"]), + float(figure["y0"]), + float(figure["x1"]), + float(figure["y1"]), pages_list[idx_page], ) bbox = (fig_pos.x0, fig_pos.y0, fig_pos.x1, fig_pos.y1) @@ -559,7 +617,7 @@ def extract_figures( textboxes = [] links = [] for lt_textbox in lt_textboxes: - if catalog['annos']: + if catalog["annos"]: links.extend(extract_linked_chars(lt_textbox, lt_page.pageid)) bbox = (lt_textbox.x0, lt_textbox.y0, lt_textbox.x1, lt_textbox.y1) @@ -567,14 +625,16 @@ def extract_figures( textboxes.append(hbox) - image_name = f'page_{page.page_number}_figure.{idx_figure + 1}.png' + image_name = f"page_{page.page_number}_figure.{idx_figure + 1}.png" # create figures directory if not exist Path(figure_dir).mkdir(parents=True, exist_ok=True) image_path = os.path.abspath(os.path.join(figure_dir, image_name)) - figure = Figure(idx_figure + 1, image_path, fig_pos, links, textboxes, 'None') + figure = Figure( + idx_figure + 1, image_path, fig_pos, links, textboxes, "None" + ) figure_list.append(figure) return figure_list @@ -589,12 +649,18 @@ def images_to_save(pdf, figure_list): page_crop = pro.remove_page_header_footer(page) - bbox = to_pdfplumber_bbox(fig.position.x0, fig.position.y0, fig.position.x1, fig.position.y1, page.height) + bbox = to_pdfplumber_bbox( + fig.position.x0, + fig.position.y0, + fig.position.x1, + fig.position.y1, + page.height, + ) crop_page_figure = page_crop.within_bbox(bbox) image_path = fig.rel_path image = crop_page_figure.to_image(resolution=300) - image.save(image_path, format='png') + image.save(image_path, format="png") def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches @@ -639,28 +705,33 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches filtered_figures = [] for figure in figures_list: # check if figure really exist and figure's size if it's human readable - if figure['height'] > FIGURE_MIN_HEIGHT and figure['width'] > FIGURE_MIN_WIDTH: + if figure["height"] > FIGURE_MIN_HEIGHT and figure["width"] > FIGURE_MIN_WIDTH: filtered_figures.append(figure) for figure in filtered_figures: # if figure exceed the boundary of the page, then only keep the part of figure that inside this page - if not (figure['x0'] >= 0 and figure['y0'] >= 0 and figure['x1'] >= 0 and figure['y1'] >= 0): - if figure['x0'] < 0: - figure['x0'] = 0 - if figure['y0'] < 0: - figure['y0'] = 0 - if figure['x1'] < 0: - figure['x1'] = 0 - if figure['y1'] < 0: - figure['y1'] = 0 + if not ( + figure["x0"] >= 0 + and figure["y0"] >= 0 + and figure["x1"] >= 0 + and figure["y1"] >= 0 + ): + if figure["x0"] < 0: + figure["x0"] = 0 + if figure["y0"] < 0: + figure["y0"] = 0 + if figure["x1"] < 0: + figure["x1"] = 0 + if figure["y1"] < 0: + figure["y1"] = 0 # check if figures completely inside another figures and remove small figures for fig0, fig1 in itertools.combinations(filtered_figures, 2): if ( - fig0['x0'] <= fig1['x0'] - and fig0['y0'] <= fig1['y0'] - and fig0['x1'] >= fig1['x1'] - and fig0['y1'] >= fig1['y1'] + fig0["x0"] <= fig1["x0"] + and fig0["y0"] <= fig1["y0"] + and fig0["x1"] >= fig1["x1"] + and fig0["y1"] >= fig1["y1"] ): if fig1 in filtered_figures: filtered_figures.remove(fig1) @@ -669,16 +740,19 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches for fig0, fig1 in itertools.combinations(filtered_figures, 2): # check partially overlap if not ( - fig0['x0'] > fig1['x1'] or fig0['x1'] < fig1['x0'] or fig0['y0'] > fig1['y1'] or fig0['y1'] < fig1['y0'] + fig0["x0"] > fig1["x1"] + or fig0["x1"] < fig1["x0"] + or fig0["y0"] > fig1["y1"] + or fig0["y1"] < fig1["y0"] ): if not ( - fig0['x0'] <= fig1['x0'] - and fig0['y0'] <= fig1['y0'] - and fig0['x1'] >= fig1['x1'] - and fig0['y1'] >= fig1['y1'] + fig0["x0"] <= fig1["x0"] + and fig0["y0"] <= fig1["y0"] + and fig0["x1"] >= fig1["x1"] + and fig0["y1"] >= fig1["y1"] ): # compare the size of two figures, keep the bigger figure - if fig0['width'] * fig0['height'] <= fig1['width'] * fig1['height']: + if fig0["width"] * fig0["height"] <= fig1["width"] * fig1["height"]: if fig0 in filtered_figures: filtered_figures.remove(fig0) else: diff --git a/libpdf/log.py b/libpdf/log.py index 9d1f10e..ca1c9f4 100644 --- a/libpdf/log.py +++ b/libpdf/log.py @@ -15,10 +15,10 @@ def get_level_name(verbose): """Return the log levels for the CLI verbosity flag in words.""" verbose = min(verbose, 3) level_dict = { - 0: 'ERROR/FATAL/CRITICAL', - 1: 'WARNING', - 2: 'INFO', - 3: 'DEBUG', + 0: "ERROR/FATAL/CRITICAL", + 1: "WARNING", + 2: "INFO", + 3: "DEBUG", } return level_dict[verbose] @@ -80,10 +80,12 @@ def config_logger(cli=True): else: # don't init anything, it's up to the user pass - log_format = '[%(levelname)5s] %(name)s - %(message)s' + log_format = "[%(levelname)5s] %(name)s - %(message)s" if init_tqdm: root_logger = logging.getLogger() - handler = TqdmLoggingHandler(level=logging.DEBUG) # output all messages, log level handling is done in logger + handler = TqdmLoggingHandler( + level=logging.DEBUG + ) # output all messages, log level handling is done in logger handler.formatter = logging.Formatter(log_format) root_logger.addHandler(handler) if init_basic: @@ -97,15 +99,15 @@ def set_log_level(verbose): All loggers have libpdf as a parent, setting the log level for libpdf also affects all child loggers like libpdf.core. """ - log = logging.getLogger('libpdf') + log = logging.getLogger("libpdf") if verbose == 0: - log.setLevel('ERROR') + log.setLevel("ERROR") elif verbose == 1: - log.setLevel('WARNING') + log.setLevel("WARNING") elif verbose == 2: - log.setLevel('INFO') + log.setLevel("INFO") else: - log.setLevel('DEBUG') + log.setLevel("DEBUG") def logging_needed(idx_page: int, count_pages: int): @@ -118,4 +120,8 @@ def logging_needed(idx_page: int, count_pages: int): return False twenty_percent = count_pages / 5.0 round_up_next_ten = int(math.ceil(twenty_percent / 10.0)) * 10 - return idx_page == 0 or (idx_page + 1) % round_up_next_ten == 0 or idx_page == count_pages - 1 + return ( + idx_page == 0 + or (idx_page + 1) % round_up_next_ten == 0 + or idx_page == count_pages - 1 + ) diff --git a/libpdf/models/chapter.py b/libpdf/models/chapter.py index f6d02f6..3179485 100644 --- a/libpdf/models/chapter.py +++ b/libpdf/models/chapter.py @@ -1,6 +1,6 @@ """Definition for PDF chapters.""" -from typing import List, TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Union from libpdf.models.element import Element from libpdf.models.horizontal_box import HorizontalBox @@ -9,10 +9,18 @@ # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: # F401 imported but unused - it's needed for type hinting - from libpdf.models.figure import Figure # noqa: F401 # pylint: disable=cyclic-import, ungrouped-imports - from libpdf.models.paragraph import Paragraph # noqa: F401 # pylint: disable=cyclic-import, ungrouped-imports - from libpdf.models.position import Position # pylint: disable=cyclic-import, ungrouped-imports - from libpdf.models.table import Table # noqa: F401 # pylint: disable=cyclic-import, ungrouped-imports + from libpdf.models.figure import ( + Figure, # pylint: disable=cyclic-import, ungrouped-imports + ) + from libpdf.models.paragraph import ( + Paragraph, # pylint: disable=cyclic-import, ungrouped-imports + ) + from libpdf.models.position import ( + Position, # pylint: disable=cyclic-import, ungrouped-imports + ) + from libpdf.models.table import ( + Table, # pylint: disable=cyclic-import, ungrouped-imports + ) class Chapter(Element): @@ -42,9 +50,9 @@ def __init__( self, title: str, number: str, - position: 'Position', - content: List[Union['Chapter', 'Paragraph', 'Table', 'Figure']] = None, - chapter: 'Chapter' = None, + position: "Position", + content: list[Union["Chapter", "Paragraph", "Table", "Figure"]] = None, + chapter: "Chapter" = None, textbox: HorizontalBox = None, ): """Initialize the instance.""" @@ -69,7 +77,7 @@ def id_(self): :type: str """ - return f'chapter.{self.number}' + return f"chapter.{self.number}" def set_backref(self): """Set b_chapter property on all elements under contents.""" @@ -82,4 +90,4 @@ def __repr__(self): The purpose of it is to improve the readability in the debugger. """ - return f'{self.number} {self.title}' + return f"{self.number} {self.title}" diff --git a/libpdf/models/element.py b/libpdf/models/element.py index b742137..0adf382 100644 --- a/libpdf/models/element.py +++ b/libpdf/models/element.py @@ -2,16 +2,19 @@ from abc import ABC, abstractmethod from typing import TYPE_CHECKING - from libpdf.models.model_base import ModelBase # avoid import cycles for back reference type hinting # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: # F401 imported but unused - it's needed for type hinting - from libpdf.models.chapter import Chapter # noqa: F401 # pylint: disable=cyclic-import - from libpdf.models.position import Position # noqa: F401 # pylint: disable=cyclic-import - from libpdf.models.root import Root # noqa: F401 # pylint: disable=cyclic-import + from libpdf.models.chapter import ( + Chapter, # pylint: disable=cyclic-import + ) + from libpdf.models.position import ( + Position, # pylint: disable=cyclic-import + ) + from libpdf.models.root import Root # pylint: disable=cyclic-import # need to ignore flake8 errors because sphinx.autodoc sees the multiline as one line. It is actually the summary line. @@ -27,13 +30,13 @@ class Element(ModelBase, ABC): :vartype b_root: Root :ivar b_chapter: parent Chapter instance (mutually exclusive with the b_root parameter) :vartype b_chapter: Chapter - """ # noqa: D205, D400 + """ # noqa: D205 def __init__( self, - position: 'Position', - root: 'Root' = None, - chapter: 'Chapter' = None, + position: "Position", + root: "Root" = None, + chapter: "Chapter" = None, ): """Initialize the instance.""" self.type = self.__class__.__name__.lower() @@ -69,11 +72,11 @@ def uid(self): curr_chapter = self.b_chapter uid_prefix = curr_chapter.id_ while curr_chapter.b_chapter: - uid_prefix = curr_chapter.b_chapter.id_ + '/' + uid_prefix + uid_prefix = curr_chapter.b_chapter.id_ + "/" + uid_prefix curr_chapter = curr_chapter.b_chapter - uid = f'{uid_prefix}/{self.id_}' + uid = f"{uid_prefix}/{self.id_}" else: - uid = f'{self.id_}' + uid = f"{self.id_}" return uid diff --git a/libpdf/models/figure.py b/libpdf/models/figure.py index e47af3e..938ea60 100644 --- a/libpdf/models/figure.py +++ b/libpdf/models/figure.py @@ -1,16 +1,17 @@ """Definition for PDF figures.""" -from typing import List, TYPE_CHECKING +from typing import TYPE_CHECKING from libpdf.models.element import Element from libpdf.models.horizontal_box import HorizontalBox from libpdf.models.link import Link - # avoid import cycles for back reference type hinting # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: # F401 imported but unused - it's needed for type hinting - from libpdf.models.position import Position # noqa: F401, pylint: disable=ungrouped-imports + from libpdf.models.position import ( + Position, # , pylint: disable=ungrouped-imports + ) class Figure(Element): @@ -42,9 +43,9 @@ def __init__( self, idx: int, rel_path: str, - position: 'Position', - links: List[Link], - textboxes: List[HorizontalBox], + position: "Position", + links: list[Link], + textboxes: list[HorizontalBox], text: str = None, caption: str = None, ): @@ -74,7 +75,7 @@ def id_(self): :type: str """ - return f'figure.{self.idx}' + return f"figure.{self.idx}" def set_links_backref(self): """Set b_source back reference on all links.""" diff --git a/libpdf/models/file.py b/libpdf/models/file.py index d18662c..179e59a 100644 --- a/libpdf/models/file.py +++ b/libpdf/models/file.py @@ -9,7 +9,7 @@ # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: # F401 imported but unused - it's needed for type hinting - from libpdf.models.root import Root # noqa: F401 # pylint: disable=cyclic-import + from libpdf.models.root import Root # pylint: disable=cyclic-import # TODO the page cropping is defined by the user and must not be stored; the @@ -68,7 +68,7 @@ def __init__( crop_left: float = 0, crop_right: float = 0, file_meta: FileMeta = None, - root: 'Root' = None, + root: "Root" = None, ): """Initialize the instance.""" self.name = name @@ -98,4 +98,4 @@ def id_(self): is used. The file identifier is built from the file name including extension. All characters are removed that do not follow the Python identifier character set (Regex character set ``[_a-zA-Z0-9]``). """ - return 'file.' + string_to_identifier(self.name) + return "file." + string_to_identifier(self.name) diff --git a/libpdf/models/file_meta.py b/libpdf/models/file_meta.py index e8a9aa7..23362eb 100644 --- a/libpdf/models/file_meta.py +++ b/libpdf/models/file_meta.py @@ -8,7 +8,7 @@ # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: # F401 imported but unused - it's needed for type hinting - from libpdf.models.file import File # noqa: F401 # pylint: disable=cyclic-import + from libpdf.models.file import File # pylint: disable=cyclic-import class FileMeta(ModelBase): @@ -48,7 +48,7 @@ def __init__( # pylint: disable=too-many-arguments # it's a data class creation_date: datetime = None, modified_date: datetime = None, trapped: bool = None, - file: 'File' = None, + file: "File" = None, ): """Initialize the instance.""" self.author = author diff --git a/libpdf/models/horizontal_box.py b/libpdf/models/horizontal_box.py index bb7a6af..2577cb4 100644 --- a/libpdf/models/horizontal_box.py +++ b/libpdf/models/horizontal_box.py @@ -1,7 +1,5 @@ """Definition of HorizontalBox to contain text in the PDF.""" -from typing import List - class Char: # pylint: disable=too-few-public-methods # simplicity is good. """ @@ -36,7 +34,7 @@ def __init__( def __repr__(self): """Make the text part of the repr for better debugging.""" - return f'{type(self).__name__}({self.text})' + return f"{type(self).__name__}({self.text})" class Word: @@ -51,7 +49,7 @@ class Word: def __init__( self, - chars: List[Char], + chars: list[Char], x0: float = None, y0: float = None, x1: float = None, @@ -73,11 +71,11 @@ def __init__( @property def text(self): """Return plain text.""" - return ''.join([x.text for x in self.chars]) + return "".join([x.text for x in self.chars]) def __repr__(self): """Make the text part of the repr for better debugging.""" - return f'{type(self).__name__}({self.text})' + return f"{type(self).__name__}({self.text})" class HorizontalLine: @@ -92,7 +90,7 @@ class HorizontalLine: def __init__( self, - words: List[Word], + words: list[Word], x0: float = None, y0: float = None, x1: float = None, @@ -114,11 +112,11 @@ def __init__( @property def text(self): """Return plain text.""" - return ' '.join([x.text for x in self.words]) + return " ".join([x.text for x in self.words]) def __repr__(self): """Make the text part of the repr for better debugging.""" - return f'{type(self).__name__}({self.text})' + return f"{type(self).__name__}({self.text})" class HorizontalBox: @@ -133,7 +131,7 @@ class HorizontalBox: def __init__( self, - lines: List[HorizontalLine], + lines: list[HorizontalLine], x0: float = None, y0: float = None, x1: float = None, @@ -155,10 +153,10 @@ def __init__( @property def text(self): """Return plain text.""" - return '\n'.join([x.text for x in self.lines]) + return "\n".join([x.text for x in self.lines]) def __repr__(self): """Make the text part of the repr for better debugging.""" if self.lines: - return f'{type(self).__name__}({self.text})' + return f"{type(self).__name__}({self.text})" return None diff --git a/libpdf/models/link.py b/libpdf/models/link.py index a3a813c..8310818 100644 --- a/libpdf/models/link.py +++ b/libpdf/models/link.py @@ -1,5 +1,5 @@ """Definition for PDF linked text.""" -from typing import Dict, TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Union from libpdf.models.model_base import ModelBase @@ -7,9 +7,13 @@ # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: # F401 imported but unused - it's needed for type hinting - from libpdf.models.table import Cell # noqa: F401 # pylint: disable=cyclic-import - from libpdf.models.figure import Figure # noqa: F401 # pylint: disable=cyclic-import - from libpdf.models.paragraph import Paragraph # noqa: F401 # pylint: disable=cyclic-import + from libpdf.models.figure import ( + Figure, # pylint: disable=cyclic-import + ) + from libpdf.models.paragraph import ( + Paragraph, # pylint: disable=cyclic-import + ) + from libpdf.models.table import Cell # pylint: disable=cyclic-import class Link(ModelBase): @@ -38,9 +42,9 @@ def __init__( self, idx_start: int, idx_stop: int, - pos_target: Dict, + pos_target: dict, libpdf_target: str = None, - b_source: Union['Paragraph', 'Figure', 'Cell'] = None, + b_source: Union["Paragraph", "Figure", "Cell"] = None, ): """Initialize the instance.""" self.idx_start = idx_start @@ -66,4 +70,4 @@ def source_chars(self): def __repr__(self): """Make link of the repr for better debugging.""" - return f'{self.source_chars}' + return f"{self.source_chars}" diff --git a/libpdf/models/model_base.py b/libpdf/models/model_base.py index 7f2060f..3f99327 100644 --- a/libpdf/models/model_base.py +++ b/libpdf/models/model_base.py @@ -16,7 +16,7 @@ def to_dict(self): vars_dict = vars(self).copy() delete_backref_keys = [] for key, value in vars_dict.items(): - if key.startswith('b_'): + if key.startswith("b_"): delete_backref_keys.append(key) else: if isinstance(value, ModelBase): @@ -31,10 +31,12 @@ def check(self): """Check if members are not set.""" for key, value in vars(self).items(): if value is None: - LOG.warning('The member %s of class %s is None', key, type(self).__name__) + LOG.warning( + "The member %s of class %s is None", key, type(self).__name__ + ) def __repr__(self): """Overwrite the object representation for better debugging.""" - if hasattr(self, 'id_'): - return f'{self.__class__.__name__}({self.id_!r})' # pylint: disable=no-member - return f'{self.__class__.__name__}()' + if hasattr(self, "id_"): + return f"{self.__class__.__name__}({self.id_!r})" # pylint: disable=no-member + return f"{self.__class__.__name__}()" diff --git a/libpdf/models/page.py b/libpdf/models/page.py index 270ae7f..715532c 100644 --- a/libpdf/models/page.py +++ b/libpdf/models/page.py @@ -1,19 +1,26 @@ """Definition for PDF pages.""" -from typing import List, TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Union from libpdf.models.model_base import ModelBase - # avoid import cycles for back reference type hinting # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: # F401 imported but unused - it's needed for type hinting - from libpdf.models.position import Position # noqa: F401 # pylint: disable=cyclic-import - from libpdf.models.root import Root # noqa: F401 # pylint: disable=cyclic-import - from libpdf.models.chapter import Chapter # noqa: F401 # pylint: disable=cyclic-import - from libpdf.models.paragraph import Paragraph # noqa: F401 # pylint: disable=cyclic-import - from libpdf.models.table import Table # noqa: F401 # pylint: disable=cyclic-import - from libpdf.models.figure import Figure # noqa: F401 # pylint: disable=cyclic-import + from libpdf.models.chapter import ( + Chapter, # pylint: disable=cyclic-import + ) + from libpdf.models.figure import ( + Figure, # pylint: disable=cyclic-import + ) + from libpdf.models.paragraph import ( + Paragraph, # pylint: disable=cyclic-import + ) + from libpdf.models.position import ( + Position, # pylint: disable=cyclic-import + ) + from libpdf.models.root import Root # pylint: disable=cyclic-import + from libpdf.models.table import Table # pylint: disable=cyclic-import class Page(ModelBase): @@ -40,9 +47,9 @@ def __init__( number, width, height, - content: List[Union['Chapter', 'Paragraph', 'Table', 'Figure']] = None, - root: 'Root' = None, - positions: List['Position'] = None, + content: list[Union["Chapter", "Paragraph", "Table", "Figure"]] = None, + root: "Root" = None, + positions: list["Position"] = None, ): """Initialize the instance.""" self.number = number @@ -64,7 +71,7 @@ def id_(self): According to PDF model the parameter should be called ``id`` but the name is reserved in Python, so ``id_`` is used. """ - return f'page.{str(self.number)}' + return f"page.{self.number!s}" def __repr__(self): """Page representation using page..""" diff --git a/libpdf/models/paragraph.py b/libpdf/models/paragraph.py index a1d3913..b3f2e10 100644 --- a/libpdf/models/paragraph.py +++ b/libpdf/models/paragraph.py @@ -1,18 +1,23 @@ """Definition for PDF textblocks.""" -from typing import List, TYPE_CHECKING +from typing import TYPE_CHECKING from libpdf.models.element import Element from libpdf.models.horizontal_box import HorizontalBox from libpdf.models.link import Link - # avoid import cycles for back reference type hinting # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: # F401 imported but unused - it's needed for type hinting - from libpdf.models.chapter import Chapter # noqa: F401 # pylint: disable=cyclic-import, ungrouped-imports - from libpdf.models.position import Position # noqa: F401 # pylint: disable=cyclic-import, ungrouped-imports - from libpdf.models.root import Root # noqa: F401 # pylint: disable=cyclic-import, ungrouped-imports + from libpdf.models.chapter import ( + Chapter, # pylint: disable=cyclic-import, ungrouped-imports + ) + from libpdf.models.position import ( + Position, # pylint: disable=cyclic-import, ungrouped-imports + ) + from libpdf.models.root import ( + Root, # pylint: disable=cyclic-import, ungrouped-imports + ) class Paragraph(Element): @@ -34,11 +39,11 @@ class Paragraph(Element): def __init__( self, idx: int, - position: 'Position', - links: List[Link], + position: "Position", + links: list[Link], textbox: HorizontalBox = None, - root: 'Root' = None, - chapter: 'Chapter' = None, + root: "Root" = None, + chapter: "Chapter" = None, ): """Initialize the instance.""" super().__init__(position=position, root=root, chapter=chapter) @@ -63,7 +68,7 @@ def id_(self): :type: str """ - return f'paragraph.{self.idx}' + return f"paragraph.{self.idx}" def set_links_backref(self): """Set b_source back reference on all links.""" @@ -72,4 +77,4 @@ def set_links_backref(self): def __repr__(self): """Make paragraph text part of the repr for better debugging.""" - return f'{type(self).__name__}({self.id_})({self.textbox.text})' + return f"{type(self).__name__}({self.id_})({self.textbox.text})" diff --git a/libpdf/models/position.py b/libpdf/models/position.py index 1e423e4..7f182f1 100644 --- a/libpdf/models/position.py +++ b/libpdf/models/position.py @@ -7,9 +7,11 @@ # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: # F401 imported but unused - it's needed for type hinting - from libpdf.models.table import Cell # noqa: F401 # pylint: disable=cyclic-import + from libpdf.models.element import ( + Element, # pylint: disable=cyclic-import + ) from libpdf.models.page import Page # pylint: disable=cyclic-import - from libpdf.models.element import Element # noqa: F401 # pylint: disable=cyclic-import + from libpdf.models.table import Cell # pylint: disable=cyclic-import class Position: @@ -82,9 +84,9 @@ def __init__( y0: float, x1: float, y1: float, - page: 'Page', - element: 'Element' = None, - cell: 'Cell' = None, + page: "Page", + element: "Element" = None, + cell: "Cell" = None, ): """Init the class with rectangular coordinates and a page reference.""" self.x0 = x0 @@ -122,4 +124,4 @@ def __repr__(self): else: ref_type = self.b_cell - return f'Page {self.page.number} [{self.x0}, {self.y0}, {self.x1}, {self.y1}] ({ref_type})' + return f"Page {self.page.number} [{self.x0}, {self.y0}, {self.x1}, {self.y1}] ({ref_type})" diff --git a/libpdf/models/root.py b/libpdf/models/root.py index 5cd065c..ed049ce 100644 --- a/libpdf/models/root.py +++ b/libpdf/models/root.py @@ -1,5 +1,5 @@ """Definition for PDF root element.""" -from typing import List, Union +from typing import Union from libpdf.models.chapter import Chapter from libpdf.models.figure import Figure @@ -26,8 +26,8 @@ class Root(ModelBase): def __init__( self, file: File, - pages: List[Page], - content: List[Union[Chapter, Paragraph, Table, Figure]], + pages: list[Page], + content: list[Union[Chapter, Paragraph, Table, Figure]], ): """Create publicly accessible objects.""" self.file: File = file diff --git a/libpdf/models/table.py b/libpdf/models/table.py index 4b6d117..22f4e2a 100644 --- a/libpdf/models/table.py +++ b/libpdf/models/table.py @@ -1,6 +1,5 @@ """Definition for PDF tables.""" from operator import attrgetter -from typing import List from libpdf.models.element import Element from libpdf.models.horizontal_box import HorizontalBox @@ -23,7 +22,9 @@ class Table(Element): :vartype position: Position """ - def __init__(self, idx: int, cells: List['Cell'], position: 'Position', caption=None): + def __init__( + self, idx: int, cells: list["Cell"], position: "Position", caption=None + ): """Initialize the instance.""" super().__init__(position=position) self.idx = idx @@ -46,7 +47,7 @@ def id_(self): :type: str """ - return f'table.{self.idx}' + return f"table.{self.idx}" def set_cells_backref(self): """Set b_table back reference on all cells.""" @@ -61,10 +62,10 @@ def rows(self): :type: List[List[Cell]] """ rows = [] - max_row = max(self.cells, key=attrgetter('row')).row # get highest row number + max_row = max(self.cells, key=attrgetter("row")).row # get highest row number for row_nr in range(1, max_row + 1): columns_in_row = [cell for cell in self.cells if cell.row == row_nr] - columns_in_row.sort(key=attrgetter('col')) + columns_in_row.sort(key=attrgetter("col")) rows.append(columns_in_row) return rows @@ -76,10 +77,12 @@ def columns(self): :type: List[List[Cell]] """ columns = [] - max_columns = max(self.cells, key=attrgetter('col')).col # get highest column number + max_columns = max( + self.cells, key=attrgetter("col") + ).col # get highest column number for col_nr in range(1, max_columns + 1): rows_in_column = [cell for cell in self.cells if cell.col == col_nr] - rows_in_column.sort(key=attrgetter('row')) + rows_in_column.sort(key=attrgetter("row")) columns.append(rows_in_column) return columns @@ -125,7 +128,7 @@ def __init__( row: int, col: int, position: Position, - links: List[Link], + links: list[Link], table: Table = None, textbox: HorizontalBox = None, ): @@ -152,4 +155,4 @@ def set_links_backref(self): def __repr__(self): """Identify cells by row and column.""" - return f'Cell({self.row}, {self.col}) {self.textbox.text}' + return f"Cell({self.row}, {self.col}) {self.textbox.text}" diff --git a/libpdf/parameters.py b/libpdf/parameters.py index 839ff80..2cbc2fa 100644 --- a/libpdf/parameters.py +++ b/libpdf/parameters.py @@ -129,10 +129,10 @@ # - it is written in 2 places: initialization here and can be set from command line / API # - it is used in multiple modules, passing around is cumbersome PAGE_CROP_MARGINS = { - 'top': 0.0, - 'right': 0.0, - 'bottom': 0.0, - 'left': 0.0, + "top": 0.0, + "right": 0.0, + "bottom": 0.0, + "left": 0.0, } # Page margins used as a search area for smart header/footer detection. @@ -142,8 +142,8 @@ # Given as fraction of page height for top (header) and bottom (footer) # Example: 0.2 (==20% of page height) SMART_PAGE_CROP_REL_MARGINS = { - 'top': 0.2, - 'bottom': 0.2, + "top": 0.2, + "bottom": 0.2, } # Parameter for the extraction of potential header/footer elements. @@ -188,26 +188,26 @@ # The following parameters are used for visual debugging # Give colors a human readable name COLORS = { - 'red': (255, 0, 0), - 'green': (0, 255, 0), - 'blue': (0, 0, 255), - 'yellow': (255, 255, 0), + "red": (255, 0, 0), + "green": (0, 255, 0), + "blue": (0, 0, 255), + "yellow": (255, 255, 0), } # Map extracted elements with color # the numbers at the end means transparency, the value should be set in range (40, 160) VIS_DBG_MAP_ELEMENTS_COLOR = { - 'chapter': COLORS['green'] + (80,), - 'paragraph': COLORS['blue'] + (40,), - 'table': COLORS['red'] + (40,), - 'figure': COLORS['yellow'] + (80,), + "chapter": COLORS["green"] + (80,), + "paragraph": COLORS["blue"] + (40,), + "table": COLORS["red"] + (40,), + "figure": COLORS["yellow"] + (80,), } RENDER_ELEMENTS = [ - 'chapter', - 'paragraph', - 'table', - 'figure', + "chapter", + "paragraph", + "table", + "figure", ] # the elements that shall be rendered # pdfminer layout analysis parameter from from pdfminer.layout -> LAParams.__init__ @@ -215,11 +215,11 @@ # - pdfplumber wrapper around pdfminer only requests layout analysis if at least one laparam is given # - they are adapted to best practice values, the deviations are commented below LA_PARAMS = { - 'line_overlap': 0.5, - 'char_margin': 6.0, # default: 2.0 - 'line_margin': 0.4, # default : 0.5 - 'word_margin': 0.1, - 'boxes_flow': 0.5, - 'detect_vertical': False, - 'all_texts': False, + "line_overlap": 0.5, + "char_margin": 6.0, # default: 2.0 + "line_margin": 0.4, # default : 0.5 + "word_margin": 0.1, + "boxes_flow": 0.5, + "detect_vertical": False, + "all_texts": False, } diff --git a/libpdf/process.py b/libpdf/process.py index 324ffb8..98495d7 100644 --- a/libpdf/process.py +++ b/libpdf/process.py @@ -1,4 +1,5 @@ -"""Process the things which I don't know how to categorize. +""" +Process the things which I don't know how to categorize. It includes: 1. metadata of chars and lines process @@ -12,7 +13,10 @@ import logging import os import sys -from typing import Dict, List, Optional, Union +from typing import Optional, Union + +import ruamel.yaml +from ruamel.yaml.representer import RoundTripRepresenter from libpdf import parameters from libpdf.apiobjects import ApiObjects @@ -28,10 +32,6 @@ from libpdf.models.table import Cell, Table from libpdf.parameters import HEADLINE_TOLERANCE -import ruamel.yaml -from ruamel.yaml.representer import RoundTripRepresenter - - LOG = logging.getLogger(__name__) @@ -40,9 +40,10 @@ def remove_page_header_footer(single_page): page_crop = single_page.within_bbox( ( 0, - decimal.Decimal(parameters.PAGE_CROP_MARGINS['top']), + decimal.Decimal(parameters.PAGE_CROP_MARGINS["top"]), single_page.width, - single_page.height - decimal.Decimal(parameters.PAGE_CROP_MARGINS['bottom']), + single_page.height + - decimal.Decimal(parameters.PAGE_CROP_MARGINS["bottom"]), ), ) @@ -54,12 +55,14 @@ class MyRepresenter(RoundTripRepresenter): # pylint: disable=too-few-public-met def represent_mapping(self, tag, mapping, flow_style=None): """Override represent_mapping.""" - tag = 'tag:yaml.org,2002:map' + tag = "tag:yaml.org,2002:map" - return RoundTripRepresenter.represent_mapping(self, tag, mapping, flow_style=flow_style) + return RoundTripRepresenter.represent_mapping( + self, tag, mapping, flow_style=flow_style + ) -def to_dict_output(obj: Union[ModelBase, Position]) -> Dict: # pylint: disable=too-many-branches #easy to in one func +def to_dict_output(obj: Union[ModelBase, Position]) -> dict: # pylint: disable=too-many-branches #easy to in one func """Turn all objects attributes into a dictionary.""" vars_dict = vars(obj).copy() @@ -68,32 +71,32 @@ def to_dict_output(obj: Union[ModelBase, Position]) -> Dict: # pylint: disable= # After python3.6/3.7, a dict is sorted in insertion order # https://docs.python.org/3.6/whatsnew/3.6.html#whatsnew36-compactdict # https://docs.python.org/3.7/tutorial/datastructures.html#dictionaries - temp_dict = {'id': obj.id_} + temp_dict = {"id": obj.id_} temp_dict.update(vars_dict) vars_dict = temp_dict if isinstance(obj, (Figure, Paragraph, Table)): # idx is not part of the UML model and should not be exported - del vars_dict['idx'] + del vars_dict["idx"] if isinstance(obj, Page): # no serialization for the contents of pages - del vars_dict['content'] + del vars_dict["content"] if isinstance(obj, (Paragraph, Cell, Chapter)): # textboxes with positions are not interest of the output file if obj.textbox: text = obj.textbox.text - vars_dict['text'] = text - del vars_dict['textbox'] + vars_dict["text"] = text + del vars_dict["textbox"] if isinstance(obj, Figure): # textboxes with positions are not interest of the output file if obj.textboxes: - text = '\n'.join(x.text for x in obj.textboxes) - vars_dict['text'] = text - del vars_dict['textboxes'] + text = "\n".join(x.text for x in obj.textboxes) + vars_dict["text"] = text + del vars_dict["textboxes"] # delete back references so the export does not create circular loops delete_backref_keys = [] for key in vars_dict: - if key.startswith('b_'): + if key.startswith("b_"): delete_backref_keys.append(key) for key in delete_backref_keys: del vars_dict[key] @@ -108,10 +111,10 @@ def to_dict_output(obj: Union[ModelBase, Position]) -> Dict: # pylint: disable= if isinstance(element, (ModelBase, Position)): vars_dict[key][index] = to_dict_output(element) - if 'page' in vars_dict: + if "page" in vars_dict: # according to the model pages are serialized as page. # this supports the common adressing scheme in libpdf - vars_dict['page'] = vars_dict['page']['id'] + vars_dict["page"] = vars_dict["page"]["id"] return vars_dict @@ -148,24 +151,36 @@ def output_dump(output_format: str, output_path: str, objects: ApiObjects): # ruamel_yaml.register_class(Chapter) # ruamel_yaml.register_class(Cell) - output_dict = {'root': to_dict_output(objects.root)} + output_dict = {"root": to_dict_output(objects.root)} if output_path is None: - LOG.info('Writing extracted data to stdout') - if output_format == 'json': - print(json.dumps(output_dict, default=json_datetime_converter, indent=2, sort_keys=False)) - elif output_format == 'yaml': + LOG.info("Writing extracted data to stdout") + if output_format == "json": + print( + json.dumps( + output_dict, + default=json_datetime_converter, + indent=2, + sort_keys=False, + ) + ) + elif output_format == "yaml": ruamel_yaml.dump(output_dict, sys.stdout) else: output_dir = os.path.dirname(output_path) if output_dir: if not os.path.isdir(output_dir): os.makedirs(output_dir) - with open(output_path, 'w', encoding='utf-8') as file: - if output_format == 'json': - json_string = json.dumps(output_dict, default=json_datetime_converter, indent=2, sort_keys=False) + with open(output_path, "w", encoding="utf-8") as file: + if output_format == "json": + json_string = json.dumps( + output_dict, + default=json_datetime_converter, + indent=2, + sort_keys=False, + ) file.write(json_string) - elif output_format == 'yaml': + elif output_format == "yaml": ruamel_yaml.dump(output_dict, file) @@ -182,26 +197,31 @@ def merge_all_elements(*elements): for obj in element: element_list.append(obj) - element_list.sort(key=lambda x: (x.position.page.number, (float(x.position.page.height) - x.position.y0))) + element_list.sort( + key=lambda x: ( + x.position.page.number, + (float(x.position.page.height) - x.position.y0), + ) + ) return element_list def filter_out_outline_page(outline_dict): """Filter out outline whose target page are not in the extracted pages list.""" - for outline_chapter in outline_dict['content'].copy(): - if outline_chapter['position']['page'] is None: - outline_dict['content'].remove(outline_chapter) + for outline_chapter in outline_dict["content"].copy(): + if outline_chapter["position"]["page"] is None: + outline_dict["content"].remove(outline_chapter) # recursively check subchapter - if outline_chapter['content']: + if outline_chapter["content"]: filter_out_outline_page(outline_chapter) return outline_dict def map_elements_outline( - element_list: List[Union[Chapter, Figure, Table, Paragraph]], + element_list: list[Union[Chapter, Figure, Table, Paragraph]], outline_dict, -) -> List[Union[Chapter, Figure, Table, Paragraph]]: +) -> list[Union[Chapter, Figure, Table, Paragraph]]: """ Map elements into a nested outline structure. @@ -212,13 +232,16 @@ def map_elements_outline( # filter out outline whose target page are not in the extracted pages list filter_out_outline_page(outline_dict) - if outline_dict['content']: + if outline_dict["content"]: elements_above_outline = list( filter( - lambda x: x.position.page.number < outline_dict['content'][0]['position']['page'] + lambda x: x.position.page.number + < outline_dict["content"][0]["position"]["page"] or ( - x.position.page.number == outline_dict['content'][0]['position']['page'] - and x.position.y1 > outline_dict['content'][0]['position']['y1'] + HEADLINE_TOLERANCE + x.position.page.number + == outline_dict["content"][0]["position"]["page"] + and x.position.y1 + > outline_dict["content"][0]["position"]["y1"] + HEADLINE_TOLERANCE ), element_list, ), @@ -226,10 +249,13 @@ def map_elements_outline( elements_in_outline = list( filter( - lambda x: x.position.page.number > outline_dict['content'][0]['position']['page'] + lambda x: x.position.page.number + > outline_dict["content"][0]["position"]["page"] or ( - x.position.page.number == outline_dict['content'][0]['position']['page'] - and x.position.y1 < outline_dict['content'][0]['position']['y1'] + HEADLINE_TOLERANCE + x.position.page.number + == outline_dict["content"][0]["position"]["page"] + and x.position.y1 + < outline_dict["content"][0]["position"]["y1"] + HEADLINE_TOLERANCE ), element_list, ), @@ -251,9 +277,14 @@ def map_elements_outline( chapters_content_filled = fill_elements_content(elements_in_outline) # turn chapters in a flatten structure into nested one - nested_chapters: List[Chapter] = [] + nested_chapters: list[Chapter] = [] - mapping_chapters(chapters_content_filled, nested_chapters, outline_content=outline_dict['content'], b_chapter=None) + mapping_chapters( + chapters_content_filled, + nested_chapters, + outline_content=outline_dict["content"], + b_chapter=None, + ) # elements below the outline nested_elements = elements_above_outline + nested_chapters @@ -261,7 +292,9 @@ def map_elements_outline( return nested_elements -def fill_elements_content(elements_in_outline: List[Union[Chapter, Figure, Table, Paragraph]]) -> List[Chapter]: +def fill_elements_content( + elements_in_outline: list[Union[Chapter, Figure, Table, Paragraph]], +) -> list[Chapter]: """ Fill the elements, tables, figures and paragraphs into their corresponding chapters' contents. @@ -272,11 +305,11 @@ def fill_elements_content(elements_in_outline: List[Union[Chapter, Figure, Table """ for index_element, element in enumerate(elements_in_outline): if isinstance(element, Chapter): - id_dict = {'table': 1, 'figure': 1, 'paragraph': 1} + id_dict = {"table": 1, "figure": 1, "paragraph": 1} content = elements_in_outline[index_element].content index_b_chapter = index_element else: - if 'content' in locals(): + if "content" in locals(): element.idx = id_dict[element.type] element.b_chapter = elements_in_outline[index_b_chapter] content.append(element) @@ -287,16 +320,20 @@ def fill_elements_content(elements_in_outline: List[Union[Chapter, Figure, Table # 3. if it's a programming error, fix the code # 4. if it's a real runtime issue coming from wrong PDF input, catch the error one level above # and log an understandable, critical error - raise ValueError('elements can not fill into the content because it does not exist') + raise ValueError( + "elements can not fill into the content because it does not exist" + ) - chapters_content = list(filter(lambda x: isinstance(x, Chapter), elements_in_outline)) + chapters_content = list( + filter(lambda x: isinstance(x, Chapter), elements_in_outline) + ) return chapters_content def mapping_chapters( - chapters_content_filled: List[Chapter], - nested_chapters: List, + chapters_content_filled: list[Chapter], + nested_chapters: list, outline_content, b_chapter: Optional[Chapter], ): @@ -314,13 +351,13 @@ def mapping_chapters( :return: """ for outline_chapter in outline_content: - # use a list to contain the filtered chapter because I don't know how to process a filtered object. # check if the title and page number are matched. filter_chapter = [ x for x in chapters_content_filled - if x.title == outline_chapter['title'] and x.number == outline_chapter['number'] + if x.title == outline_chapter["title"] + and x.number == outline_chapter["number"] ] # Presumably, there is only one chapter matched in a flatten structure @@ -329,7 +366,10 @@ def mapping_chapters( # 1. as a developer, I don't understand it # 2. the message is for end users but it contains an internal variable elements_in_outline # 3. level is DEBUG but it looks like a problem - LOG.debug('The expected element %s may be not in elements_in_outline', outline_chapter['title']) + LOG.debug( + "The expected element %s may be not in elements_in_outline", + outline_chapter["title"], + ) # raise ValueError('No filtered chapter found. The expected element may be not in elements_in_outline') continue @@ -338,21 +378,24 @@ def mapping_chapters( nested_chapters.append(filter_chapter[0]) index_chapter = len(nested_chapters) - 1 - if outline_chapter['content']: # next deeper level + if outline_chapter["content"]: # next deeper level if isinstance(nested_chapters[index_chapter], Chapter): mapping_chapters( chapters_content_filled, nested_chapters[index_chapter].content, - outline_chapter['content'], + outline_chapter["content"], b_chapter=nested_chapters[index_chapter], ) else: - LOG.debug('Non-Chapter object %s is detected', nested_chapters[index_chapter].id) + LOG.debug( + "Non-Chapter object %s is detected", + nested_chapters[index_chapter].id, + ) def libpdf_target_explorer( # pylint: disable=too-many-nested-blocks # local algorithm, better readability - elements: List[Union[Paragraph, Table]], - pages_list: List[Page], + elements: list[Union[Paragraph, Table]], + pages_list: list[Page], ): """ Convert the name_destination/target_link to nest ID paths. @@ -377,8 +420,10 @@ def libpdf_target_explorer( # pylint: disable=too-many-nested-blocks # local al """ # find the page containing source links for page in pages_list: - if page.number in catalog['annos']: - elements_on_page = [x for x in elements if x.position.page.number == page.number] + if page.number in catalog["annos"]: + elements_on_page = [ + x for x in elements if x.position.page.number == page.number + ] # find the elements which contains source links on a certain page elements_with_anno = elements_with_anno_finder(elements_on_page) @@ -400,14 +445,14 @@ def libpdf_target_explorer( # pylint: disable=too-many-nested-blocks # local al # The link on page xy with text xy cannot be resolved to a libpdf element; linking # to the target page position instead LOG.error( - 'The source link in the paragraph %s is missing', + "The source link in the paragraph %s is missing", repr(element), ) def elements_with_anno_finder( - elements_on_page: List[Union[Paragraph, Table]], -) -> Union[List[Union[Chapter, Paragraph, Figure, Table, Cell]], None]: + elements_on_page: list[Union[Paragraph, Table]], +) -> Union[list[Union[Chapter, Paragraph, Figure, Table, Cell]], None]: """ Find the elements, tables or paragraphs containing source links. @@ -441,7 +486,7 @@ def elements_with_anno_finder( return elements_with_anno -def find_target_id(link: Link, pages_list: List[Page], src_element: Element) -> str: +def find_target_id(link: Link, pages_list: list[Page], src_element: Element) -> str: """ Find the corresponding libpdf target element ID from positions. @@ -455,14 +500,16 @@ def find_target_id(link: Link, pages_list: List[Page], src_element: Element) -> """ target_id = None - if link.pos_target['page']: + if link.pos_target["page"]: for page in pages_list: - if page.number == link.pos_target['page']: + if page.number == link.pos_target["page"]: target_page = page # target_page = pages_list[link.pos_target['page'] - 1] elements_target_page = get_elements_page(target_page) for element in elements_target_page: - if element.contains_coord(link.pos_target['page'], link.pos_target['x'], link.pos_target['y']): + if element.contains_coord( + link.pos_target["page"], link.pos_target["x"], link.pos_target["y"] + ): target_id = nest_explorer(element) break @@ -470,24 +517,26 @@ def find_target_id(link: Link, pages_list: List[Page], src_element: Element) -> # If no libpdf element is found, # the target is set to the target coordinates given as page./: # To improve element detection, the parameter TARGET_COOR_TOLERANCE may need to be adjusted - target_id = f'{target_page.id_}/{link.pos_target["x"]}:{link.pos_target["y"]}' + target_id = ( + f'{target_page.id_}/{link.pos_target["x"]}:{link.pos_target["y"]}' + ) text = str(src_element) - text_shortened = (text[:60] + '..') if len(text) > 60 else text + text_shortened = (text[:60] + "..") if len(text) > 60 else text LOG.debug( 'The link "%s" on page %s could not be resolved to a libpdf element; replacing it with the raw ' - 'target page coordinate %s', + "target page coordinate %s", text_shortened, src_element.position.page.number, target_id, ) else: - target_id = 'Out Of extracted pages scope' + target_id = "Out Of extracted pages scope" return target_id -def get_elements_page(target_page: Page) -> List[Union[Paragraph, Table, Figure]]: +def get_elements_page(target_page: Page) -> list[Union[Paragraph, Table, Figure]]: """ Collect the elements, which occurs on a certain target page. @@ -512,7 +561,7 @@ def nest_explorer(element: Union[Figure, Table, Chapter, Paragraph]) -> str: """ if element.b_chapter: element_id = nest_explorer(element.b_chapter) - element_id = element_id + '/' + element.id_ + element_id = element_id + "/" + element.id_ else: element_id = element.id_ diff --git a/libpdf/progress.py b/libpdf/progress.py index e530e0b..ca85dc8 100644 --- a/libpdf/progress.py +++ b/libpdf/progress.py @@ -14,7 +14,6 @@ TQDM_AVAILABLE = True except ImportError: - # class name is constrained by the tqdm library class tqdm: # pylint: disable=invalid-name """Mock tqdm.tqdm class and provide the least amount of functionality.""" @@ -71,7 +70,7 @@ class CatchAllAttributesType(type): def __getattr__(cls, key): """Return an empty string for all class attributes if used as a metaclass.""" - return '' + return "" class Fore(metaclass=CatchAllAttributesType): # pylint: disable=too-few-public-methods """ @@ -85,15 +84,15 @@ class Fore(metaclass=CatchAllAttributesType): # pylint: disable=too-few-public- # all libpdf modules should only access tqdm and colorama through this module # COLOR_MAP is used to decouple from colorama.Fore which could be a missing dependency COLOR_MAP = { - 'black': Fore.BLACK, - 'red': Fore.RED, - 'green': Fore.GREEN, - 'yellow': Fore.YELLOW, - 'blue': Fore.BLUE, - 'magenta': Fore.MAGENTA, - 'cyan': Fore.CYAN, - 'white': Fore.WHITE, - 'reset': Fore.RESET, + "black": Fore.BLACK, + "red": Fore.RED, + "green": Fore.GREEN, + "yellow": Fore.YELLOW, + "blue": Fore.BLUE, + "magenta": Fore.MAGENTA, + "cyan": Fore.CYAN, + "white": Fore.WHITE, + "reset": Fore.RESET, } @@ -104,14 +103,14 @@ def bar_format(color): def bar_format_lvl0(): """bar_format for the top level instance of nested tqdm progress bars.""" - return bar_format('red') + return bar_format("red") def bar_format_lvl1(): """bar_format for the 1st level instance of nested tqdm progress bars.""" - return bar_format('cyan') + return bar_format("cyan") def bar_format_lvl2(): """bar_format for the 2nd level instance of nested tqdm progress bars.""" - return bar_format('green') + return bar_format("green") diff --git a/libpdf/tables.py b/libpdf/tables.py index c4258ca..652b767 100644 --- a/libpdf/tables.py +++ b/libpdf/tables.py @@ -1,4 +1,5 @@ -"""Extracts tables cells and texts inside. +""" +Extracts tables cells and texts inside. Coordinate system (positions) of tables is defined below: @@ -17,23 +18,21 @@ """ import logging from decimal import Decimal -from typing import List, Union +from typing import Union + +from pdfminer.layout import LTPage, LTTextBoxHorizontal -from libpdf import textbox -from libpdf import utils +from libpdf import textbox, utils from libpdf.catalog import catalog from libpdf.log import logging_needed from libpdf.models.figure import Figure from libpdf.models.page import Page from libpdf.models.position import Position -from libpdf.models.table import Cell -from libpdf.models.table import Table +from libpdf.models.table import Cell, Table from libpdf.parameters import LA_PARAMS from libpdf.progress import bar_format_lvl2, tqdm from libpdf.utils import from_pdfplumber_bbox, lt_to_libpdf_hbox_converter -from pdfminer.layout import LTPage, LTTextBoxHorizontal - LOG = logging.getLogger(__name__) @@ -43,10 +42,10 @@ class FoldedStr(str): def folded_str_representer(dumper, text): """Warp function of the representer.""" - return dumper.represent_scalar('tag', text, style='>') + return dumper.represent_scalar("tag", text, style=">") -def extract_pdf_table(pdf, pages_list: List[Page], figure_list: List[Figure]): +def extract_pdf_table(pdf, pages_list: list[Page], figure_list: list[Figure]): """ Extract the table in PDF. @@ -58,36 +57,41 @@ def extract_pdf_table(pdf, pages_list: List[Page], figure_list: List[Figure]): :param figure_list: a list of libpdf Figure objects, used to see if tables and figures are overlapped :return: a list of tables """ - LOG.info('Extracting tables ...') + LOG.info("Extracting tables ...") table_settings = { - 'vertical_strategy': 'lines', - 'horizontal_strategy': 'lines', - 'explicit_vertical_lines': [], - 'explicit_horizontal_lines': [], - 'snap_tolerance': 3, - 'join_tolerance': 3, - 'edge_min_length': 3, - 'min_words_vertical': 3, - 'min_words_horizontal': 1, - 'keep_blank_chars': False, - 'text_tolerance': 3, - 'text_x_tolerance': 2, - 'text_y_tolerance': 2, - 'intersection_tolerance': 3, - 'intersection_x_tolerance': None, - 'intersection_y_tolerance': None, + "vertical_strategy": "lines", + "horizontal_strategy": "lines", + "explicit_vertical_lines": [], + "explicit_horizontal_lines": [], + "snap_tolerance": 3, + "join_tolerance": 3, + "edge_min_length": 3, + "min_words_vertical": 3, + "min_words_horizontal": 1, + "keep_blank_chars": False, + "text_tolerance": 3, + "text_x_tolerance": 2, + "text_y_tolerance": 2, + "intersection_tolerance": 3, + "intersection_x_tolerance": None, + "intersection_y_tolerance": None, } - table_dict = {'page': {}} + table_dict = {"page": {}} table_list = [] table_id = 1 for idx_page, page in enumerate( - tqdm(pdf.pages, desc='###### Extracting tables', unit='pages', bar_format=bar_format_lvl2()), + tqdm( + pdf.pages, + desc="###### Extracting tables", + unit="pages", + bar_format=bar_format_lvl2(), + ), ): if logging_needed(idx_page, len(pdf.pages)): - LOG.debug('Extracting tables page %s of %s', idx_page + 1, len(pdf.pages)) - if len((page.find_tables(table_settings))) != 0: - table_dict['page'].update({idx_page + 1: []}) + LOG.debug("Extracting tables page %s of %s", idx_page + 1, len(pdf.pages)) + if len(page.find_tables(table_settings)) != 0: + table_dict["page"].update({idx_page + 1: []}) tables = page.find_tables(table_settings) lt_page = page._layout # pylint: disable=protected-access # easiest way to obtain LTPage for table in tables: @@ -110,20 +114,22 @@ def extract_pdf_table(pdf, pages_list: List[Page], figure_list: List[Figure]): ) if _table_figure_check(table_pos, figure_list) is True: - table_dict['page'][idx_page + 1].append( + table_dict["page"][idx_page + 1].append( { - 'id': 'table.' + str(table_id), - 'type': 'table', - 'positions': table_pos, + "id": "table." + str(table_id), + "type": "table", + "positions": table_pos, # 'text': table_temp.extract(2, 2), - 'cell': [], + "cell": [], }, ) cells = extract_cells( lt_page, table.rows, - table_dict['page'][idx_page + 1][len(table_dict['page'][idx_page + 1]) - 1]['cell'], + table_dict["page"][idx_page + 1][ + len(table_dict["page"][idx_page + 1]) - 1 + ]["cell"], pages_list[idx_page], ) @@ -132,13 +138,13 @@ def extract_pdf_table(pdf, pages_list: List[Page], figure_list: List[Figure]): table_id += 1 - if len(table_dict['page'][idx_page + 1]) == 0: # no table is added - del table_dict['page'][idx_page + 1] + if len(table_dict["page"][idx_page + 1]) == 0: # no table is added + del table_dict["page"][idx_page + 1] return table_list -def extract_cells(lt_page: LTPage, rows: List, list_cell: List[Cell], page: Page): +def extract_cells(lt_page: LTPage, rows: list, list_cell: list[Cell], page: Page): """ Extract cells in the table. @@ -159,14 +165,20 @@ def extract_cells(lt_page: LTPage, rows: List, list_cell: List[Cell], page: Page row_cell[3], Decimal(lt_page.height), ) - pos_cell = Position(pos_cell_bbox[0], pos_cell_bbox[1], pos_cell_bbox[2], pos_cell_bbox[3], page) + pos_cell = Position( + pos_cell_bbox[0], + pos_cell_bbox[1], + pos_cell_bbox[2], + pos_cell_bbox[3], + page, + ) # extract cell text lt_textbox = cell_lttextbox_extraction(pos_cell, lt_page) links = [] - text_cell = '' + text_cell = "" if lt_textbox: text_cell = lt_textbox.get_text() - if catalog['annos']: + if catalog["annos"]: links = textbox.extract_linked_chars(lt_textbox, lt_page.pageid) hbox = lt_to_libpdf_hbox_converter([lt_textbox]) @@ -174,22 +186,24 @@ def extract_cells(lt_page: LTPage, rows: List, list_cell: List[Cell], page: Page hbox = None cell = { - 'row': idx_row + 1, - 'col': idx_cell + 1, - 'positions': pos_cell_bbox, - 'text': FoldedStr(text_cell), - 'links': links, + "row": idx_row + 1, + "col": idx_cell + 1, + "positions": pos_cell_bbox, + "text": FoldedStr(text_cell), + "links": links, } list_cell.append(cell) - cell_obj = Cell(idx_row + 1, idx_cell + 1, pos_cell, links, textbox=hbox) + cell_obj = Cell( + idx_row + 1, idx_cell + 1, pos_cell, links, textbox=hbox + ) cell_obj_list.append(cell_obj) return cell_obj_list -def _table_figure_check(positions: Position, figure_list: List[Figure]): +def _table_figure_check(positions: Position, figure_list: list[Figure]): """ Check if the table is recognized as the figure in the same page. @@ -201,7 +215,9 @@ def _table_figure_check(positions: Position, figure_list: List[Figure]): :return: True means only table is recognised. """ if len(figure_list) > 0: - filter_list_figure = list(filter(lambda x: x.position.page.number == positions.page, figure_list)) + filter_list_figure = list( + filter(lambda x: x.position.page.number == positions.page, figure_list) + ) if len(filter_list_figure) > 0: margin_offset = 5 for figure in filter_list_figure: @@ -217,7 +233,9 @@ def _table_figure_check(positions: Position, figure_list: List[Figure]): return True -def cell_lttextbox_extraction(position: Position, lt_page: LTPage) -> Union[LTTextBoxHorizontal, None]: +def cell_lttextbox_extraction( + position: Position, lt_page: LTPage +) -> Union[LTTextBoxHorizontal, None]: """ Extract the lttextbox in the cell. @@ -228,12 +246,17 @@ def cell_lttextbox_extraction(position: Position, lt_page: LTPage) -> Union[LTTe # TODO: offset explanation offset = 5 - cell_bbox = [position.x0 - offset, position.y0 - offset, position.x1 + offset, position.y1 + offset] + cell_bbox = [ + position.x0 - offset, + position.y0 - offset, + position.x1 + offset, + position.y1 + offset, + ] lt_textbox = utils.lt_textbox_crop( cell_bbox, lt_page._objs, # pylint: disable=protected-access # not publicly available - word_margin=LA_PARAMS['word_margin'], - y_tolerance=LA_PARAMS['line_overlap'], + word_margin=LA_PARAMS["word_margin"], + y_tolerance=LA_PARAMS["line_overlap"], ) return lt_textbox diff --git a/libpdf/textbox.py b/libpdf/textbox.py index 18954ba..b616fcd 100644 --- a/libpdf/textbox.py +++ b/libpdf/textbox.py @@ -1,4 +1,5 @@ -"""Extract paragraphs and chapters from LTTextBox by pdfminer.six. +""" +Extract paragraphs and chapters from LTTextBox by pdfminer.six. Coordinate system of pdfminer LTTextBox is defined below:: @@ -30,7 +31,15 @@ import logging import re from difflib import SequenceMatcher -from typing import Dict, List, Tuple, Union +from typing import Union + +from pdfminer.layout import ( + LTAnno, + LTChar, + LTText, + LTTextBox, + LTTextLineHorizontal, +) from libpdf import parameters from libpdf.catalog import catalog @@ -52,41 +61,34 @@ from libpdf.progress import bar_format_lvl2, tqdm from libpdf.utils import lt_page_crop, lt_to_libpdf_hbox_converter, textbox_crop -from pdfminer.layout import ( - LTAnno, - LTChar, - LTText, - LTTextBox, - LTTextLineHorizontal, -) - - LOG = logging.getLogger(__name__) def extract_paragraphs_chapters( pdf, - figure_list: List[Figure], - table_list: List[Table], - page_list: List[Page], + figure_list: list[Figure], + table_list: list[Table], + page_list: list[Page], no_chapters, no_paragraphs, -) -> Tuple[List[Paragraph], List[Chapter]]: +) -> tuple[list[Paragraph], list[Chapter]]: """Extract paragraphs and chapter's headline from given pdf.""" - extracted_lt_textboxes = extract_lt_textboxes(pdf, figure_list, table_list, page_list) + extracted_lt_textboxes = extract_lt_textboxes( + pdf, figure_list, table_list, page_list + ) chapter_list = [] if no_chapters: - LOG.info('Excluding chapters extraction') + LOG.info("Excluding chapters extraction") else: - if catalog['outline']: - LOG.info('Extracting chapters ...') + if catalog["outline"]: + LOG.info("Extracting chapters ...") chapter_list = render_chapters(extracted_lt_textboxes, page_list, pdf) paragraph_list = [] if no_paragraphs: - LOG.info('Excluding paragraphs extraction') + LOG.info("Excluding paragraphs extraction") else: - LOG.info('Extracting paragraphs ...') + LOG.info("Extracting paragraphs ...") paragraph_list = render_paragraphs(extracted_lt_textboxes, page_list) return paragraph_list, chapter_list @@ -104,7 +106,7 @@ def extract_lt_textboxes(pdf, figure_list, table_list, page_list): :param page_list: :return: """ - page_lt_textboxes = pdfminer_get_lt_textboxes(pdf) # noqa: F841 # flake8 not used warning + page_lt_textboxes = pdfminer_get_lt_textboxes(pdf) # flake8 not used warning for idx_page, _ in page_lt_textboxes.copy().items(): # pages that shall be extracted @@ -112,7 +114,9 @@ def extract_lt_textboxes(pdf, figure_list, table_list, page_list): del page_lt_textboxes[idx_page] if table_list is not None or figure_list is not None: - page_lt_textboxes_filtered = remove_lt_textboxes_in_tables_figures(page_lt_textboxes, figure_list, table_list) + page_lt_textboxes_filtered = remove_lt_textboxes_in_tables_figures( + page_lt_textboxes, figure_list, table_list + ) else: page_lt_textboxes_filtered = page_lt_textboxes @@ -122,9 +126,9 @@ def extract_lt_textboxes(pdf, figure_list, table_list, page_list): lt_textboxes_without_noise = [] for lt_textbox in lt_textboxes: # remove empty texbox only contains whitespaces or newlines - if not re.match(r'^\s*$', lt_textbox.get_text()): + if not re.match(r"^\s*$", lt_textbox.get_text()): # remove all the \n at the end of lt_textbox - if lt_textbox._objs[-1]._objs[-1].get_text() == '\n': # pylint: disable=protected-access + if lt_textbox._objs[-1]._objs[-1].get_text() == "\n": # pylint: disable=protected-access del lt_textbox._objs[-1]._objs[-1] # pylint: disable=protected-access lt_textboxes_without_noise.append(lt_textbox) page_lt_textboxes_filtered_noise[page_idx] = lt_textboxes_without_noise @@ -133,10 +137,10 @@ def extract_lt_textboxes(pdf, figure_list, table_list, page_list): def render_chapters( # pylint: disable=too-many-branches, too-many-locals - page_lt_textboxes_filtered: Dict[int, List[LTTextBox]], - page_list: List[Page], + page_lt_textboxes_filtered: dict[int, list[LTTextBox]], + page_list: list[Page], pdf, -) -> List[Chapter]: +) -> list[Chapter]: """ Render libpdf chapters from LTTextboxes according to outline catalog. @@ -155,21 +159,23 @@ def render_chapters( # pylint: disable=too-many-branches, too-many-locals """ chapter_list = [] flatten_outline = [] - _flatten_outline(nested_outline=catalog['outline']['content'], flatten_outline=flatten_outline) + _flatten_outline( + nested_outline=catalog["outline"]["content"], flatten_outline=flatten_outline + ) # sort the flatten outline chapters into a dict by pages chapters_sorted_by_page = {} extracted_page_nums = [page.number for page in page_list] for chapter in flatten_outline: - if chapter['position']['page'] in extracted_page_nums: - if chapter['position']['page'] not in chapters_sorted_by_page: - chapters_sorted_by_page[chapter['position']['page']] = [] - chapters_sorted_by_page[chapter['position']['page']].append(chapter) + if chapter["position"]["page"] in extracted_page_nums: + if chapter["position"]["page"] not in chapters_sorted_by_page: + chapters_sorted_by_page[chapter["position"]["page"]] = [] + chapters_sorted_by_page[chapter["position"]["page"]].append(chapter) for page_number, chapters in tqdm( chapters_sorted_by_page.items(), - desc='###### Extracting chapters', - unit='pages', + desc="###### Extracting chapters", + unit="pages", bar_format=bar_format_lvl2(), ): if page_number - 1 in page_lt_textboxes_filtered: @@ -178,7 +184,9 @@ def render_chapters( # pylint: disable=too-many-branches, too-many-locals if page.number == page_number: chapter_page = page for chapter in chapters: - chapter_lt_textboxes = chapter_examiner(chapter, lt_textboxes, chapter_page) + chapter_lt_textboxes = chapter_examiner( + chapter, lt_textboxes, chapter_page + ) if chapter_lt_textboxes: # render chapter based on the lt_textbox @@ -188,11 +196,17 @@ def render_chapters( # pylint: disable=too-many-branches, too-many-locals y1 = max(chapter_lt_textboxes, key=lambda x: x.y1).y1 position = Position(x0, y0, x1, y1, chapter_page) - if len(chapter_lt_textboxes) == 2 and ('virt.' in chapter['number']): + if len(chapter_lt_textboxes) == 2 and ( + "virt." in chapter["number"] + ): # the case where chapter's number and title are grouped into two different lt_textboxes, # and the chapter number derives from virtual hierarchical levels because # outline catalog doesn't have chapter number. - chapter['number'] = min(chapter_lt_textboxes, key=lambda x: x.x0).get_text().strip() + chapter["number"] = ( + min(chapter_lt_textboxes, key=lambda x: x.x0) + .get_text() + .strip() + ) # extract LTPage for textbox_crop() to use lt_page = pdf.pages[page_number - 1].layout @@ -222,22 +236,22 @@ def render_chapters( # pylint: disable=too-many-branches, too-many-locals LOG.info( 'The chapter "%s %s" on page %s cannot be detected. A ghost chapter is introduced ' - 'at the jump target location. ', - chapter['number'], - chapter['title'], - chapter['position']['page'], + "at the jump target location. ", + chapter["number"], + chapter["title"], + chapter["position"]["page"], ) - if 'virt.' in chapter['number']: + if "virt." in chapter["number"]: LOG.info( - 'Virtual number %s is applied to chapter number, ' - 'so this number may not be consistent with the numerical order in the content.', - chapter['number'], + "Virtual number %s is applied to chapter number, " + "so this number may not be consistent with the numerical order in the content.", + chapter["number"], ) chapter_obj = Chapter( - chapter['title'], - chapter['number'], + chapter["title"], + chapter["number"], position, content=[], chapter=None, @@ -250,7 +264,7 @@ def render_chapters( # pylint: disable=too-many-branches, too-many-locals return chapter_list -def ghost_chapter_position_generator(chapter: Dict, page: Page) -> Position: +def ghost_chapter_position_generator(chapter: dict, page: Page) -> Position: """ Generate the position of a ghost chapter. @@ -261,25 +275,29 @@ def ghost_chapter_position_generator(chapter: Dict, page: Page) -> Position: :param page: a libpdf page :return: an instance of libpdf position """ - if chapter['position']['y1'] - CHAPTER_RECTANGLE_EXTEND > 0: - y0 = chapter['position']['y1'] - CHAPTER_RECTANGLE_EXTEND + if chapter["position"]["y1"] - CHAPTER_RECTANGLE_EXTEND > 0: + y0 = chapter["position"]["y1"] - CHAPTER_RECTANGLE_EXTEND else: # expand to bottom y0 = 0 # calculate chapter lt_textbox x1 - if chapter['position']['x0'] + CHAPTER_RECTANGLE_EXTEND < page.width: - x1 = chapter['position']['x0'] + CHAPTER_RECTANGLE_EXTEND + if chapter["position"]["x0"] + CHAPTER_RECTANGLE_EXTEND < page.width: + x1 = chapter["position"]["x0"] + CHAPTER_RECTANGLE_EXTEND else: # expand to right side x1 = page.width - position = Position(chapter['position']['x0'], y0, x1, chapter['position']['y1'], page) + position = Position( + chapter["position"]["x0"], y0, x1, chapter["position"]["y1"], page + ) return position -def chapter_examiner(chapter: Dict, lt_textboxes: List[LTTextBox], page: Page) -> Union[None, List[LTTextBox]]: +def chapter_examiner( + chapter: dict, lt_textboxes: list[LTTextBox], page: Page +) -> Union[None, list[LTTextBox]]: """ Check if certain lt_textboxes are or a certain lt_textbox is the chapter. @@ -316,8 +334,8 @@ def chapter_examiner(chapter: Dict, lt_textboxes: List[LTTextBox], page: Page) - # center of the outline chapter. # The coordinates of the rectangle are rect = (x0, y0, x1, y1). # This assumption may not work in PDFs with multiple columns. - y0 = chapter['position']['y1'] - (page.height / 4) - y1 = chapter['position']['y1'] + (page.height / 4) + y0 = chapter["position"]["y1"] - (page.height / 4) + y1 = chapter["position"]["y1"] + (page.height / 4) y0 = max(y0, 0) if y1 > page.height: y1 = page.height @@ -325,7 +343,9 @@ def chapter_examiner(chapter: Dict, lt_textboxes: List[LTTextBox], page: Page) - rect = (0, y0, page.width, y1) # get the lt_textboxes completely in the detection rectangle - lt_textboxes_in_rect = lt_page_crop(rect, lt_textboxes, LTText, contain_completely=True) + lt_textboxes_in_rect = lt_page_crop( + rect, lt_textboxes, LTText, contain_completely=True + ) if not lt_textboxes_in_rect: return None @@ -334,12 +354,16 @@ def chapter_examiner(chapter: Dict, lt_textboxes: List[LTTextBox], page: Page) - # evaluate the similarities of number, title and content of lt_textboxes with a chapter for lt_textbox in lt_textboxes_in_rect: # check if lt_textbox text and headline text has a certain similarity - similarity_title = SequenceMatcher(None, lt_textbox.get_text().strip(), chapter['title']).ratio() - if 'virt.' in chapter['number']: + similarity_title = SequenceMatcher( + None, lt_textbox.get_text().strip(), chapter["title"] + ).ratio() + if "virt." in chapter["number"]: similarity_number = None similarity_content = None else: - similarity_number = SequenceMatcher(None, lt_textbox.get_text().strip(), chapter['number']).ratio() + similarity_number = SequenceMatcher( + None, lt_textbox.get_text().strip(), chapter["number"] + ).ratio() similarity_content = SequenceMatcher( None, lt_textbox.get_text().strip(), @@ -347,7 +371,11 @@ def chapter_examiner(chapter: Dict, lt_textboxes: List[LTTextBox], page: Page) - ).ratio() similarity_lt_textboxes.append( - {'title': similarity_title, 'number': similarity_number, 'content': similarity_content}, + { + "title": similarity_title, + "number": similarity_number, + "content": similarity_content, + }, ) winners = similarity_referee(similarity_lt_textboxes, lt_textboxes_in_rect, chapter) @@ -356,10 +384,10 @@ def chapter_examiner(chapter: Dict, lt_textboxes: List[LTTextBox], page: Page) - def similarity_referee( # pylint: disable=too-many-branches # for readability - similarity_lt_textboxes: List[Dict], - lt_textboxes_in_rect: List[LTTextBox], - chapter: Dict, -) -> List: + similarity_lt_textboxes: list[dict], + lt_textboxes_in_rect: list[LTTextBox], + chapter: dict, +) -> list: """ Select the lt_textboxes with the highest similarity among titles, numbers or contents. @@ -377,7 +405,9 @@ def similarity_referee( # pylint: disable=too-many-branches # for readability # find the winner of the title similarity title_winners_idx = [ - i for i, x in enumerate(similarity_lt_textboxes) if x == max(similarity_lt_textboxes, key=lambda x: x['title']) + i + for i, x in enumerate(similarity_lt_textboxes) + if x == max(similarity_lt_textboxes, key=lambda x: x["title"]) ] if len(title_winners_idx) > 1: # find the winner who has the shortest vertical distance to the jumping point of the outline chapter. @@ -385,15 +415,18 @@ def similarity_referee( # pylint: disable=too-many-branches # for readability title_winner_idx = min( title_winners_idx, - key=lambda x: abs(lt_textboxes_in_rect[x].y1 - chapter['position']['y1']), + key=lambda x: abs(lt_textboxes_in_rect[x].y1 - chapter["position"]["y1"]), ) else: title_winner_idx = title_winners_idx[0] - if 'virt.' in chapter['number']: + if "virt." in chapter["number"]: # if the chapter number is virtual, only the title needs to be taken into account. - if similarity_lt_textboxes[title_winner_idx]['title'] > MIN_OUTLINE_TITLE_TEXTBOX_SIMILARITY: + if ( + similarity_lt_textboxes[title_winner_idx]["title"] + > MIN_OUTLINE_TITLE_TEXTBOX_SIMILARITY + ): winners.append(lt_textboxes_in_rect[title_winner_idx]) # search for the lt_textbox which may be the number of the chapter # The assumption is that the number is always located on the left of the chapter @@ -401,15 +434,21 @@ def similarity_referee( # pylint: disable=too-many-branches # for readability x for x in lt_textboxes_in_rect if x.x0 < lt_textboxes_in_rect[title_winner_idx].x0 - and abs(x.y0 - lt_textboxes_in_rect[title_winner_idx].y0) < CHAPTER_RECTANGLE_EXTEND - and abs(x.y1 - lt_textboxes_in_rect[title_winner_idx].y1) < CHAPTER_RECTANGLE_EXTEND + and abs(x.y0 - lt_textboxes_in_rect[title_winner_idx].y0) + < CHAPTER_RECTANGLE_EXTEND + and abs(x.y1 - lt_textboxes_in_rect[title_winner_idx].y1) + < CHAPTER_RECTANGLE_EXTEND ] if len(potential_chapter_number) == 1: # In case 5, the unexpected chapter number may be extracted. To avoid it, the potential chapter number # extracted around the chapter title shall be checked if it matches the patterns of comman chapter # nummber e.g. 3.9.3, XII.I.V, or A.B.D. - pattern = re.compile(r'^(?=\w)((^|\.)(([iIvVxX]{1,8})|[a-zA-Z]|[0-9]+))+\.?(?!.)') - chapter_number_matches = re.match(pattern, potential_chapter_number[0].get_text().strip()) + pattern = re.compile( + r"^(?=\w)((^|\.)(([iIvVxX]{1,8})|[a-zA-Z]|[0-9]+))+\.?(?!.)" + ) + chapter_number_matches = re.match( + pattern, potential_chapter_number[0].get_text().strip() + ) if chapter_number_matches: # to prevent wrong chapter numbers from being extracted winners.append(potential_chapter_number[0]) @@ -420,12 +459,14 @@ def similarity_referee( # pylint: disable=too-many-branches # for readability content_winners_idx = [ i for i, x in enumerate(similarity_lt_textboxes) - if x == max(similarity_lt_textboxes, key=lambda x: x['content']) + if x == max(similarity_lt_textboxes, key=lambda x: x["content"]) ] if len(content_winners_idx) > 1: content_winner_idx = min( content_winners_idx, - key=lambda x: abs(lt_textboxes_in_rect[x].y1 - chapter['position']['y1']), + key=lambda x: abs( + lt_textboxes_in_rect[x].y1 - chapter["position"]["y1"] + ), ) else: content_winner_idx = content_winners_idx[0] @@ -434,26 +475,31 @@ def similarity_referee( # pylint: disable=too-many-branches # for readability number_winners_idx = [ i for i, x in enumerate(similarity_lt_textboxes) - if x == max(similarity_lt_textboxes, key=lambda x: x['number']) + if x == max(similarity_lt_textboxes, key=lambda x: x["number"]) ] if len(number_winners_idx) > 1: number_winner_idx = min( number_winners_idx, - key=lambda x: abs(lt_textboxes_in_rect[x].y1 - chapter['position']['y1']), + key=lambda x: abs( + lt_textboxes_in_rect[x].y1 - chapter["position"]["y1"] + ), ) else: number_winner_idx = number_winners_idx[0] # If the lt_textbox is 100% similar to the chapter in terms of its content (chapter number amd title), # it is considered the outline chapter jump target. - if similarity_lt_textboxes[content_winner_idx]['content'] == 1: + if similarity_lt_textboxes[content_winner_idx]["content"] == 1: # The case where the content of the lt_textbox is 100% identical to the chapter's number plus title. winners.append(lt_textboxes_in_rect[content_winner_idx]) elif ( - similarity_lt_textboxes[content_winner_idx]['content'] < similarity_lt_textboxes[title_winner_idx]['title'] + similarity_lt_textboxes[content_winner_idx]["content"] + < similarity_lt_textboxes[title_winner_idx]["title"] and number_winner_idx != title_winner_idx - and similarity_lt_textboxes[number_winner_idx]['number'] > MIN_OUTLINE_TITLE_TEXTBOX_SIMILARITY - and similarity_lt_textboxes[title_winner_idx]['title'] > MIN_OUTLINE_TITLE_TEXTBOX_SIMILARITY + and similarity_lt_textboxes[number_winner_idx]["number"] + > MIN_OUTLINE_TITLE_TEXTBOX_SIMILARITY + and similarity_lt_textboxes[title_winner_idx]["title"] + > MIN_OUTLINE_TITLE_TEXTBOX_SIMILARITY ): # The case where chapter number and chapter title are broken into two different lt_textboxes by pdfminer. # For the lt_textbox which wins on the basis of the content, if the similarity of its title is bigger @@ -463,9 +509,10 @@ def similarity_referee( # pylint: disable=too-many-branches # for readability winners.append(lt_textboxes_in_rect[title_winner_idx]) elif ( title_winner_idx == content_winner_idx - and similarity_lt_textboxes[content_winner_idx]['content'] - >= similarity_lt_textboxes[title_winner_idx]['title'] - and similarity_lt_textboxes[content_winner_idx]['content'] > MIN_OUTLINE_TITLE_TEXTBOX_SIMILARITY + and similarity_lt_textboxes[content_winner_idx]["content"] + >= similarity_lt_textboxes[title_winner_idx]["title"] + and similarity_lt_textboxes[content_winner_idx]["content"] + > MIN_OUTLINE_TITLE_TEXTBOX_SIMILARITY ): # The case where chapter number and its title are in the same lt_textbox # For the lt_textbox which has high potential to be a chapter, it shall win the similarity @@ -482,9 +529,9 @@ def similarity_referee( # pylint: disable=too-many-branches # for readability def render_paragraphs( # pylint: disable=too-many-branches - page_lt_textboxes_filtered: Dict[int, List[LTTextBox]], - page_list: List[Page], -) -> List[Paragraph]: + page_lt_textboxes_filtered: dict[int, list[LTTextBox]], + page_list: list[Page], +) -> list[Paragraph]: """ Render paragraphs from LTTextBox. @@ -498,8 +545,8 @@ def render_paragraphs( # pylint: disable=too-many-branches for page_index, lt_textboxes in tqdm( page_lt_textboxes_filtered.items(), - desc='###### Extracting paragraphs', - unit='pages', + desc="###### Extracting paragraphs", + unit="pages", bar_format=bar_format_lvl2(), ): # add lt_textbox to a list of paragraphs @@ -508,10 +555,18 @@ def render_paragraphs( # pylint: disable=too-many-branches for page in page_list: if page.number == page_index + 1: paragraph_page = page - position = Position(lt_textbox.x0, lt_textbox.y0, lt_textbox.x1, lt_textbox.y1, paragraph_page) + position = Position( + lt_textbox.x0, + lt_textbox.y0, + lt_textbox.x1, + lt_textbox.y1, + paragraph_page, + ) page_number = page_index + 1 - paragraph = render_single_paragraph(lt_textbox, page_number, paragraph_id, position) + paragraph = render_single_paragraph( + lt_textbox, page_number, paragraph_id, position + ) paragraph_list.append(paragraph) paragraph_id += 1 @@ -534,16 +589,18 @@ def render_single_paragraph( :return: instance of a paragraph """ links = [] - if catalog['annos']: + if catalog["annos"]: links = extract_linked_chars(lt_textbox, page_number) hbox = lt_to_libpdf_hbox_converter([lt_textbox]) - paragraph = Paragraph(idx=paragraph_id, textbox=hbox, position=position, links=links) + paragraph = Paragraph( + idx=paragraph_id, textbox=hbox, position=position, links=links + ) return paragraph -def extract_linked_chars(lt_textbox: LTTextBox, page_number: int) -> List[Link]: +def extract_linked_chars(lt_textbox: LTTextBox, page_number: int) -> list[Link]: """ Extract plain texts and linked characters in lt_textboxes. @@ -563,15 +620,15 @@ def extract_linked_chars(lt_textbox: LTTextBox, page_number: int) -> List[Link]: # rect[2] is x1 (right) # rect[3] is y1 (top) links = [] - if page_number in catalog['annos']: + if page_number in catalog["annos"]: # collect the annos which are intersected with or in the lt_textbox anno_textboxes = [ x - for x in catalog['annos'][page_number]['annotation'] - if x['rect'][0] < lt_textbox.x1 - and x['rect'][1] < lt_textbox.y1 - and x['rect'][2] > lt_textbox.x0 - and x['rect'][3] > lt_textbox.y0 + for x in catalog["annos"][page_number]["annotation"] + if x["rect"][0] < lt_textbox.x1 + and x["rect"][1] < lt_textbox.y1 + and x["rect"][2] > lt_textbox.x0 + and x["rect"][3] > lt_textbox.y0 ] if anno_textboxes: @@ -585,14 +642,18 @@ def extract_linked_chars(lt_textbox: LTTextBox, page_number: int) -> List[Link]: annos_line = [ x for x in anno_textboxes - if x['rect'][0] < line_horizontal.x1 - and x['rect'][2] > line_horizontal.x0 - and line_horizontal.y1 > (x['rect'][1] + abs(x['rect'][1] - x['rect'][3]) / 2) > line_horizontal.y0 + if x["rect"][0] < line_horizontal.x1 + and x["rect"][2] > line_horizontal.x0 + and line_horizontal.y1 + > (x["rect"][1] + abs(x["rect"][1] - x["rect"][3]) / 2) + > line_horizontal.y0 ] if annos_line: - annos_line.sort(key=lambda x: x['rect'][0]) - links.extend(annos_scanner(line_horizontal, annos_line, char_counter)) + annos_line.sort(key=lambda x: x["rect"][0]) + links.extend( + annos_scanner(line_horizontal, annos_line, char_counter) + ) else: pass char_counter = char_counter + len( @@ -608,9 +669,9 @@ def extract_linked_chars(lt_textbox: LTTextBox, page_number: int) -> List[Link]: def annos_scanner( lt_textline: LTTextLineHorizontal, - annos_line: List, + annos_line: list, char_counter: int, -) -> List[Link]: # pylint: disable=too-many-nested-blocks +) -> list[Link]: # pylint: disable=too-many-nested-blocks """ Scan the characters annotated as the source link in the scope of a textline. @@ -636,7 +697,7 @@ def annos_scanner( links = [] # anno_start_idx is used to index which character is the start of the anno # anno_end_idx is used to index in which character has reached the last char in the anno-rectangle - anno_flags = {'anno_start_idx': None, 'anno_stop_idx': None} + anno_flags = {"anno_start_idx": None, "anno_stop_idx": None} for idx_char, char in enumerate(lt_textline._objs): # pylint: disable=protected-access # if all the anno-rectangles in a line have all been checked, then just get plain text of chars @@ -648,7 +709,10 @@ def annos_scanner( annos_line[idx_anno], anno_flags, ) - if anno_flags['anno_start_idx'] is not None and anno_flags['anno_stop_idx'] is not None: + if ( + anno_flags["anno_start_idx"] is not None + and anno_flags["anno_stop_idx"] is not None + ): # chars are in the anno-rectangle # using "not None" is because the index can be 0 if anno_complete: @@ -673,9 +737,9 @@ def annos_scanner( def first_last_char_in_anno_marker( # pylint: disable=too-many-branches # better readability idx_char: int, char: Union[LTChar, LTAnno], - ltobjs_in_lttextline: List[Union[LTChar, LTAnno]], - anno: Dict, - anno_flags: Dict, + ltobjs_in_lttextline: list[Union[LTChar, LTAnno]], + anno: dict, + anno_flags: dict, ) -> bool: """ Find the indices of the first and the last char in an anno-rectangle from a textline. @@ -692,20 +756,23 @@ def first_last_char_in_anno_marker( # pylint: disable=too-many-branches # bette # As it is already a horizontal line, the vertical margin of each char in the textline is # presumably more and less the same. if isinstance(char, LTChar): - if char.x0 > anno['rect'][0] - ANNO_X_TOLERANCE and char.x1 < anno['rect'][2] + ANNO_X_TOLERANCE: + if ( + char.x0 > anno["rect"][0] - ANNO_X_TOLERANCE + and char.x1 < anno["rect"][2] + ANNO_X_TOLERANCE + ): # a char is in an anno-rectangle - if anno_flags['anno_start_idx'] is None: + if anno_flags["anno_start_idx"] is None: # the first character of an anno. - anno_flags['anno_start_idx'] = idx_char + anno_flags["anno_start_idx"] = idx_char # the original index of a end char plus 1 is more intuitive for the string slicing in python - anno_flags['anno_stop_idx'] = idx_char + 1 + anno_flags["anno_stop_idx"] = idx_char + 1 if idx_char == len(ltobjs_in_lttextline) - 1: # the last char of the textline anno_complete = True elif isinstance(ltobjs_in_lttextline[idx_char + 1], LTChar): - if ltobjs_in_lttextline[idx_char + 1].x0 > anno['rect'][2]: + if ltobjs_in_lttextline[idx_char + 1].x0 > anno["rect"][2]: # the next char is outside of the current anno-rectangle anno_complete = True else: @@ -721,16 +788,16 @@ def first_last_char_in_anno_marker( # pylint: disable=too-many-branches # bette # the last char of the textline anno_complete = True elif isinstance(ltobjs_in_lttextline[idx_char + 1], LTChar): - if ltobjs_in_lttextline[idx_char + 1].x0 > anno['rect'][2]: + if ltobjs_in_lttextline[idx_char + 1].x0 > anno["rect"][2]: # the next char is outside of the current anno-rectangle anno_complete = True else: - raise ValueError('two LTAnno occurs in a row') + raise ValueError("two LTAnno occurs in a row") return anno_complete -def render_link(anno_flags: Dict, anno: Dict, char_counter: int) -> Link: +def render_link(anno_flags: dict, anno: dict, char_counter: int) -> Link: """ Render a single Link. @@ -740,38 +807,46 @@ def render_link(anno_flags: Dict, anno: Dict, char_counter: int) -> Link: this variable is used to index the start and end chars in the lt_textbox :return: a single Link instantiated """ - start_idx = anno_flags['anno_start_idx'] + char_counter + start_idx = anno_flags["anno_start_idx"] + char_counter - if anno_flags['anno_start_idx'] == anno_flags['anno_stop_idx']: + if anno_flags["anno_start_idx"] == anno_flags["anno_stop_idx"]: # the annotation contains only one character stop_idx = start_idx else: - stop_idx = anno_flags['anno_stop_idx'] + char_counter + stop_idx = anno_flags["anno_stop_idx"] + char_counter # get the position of the jump target - if 'des_name' in anno: + if "des_name" in anno: # implicit target which means name destination catalog does not exist in this PDF - if catalog['dests'] and anno['des_name'] in catalog['dests']: + if catalog["dests"] and anno["des_name"] in catalog["dests"]: pos_target = { - 'page': catalog['dests'][anno['des_name']]['Num'], - 'x': catalog['dests'][anno['des_name']]['X'], - 'y': catalog['dests'][anno['des_name']]['Y'], + "page": catalog["dests"][anno["des_name"]]["Num"], + "x": catalog["dests"][anno["des_name"]]["X"], + "y": catalog["dests"][anno["des_name"]]["Y"], } else: - pos_target = {'page': anno['dest']['page'], 'x': anno['dest']['rect_X'], 'y': anno['dest']['rect_Y']} + pos_target = { + "page": anno["dest"]["page"], + "x": anno["dest"]["rect_X"], + "y": anno["dest"]["rect_Y"], + } else: - pos_target = {'page': anno['dest']['page'], 'x': anno['dest']['rect_X'], 'y': anno['dest']['rect_Y']} + pos_target = { + "page": anno["dest"]["page"], + "x": anno["dest"]["rect_X"], + "y": anno["dest"]["rect_Y"], + } link = Link(start_idx, stop_idx, pos_target) # reset the start and end indices of the annotation - anno_flags['anno_start_idx'] = None - anno_flags['anno_stop_idx'] = None + anno_flags["anno_start_idx"] = None + anno_flags["anno_stop_idx"] = None return link -def _flatten_outline(nested_outline, flatten_outline: List): +def _flatten_outline(nested_outline, flatten_outline: list): """ Flat a nested outline for the further process in chapters detection. @@ -783,14 +858,14 @@ def _flatten_outline(nested_outline, flatten_outline: List): """ for chapter in nested_outline: flatten_outline.append(chapter) - if chapter['content']: - _flatten_outline(chapter['content'], flatten_outline) + if chapter["content"]: + _flatten_outline(chapter["content"], flatten_outline) def remove_lt_textboxes_in_tables_figures( - page_lt_textboxes: Dict[int, List[LTTextBox]], - figure_list: List[Figure], - table_list: List[Table], + page_lt_textboxes: dict[int, list[LTTextBox]], + figure_list: list[Figure], + table_list: list[Table], ): """ Remove lt_textboxes in the coverage of tables or figures from page_lt_textboxes. @@ -806,14 +881,17 @@ def remove_lt_textboxes_in_tables_figures( page_lt_textboxes_filter = {} for page_index, lt_textboxes in page_lt_textboxes.items(): figures_tables_list = tables_figures_merge(figure_list, table_list, page_index) - if figures_tables_list is not None: # figures or tables exists in the current page + if ( + figures_tables_list is not None + ): # figures or tables exists in the current page for element in figures_tables_list: # The lt_textbox inside the elements will be filtered out. It returns only the boxes # outside the elements. # Elements here can be tables or figures lt_textboxes = list( filter( - lambda x, ele=element: x.x0 < (ele.position.x0 - TABLE_MARGIN) # left + lambda x, ele=element: x.x0 + < (ele.position.x0 - TABLE_MARGIN) # left or x.x1 > (ele.position.x1 + TABLE_MARGIN) # right or x.y0 < (ele.position.y0 - TABLE_MARGIN) # bottom or x.y1 > (ele.position.y1 + TABLE_MARGIN), # top @@ -827,10 +905,10 @@ def remove_lt_textboxes_in_tables_figures( def tables_figures_merge( - figure_list: List[Figure], - table_list: List[Table], + figure_list: list[Figure], + table_list: list[Table], page_index: int, -) -> List[Union[Figure, Table]]: +) -> list[Union[Figure, Table]]: """ Merge tables and figures in the same page. @@ -842,16 +920,20 @@ def tables_figures_merge( :param page_index: index of current page number :return: """ - filter_list_table = list(filter(lambda x: x.position.page.number == page_index + 1, table_list)) - filter_list_figure = list(filter(lambda x: x.position.page.number == page_index + 1, figure_list)) - merge_list: List[Union[Figure, Table]] = filter_list_table + filter_list_figure + filter_list_table = list( + filter(lambda x: x.position.page.number == page_index + 1, table_list) + ) + filter_list_figure = list( + filter(lambda x: x.position.page.number == page_index + 1, figure_list) + ) + merge_list: list[Union[Figure, Table]] = filter_list_table + filter_list_figure if merge_list: merge_list.sort(key=lambda x: x.position.y0, reverse=True) return merge_list -def pdfminer_get_lt_textboxes(pdf) -> Dict[int, List[LTTextBox]]: +def pdfminer_get_lt_textboxes(pdf) -> dict[int, list[LTTextBox]]: """ Layout analysis done by pdfminer. @@ -862,20 +944,20 @@ def pdfminer_get_lt_textboxes(pdf) -> Dict[int, List[LTTextBox]]: :param pdf: instance of pdfplumber.pdf.PDF class :return: dictionary mapping page numbers (0-based) to a list of LTTextBox objects """ - LOG.info('Extracting layout ...') + LOG.info("Extracting layout ...") page_lt_textboxes = {} for idx_page, page in enumerate( tqdm( pdf.pages, total=len(pdf.pages), - desc='###### Extracting layout', - unit='pages', + desc="###### Extracting layout", + unit="pages", bar_format=bar_format_lvl2(), ), ): if logging_needed(idx_page, len(pdf.pages)): - LOG.debug('Extracting layout page %s of %s', idx_page + 1, len(pdf.pages)) + LOG.debug("Extracting layout page %s of %s", idx_page + 1, len(pdf.pages)) pdf.interpreter.process_page(page.page_obj) result = pdf.device.get_result() @@ -883,10 +965,12 @@ def pdfminer_get_lt_textboxes(pdf) -> Dict[int, List[LTTextBox]]: # remove detected header and footer lt_textboxes based on given page crop margin parameter filter_lt_textboxes = list( filter( - lambda lt_textbox: lt_textbox.y1 < (float(pdf.pages[0].height) - parameters.PAGE_CROP_MARGINS['top']) - and lt_textbox.y0 > parameters.PAGE_CROP_MARGINS['bottom'] - and lt_textbox.x0 > parameters.PAGE_CROP_MARGINS['left'] - and lt_textbox.x1 < (float(pdf.pages[0].width) - parameters.PAGE_CROP_MARGINS['right']), + lambda lt_textbox: lt_textbox.y1 + < (float(pdf.pages[0].height) - parameters.PAGE_CROP_MARGINS["top"]) + and lt_textbox.y0 > parameters.PAGE_CROP_MARGINS["bottom"] + and lt_textbox.x0 > parameters.PAGE_CROP_MARGINS["left"] + and lt_textbox.x1 + < (float(pdf.pages[0].width) - parameters.PAGE_CROP_MARGINS["right"]), lt_textboxes, ), ) diff --git a/libpdf/utils.py b/libpdf/utils.py index 7c19262..37ea38d 100644 --- a/libpdf/utils.py +++ b/libpdf/utils.py @@ -5,21 +5,10 @@ import re from decimal import Decimal from pathlib import Path -from typing import Any, Dict, List, Tuple, Type, Union +from typing import Any, Union import chardet - - -from libpdf.log import logging_needed -from libpdf.models.chapter import Chapter -from libpdf.models.element import Element -from libpdf.models.figure import Figure -from libpdf.models.horizontal_box import Char, HorizontalBox, HorizontalLine, Word -from libpdf.models.paragraph import Paragraph -from libpdf.models.table import Table -from libpdf.parameters import RENDER_ELEMENTS, VIS_DBG_MAP_ELEMENTS_COLOR -from libpdf.progress import bar_format_lvl1, tqdm - +import pdfplumber from pdfminer.converter import PDFPageAggregator from pdfminer.layout import ( LAParams, @@ -41,22 +30,30 @@ from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser -import pdfplumber +from libpdf.log import logging_needed +from libpdf.models.chapter import Chapter +from libpdf.models.element import Element +from libpdf.models.figure import Figure +from libpdf.models.horizontal_box import Char, HorizontalBox, HorizontalLine, Word +from libpdf.models.paragraph import Paragraph +from libpdf.models.table import Table +from libpdf.parameters import RENDER_ELEMENTS, VIS_DBG_MAP_ELEMENTS_COLOR +from libpdf.progress import bar_format_lvl1, tqdm MAP_TYPES = { - Chapter: 'chapter', - Paragraph: 'paragraph', - Table: 'table', - Figure: 'figure', - LTChar: 'paragraph', - LTCurve: 'figure', - LTTextBox: 'paragraph', - LTTextBoxHorizontal: 'paragraph', - LTTextLineHorizontal: 'paragraph', - LTFigure: 'figure', - LTLine: 'figure', - LTRect: 'figure', - LTImage: 'figure', + Chapter: "chapter", + Paragraph: "paragraph", + Table: "table", + Figure: "figure", + LTChar: "paragraph", + LTCurve: "figure", + LTTextBox: "paragraph", + LTTextBoxHorizontal: "paragraph", + LTTextLineHorizontal: "paragraph", + LTFigure: "figure", + LTLine: "figure", + LTRect: "figure", + LTImage: "figure", } LOG = logging.getLogger(__name__) @@ -66,9 +63,9 @@ def decode_title(obj_bytes: bytes) -> str: """Decode catalog headline using chardet library.""" chardet_ret = chardet.detect(obj_bytes) try: - str_ret = obj_bytes.decode(chardet_ret['encoding']) + str_ret = obj_bytes.decode(chardet_ret["encoding"]) except UnicodeDecodeError: - str_ret = obj_bytes.decode(chardet_ret['encoding'], 'backslashreplace') + str_ret = obj_bytes.decode(chardet_ret["encoding"], "backslashreplace") LOG.warning( 'Could not fully decode catalog headline "%s". Replaced character(s) with escaped hex value.', str_ret, @@ -116,14 +113,14 @@ def string_to_identifier(text: str): :raises: ValueError: text contains newline chars \r or \n :return: identifier """ - newline_chars = ['\r', '\n'] + newline_chars = ["\r", "\n"] for newline_char in newline_chars: if newline_char in text: raise ValueError(f'Input text "{text}" contains a new line character.') - allowed_chars_regex = re.compile(r'[^_a-zA-Z0-9]') - replace_string = allowed_chars_regex.sub('_', text) + allowed_chars_regex = re.compile(r"[^_a-zA-Z0-9]") + replace_string = allowed_chars_regex.sub("_", text) if replace_string[0].isdigit(): - replace_string = '_' + replace_string + replace_string = "_" + replace_string return replace_string @@ -186,7 +183,7 @@ def from_pdfplumber_bbox(x0, top, x1, bottom, page_height): return [float(x0), float(page_height - bottom), float(x1), float(page_height - top)] -def check_lt_obj_in_bbox(lt_obj, bbox: Tuple[float, float, float, float]): +def check_lt_obj_in_bbox(lt_obj, bbox: tuple[float, float, float, float]): """ Check if pdfminer LTContainer layout object (lt_obj) is completely inside the given bounding box (bbox). @@ -221,16 +218,21 @@ def check_lt_obj_in_bbox(lt_obj, bbox: Tuple[float, float, float, float]): :return: True if lt_obj is completely containd in bbox else False """ lt_obj_in_bbox = False - if lt_obj.x0 > bbox[0] and lt_obj.y0 > bbox[1] and lt_obj.x1 < bbox[2] and lt_obj.y1 < bbox[3]: + if ( + lt_obj.x0 > bbox[0] + and lt_obj.y0 > bbox[1] + and lt_obj.x1 < bbox[2] + and lt_obj.y1 < bbox[3] + ): lt_obj_in_bbox = True return lt_obj_in_bbox def find_lt_obj_in_bbox( - lt_objs_in_bbox: List, + lt_objs_in_bbox: list, lt_obj, - bbox: Tuple[float, float, float, float], + bbox: tuple[float, float, float, float], ): # pylint: disable=too-many-nested-blocks, too-many-branches # local algorithm, easier to read when not split up """ Find all layout objects (lt_obj) inside given bounding box (bbox) recursively. @@ -302,7 +304,7 @@ def find_lt_obj_in_bbox( else: # This is the case when a LT object is intersected with the given box. In this case, the LT objects inside the # given bounding box need to be hierarchically and recursively found. - if hasattr(lt_obj, '_objs'): + if hasattr(lt_obj, "_objs"): # All the downwards hierarchical LT objects are stored in the attribute "_objs". # If the _objs attribute doesn't exist, it means it's the bottom of the hierarchy. text_inside_bbox = False # True on LTTextLine level when the first LTChar is inside the BBOX @@ -339,11 +341,11 @@ def find_lt_obj_in_bbox( def lt_page_crop( - bbox: Tuple[float, float, float, float], - lt_objs: List, - lt_type_in_filter: Type[Union[LTText, LTCurve, LTImage, LTFigure]], + bbox: tuple[float, float, float, float], + lt_objs: list, + lt_type_in_filter: type[Union[LTText, LTCurve, LTImage, LTFigure]], contain_completely: bool = False, -) -> List: +) -> list: """ Find and filter pdfminer layout objects in the given bounding box. @@ -393,7 +395,9 @@ def lt_page_crop( return lt_objs_in_bbox -def lt_to_libpdf_hbox_converter(lt_objs: List[LTTextBoxHorizontal]) -> Union[HorizontalBox, None]: +def lt_to_libpdf_hbox_converter( + lt_objs: list[LTTextBoxHorizontal], +) -> Union[HorizontalBox, None]: """Convert a LTTextBox to a HorizontalBox.""" flatten_lt_objs = [] flatten_hiearchical_lttext(lt_objs, flatten_lt_objs) @@ -407,8 +411,8 @@ def lt_to_libpdf_hbox_converter(lt_objs: List[LTTextBoxHorizontal]) -> Union[Hor def textbox_crop( - bbox: Tuple[float, float, float, float], - ltpage_objs: List, + bbox: tuple[float, float, float, float], + ltpage_objs: list, ) -> Union[HorizontalBox, None]: """ Collect and group hierachically all LTChar in a given bbox and convert all LTText to libpdf text objects. @@ -426,8 +430,8 @@ def textbox_crop( def assemble_to_textlines( - flatten_lt_objs: List[LTText], -) -> List[LTTextLineHorizontal]: + flatten_lt_objs: list[LTText], +) -> list[LTTextLineHorizontal]: """ Assemble and convert all LTChar into a libpdf horiontal line or several libpdf horizontal lines. @@ -450,7 +454,7 @@ def assemble_to_textlines( # last_ltobj = flatten_lt_objs[1] for lt_obj in flatten_lt_objs: - if lt_obj.get_text() != ' ' and lt_obj.get_text() != '\n': + if lt_obj.get_text() != " " and lt_obj.get_text() != "\n": # instantiate Char char = Char(lt_obj.get_text(), lt_obj.x0, lt_obj.y0, lt_obj.x1, lt_obj.y1) chars.append(char) @@ -464,7 +468,7 @@ def assemble_to_textlines( words.clear() textlines.append(textline) - elif lt_obj.get_text() == ' ' and chars: + elif lt_obj.get_text() == " " and chars: word = Word(copy.deepcopy(chars)) chars.clear() words.append(word) @@ -475,7 +479,7 @@ def assemble_to_textlines( words.clear() textlines.append(textline) - elif isinstance(lt_obj, LTAnno) and lt_obj.get_text() == '\n': + elif isinstance(lt_obj, LTAnno) and lt_obj.get_text() == "\n": if chars: word = Word(copy.deepcopy(chars)) chars.clear() @@ -501,8 +505,8 @@ def assemble_to_textlines( def lt_textbox_crop( - bbox: Tuple[float, float, float, float], - ltpage_objs: List, + bbox: tuple[float, float, float, float], + ltpage_objs: list, word_margin: float, y_tolerance: float, ) -> Union[LTTextBoxHorizontal, None]: @@ -539,10 +543,10 @@ def lt_textbox_crop( def assemble_to_lt_textlines( - flatten_lt_objs: List[LTText], + flatten_lt_objs: list[LTText], word_margin: float, y_tolerance: float, -) -> List[LTTextLineHorizontal]: +) -> list[LTTextLineHorizontal]: """ Assemble all LTChar into a LTTextline or several LTTextlines. @@ -566,10 +570,16 @@ def assemble_to_lt_textlines( for lt_obj in flatten_lt_objs: if isinstance(lt_obj, LTChar): - if abs((lt_obj.y0 + (lt_obj.height / 2)) - (last_ltobj.y0 + (last_ltobj.height / 2))) < y_tolerance: + if ( + abs( + (lt_obj.y0 + (lt_obj.height / 2)) + - (last_ltobj.y0 + (last_ltobj.height / 2)) + ) + < y_tolerance + ): lt_textlines[-1].add(lt_obj) else: - lt_textlines[-1]._objs.append(LTAnno('\n')) # pylint: disable=protected-access # access needed + lt_textlines[-1]._objs.append(LTAnno("\n")) # pylint: disable=protected-access # access needed lt_textlines.append(LTTextLineHorizontal(word_margin)) lt_textlines[-1].add(lt_obj) @@ -578,7 +588,7 @@ def assemble_to_lt_textlines( return lt_textlines -def flatten_hiearchical_lttext(lt_objs: List[LTText], flatten_lt_objs: List[LTChar]): +def flatten_hiearchical_lttext(lt_objs: list[LTText], flatten_lt_objs: list[LTChar]): """ Flatten hierarchical LTText which can be LTTextBox and LTLine. @@ -590,7 +600,7 @@ def flatten_hiearchical_lttext(lt_objs: List[LTText], flatten_lt_objs: List[LTCh """ for lt_obj in lt_objs: if isinstance(lt_obj, (LTTextBoxHorizontal, LTTextLineHorizontal)): - if hasattr(lt_obj, '_objs'): + if hasattr(lt_obj, "_objs"): flatten_hiearchical_lttext( lt_obj._objs, # pylint: disable=protected-access # not publicly available flatten_lt_objs, @@ -599,7 +609,7 @@ def flatten_hiearchical_lttext(lt_objs: List[LTText], flatten_lt_objs: List[LTCh flatten_lt_objs.append(lt_obj) -def get_elements_on_page(elements: List[Element], page_no, element_type=None): +def get_elements_on_page(elements: list[Element], page_no, element_type=None): """ Return all libpdf elements that are on a certain page. @@ -626,30 +636,35 @@ def visual_debug_libpdf( # pylint: disable=too-many-branches visual_debug_exclude_elements, ): """Visual debug.""" - LOG.info('Starting visual debug...') + LOG.info("Starting visual debug...") # collect all elements all_elements = ( - objects.flattened.chapters + objects.flattened.paragraphs + objects.flattened.tables + objects.flattened.figures + objects.flattened.chapters + + objects.flattened.paragraphs + + objects.flattened.tables + + objects.flattened.figures ) # prepare for calling the common draw and output function draw_elements = {} - for page in tqdm(objects.root.pages, desc='###### Calculating bboxes', unit='pages'): + for page in tqdm( + objects.root.pages, desc="###### Calculating bboxes", unit="pages" + ): page_elements = get_elements_on_page(all_elements, page.number) for page_element in page_elements: draw_element = { - 'element': page_element, - 'x0': page_element.position.x0, - 'y0': page_element.position.y0, - 'x1': page_element.position.x1, - 'y1': page_element.position.y1, + "element": page_element, + "x0": page_element.position.x0, + "y0": page_element.position.y0, + "x1": page_element.position.x1, + "y1": page_element.position.y1, } if page.number not in draw_elements: draw_elements[page.number] = [draw_element] else: draw_elements[page.number].append(draw_element) - LOG.info('Rendering images') + LOG.info("Rendering images") if visual_debug_include_elements: rendered_elements = visual_debug_include_elements @@ -668,7 +683,7 @@ def visual_debug_libpdf( # pylint: disable=too-many-branches render_pages( pdf_pages=objects.pdfplumber.pages, target_dir=target_dir, - name_prefix='libpdf_', + name_prefix="libpdf_", draw_elements=draw_elements, render_elements=[render_element], ) @@ -678,19 +693,19 @@ def visual_debug_libpdf( # pylint: disable=too-many-branches render_pages( pdf_pages=objects.pdfplumber.pages, target_dir=visual_output_dir, - name_prefix='libpdf_', + name_prefix="libpdf_", draw_elements=draw_elements, render_elements=rendered_elements, ) - LOG.info('Visual debug finished successfully.') + LOG.info("Visual debug finished successfully.") def render_pages( - pdf_pages: List, + pdf_pages: list, target_dir: str, name_prefix: str, - draw_elements: Dict[int, List[Dict[str, Any]]], - render_elements: List[str], + draw_elements: dict[int, list[dict[str, Any]]], + render_elements: list[str], ): """ Render PDF pages as images containing bounding box of certain elements. @@ -715,13 +730,13 @@ def render_pages( :param render_elements: list of elements to render, options are chapter, paragraph, table, figure :return: None """ - render_elements_joined = ', '.join(render_elements) - LOG.info('Saving annotated images for %s ...', render_elements_joined) + render_elements_joined = ", ".join(render_elements) + LOG.info("Saving annotated images for %s ...", render_elements_joined) for page in tqdm( pdf_pages, - desc=f'### Saving {render_elements_joined}', - unit='pages', + desc=f"### Saving {render_elements_joined}", + unit="pages", bar_format=bar_format_lvl1(), leave=False, ): @@ -729,7 +744,7 @@ def render_pages( if logging_needed(page_no - 1, len(pdf_pages)): LOG.info( - 'Saving annotated images for %s page %s of %s', + "Saving annotated images for %s page %s of %s", render_elements_joined, page_no, len(pdf_pages), @@ -743,10 +758,10 @@ def render_pages( # filter for elements that shall get rendered target_draw_elements = [] for draw_element in draw_elements_page: - element_type = type(draw_element['element']) + element_type = type(draw_element["element"]) if element_type not in MAP_TYPES: continue - str_type = MAP_TYPES[type(draw_element['element'])] + str_type = MAP_TYPES[type(draw_element["element"])] if str_type in render_elements: target_draw_elements.append(draw_element) @@ -755,40 +770,42 @@ def render_pages( image = page.to_image(resolution=150) for target_draw_element in target_draw_elements: bbox = to_pdfplumber_bbox( - target_draw_element['x0'], - target_draw_element['y0'], - target_draw_element['x1'], - target_draw_element['y1'], + target_draw_element["x0"], + target_draw_element["y0"], + target_draw_element["x1"], + target_draw_element["y1"], page.height, ) image.draw_rect( bbox, - fill=VIS_DBG_MAP_ELEMENTS_COLOR[MAP_TYPES[type(target_draw_element['element'])]], + fill=VIS_DBG_MAP_ELEMENTS_COLOR[ + MAP_TYPES[type(target_draw_element["element"])] + ], stroke_width=2, ) - image.save(os.path.join(target_dir, name_prefix + f'{page_no}.png')) + image.save(os.path.join(target_dir, name_prefix + f"{page_no}.png")) def visual_debug_pdfminer(pdf_path, vd_pdfminer_output): """Visual debug pdfminer.""" - logging.basicConfig(format='[%(levelname)5s] %(message)s', level=logging.DEBUG) + logging.basicConfig(format="[%(levelname)5s] %(message)s", level=logging.DEBUG) - LOG.info('Starting layout extraction using only pdfminer') + LOG.info("Starting layout extraction using only pdfminer") - logging.getLogger('pdfminer').level = logging.WARNING - logging.getLogger('PIL').level = logging.WARNING + logging.getLogger("pdfminer").level = logging.WARNING + logging.getLogger("PIL").level = logging.WARNING page_containers = extract_layout(pdf_path) draw_elements = {} for page_no, page_container in page_containers.items(): - for lt_element in page_container['elements']: + for lt_element in page_container["elements"]: draw_element = { - 'element': lt_element, - 'x0': lt_element.x0, - 'y0': lt_element.y0, - 'x1': lt_element.x1, - 'y1': lt_element.y1, + "element": lt_element, + "x0": lt_element.x0, + "y0": lt_element.y0, + "x1": lt_element.x1, + "y1": lt_element.y1, } if page_no not in draw_elements: draw_elements[page_no] = [draw_element] @@ -801,18 +818,18 @@ def visual_debug_pdfminer(pdf_path, vd_pdfminer_output): render_pages( pdf_pages=pages_list, target_dir=vd_pdfminer_output, - name_prefix='pdfminer_', + name_prefix="pdfminer_", draw_elements=draw_elements, render_elements=RENDER_ELEMENTS, ) - LOG.info('Finished successfully') + LOG.info("Finished successfully") def extract_layout(path_pdf, idx_single_page=None): """Use pdfminer.six to extract LTContainer layout boxes.""" - LOG.info('Extracting layout ...') + LOG.info("Extracting layout ...") parser = None - with open(path_pdf, 'rb') as file_pointer: + with open(path_pdf, "rb") as file_pointer: # init pdfminer elements parser = PDFParser(file_pointer) doc = PDFDocument(parser) @@ -824,10 +841,10 @@ def extract_layout(path_pdf, idx_single_page=None): page_containers = {} # return dictionary - page_count = doc.catalog['Pages'].resolve()['Count'] + page_count = doc.catalog["Pages"].resolve()["Count"] for idx_page, page in enumerate(pages): if logging_needed(idx_page, page_count): - LOG.debug('Extracting layout page %s of %s', idx_page + 1, page_count) + LOG.debug("Extracting layout page %s of %s", idx_page + 1, page_count) if idx_single_page is not None and idx_single_page != idx_page: continue @@ -835,11 +852,11 @@ def extract_layout(path_pdf, idx_single_page=None): interpreter.process_page(page) lt_page: LTPage = device.get_result() - page_containers[idx_page + 1] = {'page': page, 'elements': list(lt_page)} + page_containers[idx_page + 1] = {"page": page, "elements": list(lt_page)} - LOG.info('Finished layout extraction') + LOG.info("Finished layout extraction") container_count = 0 for page_container in page_containers.values(): - container_count += len(page_container['elements']) - LOG.info('Extracted %s containers from %s pages', container_count, page_count) + container_count += len(page_container["elements"]) + LOG.info("Extracted %s containers from %s pages", container_count, page_count) return page_containers diff --git a/tests/conftest.py b/tests/conftest.py index 55c35d1..9d21e3b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,25 +1,35 @@ """Pytest conftest module containing common test configuration and fixtures.""" import os -from libpdf import load - import pytest +from libpdf import load + # test PDFs from pdfplumber -PDF_LOREM_IPSUM = os.path.join(os.path.dirname(__file__), 'pdf', 'lorem-ipsum.pdf') -PDF_TWO_COLUMNS = os.path.join(os.path.dirname(__file__), 'pdf', 'two_colums_sampe.pdf') -PDF_WITH_EMPTY_OUTLINE = os.path.join(os.path.dirname(__file__), 'pdf', 'issue-67-example.pdf') -PDF_OUTLINE_NO_DEST = os.path.join(os.path.dirname(__file__), 'pdf', 'pdffill-demo.pdf') -PDF_FIGURE_WITH_INVALID_BBOX = os.path.join(os.path.dirname(__file__), 'pdf', 'pr-138-example.pdf') -PDF_CHAPTER_DETECTION = os.path.join(os.path.dirname(__file__), 'pdf', 'DS93-chapter-issue-fix.pdf') +PDF_LOREM_IPSUM = os.path.join(os.path.dirname(__file__), "pdf", "lorem-ipsum.pdf") +PDF_TWO_COLUMNS = os.path.join(os.path.dirname(__file__), "pdf", "two_colums_sampe.pdf") +PDF_WITH_EMPTY_OUTLINE = os.path.join( + os.path.dirname(__file__), "pdf", "issue-67-example.pdf" +) +PDF_OUTLINE_NO_DEST = os.path.join(os.path.dirname(__file__), "pdf", "pdffill-demo.pdf") +PDF_FIGURE_WITH_INVALID_BBOX = os.path.join( + os.path.dirname(__file__), "pdf", "pr-138-example.pdf" +) +PDF_CHAPTER_DETECTION = os.path.join( + os.path.dirname(__file__), "pdf", "DS93-chapter-issue-fix.pdf" +) # full features PDF -PDF_FULL_FEATURES = os.path.join(os.path.dirname(__file__), 'pdf', 'full_features.pdf') -PDF_FIGURES_EXTRACTION = os.path.join(os.path.dirname(__file__), 'pdf', 'test_figures_extraction.pdf') -PDF_SMART_HEADER_FOOTER_DETECTION = os.path.join(os.path.dirname(__file__), 'pdf', 'test_header_footer_detection.pdf') +PDF_FULL_FEATURES = os.path.join(os.path.dirname(__file__), "pdf", "full_features.pdf") +PDF_FIGURES_EXTRACTION = os.path.join( + os.path.dirname(__file__), "pdf", "test_figures_extraction.pdf" +) +PDF_SMART_HEADER_FOOTER_DETECTION = os.path.join( + os.path.dirname(__file__), "pdf", "test_header_footer_detection.pdf" +) # test PDFs from official python documentation -PDF_PYTHON_LOGGING = os.path.join(os.path.dirname(__file__), 'pdf', 'howto-logging.pdf') +PDF_PYTHON_LOGGING = os.path.join(os.path.dirname(__file__), "pdf", "howto-logging.pdf") def obj_equal(class_type, instance1, instance2): @@ -36,8 +46,16 @@ def obj_equal(class_type, instance1, instance2): return NotImplemented # get attributes of each and exclude special names and back references - self_attr = [attr for attr in dir(instance1) if (not attr.startswith('__') and not attr.startswith('b_'))] - other_attr = [attr for attr in dir(instance2) if (not attr.startswith('__') and not attr.startswith('b_'))] + self_attr = [ + attr + for attr in dir(instance1) + if (not attr.startswith("__") and not attr.startswith("b_")) + ] + other_attr = [ + attr + for attr in dir(instance2) + if (not attr.startswith("__") and not attr.startswith("b_")) + ] if set(self_attr) == set(other_attr): for attr in self_attr: if getattr(instance1, attr) != getattr(instance1, attr): @@ -47,14 +65,14 @@ def obj_equal(class_type, instance1, instance2): return False -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def load_full_features_pdf(tmpdir_factory, request): """Load test pdf and return temporary directory path and the libpdf object.""" - tmpdir = tmpdir_factory.mktemp('full_features_pdf') + tmpdir = tmpdir_factory.mktemp("full_features_pdf") tmpdir_path = str(tmpdir) - save_figures = request.param if hasattr(request, 'param') else False + save_figures = request.param if hasattr(request, "param") else False return tmpdir_path, load( PDF_FULL_FEATURES, save_figures=save_figures, - figure_dir=os.path.join(tmpdir_path, 'figures'), + figure_dir=os.path.join(tmpdir_path, "figures"), ) diff --git a/tests/test_api.py b/tests/test_api.py index 0a19f36..3a42462 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -2,15 +2,14 @@ import logging -from libpdf import load - import pytest +from libpdf import load from tests.conftest import PDF_LOREM_IPSUM, PDF_TWO_COLUMNS @pytest.mark.parametrize( - 'path', + "path", [PDF_LOREM_IPSUM, PDF_TWO_COLUMNS], ) def test_api_ok(tmpdir, path): @@ -23,13 +22,14 @@ def test_api_ok(tmpdir, path): # TODO implement correctly def test_logging(tmpdir, monkeypatch): """Check if log messages appear in output.""" + # monkeypatch the failing extract function def mock_extract(*args, **kwargs): # delete unused variables to denote they are not yet used del args del kwargs - monkeypatch.setattr('libpdf.core.extract', mock_extract) + monkeypatch.setattr("libpdf.core.extract", mock_extract) logging.basicConfig() objects = load(PDF_LOREM_IPSUM, figure_dir=str(tmpdir)) assert objects is None diff --git a/tests/test_catalog.py b/tests/test_catalog.py index 785fed7..5987933 100644 --- a/tests/test_catalog.py +++ b/tests/test_catalog.py @@ -2,8 +2,11 @@ from click.testing import CliRunner import libpdf - -from tests.conftest import PDF_OUTLINE_NO_DEST, PDF_PYTHON_LOGGING, PDF_WITH_EMPTY_OUTLINE +from tests.conftest import ( + PDF_OUTLINE_NO_DEST, + PDF_PYTHON_LOGGING, + PDF_WITH_EMPTY_OUTLINE, +) def test_catalog_with_empty_outline(): @@ -25,7 +28,7 @@ def test_catalog_outline_no_dest(): assert objects.flattened.chapters # outline without destination to jump to in this pdf will not be extracted as chapter assert len(objects.flattened.chapters) == 11 - assert objects.flattened.chapters[-1].title == 'Create Curves' + assert objects.flattened.chapters[-1].title == "Create Curves" def test_catalog_outline_title(): @@ -33,4 +36,4 @@ def test_catalog_outline_title(): objects = libpdf.load(PDF_PYTHON_LOGGING) assert objects is not None # check outline title is correctly resolved - assert objects.flattened.chapters[0].title == 'Basic Logging Tutorial' + assert objects.flattened.chapters[0].title == "Basic Logging Tutorial" diff --git a/tests/test_cli.py b/tests/test_cli.py index 1146bf9..d2a6a8b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,20 +1,18 @@ """Initial test cases for CLI.""" +import pytest from click.testing import CliRunner from libpdf.core import main_cli - -import pytest - from tests.conftest import PDF_LOREM_IPSUM, PDF_TWO_COLUMNS @pytest.mark.parametrize( - 'path', + "path", [PDF_LOREM_IPSUM, PDF_TWO_COLUMNS], ) def test_cli_ok(path): """Check if CLI exits with code 0 when no errors occur.""" runner = CliRunner() - result = runner.invoke(main_cli, [path, '-o', 'out.yaml', '-f', 'yaml']) + result = runner.invoke(main_cli, [path, "-o", "out.yaml", "-f", "yaml"]) assert result.exception is None assert result.exit_code == 0 diff --git a/tests/test_details.py b/tests/test_details.py index 48a6d37..5a5c956 100644 --- a/tests/test_details.py +++ b/tests/test_details.py @@ -11,19 +11,22 @@ from libpdf.models.position import Position from libpdf.models.root import Root from libpdf.models.table import Cell, Table - from tests.conftest import PDF_LOREM_IPSUM def test_lorem_ipsum(): """Test if the library reads all content from input PDF correctly.""" - file = File(name='lorem-ipsum.pdf', path='/home/marco/ub/libpdf/tests/pdf/lorem-ipsum.pdf', page_count=2) + file = File( + name="lorem-ipsum.pdf", + path="/home/marco/ub/libpdf/tests/pdf/lorem-ipsum.pdf", + page_count=2, + ) file_meta = FileMeta( author=None, title=None, subject=None, - creator='LaTeX with hyperref package', - producer='pdfTeX-1.40.16', + creator="LaTeX with hyperref package", + producer="pdfTeX-1.40.16", keywords=None, creation_date=datetime(2017, 5, 9, 13, 57, 58), modified_date=datetime(2017, 5, 9, 13, 57, 58), @@ -43,7 +46,9 @@ def test_lorem_ipsum(): cell2_6 = Cell(2, 6, dummy_pos, dummy_links, textbox=dummy_hbox) cell6_2 = Cell(6, 2, dummy_pos, dummy_links, textbox=dummy_hbox) cell18_7 = Cell(18, 7, dummy_pos, dummy_links, textbox=dummy_hbox) - table1 = Table(idx=1, cells=[cell1_1, cell2_6, cell6_2, cell18_7], position=dummy_pos) + table1 = Table( + idx=1, cells=[cell1_1, cell2_6, cell6_2, cell18_7], position=dummy_pos + ) paragraph1 = Paragraph( idx=1, links=dummy_links, @@ -51,20 +56,20 @@ def test_lorem_ipsum(): textbox=dummy_hbox, ) chapter1 = Chapter( - title='Ipsum labore ut consectetur.', - number='1', + title="Ipsum labore ut consectetur.", + number="1", position=dummy_pos, textbox=dummy_hbox, ) chapter2 = Chapter( - title='Quiquia adipisci numquam tempora dolore magnam.', - number='2', + title="Quiquia adipisci numquam tempora dolore magnam.", + number="2", position=dummy_pos, textbox=dummy_hbox, ) chapter2_1 = Chapter( - title='Etincidunt consectetur porro velit sed quaerat.', - number='2.1', + title="Etincidunt consectetur porro velit sed quaerat.", + number="2.1", position=dummy_pos, textbox=dummy_hbox, ) diff --git a/tests/test_ds93_chapter.py b/tests/test_ds93_chapter.py index 4ff86dc..ce19688 100644 --- a/tests/test_ds93_chapter.py +++ b/tests/test_ds93_chapter.py @@ -1,6 +1,5 @@ """Test case for JIRA ticket DS-93.""" import libpdf - from tests.conftest import PDF_CHAPTER_DETECTION @@ -18,9 +17,9 @@ def test_chapter_detection(): # check chapter number and title # First chapter is "3.5.4 Franca-to-AUTOSAR Client Server Link" - assert chapters[0].title == 'Franca-to-AUTOSAR Client Server Link' - assert chapters[0].number == '3.5.4' + assert chapters[0].title == "Franca-to-AUTOSAR Client Server Link" + assert chapters[0].number == "3.5.4" # Second chapter is "9. The note composition of C Chord are C, E and G" - assert chapters[1].title == 'The note composition of C Chord are C, E and G' - assert chapters[1].number == '9.' + assert chapters[1].title == "The note composition of C Chord are C, E and G" + assert chapters[1].number == "9." diff --git a/tests/test_figures.py b/tests/test_figures.py index 40413b0..7054eb2 100644 --- a/tests/test_figures.py +++ b/tests/test_figures.py @@ -2,8 +2,11 @@ from click.testing import CliRunner import libpdf - -from tests.conftest import PDF_FIGURES_EXTRACTION, PDF_FIGURE_WITH_INVALID_BBOX, PDF_FULL_FEATURES +from tests.conftest import ( + PDF_FIGURE_WITH_INVALID_BBOX, + PDF_FIGURES_EXTRACTION, + PDF_FULL_FEATURES, +) def test_figures_extract_with_invalid_bbox(): @@ -16,12 +19,18 @@ def test_figures_extract_with_invalid_bbox(): assert objects is not None # extract figures only with valid bbox assert len(objects.pdfplumber.pages[0].figures) == 1 - assert objects.pdfplumber.pages[0].figures[0]['height'] == 0 - assert objects.pdfplumber.pages[0].figures[0]['y0'] == objects.pdfplumber.pages[0].figures[0]['y1'] + assert objects.pdfplumber.pages[0].figures[0]["height"] == 0 + assert ( + objects.pdfplumber.pages[0].figures[0]["y0"] + == objects.pdfplumber.pages[0].figures[0]["y1"] + ) assert len(objects.pdfplumber.pages[1].figures) == 1 - assert objects.pdfplumber.pages[1].figures[0]['height'] == 0 - assert objects.pdfplumber.pages[1].figures[0]['y0'] == objects.pdfplumber.pages[1].figures[0]['y1'] + assert objects.pdfplumber.pages[1].figures[0]["height"] == 0 + assert ( + objects.pdfplumber.pages[1].figures[0]["y0"] + == objects.pdfplumber.pages[1].figures[0]["y1"] + ) assert not objects.flattened.figures @@ -35,46 +44,47 @@ def test_figures_extraction(): assert len(objects.flattened.figures) == 2 # filter figure with negative position, partially outside page - assert objects.pdfplumber.figures[2]['x0'] < 0 + assert objects.pdfplumber.figures[2]["x0"] < 0 # check that figure exists no more assert objects.flattened.figures[0].position.x0 >= 0 assert objects.flattened.figures[1].position.x0 >= 0 # filter figures that are too small - assert objects.pdfplumber.figures[4]['width'] < 15 - assert objects.pdfplumber.figures[4]['height'] < 15 + assert objects.pdfplumber.figures[4]["width"] < 15 + assert objects.pdfplumber.figures[4]["height"] < 15 # check that figure exists no more for figure in objects.flattened.figures: assert figure.position.x1 - figure.position.x0 >= 15 assert figure.position.y1 - figure.position.y0 >= 15 # filter figures that are completely inside other figures - assert objects.pdfplumber.figures[1]['x0'] > objects.pdfplumber.figures[0]['x0'] - assert objects.pdfplumber.figures[1]['y0'] > objects.pdfplumber.figures[0]['y0'] - assert objects.pdfplumber.figures[1]['x1'] < objects.pdfplumber.figures[0]['x1'] - assert objects.pdfplumber.figures[1]['y1'] < objects.pdfplumber.figures[0]['y1'] + assert objects.pdfplumber.figures[1]["x0"] > objects.pdfplumber.figures[0]["x0"] + assert objects.pdfplumber.figures[1]["y0"] > objects.pdfplumber.figures[0]["y0"] + assert objects.pdfplumber.figures[1]["x1"] < objects.pdfplumber.figures[0]["x1"] + assert objects.pdfplumber.figures[1]["y1"] < objects.pdfplumber.figures[0]["y1"] # check that figure exists no more for figure in objects.flattened.figures: - assert abs(float(objects.pdfplumber.figures[1]['x0']) - figure.position.x0) > 1 - assert abs(float(objects.pdfplumber.figures[1]['y0']) - figure.position.y0) > 1 - assert abs(float(objects.pdfplumber.figures[1]['x1']) - figure.position.x1) > 1 - assert abs(float(objects.pdfplumber.figures[1]['y1']) - figure.position.y1) > 1 + assert abs(float(objects.pdfplumber.figures[1]["x0"]) - figure.position.x0) > 1 + assert abs(float(objects.pdfplumber.figures[1]["y0"]) - figure.position.y0) > 1 + assert abs(float(objects.pdfplumber.figures[1]["x1"]) - figure.position.x1) > 1 + assert abs(float(objects.pdfplumber.figures[1]["y1"]) - figure.position.y1) > 1 # filter figures that are partially overlap with other figure, remove the smaller figure - assert objects.pdfplumber.figures[3]['x0'] < objects.pdfplumber.figures[5]['x0'] - assert objects.pdfplumber.figures[3]['y0'] < objects.pdfplumber.figures[5]['y0'] - assert objects.pdfplumber.figures[3]['x1'] < objects.pdfplumber.figures[5]['x1'] - assert objects.pdfplumber.figures[3]['y1'] < objects.pdfplumber.figures[5]['y1'] + assert objects.pdfplumber.figures[3]["x0"] < objects.pdfplumber.figures[5]["x0"] + assert objects.pdfplumber.figures[3]["y0"] < objects.pdfplumber.figures[5]["y0"] + assert objects.pdfplumber.figures[3]["x1"] < objects.pdfplumber.figures[5]["x1"] + assert objects.pdfplumber.figures[3]["y1"] < objects.pdfplumber.figures[5]["y1"] assert ( - objects.pdfplumber.figures[3]['width'] * objects.pdfplumber.figures[3]['height'] - < objects.pdfplumber.figures[5]['width'] * objects.pdfplumber.figures[5]['height'] + objects.pdfplumber.figures[3]["width"] * objects.pdfplumber.figures[3]["height"] + < objects.pdfplumber.figures[5]["width"] + * objects.pdfplumber.figures[5]["height"] ) # check that figure exists no more for figure in objects.flattened.figures: - assert abs(float(objects.pdfplumber.figures[3]['x0']) - figure.position.x0) > 1 - assert abs(float(objects.pdfplumber.figures[3]['y0']) - figure.position.y0) > 1 - assert abs(float(objects.pdfplumber.figures[3]['x1']) - figure.position.x1) > 1 - assert abs(float(objects.pdfplumber.figures[3]['y1']) - figure.position.y1) > 1 + assert abs(float(objects.pdfplumber.figures[3]["x0"]) - figure.position.x0) > 1 + assert abs(float(objects.pdfplumber.figures[3]["y0"]) - figure.position.y0) > 1 + assert abs(float(objects.pdfplumber.figures[3]["x1"]) - figure.position.x1) > 1 + assert abs(float(objects.pdfplumber.figures[3]["y1"]) - figure.position.y1) > 1 def test_remove_figures_in_header_footer(): @@ -84,14 +94,14 @@ def test_remove_figures_in_header_footer(): assert len(objects.flattened.figures) == 2 # on page 1, there are two figures, one is in header - assert objects.pdfplumber.figures[0]['page_number'] == 1 + assert objects.pdfplumber.figures[0]["page_number"] == 1 # figures[0] on page 1 is not in header - assert float(objects.pdfplumber.figures[0]['y0']) == 239.15 - assert float(objects.pdfplumber.figures[0]['y1']) == 382.85 + assert float(objects.pdfplumber.figures[0]["y0"]) == 239.15 + assert float(objects.pdfplumber.figures[0]["y1"]) == 382.85 # figures[1] on page 1 is in header - assert objects.pdfplumber.figures[1]['page_number'] == 1 - assert float(objects.pdfplumber.figures[1]['y0']) == 719.4 - assert float(objects.pdfplumber.figures[1]['y1']) == 754.05 + assert objects.pdfplumber.figures[1]["page_number"] == 1 + assert float(objects.pdfplumber.figures[1]["y0"]) == 719.4 + assert float(objects.pdfplumber.figures[1]["y1"]) == 754.05 # libpdf extract_figures removed that figure in header, only one figure left on page 1 assert objects.flattened.figures[0].position.page.number == 1 diff --git a/tests/test_full_features.py b/tests/test_full_features.py index c019891..8d9d108 100644 --- a/tests/test_full_features.py +++ b/tests/test_full_features.py @@ -3,12 +3,11 @@ import os import sys +import pytest + import libpdf from libpdf.models.figure import Figure from libpdf.models.table import Table - -import pytest - from tests.conftest import PDF_FULL_FEATURES, PDF_SMART_HEADER_FOOTER_DETECTION @@ -22,34 +21,34 @@ def test_chapters(load_full_features_pdf): # check chapter title # first 2 chapters have no number in PDF, so a virtual number is generated considering the chapter nesting - assert chapters[0].title == 'Disclaimer' - assert chapters[0].number == 'virt.1' - assert chapters[1].title == 'Content of table' - assert chapters[1].number == 'virt.1.1' + assert chapters[0].title == "Disclaimer" + assert chapters[0].number == "virt.1" + assert chapters[1].title == "Content of table" + assert chapters[1].number == "virt.1.1" # the following chapters have numbers - assert chapters[2].title == 'Introduction' - assert chapters[2].number == '1' - assert chapters[3].title == 'Chapter Useful' - assert chapters[3].number == '2' - assert chapters[4].title == 'Meaningful' - assert chapters[4].number == '2.1' - assert chapters[5].title == 'Funny' - assert chapters[5].number == '2.2' - assert chapters[6].title == 'Surprise' - assert chapters[6].number == '3' - assert chapters[7].title == 'Example' - assert chapters[7].number == 'A' + assert chapters[2].title == "Introduction" + assert chapters[2].number == "1" + assert chapters[3].title == "Chapter Useful" + assert chapters[3].number == "2" + assert chapters[4].title == "Meaningful" + assert chapters[4].number == "2.1" + assert chapters[5].title == "Funny" + assert chapters[5].number == "2.2" + assert chapters[6].title == "Surprise" + assert chapters[6].number == "3" + assert chapters[7].title == "Example" + assert chapters[7].number == "A" # check chapter unique id - assert chapters[0].uid == 'chapter.virt.1' - assert chapters[1].uid == 'chapter.virt.1/chapter.virt.1.1' - assert chapters[2].uid == 'chapter.1' - assert chapters[3].uid == 'chapter.2' - assert chapters[4].uid == 'chapter.2/chapter.2.1' - assert chapters[5].uid == 'chapter.2/chapter.2.2' - assert chapters[6].uid == 'chapter.3' - assert chapters[7].uid == 'chapter.A' + assert chapters[0].uid == "chapter.virt.1" + assert chapters[1].uid == "chapter.virt.1/chapter.virt.1.1" + assert chapters[2].uid == "chapter.1" + assert chapters[3].uid == "chapter.2" + assert chapters[4].uid == "chapter.2/chapter.2.1" + assert chapters[5].uid == "chapter.2/chapter.2.2" + assert chapters[6].uid == "chapter.3" + assert chapters[7].uid == "chapter.A" # check chapter headline position assert chapters[0].position.page.number == 1 @@ -60,9 +59,11 @@ def test_chapters(load_full_features_pdf): # check chapter content assert chapters[1].content is not None - assert chapters[1].content[0].type == 'paragraph' - assert chapters[1].content[0].textbox.text.startswith('libpdf allows the extraction') - assert chapters[1].content[0].textbox.text.endswith('Figure or Table.') + assert chapters[1].content[0].type == "paragraph" + assert ( + chapters[1].content[0].textbox.text.startswith("libpdf allows the extraction") + ) + assert chapters[1].content[0].textbox.text.endswith("Figure or Table.") assert len(chapters[1].content[0].textbox.lines) == 3 @@ -86,18 +87,23 @@ def test_tables(load_full_features_pdf): assert tables[1].position.y1 < 654 # check table content - assert tables[1].cells[0].textbox.text == 'some' + assert tables[1].cells[0].textbox.text == "some" assert tables[1].columns[0][0] == tables[1].cells[0] - assert tables[1].rows[2][1].textbox.text == 'Henry\ncavill' - assert tables[1].rows[6][4].textbox.text == '3' + assert tables[1].rows[2][1].textbox.text == "Henry\ncavill" + assert tables[1].rows[6][4].textbox.text == "3" # check table unique id - assert tables[0].uid == 'table.1' - assert tables[1].uid == 'chapter.3/table.1' + assert tables[0].uid == "table.1" + assert tables[1].uid == "chapter.3/table.1" -@pytest.mark.skipif(sys.platform.startswith('win'), reason='saving figures: ImageMagick not installed on Win') -@pytest.mark.parametrize('load_full_features_pdf', [True], indirect=True) # save figures +@pytest.mark.skipif( + sys.platform.startswith("win"), + reason="saving figures: ImageMagick not installed on Win", +) +@pytest.mark.parametrize( + "load_full_features_pdf", [True], indirect=True +) # save figures def test_figures(load_full_features_pdf): """Check if API extract all the figures.""" tmpdir_path, objects = load_full_features_pdf @@ -110,7 +116,7 @@ def test_figures(load_full_features_pdf): assert isinstance(figure, Figure) # check extracted figures stored location - output_dir = os.path.join(tmpdir_path, 'figures') + output_dir = os.path.join(tmpdir_path, "figures") assert os.path.exists(output_dir) assert os.path.isdir(output_dir) # check output directory is not empty @@ -131,10 +137,10 @@ def test_figures(load_full_features_pdf): assert figures[1].position.y1 < 755 # check figure unique id - assert figures[0].uid == 'figure.1' + assert figures[0].uid == "figure.1" # figure.2 is header figure - assert figures[1].uid == 'figure.2' - assert figures[2].uid == 'chapter.1/figure.1' + assert figures[1].uid == "figure.2" + assert figures[2].uid == "chapter.1/figure.1" def test_content_structure(load_full_features_pdf): @@ -148,26 +154,32 @@ def test_content_structure(load_full_features_pdf): assert len(root.content) == 14 # content before first chapter - assert root.content[0].type == 'paragraph' - assert root.content[1].type == 'figure' - assert root.content[4].type == 'table' - assert root.content[7].type == 'figure' + assert root.content[0].type == "paragraph" + assert root.content[1].type == "figure" + assert root.content[4].type == "table" + assert root.content[7].type == "figure" # chapter Useful contains two sub-chapters - assert root.content[11].title == 'Chapter Useful' - assert root.content[11].content[0].title == 'Meaningful' - assert root.content[11].content[1].title == 'Funny' + assert root.content[11].title == "Chapter Useful" + assert root.content[11].content[0].title == "Meaningful" + assert root.content[11].content[1].title == "Funny" # sub-chapter contains a list of paragraphs, tables and figures including header/footer assert len(root.content[11].content[0].content) == 8 - assert root.content[11].content[0].content[0].type == 'paragraph' - assert root.content[11].content[0].content[7].textbox.text == 'Release snyder cut of justice league!!!' + assert root.content[11].content[0].content[0].type == "paragraph" + assert ( + root.content[11].content[0].content[7].textbox.text + == "Release snyder cut of justice league!!!" + ) # check paragraph unique id - assert root.content[0].uid == 'paragraph.1' - assert root.content[10].content[1].uid == 'chapter.1/paragraph.2' - assert root.content[11].content[0].content[0].uid == 'chapter.2/chapter.2.1/paragraph.1' - assert root.content[13].content[0].uid == 'chapter.A/paragraph.1' + assert root.content[0].uid == "paragraph.1" + assert root.content[10].content[1].uid == "chapter.1/paragraph.2" + assert ( + root.content[11].content[0].content[0].uid + == "chapter.2/chapter.2.1/paragraph.1" + ) + assert root.content[13].content[0].uid == "chapter.A/paragraph.1" # check paragraphs amounts assert len(objects.flattened.paragraphs) == 48 @@ -181,21 +193,23 @@ def test_smart_header_footer_detection(): assert len(objects.flattened.figures) == 2 # on page 1 and page 2 only 1 figure left and header figure is removed assert objects.flattened.figures[0].position.page.number == 1 - assert objects.flattened.figures[0].uid == 'figure.1' + assert objects.flattened.figures[0].uid == "figure.1" assert objects.flattened.figures[0].position.x0 > 200 assert objects.flattened.figures[0].position.x1 < 392 assert objects.flattened.figures[0].position.y0 > 239 assert objects.flattened.figures[0].position.y1 < 383 assert objects.flattened.figures[1].position.page.number == 2 - assert objects.flattened.figures[1].uid == 'chapter.1/figure.1' + assert objects.flattened.figures[1].uid == "chapter.1/figure.1" assert len(objects.flattened.tables) == 2 # 10 paragraphs in header/footer assert len(objects.flattened.paragraphs) == 38 - assert objects.flattened.paragraphs[0].uid == 'paragraph.2' - assert objects.flattened.paragraphs[0].textbox.text.startswith('libpdf allows the extraction') + assert objects.flattened.paragraphs[0].uid == "paragraph.2" + assert objects.flattened.paragraphs[0].textbox.text.startswith( + "libpdf allows the extraction" + ) # Check smart header/footer detection for pdf without outline objects = libpdf.load(PDF_SMART_HEADER_FOOTER_DETECTION) @@ -205,62 +219,92 @@ def test_smart_header_footer_detection(): # header/footer and at similar location smart_objects = libpdf.load(PDF_SMART_HEADER_FOOTER_DETECTION, smart_page_crop=True) assert len(smart_objects.flattened.paragraphs) == 30 - assert smart_objects.flattened.paragraphs[0].textbox.text == '1. Chapter title for header' - assert smart_objects.flattened.paragraphs[12].textbox.text == '1. Chapter test for footer' - assert smart_objects.flattened.paragraphs[13].textbox.text == '2. Chapter title for header' - assert smart_objects.flattened.paragraphs[17].textbox.text == '2. Chapter test for footer' - assert smart_objects.flattened.paragraphs[18].textbox.text == '3. Chapter title for header' - assert smart_objects.flattened.paragraphs[23].textbox.text == '3. Chapter test for footer' - assert smart_objects.flattened.paragraphs[24].textbox.text == '4. Chapter title for header' - assert smart_objects.flattened.paragraphs[29].textbox.text == '4. Chapter test for footer' + assert ( + smart_objects.flattened.paragraphs[0].textbox.text + == "1. Chapter title for header" + ) + assert ( + smart_objects.flattened.paragraphs[12].textbox.text + == "1. Chapter test for footer" + ) + assert ( + smart_objects.flattened.paragraphs[13].textbox.text + == "2. Chapter title for header" + ) + assert ( + smart_objects.flattened.paragraphs[17].textbox.text + == "2. Chapter test for footer" + ) + assert ( + smart_objects.flattened.paragraphs[18].textbox.text + == "3. Chapter title for header" + ) + assert ( + smart_objects.flattened.paragraphs[23].textbox.text + == "3. Chapter test for footer" + ) + assert ( + smart_objects.flattened.paragraphs[24].textbox.text + == "4. Chapter title for header" + ) + assert ( + smart_objects.flattened.paragraphs[29].textbox.text + == "4. Chapter test for footer" + ) -@pytest.mark.skipif(sys.platform.startswith('win'), reason='visual debugging: ImageMagick not installed on Win') +@pytest.mark.skipif( + sys.platform.startswith("win"), + reason="visual debugging: ImageMagick not installed on Win", +) def test_visual_debug_include_elements(tmpdir): """Test visual debug include visualized elements.""" - visual_debug_output_dir = os.path.join(tmpdir, 'visual_debug_libpdf') + visual_debug_output_dir = os.path.join(tmpdir, "visual_debug_libpdf") libpdf.load( PDF_FULL_FEATURES, visual_debug=True, visual_debug_output_dir=visual_debug_output_dir, visual_split_elements=True, - visual_debug_include_elements=['chapter'], + visual_debug_include_elements=["chapter"], ) # check visual debug output directory assert os.path.exists(visual_debug_output_dir) assert os.path.isdir(visual_debug_output_dir) # check visual debug included elements directory exist - included_elements_dir = os.path.join(visual_debug_output_dir, 'chapter') + included_elements_dir = os.path.join(visual_debug_output_dir, "chapter") assert os.path.exists(included_elements_dir) assert os.path.isdir(included_elements_dir) # check only one visual debug element directory assert len(os.listdir(visual_debug_output_dir)) == 1 -@pytest.mark.skipif(sys.platform.startswith('win'), reason='visual debugging: ImageMagick not installed on Win') +@pytest.mark.skipif( + sys.platform.startswith("win"), + reason="visual debugging: ImageMagick not installed on Win", +) def test_visual_debug_exclude_elements(tmpdir): """Test visual debug exclude visualized elements.""" - visual_debug_output_dir = os.path.join(tmpdir, 'visual_debug_libpdf') + visual_debug_output_dir = os.path.join(tmpdir, "visual_debug_libpdf") libpdf.load( PDF_FULL_FEATURES, visual_debug=True, visual_debug_output_dir=visual_debug_output_dir, visual_split_elements=True, - visual_debug_exclude_elements=['chapter', 'figure'], + visual_debug_exclude_elements=["chapter", "figure"], ) # check visual excluded elements directory not exist - excluded_elements_figure_dir = os.path.join(visual_debug_output_dir, 'figure') + excluded_elements_figure_dir = os.path.join(visual_debug_output_dir, "figure") assert not os.path.exists(excluded_elements_figure_dir) - excluded_elements_chapter_dir = os.path.join(visual_debug_output_dir, 'chapter') + excluded_elements_chapter_dir = os.path.join(visual_debug_output_dir, "chapter") assert not os.path.exists(excluded_elements_chapter_dir) # check visual debug visualized elements directory paragraph and table exist - included_elements_paragraph_dir = os.path.join(visual_debug_output_dir, 'paragraph') + included_elements_paragraph_dir = os.path.join(visual_debug_output_dir, "paragraph") assert os.path.exists(included_elements_paragraph_dir) assert os.path.isdir(included_elements_paragraph_dir) - included_elements_table_dir = os.path.join(visual_debug_output_dir, 'table') + included_elements_table_dir = os.path.join(visual_debug_output_dir, "table") assert os.path.exists(included_elements_table_dir) assert os.path.isdir(included_elements_table_dir) diff --git a/tests/test_tables.py b/tests/test_tables.py index 8416945..bc19a45 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -1,7 +1,6 @@ """Test tables extraction.""" import libpdf - from tests.conftest import PDF_LOREM_IPSUM @@ -12,17 +11,17 @@ def test_table_cells_words(): # check table 1 on page 1 table_1 = objects.flattened.tables[0] - assert table_1.uid == 'table.1' - assert table_1.position.page.id_ == 'page.1' + assert table_1.uid == "table.1" + assert table_1.position.page.id_ == "page.1" # check table 1 cell(1, 1) cell_1_1 = table_1.cells[0] assert cell_1_1.row == 1 assert cell_1_1.col == 1 - assert cell_1_1.textbox.text == 'Tempora co\nVoluptatem' + assert cell_1_1.textbox.text == "Tempora co\nVoluptatem" # check table 1 cell(3, 5) cell_3_5 = table_1.cells[14] assert cell_3_5.row == 3 assert cell_3_5.col == 5 - assert cell_3_5.textbox.text == 'Eius quaer Etincidunt' + assert cell_3_5.textbox.text == "Eius quaer Etincidunt"