dhdaines · dhdaines · Dec 30, 2024 · Dec 30, 2024 · Dec 30, 2024 · Dec 30, 2024
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+max-line-length = 88
+extend-ignore = 
+    # See https://github.com/PyCQA/pycodestyle/issues/373
+    E203,
diff --git a/README.md b/README.md
@@ -15,17 +15,112 @@ data structures and algorithms.
 
 There will be dependencies.  Oh, there will be dependencies.
 
-## Table of Contents
-
-- [Installation](#installation)
-- [License](#license)
-
 ## Installation
 
 ```console
 pip install paves
 ```
 
+## Workin' in a PDF mine
+
+`pdfminer.six` is widely used for text extraction and layout analysis
+due to its liberal licensing terms.  Unfortunately it is quite slow
+and contains many bugs.  Now you can use PAVÉS instead:
+
+```python
+from paves.miner import extract, LAParams
+
+laparams = LAParams()
+for page in extract(path, laparams):
+    # do something
+```
+
+By default this will use all of your CPUs to go fast on large
+documents and somewhat slower on small ones.  You can make it not do
+that by passing `max_workers=1`, or you can only use some CPUs by
+passing some other value.
+
+Even with a single CPU, it is often faster than `pdfminer.six`.
+
+There are a few differences with `pdfminer.six` (some might call them
+bug fixes):
+
+- By default, if you do not pass the `laparams` argument to `extract`,
+  no layout analysis at all is done.  This is different from
+  `extract_pages` in `pdfminer.six` which will set some default
+  parameters for you.  If you don't see any `LTTextBox` items in your
+  `LTPage` then this is why!
+- Rectangles are recognized correctly in some cases where
+  `pdfminer.six` thought they were "curves".
+- Colours and colour spaces are the PLAYA versions, which do not
+  correspond to what `pdfminer.six` gives you, because what
+  `pdfminer.six` gives you is not useful and often wrong.
+- You have access to the marked content section ID and tag in every
+  `LTComponent`, as the `mcs` attribute (but this is often `None` in
+  the case where there is no MCS, or there are multiple MCSes)
+- Bounding boxes of rotated glyphs are the actual bounding box.
+
+Probably more... but you didn't use any of that stuff anyway, you just
+wanted to get `LTTextBoxes` to feed to your hallucination factories.
+
+There is also a bug:
+
+- The `size` of rotated glyphs is incorrect (this will get fixed in
+  PLAYA soon hopefully).
+
+## PLAYA Bears
+
+[PLAYA](https://github.com/dhdaines/playa) has a nice "lazy" API which
+is efficient but does take a bit of work to use.  If, on the other
+hand, **you** are lazy, then you can use `paves.bears`, which will
+flatten everything for you into a friendly dictionary representation
+(but it is a
+[`TypedDict`](https://typing.readthedocs.io/en/latest/spec/typeddict.html#typeddict))
+which, um, looks a lot like what `pdfplumber` gives you, except
+possibly in a different coordinate space, as defined [in the PLAYA
+documentation](https://github.com/dhdaines/playa#an-important-note-about-coordinate-spaces).
+
+```python
+from paves.bears import extract
+
+for dic in extract(path):
+    print("it is a {dic['object_type']} at ({dic['x0']}", {dic['y0']}))
+    print("    the color is {dic['stroking_color']}")
+    print("    the text is {dic['text']}")
+    print("    it is in MCS {dic['mcid']} which is a {dic['tag']}")
+    print("    it is also in Form XObject {dic['xobjid']}")
+```
+
+This can be used to do machine learning of various sorts.  For
+instance, you can write `page.layout` to a CSV file:
+
+```python
+from paves.bears import FIELDNAMES
+
+writer = DictWriter(outfh, fieldnames=FIELDNAMES)
+writer.writeheader()
+for dic in extract(path):
+    writer.writerow(dic)
+```
+
+you can also create a Pandas DataFrame:
+
+```python
+df = pandas.DataFrame.from_records(extract(path))
+```
+
+or a Polars DataFrame or LazyFrame:
+
+```python
+from paves.bears import SCHEMA
+
+df = polars.DataFrame(extract(path), schema=SCHEMA)
+```
+
+As above, this will use all of your CPUs and return a possibly quite
+large object.
+
 ## License
 
-`PAVÉS` is distributed under the terms of the [MIT](https://spdx.org/licenses/MIT.html) license.
+`PAVÉS` is distributed under the terms of the
+[MIT](https://spdx.org/licenses/MIT.html) license.
diff --git a/benchmarks/miner.py b/benchmarks/miner.py
@@ -0,0 +1,42 @@
+"""Benchmark pdfminer.six against PAVÉS"""
+
+import time
+from typing import Union
+from pdfminer.high_level import extract_pages
+from paves.miner import extract, LAParams
+from pathlib import Path
+
+
+def benchmark_single(path: Path):
+    for page in extract_pages(path):
+        pass
+
+
+def benchmark_multi(path: Path, ncpu: Union[int, None]):
+    for page in extract(path, laparams=LAParams(), max_workers=ncpu):
+        pass
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("-n", "--ncpu", type=int, default=None)
+    parser.add_argument("pdf", type=Path)
+    args = parser.parse_args()
+
+    start = time.time()
+    benchmark_multi(args.pdf, args.ncpu)
+    multi_time = time.time() - start
+    print(
+        "PAVÉS (%r CPUs) took %.2fs"
+        % (
+            "all" if args.ncpu is None else args.ncpu,
+            multi_time,
+        )
+    )
+
+    start = time.time()
+    benchmark_single(args.pdf)
+    single_time = time.time() - start
+    print("pdfminer.six (single) took %.2fs" % (single_time,))
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ classifiers = [
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
-  "playa-pdf >= 0.2.5, < 0.3"  # not considered harmful as we depend on internals
+  "playa-pdf >= 0.2.6, < 0.3"  # not considered harmful as we depend on internals
 ]
 
 [project.urls]
@@ -44,6 +44,16 @@ exclude = [
   "/.github",
   "/tests/contrib",
 ]
+
+[tool.hatch.envs.hatch-test]
+extra-dependencies = [ "pdfminer.six", "pandas", "polars-lts-cpu" ]
+
+[tool.hatch.envs.default]
+dependencies = [ "pytest", "mypy", "pdfminer.six", "pandas", "polars-lts-cpu" ]
+
+[tool.hatch.envs.hatch-static-analysis]
+config-path = "none"  # Disable hatch's unreasonable ruff defaults
+
 [tool.hatch.envs.types]
 extra-dependencies = [
   "mypy>=1.0.0",

diff --git a/src/paves/playa.py → src/paves/bears.py b/src/paves/playa.py → src/paves/bears.py
@@ -1,15 +1,19 @@
-"""
-Reimplementation of PLAYA 0.2 `page.layout` in a more appropriate location.
+"""Reimplementation of PLAYA 0.2 `page.layout` in a more appropriate location.
+
+Creates dictionaries appropriate for feeding to bears of different
+sorts (pandas or polars, your choice).
 """
 
 from functools import singledispatch
+from multiprocessing.context import BaseContext
+from pathlib import Path
 import logging
+import multiprocessing
 
-from typing import Iterator, List
+from typing import cast, Iterator, List, Union
 from playa.page import (
     Page,
     ContentObject,
-    LayoutDict,
     PathObject,
     ImageObject,
     TextObject,
@@ -23,6 +27,8 @@
     mult_matrix,
     translate_matrix,
 )
+import playa
+from playa import DeviceSpace, LayoutDict, fieldnames as FIELDNAMES, schema as SCHEMA  # noqa: F401
 
 LOG = logging.getLogger(__name__)
 
@@ -68,11 +74,13 @@ def make_path(
         non_stroking_colorspace=obj.gstate.ncs.name,
         non_stroking_color=obj.gstate.ncolor.values,
         non_stroking_pattern=obj.gstate.ncolor.pattern,
+        page_index=0,
+        page_label="0",
     )
 
 
 @process_object.register
-def _(obj: PathObject):
+def _(obj: PathObject) -> Iterator[LayoutDict]:
     for path in obj:
         ops = []
         pts: List[Point] = []
@@ -176,6 +184,8 @@ def _(obj: ImageObject) -> Iterator[LayoutDict]:
         bits=obj.bits,
         image_colorspace=obj.colorspace,
         stream=stream_id,
+        page_index=0,
+        page_label="0",
     )
 
 
@@ -225,6 +235,8 @@ def _(obj: TextObject) -> Iterator[LayoutDict]:
             non_stroking_pattern=gstate.ncolor.pattern,
             mcid=None if obj.mcs is None else obj.mcs.mcid,
             tag=None if obj.mcs is None else obj.mcs.tag,
+            page_index=0,
+            page_label="0",
         )
 
 
@@ -236,7 +248,31 @@ def _(obj: XObjectObject) -> Iterator[LayoutDict]:
             yield layout
 
 
-def extract_page(page: Page) -> Iterator[LayoutDict]:
+def extract_page(page: Page) -> List[LayoutDict]:
     """Extract LayoutDict items from a Page."""
+    page_layout = []
     for obj in page:
-        yield from process_object(obj)
+        for dic in process_object(obj):
+            dic = cast(LayoutDict, dic)  # ugh
+            dic["page_index"] = page.page_idx
+            dic["page_label"] = page.label
+            page_layout.append(dic)
+    return page_layout
+
+
+def extract(
+    path: Path,
+    space: DeviceSpace = "screen",
+    max_workers: Union[int, None] = None,
+    mp_context: Union[BaseContext, None] = None,
+) -> Iterator[LayoutDict]:
+    """Extract LayoutDict items from a document."""
+    if max_workers is None:
+        max_workers = multiprocessing.cpu_count()
+    with playa.open(
+        path,
+        max_workers=max_workers,
+        mp_context=mp_context,
+    ) as pdf:
+        for page in pdf.pages.map(extract_page):
+            yield from page