Skip to content

Commit

Permalink
[IMP]: add support for file_bytes argument with managed_file_context()
Browse files Browse the repository at this point in the history
  • Loading branch information
cscanlin-kwh authored and bosd committed Sep 19, 2024
1 parent e3c1115 commit bdc2ae7
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 91 deletions.
172 changes: 112 additions & 60 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,72 @@
import io
import multiprocessing as mp
import os
import sys
from contextlib import contextmanager
from pathlib import Path
from typing import IO
from typing import Any
from typing import TypeVar
from typing import Union

from pypdf import PdfReader
from pypdf import PdfWriter
from pypdf._utils import StrByteType

from .core import TableList
from .parsers import Lattice
from .parsers import Stream
from .utils import InvalidArguments
from .utils import TemporaryDirectory
from .utils import download_url
from .utils import get_page_layout
from .utils import get_rotation
from .utils import get_text_objects
from .utils import get_url_bytes
from .utils import is_url


FilePathType = TypeVar("FilePathType", str, IO[Any], Path, None)


class PDFHandler:
"""Handles all operations like temp directory creation, splitting
file into single page PDFs, parsing each PDF and then removing the
temp directory.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
filepath : str | pathlib.Path, optional (default: None)
Filepath or URL of the PDF file. Required if file_bytes is not given
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
Password for decryption.
file_bytes : io.IOBase, optional (default: None)
A file-like stream. Required if filepath is not given
"""

def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None):
def __init__(
self, filepath: FilePathType = None, pages="1", password=None, file_bytes=None
):
if is_url(filepath):
filepath = download_url(filepath)
self.filepath: Union[StrByteType, Path] = filepath
file_bytes = get_url_bytes(filepath)

if not filepath and not file_bytes:
raise InvalidArguments("Either `filepath` or `file_bytes` is required")
if not filepath:
# filepath must either be passed, or taken from the name attribute
try:
filepath = getattr(file_bytes, "name")
except AttributeError:
msg = (
"Either pass a `filepath`, or give the "
"`file_bytes` argument a name attribute"
)
raise InvalidArguments(msg)
self.file_bytes = file_bytes # ok to be None

self.filepath = filepath
if isinstance(filepath, str) and not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported")

Expand All @@ -52,13 +78,35 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
self.password = self.password.encode("ascii")
self.pages = self._get_pages(pages)

@contextmanager
def managed_file_context(self):
"""Reads from either the `filepath` or `file_bytes`
attribute of this instance, to return a file-like object.
Closes any open file handles on exit or error.
Returns
-------
file_bytes : io.IOBase
A readable, seekable, file-like object
"""
if self.file_bytes:
# if we can't seek, write to a BytesIO object that can,
# then seek to the beginning before yielding
if not hasattr(self.file_bytes, "seek"):
self.file_bytes = io.BytesIO(self.file_bytes.read())
self.file_bytes.seek(0)
yield self.file_bytes
else:
with open(self.filepath, "rb") as file_bytes:
yield file_bytes

def _get_pages(self, pages):
"""Converts pages string to list of ints.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
managed_file_context : io.IOBase
A readable, seekable, file-like object
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Expand All @@ -74,82 +122,85 @@ def _get_pages(self, pages):
if pages == "1":
page_numbers.append({"start": 1, "end": 1})
else:
infile = PdfReader(self.filepath, strict=False)
with self.managed_file_context() as f:
infile = PdfReader(f, strict=False)

if infile.is_encrypted:
infile.decrypt(self.password)
if infile.is_encrypted:
infile.decrypt(self.password)

if pages == "all":
page_numbers.append({"start": 1, "end": len(infile.pages)})
else:
for r in pages.split(","):
if "-" in r:
a, b = r.split("-")
if b == "end":
b = len(infile.pages)
page_numbers.append({"start": int(a), "end": int(b)})
else:
page_numbers.append({"start": int(r), "end": int(r)})
if pages == "all":
page_numbers.append({"start": 1, "end": len(infile.pages)})
else:
for r in pages.split(","):
if "-" in r:
a, b = r.split("-")
if b == "end":
b = len(infile.pages)
page_numbers.append({"start": int(a), "end": int(b)})
else:
page_numbers.append({"start": int(r), "end": int(r)})

result = []
for p in page_numbers:
result.extend(range(p["start"], p["end"] + 1))
return sorted(set(result))

def _save_page(self, filepath: Union[StrByteType, Path], page, temp):
def _save_page(self, filepath: FilePathType, page, temp):
"""Saves specified page from PDF into a temporary directory.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
managed_file_context : io.IOBase
A readable, seekable, file-like object
page : int
Page number.
temp : str
Tmp directory.
"""
infile = PdfReader(filepath, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
p = infile.pages[page - 1]
outfile = PdfWriter()
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
instream = open(fpath_new, "rb")
infile = PdfReader(instream, strict=False)

with self.managed_file_context() as fileobj:
infile = PdfReader(fileobj, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
p = infile.pages[page - 1]
outfile = PdfWriter()
p = infile.pages[0]
if rotation == "anticlockwise":
p.rotate(90)
elif rotation == "clockwise":
p.rotate(-90)
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
instream.close()
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
instream = open(fpath_new, "rb")
infile = PdfReader(instream, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
outfile = PdfWriter()
p = infile.pages[0]
if rotation == "anticlockwise":
p.rotate(90)
elif rotation == "clockwise":
p.rotate(-90)
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
instream.close()

def parse(
self,
flavor="lattice",
suppress_stdout=False,
parallel=False,
layout_kwargs=None,
**kwargs
**kwargs,
):
"""Extracts tables by calling parser.get_tables on all single
page PDFs.
Expand Down Expand Up @@ -189,7 +240,8 @@ def parse(
jobs = []
for p in self.pages:
j = pool.apply_async(
self._parse_page,(p, tempdir, parser, suppress_stdout, layout_kwargs)
self._parse_page,
(p, tempdir, parser, suppress_stdout, layout_kwargs),
)
jobs.append(j)

Expand All @@ -198,14 +250,14 @@ def parse(
tables.extend(t)
else:
for p in self.pages:
t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs)
t = self._parse_page(
p, tempdir, parser, suppress_stdout, layout_kwargs
)
tables.extend(t)

return TableList(sorted(tables))

def _parse_page(
self, page, tempdir, parser, suppress_stdout, layout_kwargs
):
def _parse_page(self, page, tempdir, parser, suppress_stdout, layout_kwargs):
"""Extracts tables by calling parser.get_tables on a single
page PDF.
Expand All @@ -224,7 +276,7 @@ def _parse_page(
-------
tables : camelot.core.TableList
List of tables found in PDF.
"""
self._save_page(self.filepath, page, tempdir)
page_path = os.path.join(tempdir, f"page-{page}.pdf")
Expand Down
25 changes: 16 additions & 9 deletions camelot/io.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
import warnings
from pathlib import Path
from typing import Union

from pypdf._utils import StrByteType
from .handlers import PDFHandler, FilePathType

from .handlers import PDFHandler
from .utils import remove_extra
from .utils import validate_input
from .utils import (
InvalidArguments,
validate_input,
remove_extra,
)


def read_pdf(
filepath: Union[StrByteType, Path],
filepath: FilePathType = None,
pages="1",
password=None,
flavor="lattice",
suppress_stdout=False,
parallel=False,
layout_kwargs=None,
file_bytes=None,
**kwargs
):
"""Read PDF and return extracted tables.
Expand All @@ -26,8 +28,8 @@ def read_pdf(
Parameters
----------
filepath : str, Path, IO
Filepath or URL of the PDF file.
filepath : str | pathlib.Path, optional (default: None)
Filepath or URL of the PDF file. Required if file_bytes is not given
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Expand All @@ -40,6 +42,8 @@ def read_pdf(
Print all logs and warnings.
parallel : bool, optional (default: False)
Process pages in parallel using all available cpu cores.
file_bytes : io.IOBase, optional (default: None)
A file-like stream. Required if filepath is not given
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
Expand Down Expand Up @@ -115,12 +119,15 @@ def read_pdf(
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
)

if not filepath and not file_bytes:
raise InvalidArguments('Either `filepath` or `file_bytes` is required')

with warnings.catch_warnings():
if suppress_stdout:
warnings.simplefilter("ignore")

validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages=pages, password=password)
p = PDFHandler(filepath, pages=pages, password=password, file_bytes=file_bytes)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(
flavor=flavor,
Expand Down
Loading

0 comments on commit bdc2ae7

Please sign in to comment.