From bdc2ae7b8a69423ca8954d74903e43ce02552637 Mon Sep 17 00:00:00 2001 From: cscanlin Date: Fri, 8 Oct 2021 00:13:00 -0700 Subject: [PATCH] [IMP]: add support for file_bytes argument with managed_file_context() --- camelot/handlers.py | 172 ++++++++++++++++++++++++++++--------------- camelot/io.py | 25 ++++--- camelot/utils.py | 40 +++++----- tests/test_common.py | 21 +++++- tests/test_errors.py | 18 ++++- 5 files changed, 185 insertions(+), 91 deletions(-) diff --git a/camelot/handlers.py b/camelot/handlers.py index 74ddde7a..cebb0c8d 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -1,24 +1,32 @@ +import io import multiprocessing as mp import os import sys +from contextlib import contextmanager from pathlib import Path +from typing import IO +from typing import Any +from typing import TypeVar from typing import Union from pypdf import PdfReader from pypdf import PdfWriter -from pypdf._utils import StrByteType from .core import TableList from .parsers import Lattice from .parsers import Stream +from .utils import InvalidArguments from .utils import TemporaryDirectory -from .utils import download_url from .utils import get_page_layout from .utils import get_rotation from .utils import get_text_objects +from .utils import get_url_bytes from .utils import is_url +FilePathType = TypeVar("FilePathType", str, IO[Any], Path, None) + + class PDFHandler: """Handles all operations like temp directory creation, splitting file into single page PDFs, parsing each PDF and then removing the @@ -26,21 +34,39 @@ class PDFHandler: Parameters ---------- - filepath : str - Filepath or URL of the PDF file. + filepath : str | pathlib.Path, optional (default: None) + Filepath or URL of the PDF file. Required if file_bytes is not given pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. + file_bytes : io.IOBase, optional (default: None) + A file-like stream. Required if filepath is not given """ - def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None): + def __init__( + self, filepath: FilePathType = None, pages="1", password=None, file_bytes=None + ): if is_url(filepath): - filepath = download_url(filepath) - self.filepath: Union[StrByteType, Path] = filepath + file_bytes = get_url_bytes(filepath) + + if not filepath and not file_bytes: + raise InvalidArguments("Either `filepath` or `file_bytes` is required") + if not filepath: + # filepath must either be passed, or taken from the name attribute + try: + filepath = getattr(file_bytes, "name") + except AttributeError: + msg = ( + "Either pass a `filepath`, or give the " + "`file_bytes` argument a name attribute" + ) + raise InvalidArguments(msg) + self.file_bytes = file_bytes # ok to be None + self.filepath = filepath if isinstance(filepath, str) and not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") @@ -52,13 +78,35 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None) self.password = self.password.encode("ascii") self.pages = self._get_pages(pages) + @contextmanager + def managed_file_context(self): + """Reads from either the `filepath` or `file_bytes` + attribute of this instance, to return a file-like object. + Closes any open file handles on exit or error. + + Returns + ------- + file_bytes : io.IOBase + A readable, seekable, file-like object + """ + if self.file_bytes: + # if we can't seek, write to a BytesIO object that can, + # then seek to the beginning before yielding + if not hasattr(self.file_bytes, "seek"): + self.file_bytes = io.BytesIO(self.file_bytes.read()) + self.file_bytes.seek(0) + yield self.file_bytes + else: + with open(self.filepath, "rb") as file_bytes: + yield file_bytes + def _get_pages(self, pages): """Converts pages string to list of ints. Parameters ---------- - filepath : str - Filepath or URL of the PDF file. + managed_file_context : io.IOBase + A readable, seekable, file-like object pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. @@ -74,74 +122,77 @@ def _get_pages(self, pages): if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: - infile = PdfReader(self.filepath, strict=False) + with self.managed_file_context() as f: + infile = PdfReader(f, strict=False) - if infile.is_encrypted: - infile.decrypt(self.password) + if infile.is_encrypted: + infile.decrypt(self.password) - if pages == "all": - page_numbers.append({"start": 1, "end": len(infile.pages)}) - else: - for r in pages.split(","): - if "-" in r: - a, b = r.split("-") - if b == "end": - b = len(infile.pages) - page_numbers.append({"start": int(a), "end": int(b)}) - else: - page_numbers.append({"start": int(r), "end": int(r)}) + if pages == "all": + page_numbers.append({"start": 1, "end": len(infile.pages)}) + else: + for r in pages.split(","): + if "-" in r: + a, b = r.split("-") + if b == "end": + b = len(infile.pages) + page_numbers.append({"start": int(a), "end": int(b)}) + else: + page_numbers.append({"start": int(r), "end": int(r)}) result = [] for p in page_numbers: result.extend(range(p["start"], p["end"] + 1)) return sorted(set(result)) - def _save_page(self, filepath: Union[StrByteType, Path], page, temp): + def _save_page(self, filepath: FilePathType, page, temp): """Saves specified page from PDF into a temporary directory. Parameters ---------- - filepath : str - Filepath or URL of the PDF file. + managed_file_context : io.IOBase + A readable, seekable, file-like object page : int Page number. temp : str Tmp directory. """ - infile = PdfReader(filepath, strict=False) - if infile.is_encrypted: - infile.decrypt(self.password) - fpath = os.path.join(temp, f"page-{page}.pdf") - froot, fext = os.path.splitext(fpath) - p = infile.pages[page - 1] - outfile = PdfWriter() - outfile.add_page(p) - with open(fpath, "wb") as f: - outfile.write(f) - layout, dim = get_page_layout(fpath) - # fix rotated PDF - chars = get_text_objects(layout, ltype="char") - horizontal_text = get_text_objects(layout, ltype="horizontal_text") - vertical_text = get_text_objects(layout, ltype="vertical_text") - rotation = get_rotation(chars, horizontal_text, vertical_text) - if rotation != "": - fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) - os.rename(fpath, fpath_new) - instream = open(fpath_new, "rb") - infile = PdfReader(instream, strict=False) + + with self.managed_file_context() as fileobj: + infile = PdfReader(fileobj, strict=False) if infile.is_encrypted: infile.decrypt(self.password) + fpath = os.path.join(temp, f"page-{page}.pdf") + froot, fext = os.path.splitext(fpath) + p = infile.pages[page - 1] outfile = PdfWriter() - p = infile.pages[0] - if rotation == "anticlockwise": - p.rotate(90) - elif rotation == "clockwise": - p.rotate(-90) outfile.add_page(p) with open(fpath, "wb") as f: outfile.write(f) - instream.close() + layout, dim = get_page_layout(fpath) + # fix rotated PDF + chars = get_text_objects(layout, ltype="char") + horizontal_text = get_text_objects(layout, ltype="horizontal_text") + vertical_text = get_text_objects(layout, ltype="vertical_text") + rotation = get_rotation(chars, horizontal_text, vertical_text) + if rotation != "": + fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) + os.rename(fpath, fpath_new) + instream = open(fpath_new, "rb") + infile = PdfReader(instream, strict=False) + if infile.is_encrypted: + infile.decrypt(self.password) + outfile = PdfWriter() + p = infile.pages[0] + if rotation == "anticlockwise": + p.rotate(90) + elif rotation == "clockwise": + p.rotate(-90) + outfile.add_page(p) + with open(fpath, "wb") as f: + outfile.write(f) + instream.close() def parse( self, @@ -149,7 +200,7 @@ def parse( suppress_stdout=False, parallel=False, layout_kwargs=None, - **kwargs + **kwargs, ): """Extracts tables by calling parser.get_tables on all single page PDFs. @@ -189,7 +240,8 @@ def parse( jobs = [] for p in self.pages: j = pool.apply_async( - self._parse_page,(p, tempdir, parser, suppress_stdout, layout_kwargs) + self._parse_page, + (p, tempdir, parser, suppress_stdout, layout_kwargs), ) jobs.append(j) @@ -198,14 +250,14 @@ def parse( tables.extend(t) else: for p in self.pages: - t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs) + t = self._parse_page( + p, tempdir, parser, suppress_stdout, layout_kwargs + ) tables.extend(t) return TableList(sorted(tables)) - def _parse_page( - self, page, tempdir, parser, suppress_stdout, layout_kwargs - ): + def _parse_page(self, page, tempdir, parser, suppress_stdout, layout_kwargs): """Extracts tables by calling parser.get_tables on a single page PDF. @@ -224,7 +276,7 @@ def _parse_page( ------- tables : camelot.core.TableList List of tables found in PDF. - + """ self._save_page(self.filepath, page, tempdir) page_path = os.path.join(tempdir, f"page-{page}.pdf") diff --git a/camelot/io.py b/camelot/io.py index 12718828..9c5e6f62 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -1,22 +1,24 @@ import warnings from pathlib import Path -from typing import Union -from pypdf._utils import StrByteType +from .handlers import PDFHandler, FilePathType -from .handlers import PDFHandler -from .utils import remove_extra -from .utils import validate_input +from .utils import ( + InvalidArguments, + validate_input, + remove_extra, +) def read_pdf( - filepath: Union[StrByteType, Path], + filepath: FilePathType = None, pages="1", password=None, flavor="lattice", suppress_stdout=False, parallel=False, layout_kwargs=None, + file_bytes=None, **kwargs ): """Read PDF and return extracted tables. @@ -26,8 +28,8 @@ def read_pdf( Parameters ---------- - filepath : str, Path, IO - Filepath or URL of the PDF file. + filepath : str | pathlib.Path, optional (default: None) + Filepath or URL of the PDF file. Required if file_bytes is not given pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. @@ -40,6 +42,8 @@ def read_pdf( Print all logs and warnings. parallel : bool, optional (default: False) Process pages in parallel using all available cpu cores. + file_bytes : io.IOBase, optional (default: None) + A file-like stream. Required if filepath is not given layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams `_ kwargs. @@ -115,12 +119,15 @@ def read_pdf( "Unknown flavor specified." " Use either 'lattice' or 'stream'" ) + if not filepath and not file_bytes: + raise InvalidArguments('Either `filepath` or `file_bytes` is required') + with warnings.catch_warnings(): if suppress_stdout: warnings.simplefilter("ignore") validate_input(kwargs, flavor=flavor) - p = PDFHandler(filepath, pages=pages, password=password) + p = PDFHandler(filepath, pages=pages, password=password, file_bytes=file_bytes) kwargs = remove_extra(kwargs, flavor=flavor) tables = p.parse( flavor=flavor, diff --git a/camelot/utils.py b/camelot/utils.py index fda56f54..aca25281 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1,4 +1,4 @@ -import os +import io import random import re import shutil @@ -34,6 +34,10 @@ _VALID_URLS.discard("") +class InvalidArguments(Exception): + pass + + # https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py def is_url(url): """Check to see if a URL has a valid protocol. @@ -64,8 +68,8 @@ def random_string(length): return ret -def download_url(url): - """Download file from specified URL. +def get_url_bytes(url): + """Get a stream of bytes for url Parameters ---------- @@ -73,25 +77,21 @@ def download_url(url): Returns ------- - filepath : str or unicode - Temporary filepath. + file_bytes : io.BytesIO + a file-like object that cane be read """ - filename = f"{random_string(6)}.pdf" - with tempfile.NamedTemporaryFile("wb", delete=False) as f: - headers = { - "User-Agent": "Mozilla/5.0", - "Accept-Encoding": "gzip;q=1.0, deflate;q=0.9, br;q=0.8, compress;q=0.7, *;q=0.1" - } - request = Request(url, None, headers) - obj = urlopen(request) - content_type = obj.info().get_content_type() - if content_type != "application/pdf": - raise NotImplementedError("File format not supported") - f.write(obj.read()) - filepath = os.path.join(os.path.dirname(f.name), filename) - shutil.move(f.name, filepath) - return filepath + file_bytes = io.BytesIO() + file_bytes.name = url + headers = {"User-Agent": "Mozilla/5.0"} + request = Request(url, data=None, headers=headers) + obj = urlopen(request) + content_type = obj.info().get_content_type() + if content_type != "application/pdf": + raise NotImplementedError("File format not supported") + file_bytes.write(obj.read()) + file_bytes.seek(0) + return file_bytes stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"] diff --git a/tests/test_common.py b/tests/test_common.py index c3243174..b1ba3cfc 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -1,3 +1,4 @@ +import io import os from pathlib import Path @@ -189,7 +190,6 @@ def test_handler_with_pathlib(testdir): handler = PDFHandler(f) assert handler._get_pages("1") == [1] - def test_table_list_iter(): def _make_table(page, order): t = Table([], []) @@ -214,3 +214,22 @@ def _make_table(page, order): assert iterator_b is not None item_c = next(iterator_b) assert item_c is not None + +@skip_on_windows +def test_from_open(testdir): + filename = os.path.join(testdir, "foo.pdf") + with open(filename, "rb") as file_bytes: + tables = camelot.read_pdf(file_bytes=file_bytes) + assert repr(tables) == "" + assert repr(tables[0]) == "" + +@skip_on_windows +def test_from_bytes(testdir): + filename = os.path.join(testdir, "foo.pdf") + file_bytes = io.BytesIO() + file_bytes.name = filename + with open(filename, "rb") as f: + file_bytes.write(f.read()) # note that we didn't seek, done by PDFHandler + tables = camelot.read_pdf(file_bytes=file_bytes) + assert repr(tables) == "" + assert repr(tables[0]) == "
" diff --git a/tests/test_errors.py b/tests/test_errors.py index eaa720ec..b99c4848 100644 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -1,11 +1,12 @@ import os +import io import warnings import pytest import camelot from tests.conftest import skip_on_windows - +from camelot.utils import InvalidArguments def test_unknown_flavor(foo_pdf): message = "Unknown flavor specified." " Use either 'lattice' or 'stream'" @@ -25,6 +26,21 @@ def test_unsupported_format(testdir): with pytest.raises(NotImplementedError, match=message): camelot.read_pdf(filename) +def test_no_file_or_bytes(testdir): + message = "Either `filepath` or `file_bytes` is required" + with pytest.raises(InvalidArguments, match=message): + camelot.read_pdf() + +@skip_on_windows +def test_no_filepath_or_name(testdir): + message = ('Either pass a `filepath`, or give the ' + '`file_bytes` argument a name attribute') + filename = os.path.join(testdir, "foo.pdf") + file_bytes = io.BytesIO() + with open(filename, "rb") as f: + file_bytes.write(f.read()) + with pytest.raises(InvalidArguments, match=message): + camelot.read_pdf(file_bytes=file_bytes) @skip_on_windows def test_no_tables_found_logs_suppressed(testdir):