diff --git a/camelot/handlers.py b/camelot/handlers.py index 66ee1697b..c03f16651 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -1,3 +1,5 @@ +from contextlib import contextmanager +import io import os import sys from pathlib import Path @@ -11,7 +13,8 @@ from .parsers import Lattice from .parsers import Stream from .utils import TemporaryDirectory -from .utils import download_url +from .utils import InvalidArguments +from .utils import get_url_bytes from .utils import get_page_layout from .utils import get_rotation from .utils import get_text_objects @@ -25,21 +28,36 @@ class PDFHandler: Parameters ---------- - filepath : str - Filepath or URL of the PDF file. + filepath : str | pathlib.Path, optional (default: None) + Filepath or URL of the PDF file. Required if file_bytes is not given pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. + file_bytes : io.IOBase, optional (default: None) + A file-like stream. Required if filepath is not given """ - def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None): + def __init__(self, filepath: Union[StrByteType, Path, None], pages="1", password=None, file_bytes=None): if is_url(filepath): - filepath = download_url(filepath) + file_bytes = get_url_bytes(filepath) + + if not filepath and not file_bytes: + raise InvalidArguments('Either `filepath` or `file_bytes` is required') + if not filepath: + # filepath must either be passed, or taken from the name attribute + filepath = getattr(file_bytes, 'name') + if not filepath: + msg = ('Either pass a `filepath`, or give the ' + '`file_bytes` argument a name attribute') + raise InvalidArguments(msg) + self.file_bytes = file_bytes # ok to be None + + # self.filepath = filepath + # or self.filepath: Union[StrByteType, Path] = filepath - if isinstance(filepath, str) and not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") @@ -51,6 +69,28 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None) self.password = self.password.encode("ascii") self.pages = self._get_pages(pages) + @contextmanager + def managed_file_context(self): + """Reads from either the `filepath` or `file_bytes` + attribute of this instance, to return a file-like object. + Closes any open file handles on exit or error. + + Returns + ------- + file_bytes : io.IOBase + A readable, seekable, file-like object + """ + if self.file_bytes: + # if we can't seek, write to a BytesIO object that can, + # then seek to the beginning before yielding + if not hasattr(self.file_bytes, 'seek'): + self.file_bytes = io.BytesIO(self.file_bytes.read()) + self.file_bytes.seek(0) + yield self.file_bytes + else: + with open(self.filepath, "rb") as file_bytes: + yield file_bytes + def _get_pages(self, pages): """Converts pages string to list of ints. @@ -73,29 +113,30 @@ def _get_pages(self, pages): if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: - infile = PdfReader(self.filepath, strict=False) - - if infile.is_encrypted: - infile.decrypt(self.password) - - if pages == "all": - page_numbers.append({"start": 1, "end": len(infile.pages)}) - else: - for r in pages.split(","): - if "-" in r: - a, b = r.split("-") - if b == "end": - b = len(infile.pages) - page_numbers.append({"start": int(a), "end": int(b)}) - else: - page_numbers.append({"start": int(r), "end": int(r)}) + with self.managed_file_context() as f: + infile = PdfReader(f, strict=False) + + if infile.is_encrypted: + infile.decrypt(self.password) + + if pages == "all": + page_numbers.append({"start": 1, "end": len(infile.pages)}) + else: + for r in pages.split(","): + if "-" in r: + a, b = r.split("-") + if b == "end": + b = len(infile.pages) + page_numbers.append({"start": int(a), "end": int(b)}) + else: + page_numbers.append({"start": int(r), "end": int(r)}) result = [] for p in page_numbers: result.extend(range(p["start"], p["end"] + 1)) return sorted(set(result)) - def _save_page(self, filepath: Union[StrByteType, Path], page, temp): + def _save_page(self, filepath: Union[StrByteType, Path, None], page, temp): """Saves specified page from PDF into a temporary directory. Parameters @@ -108,39 +149,41 @@ def _save_page(self, filepath: Union[StrByteType, Path], page, temp): Tmp directory. """ - infile = PdfReader(filepath, strict=False) - if infile.is_encrypted: - infile.decrypt(self.password) - fpath = os.path.join(temp, f"page-{page}.pdf") - froot, fext = os.path.splitext(fpath) - p = infile.pages[page - 1] - outfile = PdfWriter() - outfile.add_page(p) - with open(fpath, "wb") as f: - outfile.write(f) - layout, dim = get_page_layout(fpath) - # fix rotated PDF - chars = get_text_objects(layout, ltype="char") - horizontal_text = get_text_objects(layout, ltype="horizontal_text") - vertical_text = get_text_objects(layout, ltype="vertical_text") - rotation = get_rotation(chars, horizontal_text, vertical_text) - if rotation != "": - fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) - os.rename(fpath, fpath_new) - instream = open(fpath_new, "rb") - infile = PdfReader(instream, strict=False) + + with self.managed_file_context() as fileobj: + infile = PdfReader(fileobj, strict=False) if infile.is_encrypted: infile.decrypt(self.password) + fpath = os.path.join(temp, f"page-{page}.pdf") + froot, fext = os.path.splitext(fpath) + p = infile.pages[page - 1] outfile = PdfWriter() - p = infile.pages[0] - if rotation == "anticlockwise": - p.rotate(90) - elif rotation == "clockwise": - p.rotate(-90) outfile.add_page(p) with open(fpath, "wb") as f: outfile.write(f) - instream.close() + layout, dim = get_page_layout(fpath) + # fix rotated PDF + chars = get_text_objects(layout, ltype="char") + horizontal_text = get_text_objects(layout, ltype="horizontal_text") + vertical_text = get_text_objects(layout, ltype="vertical_text") + rotation = get_rotation(chars, horizontal_text, vertical_text) + if rotation != "": + fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) + os.rename(fpath, fpath_new) + instream = open(fpath_new, "rb") + infile = PdfReader(instream, strict=False) + if infile.is_encrypted: + infile.decrypt(self.password) + outfile = PdfWriter() + p = infile.pages[0] + if rotation == "anticlockwise": + p.rotate(90) + elif rotation == "clockwise": + p.rotate(-90) + outfile.add_page(p) + with open(fpath, "wb") as f: + outfile.write(f) + instream.close() def parse( self, flavor="lattice", suppress_stdout=False, layout_kwargs=None, **kwargs diff --git a/camelot/io.py b/camelot/io.py index 78319bc90..2b8f7a575 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -5,17 +5,22 @@ from pypdf._utils import StrByteType from .handlers import PDFHandler -from .utils import remove_extra -from .utils import validate_input + +from .utils import ( + InvalidArguments, + validate_input, + remove_extra, +) def read_pdf( - filepath: Union[StrByteType, Path], + filepath=Union[StrByteType, Path], pages="1", password=None, flavor="lattice", suppress_stdout=False, layout_kwargs=None, + file_bytes=None, **kwargs ): """Read PDF and return extracted tables. @@ -25,8 +30,8 @@ def read_pdf( Parameters ---------- - filepath : str, Path, IO - Filepath or URL of the PDF file. + filepath : str | pathlib.Path, optional (default: None) + Filepath or URL of the PDF file. Required if file_bytes is not given pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. @@ -37,6 +42,8 @@ def read_pdf( Lattice is used by default. suppress_stdout : bool, optional (default: True) Print all logs and warnings. + file_bytes : io.IOBase, optional (default: None) + A file-like stream. Required if filepath is not given layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams `_ kwargs. @@ -112,12 +119,15 @@ def read_pdf( "Unknown flavor specified." " Use either 'lattice' or 'stream'" ) + if not filepath and not file_bytes: + raise InvalidArguments('Either `filepath` or `file_bytes` is required') + with warnings.catch_warnings(): if suppress_stdout: warnings.simplefilter("ignore") validate_input(kwargs, flavor=flavor) - p = PDFHandler(filepath, pages=pages, password=password) + p = PDFHandler(filepath, pages=pages, password=password, file_bytes=file_bytes) kwargs = remove_extra(kwargs, flavor=flavor) tables = p.parse( flavor=flavor, diff --git a/camelot/utils.py b/camelot/utils.py index 29939f684..c93e21ff2 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1,4 +1,4 @@ -import os +import io import random import re import shutil @@ -34,6 +34,10 @@ _VALID_URLS.discard("") +class InvalidArguments(Exception): + pass + + # https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py def is_url(url): """Check to see if a URL has a valid protocol. @@ -64,8 +68,8 @@ def random_string(length): return ret -def download_url(url): - """Download file from specified URL. +def get_url_bytes(url): + """Get a stream of bytes for url Parameters ---------- @@ -73,22 +77,21 @@ def download_url(url): Returns ------- - filepath : str or unicode - Temporary filepath. + file_bytes : io.BytesIO + a file-like object that cane be read """ - filename = f"{random_string(6)}.pdf" - with tempfile.NamedTemporaryFile("wb", delete=False) as f: - headers = {"User-Agent": "Mozilla/5.0"} - request = Request(url, None, headers) - obj = urlopen(request) - content_type = obj.info().get_content_type() - if content_type != "application/pdf": - raise NotImplementedError("File format not supported") - f.write(obj.read()) - filepath = os.path.join(os.path.dirname(f.name), filename) - shutil.move(f.name, filepath) - return filepath + file_bytes = io.BytesIO() + file_bytes.name = url + headers = {"User-Agent": "Mozilla/5.0"} + request = Request(url, data=None, headers=headers) + obj = urlopen(request) + content_type = obj.info().get_content_type() + if content_type != "application/pdf": + raise NotImplementedError("File format not supported") + file_bytes.write(obj.read()) + file_bytes.seek(0) + return file_bytes stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"] diff --git a/tests/test_common.py b/tests/test_common.py index ca9910d0d..0f2317e2b 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -1,3 +1,4 @@ +import io import os from pathlib import Path @@ -188,3 +189,21 @@ def test_handler_with_pathlib(testdir): with open(filename, "rb") as f: handler = PDFHandler(f) assert handler._get_pages("1") == [1] + +@skip_on_windows +def test_from_open(testdir): + filename = os.path.join(testdir, "foo.pdf") + with open(filename, "rb") as file_bytes: + tables = camelot.read_pdf(file_bytes=file_bytes) + assert repr(tables) == "" + assert repr(tables[0]) == "" + +@skip_on_windows +def test_from_bytes(testdir): + filename = os.path.join(testdir, "foo.pdf") + file_bytes = io.BytesIO() + with open(filename, "rb") as f: + file_bytes.write(f.read()) # note that we didn't seek, done by PDFHandler + tables = camelot.read_pdf(file_bytes=file_bytes) + assert repr(tables) == "" + assert repr(tables[0]) == "
"