From bdc2ae7b8a69423ca8954d74903e43ce02552637 Mon Sep 17 00:00:00 2001
From: cscanlin <chris.scanlin@kwhanalytics.com>
Date: Fri, 8 Oct 2021 00:13:00 -0700
Subject: [PATCH] [IMP]: add support for file_bytes argument with
 managed_file_context()

---
 camelot/handlers.py  | 172 ++++++++++++++++++++++++++++---------------
 camelot/io.py        |  25 ++++---
 camelot/utils.py     |  40 +++++-----
 tests/test_common.py |  21 +++++-
 tests/test_errors.py |  18 ++++-
 5 files changed, 185 insertions(+), 91 deletions(-)

diff --git a/camelot/handlers.py b/camelot/handlers.py
index 74ddde7a..cebb0c8d 100644
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@@ -1,24 +1,32 @@
+import io
 import multiprocessing as mp
 import os
 import sys
+from contextlib import contextmanager
 from pathlib import Path
+from typing import IO
+from typing import Any
+from typing import TypeVar
 from typing import Union
 
 from pypdf import PdfReader
 from pypdf import PdfWriter
-from pypdf._utils import StrByteType
 
 from .core import TableList
 from .parsers import Lattice
 from .parsers import Stream
+from .utils import InvalidArguments
 from .utils import TemporaryDirectory
-from .utils import download_url
 from .utils import get_page_layout
 from .utils import get_rotation
 from .utils import get_text_objects
+from .utils import get_url_bytes
 from .utils import is_url
 
 
+FilePathType = TypeVar("FilePathType", str, IO[Any], Path, None)
+
+
 class PDFHandler:
     """Handles all operations like temp directory creation, splitting
     file into single page PDFs, parsing each PDF and then removing the
@@ -26,21 +34,39 @@ class PDFHandler:
 
     Parameters
     ----------
-    filepath : str
-        Filepath or URL of the PDF file.
+    filepath : str | pathlib.Path, optional (default: None)
+        Filepath or URL of the PDF file. Required if file_bytes is not given
     pages : str, optional (default: '1')
         Comma-separated page numbers.
         Example: '1,3,4' or '1,4-end' or 'all'.
     password : str, optional (default: None)
         Password for decryption.
+    file_bytes : io.IOBase, optional (default: None)
+        A file-like stream. Required if filepath is not given
 
     """
 
-    def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None):
+    def __init__(
+        self, filepath: FilePathType = None, pages="1", password=None, file_bytes=None
+    ):
         if is_url(filepath):
-            filepath = download_url(filepath)
-        self.filepath: Union[StrByteType, Path] = filepath
+            file_bytes = get_url_bytes(filepath)
+
+        if not filepath and not file_bytes:
+            raise InvalidArguments("Either `filepath` or `file_bytes` is required")
+        if not filepath:
+            # filepath must either be passed, or taken from the name attribute
+            try:
+                filepath = getattr(file_bytes, "name")
+            except AttributeError:
+                msg = (
+                    "Either pass a `filepath`, or give the "
+                    "`file_bytes` argument a name attribute"
+                )
+                raise InvalidArguments(msg)
+        self.file_bytes = file_bytes  # ok to be None
 
+        self.filepath = filepath
         if isinstance(filepath, str) and not filepath.lower().endswith(".pdf"):
             raise NotImplementedError("File format not supported")
 
@@ -52,13 +78,35 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
                 self.password = self.password.encode("ascii")
         self.pages = self._get_pages(pages)
 
+    @contextmanager
+    def managed_file_context(self):
+        """Reads from either the `filepath` or `file_bytes`
+        attribute of this instance, to return a file-like object.
+        Closes any open file handles on exit or error.
+
+        Returns
+        -------
+        file_bytes : io.IOBase
+            A readable, seekable, file-like object
+        """
+        if self.file_bytes:
+            # if we can't seek, write to a BytesIO object that can,
+            # then seek to the beginning before yielding
+            if not hasattr(self.file_bytes, "seek"):
+                self.file_bytes = io.BytesIO(self.file_bytes.read())
+            self.file_bytes.seek(0)
+            yield self.file_bytes
+        else:
+            with open(self.filepath, "rb") as file_bytes:
+                yield file_bytes
+
     def _get_pages(self, pages):
         """Converts pages string to list of ints.
 
         Parameters
         ----------
-        filepath : str
-            Filepath or URL of the PDF file.
+        managed_file_context : io.IOBase
+            A readable, seekable, file-like object
         pages : str, optional (default: '1')
             Comma-separated page numbers.
             Example: '1,3,4' or '1,4-end' or 'all'.
@@ -74,74 +122,77 @@ def _get_pages(self, pages):
         if pages == "1":
             page_numbers.append({"start": 1, "end": 1})
         else:
-            infile = PdfReader(self.filepath, strict=False)
+            with self.managed_file_context() as f:
+                infile = PdfReader(f, strict=False)
 
-            if infile.is_encrypted:
-                infile.decrypt(self.password)
+                if infile.is_encrypted:
+                    infile.decrypt(self.password)
 
-            if pages == "all":
-                page_numbers.append({"start": 1, "end": len(infile.pages)})
-            else:
-                for r in pages.split(","):
-                    if "-" in r:
-                        a, b = r.split("-")
-                        if b == "end":
-                            b = len(infile.pages)
-                        page_numbers.append({"start": int(a), "end": int(b)})
-                    else:
-                        page_numbers.append({"start": int(r), "end": int(r)})
+                if pages == "all":
+                    page_numbers.append({"start": 1, "end": len(infile.pages)})
+                else:
+                    for r in pages.split(","):
+                        if "-" in r:
+                            a, b = r.split("-")
+                            if b == "end":
+                                b = len(infile.pages)
+                            page_numbers.append({"start": int(a), "end": int(b)})
+                        else:
+                            page_numbers.append({"start": int(r), "end": int(r)})
 
         result = []
         for p in page_numbers:
             result.extend(range(p["start"], p["end"] + 1))
         return sorted(set(result))
 
-    def _save_page(self, filepath: Union[StrByteType, Path], page, temp):
+    def _save_page(self, filepath: FilePathType, page, temp):
         """Saves specified page from PDF into a temporary directory.
 
         Parameters
         ----------
-        filepath : str
-            Filepath or URL of the PDF file.
+        managed_file_context : io.IOBase
+            A readable, seekable, file-like object
         page : int
             Page number.
         temp : str
             Tmp directory.
 
         """
-        infile = PdfReader(filepath, strict=False)
-        if infile.is_encrypted:
-            infile.decrypt(self.password)
-        fpath = os.path.join(temp, f"page-{page}.pdf")
-        froot, fext = os.path.splitext(fpath)
-        p = infile.pages[page - 1]
-        outfile = PdfWriter()
-        outfile.add_page(p)
-        with open(fpath, "wb") as f:
-            outfile.write(f)
-        layout, dim = get_page_layout(fpath)
-        # fix rotated PDF
-        chars = get_text_objects(layout, ltype="char")
-        horizontal_text = get_text_objects(layout, ltype="horizontal_text")
-        vertical_text = get_text_objects(layout, ltype="vertical_text")
-        rotation = get_rotation(chars, horizontal_text, vertical_text)
-        if rotation != "":
-            fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
-            os.rename(fpath, fpath_new)
-            instream = open(fpath_new, "rb")
-            infile = PdfReader(instream, strict=False)
+
+        with self.managed_file_context() as fileobj:
+            infile = PdfReader(fileobj, strict=False)
             if infile.is_encrypted:
                 infile.decrypt(self.password)
+            fpath = os.path.join(temp, f"page-{page}.pdf")
+            froot, fext = os.path.splitext(fpath)
+            p = infile.pages[page - 1]
             outfile = PdfWriter()
-            p = infile.pages[0]
-            if rotation == "anticlockwise":
-                p.rotate(90)
-            elif rotation == "clockwise":
-                p.rotate(-90)
             outfile.add_page(p)
             with open(fpath, "wb") as f:
                 outfile.write(f)
-            instream.close()
+            layout, dim = get_page_layout(fpath)
+            # fix rotated PDF
+            chars = get_text_objects(layout, ltype="char")
+            horizontal_text = get_text_objects(layout, ltype="horizontal_text")
+            vertical_text = get_text_objects(layout, ltype="vertical_text")
+            rotation = get_rotation(chars, horizontal_text, vertical_text)
+            if rotation != "":
+                fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
+                os.rename(fpath, fpath_new)
+                instream = open(fpath_new, "rb")
+                infile = PdfReader(instream, strict=False)
+                if infile.is_encrypted:
+                    infile.decrypt(self.password)
+                outfile = PdfWriter()
+                p = infile.pages[0]
+                if rotation == "anticlockwise":
+                    p.rotate(90)
+                elif rotation == "clockwise":
+                    p.rotate(-90)
+                outfile.add_page(p)
+                with open(fpath, "wb") as f:
+                    outfile.write(f)
+                instream.close()
 
     def parse(
         self,
@@ -149,7 +200,7 @@ def parse(
         suppress_stdout=False,
         parallel=False,
         layout_kwargs=None,
-        **kwargs
+        **kwargs,
     ):
         """Extracts tables by calling parser.get_tables on all single
         page PDFs.
@@ -189,7 +240,8 @@ def parse(
                     jobs = []
                     for p in self.pages:
                         j = pool.apply_async(
-                            self._parse_page,(p, tempdir, parser, suppress_stdout, layout_kwargs)
+                            self._parse_page,
+                            (p, tempdir, parser, suppress_stdout, layout_kwargs),
                         )
                         jobs.append(j)
 
@@ -198,14 +250,14 @@ def parse(
                         tables.extend(t)
             else:
                 for p in self.pages:
-                    t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs)
+                    t = self._parse_page(
+                        p, tempdir, parser, suppress_stdout, layout_kwargs
+                    )
                     tables.extend(t)
 
         return TableList(sorted(tables))
 
-    def _parse_page(
-        self, page, tempdir, parser, suppress_stdout, layout_kwargs
-    ):
+    def _parse_page(self, page, tempdir, parser, suppress_stdout, layout_kwargs):
         """Extracts tables by calling parser.get_tables on a single
         page PDF.
 
@@ -224,7 +276,7 @@ def _parse_page(
         -------
         tables : camelot.core.TableList
             List of tables found in PDF.
-        
+
         """
         self._save_page(self.filepath, page, tempdir)
         page_path = os.path.join(tempdir, f"page-{page}.pdf")
diff --git a/camelot/io.py b/camelot/io.py
index 12718828..9c5e6f62 100644
--- a/camelot/io.py
+++ b/camelot/io.py
@@ -1,22 +1,24 @@
 import warnings
 from pathlib import Path
-from typing import Union
 
-from pypdf._utils import StrByteType
+from .handlers import PDFHandler, FilePathType
 
-from .handlers import PDFHandler
-from .utils import remove_extra
-from .utils import validate_input
+from .utils import (
+    InvalidArguments,
+    validate_input,
+    remove_extra,
+)
 
 
 def read_pdf(
-    filepath: Union[StrByteType, Path],
+    filepath: FilePathType = None,
     pages="1",
     password=None,
     flavor="lattice",
     suppress_stdout=False,
     parallel=False,
     layout_kwargs=None,
+    file_bytes=None,
     **kwargs
 ):
     """Read PDF and return extracted tables.
@@ -26,8 +28,8 @@ def read_pdf(
 
     Parameters
     ----------
-    filepath : str, Path, IO
-        Filepath or URL of the PDF file.
+    filepath : str | pathlib.Path, optional (default: None)
+        Filepath or URL of the PDF file. Required if file_bytes is not given
     pages : str, optional (default: '1')
         Comma-separated page numbers.
         Example: '1,3,4' or '1,4-end' or 'all'.
@@ -40,6 +42,8 @@ def read_pdf(
         Print all logs and warnings.
     parallel : bool, optional (default: False)
         Process pages in parallel using all available cpu cores.
+    file_bytes : io.IOBase, optional (default: None)
+        A file-like stream. Required if filepath is not given
     layout_kwargs : dict, optional (default: {})
         A dict of `pdfminer.layout.LAParams
         <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
@@ -115,12 +119,15 @@ def read_pdf(
             "Unknown flavor specified." " Use either 'lattice' or 'stream'"
         )
 
+    if not filepath and not file_bytes:
+        raise InvalidArguments('Either `filepath` or `file_bytes` is required')
+
     with warnings.catch_warnings():
         if suppress_stdout:
             warnings.simplefilter("ignore")
 
         validate_input(kwargs, flavor=flavor)
-        p = PDFHandler(filepath, pages=pages, password=password)
+        p = PDFHandler(filepath, pages=pages, password=password, file_bytes=file_bytes)
         kwargs = remove_extra(kwargs, flavor=flavor)
         tables = p.parse(
             flavor=flavor,
diff --git a/camelot/utils.py b/camelot/utils.py
index fda56f54..aca25281 100644
--- a/camelot/utils.py
+++ b/camelot/utils.py
@@ -1,4 +1,4 @@
-import os
+import io
 import random
 import re
 import shutil
@@ -34,6 +34,10 @@
 _VALID_URLS.discard("")
 
 
+class InvalidArguments(Exception):
+    pass
+
+
 # https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
 def is_url(url):
     """Check to see if a URL has a valid protocol.
@@ -64,8 +68,8 @@ def random_string(length):
     return ret
 
 
-def download_url(url):
-    """Download file from specified URL.
+def get_url_bytes(url):
+    """Get a stream of bytes for url
 
     Parameters
     ----------
@@ -73,25 +77,21 @@ def download_url(url):
 
     Returns
     -------
-    filepath : str or unicode
-        Temporary filepath.
+    file_bytes : io.BytesIO
+        a file-like object that cane be read
 
     """
-    filename = f"{random_string(6)}.pdf"
-    with tempfile.NamedTemporaryFile("wb", delete=False) as f:
-        headers = {
-            "User-Agent": "Mozilla/5.0",
-            "Accept-Encoding": "gzip;q=1.0, deflate;q=0.9, br;q=0.8, compress;q=0.7, *;q=0.1"
-        }
-        request = Request(url, None, headers)
-        obj = urlopen(request)
-        content_type = obj.info().get_content_type()
-        if content_type != "application/pdf":
-            raise NotImplementedError("File format not supported")
-        f.write(obj.read())
-    filepath = os.path.join(os.path.dirname(f.name), filename)
-    shutil.move(f.name, filepath)
-    return filepath
+    file_bytes = io.BytesIO()
+    file_bytes.name = url
+    headers = {"User-Agent": "Mozilla/5.0"}
+    request = Request(url, data=None, headers=headers)
+    obj = urlopen(request)
+    content_type = obj.info().get_content_type()
+    if content_type != "application/pdf":
+        raise NotImplementedError("File format not supported")
+    file_bytes.write(obj.read())
+    file_bytes.seek(0)
+    return file_bytes
 
 
 stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
diff --git a/tests/test_common.py b/tests/test_common.py
index c3243174..b1ba3cfc 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -1,3 +1,4 @@
+import io
 import os
 from pathlib import Path
 
@@ -189,7 +190,6 @@ def test_handler_with_pathlib(testdir):
         handler = PDFHandler(f)
         assert handler._get_pages("1") == [1]
 
-
 def test_table_list_iter():
     def _make_table(page, order):
         t = Table([], [])
@@ -214,3 +214,22 @@ def _make_table(page, order):
     assert iterator_b is not None
     item_c = next(iterator_b)
     assert item_c is not None
+
+@skip_on_windows
+def test_from_open(testdir):
+    filename = os.path.join(testdir, "foo.pdf")
+    with open(filename, "rb") as file_bytes:
+        tables = camelot.read_pdf(file_bytes=file_bytes)
+        assert repr(tables) == "<TableList n=1>"
+        assert repr(tables[0]) == "<Table shape=(7, 7)>"
+
+@skip_on_windows
+def test_from_bytes(testdir):
+    filename = os.path.join(testdir, "foo.pdf")
+    file_bytes = io.BytesIO()
+    file_bytes.name = filename
+    with open(filename, "rb") as f:
+        file_bytes.write(f.read())  # note that we didn't seek, done by PDFHandler
+    tables = camelot.read_pdf(file_bytes=file_bytes)
+    assert repr(tables) == "<TableList n=1>"
+    assert repr(tables[0]) == "<Table shape=(7, 7)>"
diff --git a/tests/test_errors.py b/tests/test_errors.py
index eaa720ec..b99c4848 100644
--- a/tests/test_errors.py
+++ b/tests/test_errors.py
@@ -1,11 +1,12 @@
 import os
+import io
 import warnings
 
 import pytest
 
 import camelot
 from tests.conftest import skip_on_windows
-
+from camelot.utils import InvalidArguments
 
 def test_unknown_flavor(foo_pdf):
     message = "Unknown flavor specified." " Use either 'lattice' or 'stream'"
@@ -25,6 +26,21 @@ def test_unsupported_format(testdir):
     with pytest.raises(NotImplementedError, match=message):
         camelot.read_pdf(filename)
 
+def test_no_file_or_bytes(testdir):
+    message = "Either `filepath` or `file_bytes` is required"
+    with pytest.raises(InvalidArguments, match=message):
+        camelot.read_pdf()
+
+@skip_on_windows
+def test_no_filepath_or_name(testdir):
+    message = ('Either pass a `filepath`, or give the '
+               '`file_bytes` argument a name attribute')
+    filename = os.path.join(testdir, "foo.pdf")
+    file_bytes = io.BytesIO()
+    with open(filename, "rb") as f:
+        file_bytes.write(f.read())
+    with pytest.raises(InvalidArguments, match=message):
+        camelot.read_pdf(file_bytes=file_bytes)
 
 @skip_on_windows
 def test_no_tables_found_logs_suppressed(testdir):