From 6c243e00cb630b3f92543794bee035a2f421d236 Mon Sep 17 00:00:00 2001 From: Erik Kastelec <34520112+erikkastelec@users.noreply.github.com> Date: Fri, 11 Sep 2020 13:48:00 +0200 Subject: [PATCH] fix for a bug introduced in the last commit. --- PDFScraper/__init__.py | 2 +- PDFScraper/core.py | 7 ++++--- README.md | 2 +- setup.py | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/PDFScraper/__init__.py b/PDFScraper/__init__.py index fe4d5a9..22cd97f 100644 --- a/PDFScraper/__init__.py +++ b/PDFScraper/__init__.py @@ -1,4 +1,4 @@ -__version__ = "1.1.7" +__version__ = "1.1.8" import logging diff --git a/PDFScraper/core.py b/PDFScraper/core.py index be88566..e2d4f4e 100644 --- a/PDFScraper/core.py +++ b/PDFScraper/core.py @@ -278,19 +278,21 @@ def convert_to_pdf(document: Document, tessdata_location: str, config_options="" logger.error(e) sys.exit(1) pdf_writer = PdfFileWriter() + pdf_files = [] for filename in pdf_pages: pdf_file = open(filename, 'rb') + pdf_files.append(pdf_file) pdf_reader = PdfFileReader(pdf_file) for i in range(pdf_reader.numPages): page = pdf_reader.getPage(i) pdf_writer.addPage(page) - pdf_file.close() with open(tempfile.gettempdir() + "/PDFScraper" + "/" + document.filename + ".pdf", 'w+b') as out: pdf_writer.write(out) out.close() document.ocr_path = tempfile.gettempdir() + "/PDFScraper" + "/" + document.filename + ".pdf" # cleanup temporary files - for filename in pdf_pages: + for file, filename in zip(pdf_files, pdf_pages): + file.close() os.remove(filename) @@ -497,4 +499,3 @@ def find_words_tables(tables, search_mode, search_words, match_score): if found: result.append(table) return result - diff --git a/README.md b/README.md index 60847d9..c1eff1f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # PDFScraper [![PyPI version](https://badge.fury.io/py/PDFScraper.svg)](https://badge.fury.io/py/PDFScraper) -CLI program for searching text and tables inside of PDF documents and displaying results in HTML. It combines [Pdfminer.six](https://github.com/pdfminer/pdfminer.six), [Camelot](https://github.com/camelot-dev/camelot) and [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) in a single program, which is simple to use. +CLI program and library for extraction of PDF elements, which implements a search functionality that outputs summary in an HTML format. It combines [Pdfminer.six](https://github.com/pdfminer/pdfminer.six), [Camelot](https://github.com/camelot-dev/camelot) and [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) in a single program, which is simple to use. # How to use ### Install using pip diff --git a/setup.py b/setup.py index 9afbfab..5cd2de2 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ "yattag==1.14.0", ], name="PDFScraper", - version="1.1.7", + version="1.1.8", author="Erik Kastelec", author_email="erikkastelec@gmail.com", description="PDF text and table search",