added unique filename in preprocess_image

erikkastelec · Sep 6, 2020 · b9b6339 · b9b6339
1 parent d93e526
commit b9b6339
Show file tree

Hide file tree

Showing 5 changed files with 12 additions and 54 deletions.
diff --git a/PDFScraper/__init__.py b/PDFScraper/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "1.1.3"
+__version__ = "1.1.6"
 
 import logging
 

diff --git a/PDFScraper/cli.py b/PDFScraper/cli.py
@@ -114,8 +114,7 @@ def process_doc(doc):
 
         else:
             logger.warning("Skipping parsing. Document is not exable.")
-        # logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
-        # progress_counter += 1
+
     else:
         logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
         pdf_to_image(doc)
@@ -142,13 +141,16 @@ def cli():
     # Extract information about PDFs
     progress_counter = 1
 
-    # Multiprocessing
+    # Multiprocessing -- Improves speed of processing multiple documents significantly
+    # !! BAD PERFORMANCE OF OCR WITH MULTIPLE FILES
     if args["multiprocessing"]:
         pool = multiprocessing.Pool()
         pool.map(process_doc, docs)
     else:
         for doc in docs:
             process_doc(doc)
+            logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
+            progress_counter += 1
     logger.info('Done parsing PDFs')
     logger.info('Generating summary')
     generate_html(output_path, docs, search_word, search_mode)
@@ -158,41 +160,5 @@ def cli():
     sys.exit(0)
 
 
-def process_doc(doc):
-    extract_info(doc)
-    get_filename(doc)
-    if doc.is_pdf:
-        pdf_object = get_pdf_object(doc)
-        if doc.extractable:
-
-            logger.debug('Document information:' + '\n' + doc.document_info_to_string())
-            extract_table_of_contents(doc, pdf_object)
-            logger.debug('Table of contents: \n' + doc.table_of_contents_to_string())
-            page_layouts = extract_page_layouts(pdf_object)
-            # table extraction is possible only for text based PDFs
-            if tables_extract:
-                extract_tables(doc)
-            parse_layouts(doc, page_layouts)
-            if len(doc.paragraphs) == 0:
-                logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
-                pdf_to_image(doc)
-                convert_to_pdf(doc, tessdata_location)
-                pdf_object = get_pdf_object(doc)
-                page_layouts = extract_page_layouts(pdf_object)
-                if tables_extract:
-                    extract_tables(doc)
-                parse_layouts(doc, page_layouts)
-
-        else:
-            logger.warning("Skipping parsing. Document is not exable.")
-        # logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
-        # progress_counter += 1
-    else:
-        logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
-        pdf_to_image(doc)
-        convert_to_pdf(doc, tessdata_location)
-        pdf_object = get_pdf_object(doc)
-        page_layouts = extract_page_layouts(pdf_object)
-        if tables_extract:
-            extract_tables(doc)
-        parse_layouts(doc, page_layouts)
+if __name__ == "__main__":
+    cli()
diff --git a/PDFScraper/core.py b/PDFScraper/core.py
@@ -4,6 +4,7 @@
 import re
 import sys
 import tempfile
+import uuid
 from typing import TYPE_CHECKING
 
 import camelot
@@ -228,11 +229,10 @@ def preprocess_image(image):
     # image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
     image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
     # save and reread to convert to scikit-image image type
-    temp_image_path = tempfile.gettempdir() + "/PDFScraper" + "/" + "deskew.jpg"
+    temp_image_path = tempfile.gettempdir() + "/PDFScraper" + "/" + str(uuid.uuid4()) + "deskew.jpg"
     cv2.imwrite(temp_image_path, image)
     image = io.imread(temp_image_path)
     os.remove(temp_image_path)
-    # perform deskewing
     image = deskew(image)
     image = image * 255
     io.imsave(temp_image_path, image.astype(np.uint8))
@@ -496,11 +496,3 @@ def find_words_tables(tables, search_mode, search_words, match_score):
             result.append(table)
     return result
 
-
-if __name__ == "__main__":
-    import argparse
-
-    argumentParser = argparse.ArgumentParser()
-    argumentParser.add_argument('--path', help='path to pdf file', required=True)
-    args = vars(argumentParser.parse_args())
-    doc = Document(args["path"])
diff --git a/README.md b/README.md
@@ -44,7 +44,7 @@ optional arguments:
 
 `search_mode`, by default in 'and' mode, specifies whether all the search terms need to be contained inside paragraph. In 'or' mode, the paragraph is returned if any of the terms are contained. In 'and' mode, the paragraph is returned if all the terms are contained.
 
-`multiprocessing`, by default True, runs process in multiple threads to speed up.
+`multiprocessing`, by default True, runs process in multiple threads to speed up processing. **Should not be used with OCR as it significantly decreases performance**
 ### OCR
 
 **tessdata pretrained language [files](https://github.com/tesseract-ocr/tessdata_best) need to be manually added to the tessdata directory.**

diff --git a/setup.py b/setup.py
@@ -49,7 +49,7 @@
         "yattag==1.14.0",
     ],
     name="PDFScraper",
-    version="1.1.3",
+    version="1.1.6",
     author="Erik Kastelec",
     author_email="[email protected]",
     description="PDF text and table search",