multiprocessing support

erikkastelec · Sep 5, 2020 · d93e526 · d93e526
1 parent 04a50ac
commit d93e526
Show file tree

Hide file tree

Showing 5 changed files with 171 additions and 104 deletions.
diff --git a/PDFScraper/__init__.py b/PDFScraper/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "1.1.2"
+__version__ = "1.1.3"
 
 import logging
 

diff --git a/PDFScraper/cli.py b/PDFScraper/cli.py
@@ -1,5 +1,6 @@
 import argparse
 import logging
+import multiprocessing
 import os
 import shutil
 import signal
@@ -61,7 +62,7 @@ def search_mode_helper(v):
 argumentParser.add_argument('--search_mode', type=search_mode_helper, help='And or Or search, when multiple '
                                                                            'search words are provided',
                             default=True)
-
+argumentParser.add_argument('--multiprocessing', type=str2bool, help='should multiprocessing be enabled', default=True)
 args = vars(argumentParser.parse_args())
 output_path = args["out"]
 log_level = logger_switcher.get(args["log_level"])
@@ -86,6 +87,46 @@ def signal_handler(sign, frame):
 signal.signal(signal.SIGINT, signal_handler)
 
 
+def process_doc(doc):
+    extract_info(doc)
+    get_filename(doc)
+    if doc.is_pdf:
+        pdf_object = get_pdf_object(doc)
+        if doc.extractable:
+
+            logger.debug('Document information:' + '\n' + doc.document_info_to_string())
+            extract_table_of_contents(doc, pdf_object)
+            logger.debug('Table of contents: \n' + doc.table_of_contents_to_string())
+            page_layouts = extract_page_layouts(pdf_object)
+            # table extraction is possible only for text based PDFs
+            if tables_extract:
+                extract_tables(doc)
+            parse_layouts(doc, page_layouts)
+            if len(doc.paragraphs) == 0:
+                logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
+                pdf_to_image(doc)
+                convert_to_pdf(doc, tessdata_location)
+                pdf_object = get_pdf_object(doc)
+                page_layouts = extract_page_layouts(pdf_object)
+                if tables_extract:
+                    extract_tables(doc)
+                parse_layouts(doc, page_layouts)
+
+        else:
+            logger.warning("Skipping parsing. Document is not exable.")
+        # logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
+        # progress_counter += 1
+    else:
+        logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
+        pdf_to_image(doc)
+        convert_to_pdf(doc, tessdata_location)
+        pdf_object = get_pdf_object(doc)
+        page_layouts = extract_page_layouts(pdf_object)
+        if tables_extract:
+            extract_tables(doc)
+        parse_layouts(doc, page_layouts)
+    logger.debug('Paragraphs: \n' + '\n'.join(doc.paragraphs))
+
 def cli():
     path = os.path.abspath(args["path"])
     logger.info('Finding PDFs in ' + path)
@@ -100,48 +141,58 @@ def cli():
     logger.info('Parsing ' + str(len(docs)) + ' documents')
     # Extract information about PDFs
     progress_counter = 1
-    for doc in docs:
-        extract_info(doc)
-        get_filename(doc)
-        if doc.is_pdf:
-            pdf_object = get_pdf_object(doc)
-            if doc.extractable:
-
-                logger.debug('Document information:' + '\n' + doc.document_info_to_string())
-                extract_table_of_contents(doc, pdf_object)
-                logger.debug('Table of contents: \n' + doc.table_of_contents_to_string())
-                page_layouts = extract_page_layouts(pdf_object)
-                # table extraction is possible only for text based PDFs
-                if tables_extract:
-                    extract_tables(doc)
-                parse_layouts(doc, page_layouts)
-                if len(doc.paragraphs) == 0:
-                    logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
-                    pdf_to_image(doc)
-                    convert_to_pdf(doc, tessdata_location)
-                    pdf_object = get_pdf_object(doc)
-                    page_layouts = extract_page_layouts(pdf_object)
-                    if tables_extract:
-                        extract_tables(doc)
-                    parse_layouts(doc, page_layouts)
-
-            else:
-                logger.warning("Skipping parsing. Document is not exable.")
-            logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
-            progress_counter += 1
-        else:
-            logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
-            pdf_to_image(doc)
-            convert_to_pdf(doc, tessdata_location)
-            pdf_object = get_pdf_object(doc)
-            page_layouts = extract_page_layouts(pdf_object)
-            if tables_extract:
-                extract_tables(doc)
-            parse_layouts(doc, page_layouts)
-        logger.debug('Paragraphs: \n' + '\n'.join(doc.paragraphs))
+
+    # Multiprocessing
+    if args["multiprocessing"]:
+        pool = multiprocessing.Pool()
+        pool.map(process_doc, docs)
+    else:
+        for doc in docs:
+            process_doc(doc)
     logger.info('Done parsing PDFs')
-    logger.info('Stopping')
+    logger.info('Generating summary')
     generate_html(output_path, docs, search_word, search_mode)
     # clean up temporary directory
+    logger.info('Stopping')
     shutil.rmtree(tempfile.gettempdir() + "/PDFScraper", ignore_errors=True)
     sys.exit(0)
+
+
+def process_doc(doc):
+    extract_info(doc)
+    get_filename(doc)
+    if doc.is_pdf:
+        pdf_object = get_pdf_object(doc)
+        if doc.extractable:
+
+            logger.debug('Document information:' + '\n' + doc.document_info_to_string())
+            extract_table_of_contents(doc, pdf_object)
+            logger.debug('Table of contents: \n' + doc.table_of_contents_to_string())
+            page_layouts = extract_page_layouts(pdf_object)
+            # table extraction is possible only for text based PDFs
+            if tables_extract:
+                extract_tables(doc)
+            parse_layouts(doc, page_layouts)
+            if len(doc.paragraphs) == 0:
+                logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
+                pdf_to_image(doc)
+                convert_to_pdf(doc, tessdata_location)
+                pdf_object = get_pdf_object(doc)
+                page_layouts = extract_page_layouts(pdf_object)
+                if tables_extract:
+                    extract_tables(doc)
+                parse_layouts(doc, page_layouts)
+
+        else:
+            logger.warning("Skipping parsing. Document is not exable.")
+        # logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
+        # progress_counter += 1
+    else:
+        logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
+        pdf_to_image(doc)
+        convert_to_pdf(doc, tessdata_location)
+        pdf_object = get_pdf_object(doc)
+        page_layouts = extract_page_layouts(pdf_object)
+        if tables_extract:
+            extract_tables(doc)
+        parse_layouts(doc, page_layouts)