Skip to content

Commit

Permalink
multiprocessing support
Browse files Browse the repository at this point in the history
  • Loading branch information
erikkastelec committed Sep 5, 2020
1 parent 04a50ac commit d93e526
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 104 deletions.
2 changes: 1 addition & 1 deletion PDFScraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "1.1.2"
__version__ = "1.1.3"

import logging

Expand Down
133 changes: 92 additions & 41 deletions PDFScraper/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
import logging
import multiprocessing
import os
import shutil
import signal
Expand Down Expand Up @@ -61,7 +62,7 @@ def search_mode_helper(v):
argumentParser.add_argument('--search_mode', type=search_mode_helper, help='And or Or search, when multiple '
'search words are provided',
default=True)

argumentParser.add_argument('--multiprocessing', type=str2bool, help='should multiprocessing be enabled', default=True)
args = vars(argumentParser.parse_args())
output_path = args["out"]
log_level = logger_switcher.get(args["log_level"])
Expand All @@ -86,6 +87,46 @@ def signal_handler(sign, frame):
signal.signal(signal.SIGINT, signal_handler)


def process_doc(doc):
extract_info(doc)
get_filename(doc)
if doc.is_pdf:
pdf_object = get_pdf_object(doc)
if doc.extractable:

logger.debug('Document information:' + '\n' + doc.document_info_to_string())
extract_table_of_contents(doc, pdf_object)
logger.debug('Table of contents: \n' + doc.table_of_contents_to_string())
page_layouts = extract_page_layouts(pdf_object)
# table extraction is possible only for text based PDFs
if tables_extract:
extract_tables(doc)
parse_layouts(doc, page_layouts)
if len(doc.paragraphs) == 0:
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
pdf_to_image(doc)
convert_to_pdf(doc, tessdata_location)
pdf_object = get_pdf_object(doc)
page_layouts = extract_page_layouts(pdf_object)
if tables_extract:
extract_tables(doc)
parse_layouts(doc, page_layouts)

else:
logger.warning("Skipping parsing. Document is not exable.")
# logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
# progress_counter += 1
else:
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
pdf_to_image(doc)
convert_to_pdf(doc, tessdata_location)
pdf_object = get_pdf_object(doc)
page_layouts = extract_page_layouts(pdf_object)
if tables_extract:
extract_tables(doc)
parse_layouts(doc, page_layouts)
logger.debug('Paragraphs: \n' + '\n'.join(doc.paragraphs))

def cli():
path = os.path.abspath(args["path"])
logger.info('Finding PDFs in ' + path)
Expand All @@ -100,48 +141,58 @@ def cli():
logger.info('Parsing ' + str(len(docs)) + ' documents')
# Extract information about PDFs
progress_counter = 1
for doc in docs:
extract_info(doc)
get_filename(doc)
if doc.is_pdf:
pdf_object = get_pdf_object(doc)
if doc.extractable:

logger.debug('Document information:' + '\n' + doc.document_info_to_string())
extract_table_of_contents(doc, pdf_object)
logger.debug('Table of contents: \n' + doc.table_of_contents_to_string())
page_layouts = extract_page_layouts(pdf_object)
# table extraction is possible only for text based PDFs
if tables_extract:
extract_tables(doc)
parse_layouts(doc, page_layouts)
if len(doc.paragraphs) == 0:
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
pdf_to_image(doc)
convert_to_pdf(doc, tessdata_location)
pdf_object = get_pdf_object(doc)
page_layouts = extract_page_layouts(pdf_object)
if tables_extract:
extract_tables(doc)
parse_layouts(doc, page_layouts)

else:
logger.warning("Skipping parsing. Document is not exable.")
logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
progress_counter += 1
else:
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
pdf_to_image(doc)
convert_to_pdf(doc, tessdata_location)
pdf_object = get_pdf_object(doc)
page_layouts = extract_page_layouts(pdf_object)
if tables_extract:
extract_tables(doc)
parse_layouts(doc, page_layouts)
logger.debug('Paragraphs: \n' + '\n'.join(doc.paragraphs))

# Multiprocessing
if args["multiprocessing"]:
pool = multiprocessing.Pool()
pool.map(process_doc, docs)
else:
for doc in docs:
process_doc(doc)
logger.info('Done parsing PDFs')
logger.info('Stopping')
logger.info('Generating summary')
generate_html(output_path, docs, search_word, search_mode)
# clean up temporary directory
logger.info('Stopping')
shutil.rmtree(tempfile.gettempdir() + "/PDFScraper", ignore_errors=True)
sys.exit(0)


def process_doc(doc):
extract_info(doc)
get_filename(doc)
if doc.is_pdf:
pdf_object = get_pdf_object(doc)
if doc.extractable:

logger.debug('Document information:' + '\n' + doc.document_info_to_string())
extract_table_of_contents(doc, pdf_object)
logger.debug('Table of contents: \n' + doc.table_of_contents_to_string())
page_layouts = extract_page_layouts(pdf_object)
# table extraction is possible only for text based PDFs
if tables_extract:
extract_tables(doc)
parse_layouts(doc, page_layouts)
if len(doc.paragraphs) == 0:
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
pdf_to_image(doc)
convert_to_pdf(doc, tessdata_location)
pdf_object = get_pdf_object(doc)
page_layouts = extract_page_layouts(pdf_object)
if tables_extract:
extract_tables(doc)
parse_layouts(doc, page_layouts)

else:
logger.warning("Skipping parsing. Document is not exable.")
# logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
# progress_counter += 1
else:
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
pdf_to_image(doc)
convert_to_pdf(doc, tessdata_location)
pdf_object = get_pdf_object(doc)
page_layouts = extract_page_layouts(pdf_object)
if tables_extract:
extract_tables(doc)
parse_layouts(doc, page_layouts)
Loading

0 comments on commit d93e526

Please sign in to comment.