Skip to content

Commit

Permalink
added unique filename in preprocess_image
Browse files Browse the repository at this point in the history
  • Loading branch information
erikkastelec committed Sep 6, 2020
1 parent d93e526 commit b9b6339
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 54 deletions.
2 changes: 1 addition & 1 deletion PDFScraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "1.1.3"
__version__ = "1.1.6"

import logging

Expand Down
48 changes: 7 additions & 41 deletions PDFScraper/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,7 @@ def process_doc(doc):

else:
logger.warning("Skipping parsing. Document is not exable.")
# logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
# progress_counter += 1

else:
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
pdf_to_image(doc)
Expand All @@ -142,13 +141,16 @@ def cli():
# Extract information about PDFs
progress_counter = 1

# Multiprocessing
# Multiprocessing -- Improves speed of processing multiple documents significantly
# !! BAD PERFORMANCE OF OCR WITH MULTIPLE FILES
if args["multiprocessing"]:
pool = multiprocessing.Pool()
pool.map(process_doc, docs)
else:
for doc in docs:
process_doc(doc)
logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
progress_counter += 1
logger.info('Done parsing PDFs')
logger.info('Generating summary')
generate_html(output_path, docs, search_word, search_mode)
Expand All @@ -158,41 +160,5 @@ def cli():
sys.exit(0)


def process_doc(doc):
extract_info(doc)
get_filename(doc)
if doc.is_pdf:
pdf_object = get_pdf_object(doc)
if doc.extractable:

logger.debug('Document information:' + '\n' + doc.document_info_to_string())
extract_table_of_contents(doc, pdf_object)
logger.debug('Table of contents: \n' + doc.table_of_contents_to_string())
page_layouts = extract_page_layouts(pdf_object)
# table extraction is possible only for text based PDFs
if tables_extract:
extract_tables(doc)
parse_layouts(doc, page_layouts)
if len(doc.paragraphs) == 0:
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
pdf_to_image(doc)
convert_to_pdf(doc, tessdata_location)
pdf_object = get_pdf_object(doc)
page_layouts = extract_page_layouts(pdf_object)
if tables_extract:
extract_tables(doc)
parse_layouts(doc, page_layouts)

else:
logger.warning("Skipping parsing. Document is not exable.")
# logger.info('Parsed ' + str(progress_counter) + ' out of ' + str(len(docs)) + ' documents')
# progress_counter += 1
else:
logger.info("Regular text extraction is not possible. Trying to extract text using OCR")
pdf_to_image(doc)
convert_to_pdf(doc, tessdata_location)
pdf_object = get_pdf_object(doc)
page_layouts = extract_page_layouts(pdf_object)
if tables_extract:
extract_tables(doc)
parse_layouts(doc, page_layouts)
if __name__ == "__main__":
cli()
12 changes: 2 additions & 10 deletions PDFScraper/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
import sys
import tempfile
import uuid
from typing import TYPE_CHECKING

import camelot
Expand Down Expand Up @@ -228,11 +229,10 @@ def preprocess_image(image):
# image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
# save and reread to convert to scikit-image image type
temp_image_path = tempfile.gettempdir() + "/PDFScraper" + "/" + "deskew.jpg"
temp_image_path = tempfile.gettempdir() + "/PDFScraper" + "/" + str(uuid.uuid4()) + "deskew.jpg"
cv2.imwrite(temp_image_path, image)
image = io.imread(temp_image_path)
os.remove(temp_image_path)
# perform deskewing
image = deskew(image)
image = image * 255
io.imsave(temp_image_path, image.astype(np.uint8))
Expand Down Expand Up @@ -496,11 +496,3 @@ def find_words_tables(tables, search_mode, search_words, match_score):
result.append(table)
return result


if __name__ == "__main__":
import argparse

argumentParser = argparse.ArgumentParser()
argumentParser.add_argument('--path', help='path to pdf file', required=True)
args = vars(argumentParser.parse_args())
doc = Document(args["path"])
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ optional arguments:

`search_mode`, by default in 'and' mode, specifies whether all the search terms need to be contained inside paragraph. In 'or' mode, the paragraph is returned if any of the terms are contained. In 'and' mode, the paragraph is returned if all the terms are contained.

`multiprocessing`, by default True, runs process in multiple threads to speed up.
`multiprocessing`, by default True, runs process in multiple threads to speed up processing. **Should not be used with OCR as it significantly decreases performance**
### OCR

**tessdata pretrained language [files](https://github.com/tesseract-ocr/tessdata_best) need to be manually added to the tessdata directory.**
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
"yattag==1.14.0",
],
name="PDFScraper",
version="1.1.3",
version="1.1.6",
author="Erik Kastelec",
author_email="[email protected]",
description="PDF text and table search",
Expand Down

0 comments on commit b9b6339

Please sign in to comment.