-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmain.py
66 lines (52 loc) · 2.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import argparse
import cv2
import utils
from PIL import Image
from image_processing import crop_text_region, fix_orientation
from document_processing import extract_fields, identify_doc_type
from ocr_engine import ocr_easyocr, draw_ocr_results
def main(image_path):
img = cv2.imread(image_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# TODO: fix document orientation
print('Fixing document orientation')
img = fix_orientation(img)
print('Cropping text region')
cropped_img, _, _ = crop_text_region(img)
cv2.imwrite('cropped.png', cropped_img)
# ocr, extract all texts
# TODO: add pytesseract, etc.
print('Performing OCR')
ocr_results = ocr_easyocr(cropped_img)
ocr_img = draw_ocr_results(cropped_img, ocr_results)
cv2.imwrite('raw_ocr_result.png', ocr_img)
# TODO: post-process ocr results (mispelling correction)
# TODO: identify doc type based on ocr results
doc_type = identify_doc_type(cropped_img, ocr_results)
print('Detected document type:', utils.DOC_TYPE_TH[doc_type])
# TODO: field extraction
# - read field_positions
# - do some smart filtering for each file type
# - line height detection for variable lines?
extracted_fields = extract_fields(cropped_img, ocr_results, doc_type)
# print final ocr results
print(f'Extracted text from {image_path}:')
for field in extracted_fields:
field_name = field[0]
field_ocr_result = field[1]
field_text = field_ocr_result[1]
print(f'{field_name}: {field_text}')
filtered_ocr_results = [item[1] for item in extracted_fields]
# create image with bbox of extracted fields only
filtered_ocr_img = draw_ocr_results(cropped_img, filtered_ocr_results)
cv2.imwrite('filtered_ocr_result.png', filtered_ocr_img)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Chamomind OCR')
parser.add_argument('image_file', type=str,
help='Path to the image file (must be .png).')
# parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose output.')
args = parser.parse_args()
if not args.image_file.lower().endswith('.png'):
print('Error: Please provide a PNG image file.')
exit()
main(args.image_file)