forked from senderle/tess
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtess.py
130 lines (109 loc) · 3.86 KB
/
tess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import subprocess
import uuid
import os
import shlex
import argparse
import tempfile
maxproc = 8
debug = False
conv = ('convert -density 300 {doc} -depth 8 '
'-strip -background white -alpha off {tempfile}')
# tess = ('tesseract {infile} -l {lang} '
# '-c preserve_interword_spaces=1 {outfile}')
tess = ('tesseract {infile} {outfile} -l {lang}')
def poll_and_popitem(running_ps):
# Find a terminted process, using temporary output filenames as ids.
# This is a slow linear search, but that's probably OK since these will
# be heavily IO-bound processes.
pop_id = None
for pid, ps in running_ps.items():
if ps.poll() is not None:
pop_id = pid
break
# If no process has terminated, pick one at random & remember filename.
if pop_id is None:
pop_id, ps = running_ps.popitem()
else:
ps = running_ps.pop(pop_id)
return pop_id, ps
def wait_for_ps(running_ps, infiles, outfiles):
outfile, ps = poll_and_popitem(running_ps)
inf = infiles[outfile]
if ps.wait() == 0:
outfiles[inf] = outfile
msg = 'File\n\t{}\nconverted to\n\t{}\nusing `{}`'
msg = msg.format(inf, outfile, ps.args[0])
else:
msg = 'Error: `{}` failed for file\n\t{}'
msg = msg.format(ps.args[0], inf)
print(msg)
print()
def convert_files(files, tempdir): # instead of tempdir, make_outfile
running_ps = {}
docfiles = {}
outfiles = {}
for doc in files:
tempfile = '' # make_outfile
while not tempfile or os.path.exists(tempfile):
tempname = str(uuid.uuid4()) + '.tiff'
tempfile = os.path.join(tempdir, tempname)
docfiles[tempfile] = doc
args = shlex.split(conv.format(doc=doc, tempfile=tempfile))
print(args)
ps = subprocess.Popen(args)
running_ps[tempfile] = ps
if len(running_ps) > maxproc:
wait_for_ps(running_ps, docfiles, outfiles)
while running_ps:
wait_for_ps(running_ps, docfiles, outfiles)
return outfiles
def tess_files(infiles, language): # add make_outfile
running_ps = {}
stepfiles = {}
outfiles = {}
for infile, stepfile in infiles.items():
outfile, ext = os.path.splitext(infile) # make_outfile
stepfiles[outfile] = stepfile
args = shlex.split(tess.format(infile=stepfile,
lang=language,
outfile=outfile))
print(args)
ps = subprocess.Popen(args)
running_ps[outfile] = ps
if len(running_ps) > maxproc:
wait_for_ps(running_ps, stepfiles, outfiles)
while running_ps:
wait_for_ps(running_ps, stepfiles, outfiles)
return outfiles
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='OCR workflow for a range of file types.'
)
parser.add_argument(
'--language', '-l', default='eng',
help='The tesseract language ID code for the language you would '
'like to use. Default is `eng` (English).'
)
parser.add_argument(
'files', nargs='+',
help='One or more image files to process.'
)
args = parser.parse_args()
files = args.files
try:
with tempfile.TemporaryDirectory() as tempdir:
outfiles = convert_files(files, tempdir)
outfiles = tess_files(outfiles, args.language)
except OSError as exc:
print('Either the `convert` or the `tesseract` command could not '
'be found.')
print()
print('Make sure that you have installed both ImageMagick and '
'Tesseract, and')
print('that the `convert` and `tesseract` executables are on the '
'system path.')
print()
print('Here is the original exception message:')
print()
print(exc)
print()