Skip to content

Commit

Permalink
use (py)tesseract to extract text from images
Browse files Browse the repository at this point in the history
  • Loading branch information
generic-github-user committed Jul 21, 2022
1 parent e0b2837 commit d481791
Showing 1 changed file with 31 additions and 0 deletions.
31 changes: 31 additions & 0 deletions ap/ap.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@

import psutil
import hashlib
from PIL import Image
from PIL import UnidentifiedImageError
import pytesseract

dbpath = '/home/alex/Desktop/ap.pickle'
timelimit = 20
Expand Down Expand Up @@ -233,3 +236,31 @@ def tagfiles(n=0):
if not hasattr(anode, 'tags'): setattr(anode, 'tags', [])
anode.print()
save()
def mayhave(obj, attr):
if hasattr(obj, attr):
return getattr(obj, attr)
else:
return None

def extracttext(n=0):
log('Running OCR (Tesseract)')
for anode in itertools.islice(filter(
lambda x:
(hasattr(x, 'tags') and
'image' in x.tags and
not mayhave(x, 'processed')),
data['files']), n):
#anode.print()
try:
log('', 1)
log(anode.path, 1)
imgcontent = pytesseract.image_to_string(Image.open(anode.path))
setattr(anode, 'text', imgcontent)

text = imgcontent.replace('\n', '')
text = re.sub('[\n ]+', ' ', text, re.M)
log(f"Result (condensed): {text}", 1)
except UnidentifiedImageError as exception:
print(exception)
anode.processed = True
save()

0 comments on commit d481791

Please sign in to comment.