-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
28 additions
and
61 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,68 +1,35 @@ | ||
from json import loads | ||
from pkg_resources import resource_string | ||
from tempfile import NamedTemporaryFile | ||
from pathlib import Path | ||
from os.path import join | ||
|
||
from PIL import Image | ||
|
||
from typing import Optional | ||
from ocrd.processor.ocrd_page_result import OcrdPageResult | ||
from ocrd_models import OcrdPage | ||
from ocrd import Processor | ||
from ocrd_modelfactory import page_from_file, exif_from_filename | ||
from ocrd_models import OcrdFile, OcrdExif | ||
from ocrd_models.ocrd_page import to_xml | ||
from ocrd_utils import ( | ||
getLogger, | ||
MIMETYPE_PAGE, | ||
assert_file_grp_cardinality, | ||
make_file_id | ||
) | ||
|
||
from .eynollah import Eynollah | ||
from .utils.pil_cv2 import pil2cv | ||
|
||
OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) | ||
|
||
class EynollahProcessor(Processor): | ||
|
||
def __init__(self, *args, **kwargs): | ||
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-eynollah-segment'] | ||
kwargs['version'] = OCRD_TOOL['version'] | ||
super().__init__(*args, **kwargs) | ||
@property | ||
def metadata_location(self) -> str: | ||
return 'eynollah/ocrd-tool.json' | ||
|
||
def process(self): | ||
LOG = getLogger('eynollah') | ||
assert_file_grp_cardinality(self.input_file_grp, 1) | ||
assert_file_grp_cardinality(self.output_file_grp, 1) | ||
for n, input_file in enumerate(self.input_files): | ||
page_id = input_file.pageId or input_file.ID | ||
LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files)) | ||
pcgts = page_from_file(self.workspace.download_file(input_file)) | ||
LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight) | ||
self.add_metadata(pcgts) | ||
page = pcgts.get_Page() | ||
# XXX loses DPI information | ||
# page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') | ||
image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename | ||
eynollah_kwargs = { | ||
'dir_models': self.resolve_resource(self.parameter['models']), | ||
'allow_enhancement': False, | ||
'curved_line': self.parameter['curved_line'], | ||
'full_layout': self.parameter['full_layout'], | ||
'allow_scaling': self.parameter['allow_scaling'], | ||
'headers_off': self.parameter['headers_off'], | ||
'tables': self.parameter['tables'], | ||
'override_dpi': self.parameter['dpi'], | ||
'logger': LOG, | ||
'pcgts': pcgts, | ||
'image_filename': image_filename | ||
} | ||
Eynollah(**eynollah_kwargs).run() | ||
file_id = make_file_id(input_file, self.output_file_grp) | ||
pcgts.set_pcGtsId(file_id) | ||
self.workspace.add_file( | ||
ID=file_id, | ||
file_grp=self.output_file_grp, | ||
pageId=page_id, | ||
mimetype=MIMETYPE_PAGE, | ||
local_filename=join(self.output_file_grp, file_id) + '.xml', | ||
content=to_xml(pcgts)) | ||
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: | ||
assert input_pcgts | ||
assert input_pcgts[0] | ||
pcgts = input_pcgts[0] | ||
page = pcgts.get_Page() | ||
# XXX loses DPI information | ||
# page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') | ||
image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(local_filename=page.imageFilename))).local_filename | ||
Eynollah( | ||
dir_models=self.resolve_resource(self.parameter['models']), | ||
allow_enhancement=False, | ||
curved_line=self.parameter['curved_line'], | ||
full_layout=self.parameter['full_layout'], | ||
allow_scaling=self.parameter['allow_scaling'], | ||
headers_off=self.parameter['headers_off'], | ||
tables=self.parameter['tables'], | ||
override_dpi=self.parameter['dpi'], | ||
logger=self.logger, | ||
pcgts=pcgts, | ||
image_filename=image_filename | ||
).run() | ||
return OcrdPageResult(pcgts) |