From 4f98e6d1804aee55cc3754d8afda127235f50f8a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 6 Jul 2024 18:48:07 +0200 Subject: [PATCH 01/34] adapt to ocrd v3 Processor init (automatic ocrd-tool.json loading) --- ocrd_tesserocr/binarize.py | 11 +++-------- ocrd_tesserocr/config.py | 4 ---- ocrd_tesserocr/crop.py | 11 +++-------- ocrd_tesserocr/deskew.py | 11 +++-------- ocrd_tesserocr/fontshape.py | 11 +++-------- ocrd_tesserocr/recognize.py | 28 ++++++++++++++-------------- ocrd_tesserocr/segment.py | 12 +++++------- ocrd_tesserocr/segment_line.py | 12 +++++------- ocrd_tesserocr/segment_region.py | 12 +++++------- ocrd_tesserocr/segment_table.py | 12 +++++------- ocrd_tesserocr/segment_word.py | 12 +++++------- 11 files changed, 51 insertions(+), 85 deletions(-) delete mode 100644 ocrd_tesserocr/config.py diff --git a/ocrd_tesserocr/binarize.py b/ocrd_tesserocr/binarize.py index 84a5e5e..180fb4c 100644 --- a/ocrd_tesserocr/binarize.py +++ b/ocrd_tesserocr/binarize.py @@ -19,17 +19,12 @@ to_xml ) -from .config import OCRD_TOOL from .recognize import TesserocrRecognize -TOOL = 'ocrd-tesserocr-binarize' - class TesserocrBinarize(TesserocrRecognize): - def __init__(self, *args, **kwargs): - kwargs.setdefault('ocrd_tool', OCRD_TOOL['tools'][TOOL]) - super().__init__(*args, **kwargs) - if hasattr(self, 'parameter'): - self.logger = getLogger('processor.TesserocrBinarize') + @property + def executable(self): + return 'ocrd-tesserocr-binarize' def process(self): """Performs binarization of the region / line with Tesseract on the workspace. diff --git a/ocrd_tesserocr/config.py b/ocrd_tesserocr/config.py deleted file mode 100644 index 01e0b23..0000000 --- a/ocrd_tesserocr/config.py +++ /dev/null @@ -1,4 +0,0 @@ -import json -from pkg_resources import resource_string - -OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index 47ccabc..7e7af8e 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -24,17 +24,12 @@ to_xml ) -from .config import OCRD_TOOL from .recognize import TesserocrRecognize, polygon_for_parent -TOOL = 'ocrd-tesserocr-crop' - class TesserocrCrop(TesserocrRecognize): - def __init__(self, *args, **kwargs): - kwargs.setdefault('ocrd_tool', OCRD_TOOL['tools'][TOOL]) - super().__init__(*args, **kwargs) - if hasattr(self, 'parameter'): - self.logger = getLogger('processor.TesserocrCrop') + @property + def executable(self): + return 'ocrd-tesserocr-crop' def process(self): """Performs page cropping with Tesseract on the workspace. diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index a76e203..cacf632 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -24,17 +24,12 @@ to_xml ) -from .config import OCRD_TOOL from .recognize import TesserocrRecognize -TOOL = 'ocrd-tesserocr-deskew' - class TesserocrDeskew(TesserocrRecognize): - def __init__(self, *args, **kwargs): - kwargs.setdefault('ocrd_tool', OCRD_TOOL['tools'][TOOL]) - super().__init__(*args, **kwargs) - if hasattr(self, 'parameter'): - self.logger = getLogger('processor.TesserocrDeskew') + @property + def executable(self): + return 'ocrd-tesserocr-deskew' def process(self): """Performs deskewing of the page / region with Tesseract on the workspace. diff --git a/ocrd_tesserocr/fontshape.py b/ocrd_tesserocr/fontshape.py index bfac399..06d762c 100644 --- a/ocrd_tesserocr/fontshape.py +++ b/ocrd_tesserocr/fontshape.py @@ -19,17 +19,12 @@ to_xml) from ocrd_modelfactory import page_from_file -from .config import OCRD_TOOL from .recognize import TesserocrRecognize -TOOL = 'ocrd-tesserocr-fontshape' - class TesserocrFontShape(TesserocrRecognize): - def __init__(self, *args, **kwargs): - kwargs.setdefault('ocrd_tool', OCRD_TOOL['tools'][TOOL]) - super().__init__(*args, **kwargs) - if hasattr(self, 'parameter'): - self.logger = getLogger('processor.TesserocrFontShape') + @property + def executable(self): + return 'ocrd-tesserocr-fontshape' def process(self): """Detect font shapes via rule-based OCR with Tesseract on the workspace. diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index d4299a6..f591eed 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -30,6 +30,7 @@ points_from_polygon, xywh_from_polygon, MIMETYPE_PAGE, + VERSION as OCRD_VERSION, membername ) from ocrd_models.ocrd_page import ( @@ -62,10 +63,6 @@ from ocrd_modelfactory import page_from_file from ocrd import Processor -from .config import OCRD_TOOL - -TOOL = 'ocrd-tesserocr-recognize' - CHOICE_THRESHOLD_NUM = 10 # maximum number of choices to query and annotate CHOICE_THRESHOLD_CONF = 1 # maximum score drop from best choice to query and annotate # (ChoiceIterator usually rounds to 0.0 for non-best, so this better be maximum) @@ -125,17 +122,24 @@ def __exit__(self, exc_type, exc_val, exc_trace): return None class TesserocrRecognize(Processor): - def __init__(self, *args, **kwargs): - kwargs.setdefault('ocrd_tool', OCRD_TOOL['tools'][TOOL]) - kwargs.setdefault('version', OCRD_TOOL['version'] + ' (' + tesseract_version().split('\n')[0] + ')') - super().__init__(*args, **kwargs) - if hasattr(self, 'parameter'): - self.logger = getLogger('processor.TesserocrRecognize') + @property + def executable(self): + return 'ocrd-tesserocr-recognize' + + def show_version(self): + tess_version = tesseract_version().split('\n')[0] + print(f"Version {self.version}, {tess_version}, ocrd/core {OCRD_VERSION}") @property def moduledir(self): return get_languages()[0] + def setup(self): + self.logger = getLogger('processor.' + self.__class__.__name__) + self.logger.debug("TESSDATA: %s, installed Tesseract models: %s", *get_languages()) + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + def process(self): """Perform layout segmentation and/or text recognition with Tesseract. @@ -262,10 +266,6 @@ def process(self): model (among the models given in ``model``), enable ``auto_model``. To constrain models by type (called OCR engine mode), use ``oem``. """ - self.logger.debug("TESSDATA: %s, installed Tesseract models: %s", *get_languages()) - - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) inlevel = self.parameter['segmentation_level'] outlevel = self.parameter['textequiv_level'] diff --git a/ocrd_tesserocr/segment.py b/ocrd_tesserocr/segment.py index f60e913..ae45677 100644 --- a/ocrd_tesserocr/segment.py +++ b/ocrd_tesserocr/segment.py @@ -3,23 +3,21 @@ from ocrd_utils import getLogger from ocrd_validators import ParameterValidator -from .config import OCRD_TOOL from .recognize import TesserocrRecognize -TOOL = 'ocrd-tesserocr-segment' -BASE_TOOL = 'ocrd-tesserocr-recognize' - class TesserocrSegment(TesserocrRecognize): + @property + def executable(self): + return 'ocrd-tesserocr-segment' + def __init__(self, *args, **kwargs): - kwargs.setdefault('ocrd_tool', OCRD_TOOL['tools'][TOOL]) super().__init__(*args, **kwargs) if hasattr(self, 'parameter'): self.parameter['overwrite_segments'] = True self.parameter['segmentation_level'] = "region" self.parameter['textequiv_level'] = "none" # add default params - assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid - self.logger = getLogger('processor.TesserocrSegment') + assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid def process(self): """Performs region and line segmentation with Tesseract on the workspace. diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index 626f859..09a3445 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -3,15 +3,14 @@ from ocrd_utils import getLogger from ocrd_validators import ParameterValidator -from .config import OCRD_TOOL from .recognize import TesserocrRecognize -TOOL = 'ocrd-tesserocr-segment-line' -BASE_TOOL = 'ocrd-tesserocr-recognize' - class TesserocrSegmentLine(TesserocrRecognize): + @property + def executable(self): + return 'ocrd-tesserocr-segment-line' + def __init__(self, *args, **kwargs): - kwargs.setdefault('ocrd_tool', OCRD_TOOL['tools'][TOOL]) super().__init__(*args, **kwargs) if hasattr(self, 'parameter'): self.parameter['overwrite_segments'] = self.parameter['overwrite_lines'] @@ -19,8 +18,7 @@ def __init__(self, *args, **kwargs): self.parameter['segmentation_level'] = "line" self.parameter['textequiv_level'] = "line" # add default params - assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid - self.logger = getLogger('processor.TesserocrSegmentLine') + assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid def process(self): """Performs (text) line segmentation with Tesseract on the workspace. diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index f04b0dd..2b4aa95 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -3,15 +3,14 @@ from ocrd_utils import getLogger from ocrd_validators import ParameterValidator -from .config import OCRD_TOOL from .recognize import TesserocrRecognize -TOOL = 'ocrd-tesserocr-segment-region' -BASE_TOOL = 'ocrd-tesserocr-recognize' - class TesserocrSegmentRegion(TesserocrRecognize): + @property + def executable(self): + return 'ocrd-tesserocr-segment-region' + def __init__(self, *args, **kwargs): - kwargs.setdefault('ocrd_tool', OCRD_TOOL['tools'][TOOL]) super().__init__(*args, **kwargs) if hasattr(self, 'parameter'): self.parameter['overwrite_segments'] = self.parameter['overwrite_regions'] @@ -21,8 +20,7 @@ def __init__(self, *args, **kwargs): self.parameter['block_polygons'] = self.parameter['crop_polygons'] del self.parameter['crop_polygons'] # add default params - assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid - self.logger = getLogger('processor.TesserocrSegmentRegion') + assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid def process(self): """Performs region segmentation with Tesseract on the workspace. diff --git a/ocrd_tesserocr/segment_table.py b/ocrd_tesserocr/segment_table.py index 3aa753c..72dfe38 100644 --- a/ocrd_tesserocr/segment_table.py +++ b/ocrd_tesserocr/segment_table.py @@ -3,15 +3,14 @@ from ocrd_utils import getLogger from ocrd_validators import ParameterValidator -from .config import OCRD_TOOL from .recognize import TesserocrRecognize -TOOL = 'ocrd-tesserocr-segment-table' -BASE_TOOL = 'ocrd-tesserocr-recognize' - class TesserocrSegmentTable(TesserocrRecognize): + @property + def executable(self): + return 'ocrd-tesserocr-segment-table' + def __init__(self, *args, **kwargs): - kwargs.setdefault('ocrd_tool', OCRD_TOOL['tools'][TOOL]) super().__init__(*args, **kwargs) if hasattr(self, 'parameter'): self.parameter['overwrite_segments'] = self.parameter['overwrite_cells'] @@ -19,8 +18,7 @@ def __init__(self, *args, **kwargs): self.parameter['segmentation_level'] = "cell" self.parameter['textequiv_level'] = "cell" # add default params - assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid - self.logger = getLogger('processor.TesserocrSegmentTable') + assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid def process(self): """Performs table cell segmentation with Tesseract on the workspace. diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index ebe8f49..7e4fb45 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -3,15 +3,14 @@ from ocrd_utils import getLogger from ocrd_validators import ParameterValidator -from .config import OCRD_TOOL from .recognize import TesserocrRecognize -TOOL = 'ocrd-tesserocr-segment-word' -BASE_TOOL = 'ocrd-tesserocr-recognize' - class TesserocrSegmentWord(TesserocrRecognize): + @property + def executable(self): + return 'ocrd-tesserocr-segment-word' + def __init__(self, *args, **kwargs): - kwargs.setdefault('ocrd_tool', OCRD_TOOL['tools'][TOOL]) super().__init__(*args, **kwargs) if hasattr(self, 'parameter'): self.parameter['overwrite_segments'] = self.parameter['overwrite_words'] @@ -19,8 +18,7 @@ def __init__(self, *args, **kwargs): self.parameter['segmentation_level'] = "word" self.parameter['textequiv_level'] = "word" # add default params - assert ParameterValidator(OCRD_TOOL['tools'][BASE_TOOL]).validate(self.parameter).is_valid - self.logger = getLogger('processor.TesserocrSegmentWord') + assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid def process(self): """Performs word segmentation with Tesseract on the workspace. From a9168e0d60e7e89de41d6aa3e28180290ead6125 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 6 Jul 2024 18:48:49 +0200 Subject: [PATCH 02/34] tests: adapt to ocrd v3 init (setup only via run_processor) --- test/test_recognize.py | 171 +++++++++++++++++------------------- test/test_segment_line.py | 28 +++--- test/test_segment_region.py | 43 +++++---- test/test_segment_table.py | 44 +++++----- test/test_segment_word.py | 28 +++--- 5 files changed, 148 insertions(+), 166 deletions(-) diff --git a/test/test_recognize.py b/test/test_recognize.py index a7f47ae..2c00c55 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -1,5 +1,6 @@ import os +from ocrd import run_processor from ocrd_models.constants import NAMESPACES from ocrd_modelfactory import page_from_file from ocrd_utils import MIMETYPE_PAGE @@ -11,33 +12,28 @@ from ocrd_tesserocr import TesserocrFontShape def test_run_modular(workspace_kant_binarized): - TesserocrSegmentRegion( - workspace_kant_binarized, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-SEG-BLOCK" - ).process() - TesserocrSegmentLine( - workspace_kant_binarized, - input_file_grp="OCR-D-SEG-BLOCK", - output_file_grp="OCR-D-SEG-LINE" - ).process() - TesserocrRecognize( - workspace_kant_binarized, - input_file_grp="OCR-D-SEG-LINE", - output_file_grp="OCR-D-OCR-TESS", - parameter={'textequiv_level': 'line', 'model': 'Fraktur'} - ).process() - TesserocrSegmentWord( - workspace_kant_binarized, - input_file_grp="OCR-D-SEG-LINE", - output_file_grp="OCR-D-SEG-WORD" - ).process() - TesserocrRecognize( - workspace_kant_binarized, - input_file_grp="OCR-D-SEG-WORD", - output_file_grp="OCR-D-OCR-TESS-W2C", - parameter={'segmentation_level': 'glyph', 'textequiv_level': 'glyph', 'model': 'Fraktur'} - ).process() + run_processor(TesserocrSegmentRegion, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-SEG-BLOCK") + run_processor(TesserocrSegmentLine, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-SEG-BLOCK", + output_file_grp="OCR-D-SEG-LINE") + run_processor(TesserocrRecognize, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-SEG-LINE", + output_file_grp="OCR-D-OCR-TESS", + parameter={'textequiv_level': 'line', 'model': 'Fraktur'}) + run_processor(TesserocrSegmentWord, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-SEG-LINE", + output_file_grp="OCR-D-SEG-WORD") + run_processor(TesserocrRecognize, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-SEG-WORD", + output_file_grp="OCR-D-OCR-TESS-W2C", + parameter={'segmentation_level': 'glyph', 'textequiv_level': 'glyph', 'model': 'Fraktur'}) workspace_kant_binarized.save_mets() assert os.path.isdir(os.path.join(workspace_kant_binarized.directory, 'OCR-D-OCR-TESS-W2C')) results = workspace_kant_binarized.find_files(file_grp='OCR-D-OCR-TESS-W2C', mimetype=MIMETYPE_PAGE) @@ -48,39 +44,36 @@ def test_run_modular(workspace_kant_binarized): assert len(text0) > 0 def test_run_modular_full(workspace_kant_binarized): - TesserocrDeskew( - workspace_kant_binarized, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-DESK", - parameter={"operation_level": "page"} - ).process() - TesserocrSegmentRegion( - workspace_kant_binarized, - input_file_grp="OCR-D-DESK", - output_file_grp="OCR-D-SEG-BLOCK" - ).process() - TesserocrDeskew( - workspace_kant_binarized, - input_file_grp="OCR-D-SEG-BLOCK", - output_file_grp="OCR-D-DESK-BLOCK", - parameter={"operation_level": "region"} - ).process() - TesserocrSegmentLine( - workspace_kant_binarized, - input_file_grp="OCR-D-DESK-BLOCK", - output_file_grp="OCR-D-SEG-LINE" - ).process() - TesserocrRecognize( - workspace_kant_binarized, - input_file_grp="OCR-D-SEG-LINE", - output_file_grp="OCR-D-OCR-TESS", - parameter={'textequiv_level': 'word', 'raw_lines': True, 'xpath_model': {'starts-with(@script,"Latn")': 'deu+eng', 'starts-with(@script,"Latf")': 'Fraktur'}, 'model': 'Fraktur+deu+eng'} - ).process() - TesserocrFontShape( - workspace_kant_binarized, - input_file_grp="OCR-D-OCR-TESS", - output_file_grp="OCR-D-OCR-STYLE" - ).process() + run_processor(TesserocrDeskew, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-DESK", + parameter={"operation_level": "page"}) + run_processor(TesserocrSegmentRegion, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-DESK", + output_file_grp="OCR-D-SEG-BLOCK") + run_processor(TesserocrDeskew, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-SEG-BLOCK", + output_file_grp="OCR-D-DESK-BLOCK", + parameter={"operation_level": "region"}) + run_processor(TesserocrSegmentLine, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-DESK-BLOCK", + output_file_grp="OCR-D-SEG-LINE") + run_processor(TesserocrRecognize, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-SEG-LINE", + output_file_grp="OCR-D-OCR-TESS", + parameter={'textequiv_level': 'word', 'raw_lines': True, + 'xpath_model': {'starts-with(@script,"Latn")': 'deu+eng', + 'starts-with(@script,"Latf")': 'Fraktur'}, + 'model': 'Fraktur+deu+eng'}) + run_processor(TesserocrFontShape, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-OCR-TESS", + output_file_grp="OCR-D-OCR-STYLE") workspace_kant_binarized.save_mets() assert os.path.isdir(os.path.join(workspace_kant_binarized.directory, 'OCR-D-OCR-STYLE')) results = workspace_kant_binarized.find_files(file_grp='OCR-D-OCR-STYLE', mimetype=MIMETYPE_PAGE) @@ -93,12 +86,11 @@ def test_run_modular_full(workspace_kant_binarized): assert len(style0) > 0 def test_run_allinone(workspace_kant_binarized): - TesserocrRecognize( - workspace_kant_binarized, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-OCR-TESS-W2C", - parameter={'segmentation_level': 'region', 'textequiv_level': 'glyph', 'model': 'Fraktur'} - ).process() + run_processor(TesserocrRecognize, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OCR-TESS-W2C", + parameter={'segmentation_level': 'region', 'textequiv_level': 'glyph', 'model': 'Fraktur'}) workspace_kant_binarized.save_mets() assert os.path.isdir(os.path.join(workspace_kant_binarized.directory, 'OCR-D-OCR-TESS-W2C')) results = workspace_kant_binarized.find_files(file_grp='OCR-D-OCR-TESS-W2C', mimetype=MIMETYPE_PAGE) @@ -109,38 +101,37 @@ def test_run_allinone(workspace_kant_binarized): assert len(text0) > 0 def test_run_allinone_shrink(workspace_kant_binarized): - TesserocrRecognize( - workspace_kant_binarized, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-OCR-TESS-W2C", - parameter={'segmentation_level': 'region', 'textequiv_level': 'glyph', 'shrink_polygons': True, 'model': 'Fraktur'} - ).process() + run_processor(TesserocrRecognize, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OCR-TESS-W2C", + parameter={'segmentation_level': 'region', 'textequiv_level': 'glyph', 'shrink_polygons': True, + 'model': 'Fraktur'}) workspace_kant_binarized.save_mets() def test_run_allinone_sparse(workspace_kant_binarized): - TesserocrRecognize( - workspace_kant_binarized, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-OCR-TESS-W2C", - parameter={'segmentation_level': 'region', 'textequiv_level': 'glyph', 'sparse_text': True, 'model': 'Fraktur'} - ).process() + run_processor(TesserocrRecognize, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OCR-TESS-W2C", + parameter={'segmentation_level': 'region', 'textequiv_level': 'glyph', 'sparse_text': True, + 'model': 'Fraktur'}) workspace_kant_binarized.save_mets() def test_run_allineone_multimodel(workspace_kant_binarized): - TesserocrRecognize( - workspace_kant_binarized, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-OCR-TESS-W2C", - parameter={'segmentation_level': 'region', 'textequiv_level': 'glyph', 'model': 'Fraktur+eng+deu'} - ).process() + run_processor(TesserocrRecognize, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OCR-TESS-W2C", + parameter={'segmentation_level': 'region', 'textequiv_level': 'glyph', 'model': 'Fraktur+eng+deu'}) workspace_kant_binarized.save_mets() # @skip def test_run_allinone_automodel(workspace_kant_binarized): - TesserocrRecognize( - workspace_kant_binarized, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-OCR-TESS-W2C", - parameter={'segmentation_level': 'region', 'textequiv_level': 'glyph', 'auto_model': True, 'model': 'Fraktur+eng+deu'} - ).process() + run_processor(TesserocrRecognize, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OCR-TESS-W2C", + parameter={'segmentation_level': 'region', 'textequiv_level': 'glyph', 'auto_model': True, + 'model': 'Fraktur+eng+deu'}) workspace_kant_binarized.save_mets() diff --git a/test/test_segment_line.py b/test/test_segment_line.py index f63bc6a..1352cf4 100644 --- a/test/test_segment_line.py +++ b/test/test_segment_line.py @@ -1,3 +1,4 @@ +from ocrd import run_processor from ocrd_tesserocr import TesserocrSegmentRegion from ocrd_tesserocr import TesserocrSegmentLine from ocrd_tesserocr import TesserocrSegment @@ -5,17 +6,15 @@ from ocrd_utils import MIMETYPE_PAGE def test_run_modular(workspace_herold_small): - TesserocrSegmentRegion( - workspace_herold_small, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-SEG-BLOCK" - ).process() + run_processor(TesserocrSegmentRegion, + workspace=workspace_herold_small, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-SEG-BLOCK") # workspace.save_mets() - TesserocrSegmentLine( - workspace_herold_small, - input_file_grp="OCR-D-SEG-BLOCK", - output_file_grp="OCR-D-SEG-LINE" - ).process() + run_processor(TesserocrSegmentLine, + workspace=workspace_herold_small, + input_file_grp="OCR-D-SEG-BLOCK", + output_file_grp="OCR-D-SEG-LINE") out_files = list(workspace_herold_small.find_files( fileGrp="OCR-D-SEG-LINE", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) assert len(out_files) @@ -26,11 +25,10 @@ def test_run_modular(workspace_herold_small): workspace_herold_small.save_mets() def test_run_allinone(workspace_herold_small): - TesserocrSegment( - workspace_herold_small, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-SEG" - ).process() + run_processor(TesserocrSegment, + workspace=workspace_herold_small, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-SEG") out_files = list(workspace_herold_small.find_files( fileGrp="OCR-D-SEG", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) assert len(out_files) diff --git a/test/test_segment_region.py b/test/test_segment_region.py index 50f7ed4..2c8f3e4 100644 --- a/test/test_segment_region.py +++ b/test/test_segment_region.py @@ -1,13 +1,13 @@ +from ocrd import run_processor from ocrd_tesserocr import TesserocrSegmentRegion from ocrd_modelfactory import page_from_file from ocrd_utils import MIMETYPE_PAGE def test_run(workspace_herold_small): - TesserocrSegmentRegion( - workspace_herold_small, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-SEG-BLOCK" - ).process() + run_processor(TesserocrSegmentRegion, + workspace=workspace_herold_small, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-SEG-BLOCK") out_files = list(workspace_herold_small.find_files( fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) assert len(out_files) @@ -18,12 +18,11 @@ def test_run(workspace_herold_small): workspace_herold_small.save_mets() def test_run_shrink(workspace_herold_small): - TesserocrSegmentRegion( - workspace_herold_small, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-SEG-BLOCK", - parameter={'shrink_polygons': True} - ).process() + run_processor(TesserocrSegmentRegion, + workspace=workspace_herold_small, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-SEG-BLOCK", + parameter={'shrink_polygons': True}) out_files = list(workspace_herold_small.find_files( fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) assert len(out_files) @@ -34,12 +33,11 @@ def test_run_shrink(workspace_herold_small): workspace_herold_small.save_mets() def test_run_sparse(workspace_herold_small): - TesserocrSegmentRegion( - workspace_herold_small, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-SEG-BLOCK", - parameter={'sparse_text': True} - ).process() + run_processor(TesserocrSegmentRegion, + workspace=workspace_herold_small, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-SEG-BLOCK", + parameter={'sparse_text': True}) out_files = list(workspace_herold_small.find_files( fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) assert len(out_files) @@ -50,12 +48,11 @@ def test_run_sparse(workspace_herold_small): workspace_herold_small.save_mets() def test_run_staves(workspace_herold_small): - TesserocrSegmentRegion( - workspace_herold_small, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-SEG-BLOCK", - parameter={'find_staves': True, 'find_tables': False} - ).process() + run_processor(TesserocrSegmentRegion, + workspace=workspace_herold_small, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-SEG-BLOCK", + parameter={'find_staves': True, 'find_tables': False}) out_files = list(workspace_herold_small.find_files( fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) assert len(out_files) diff --git a/test/test_segment_table.py b/test/test_segment_table.py index 34ecfca..039337b 100644 --- a/test/test_segment_table.py +++ b/test/test_segment_table.py @@ -1,19 +1,18 @@ +from ocrd import run_processor from ocrd_tesserocr import TesserocrSegment, TesserocrSegmentRegion, TesserocrSegmentTable from ocrd_modelfactory import page_from_file from ocrd_utils import MIMETYPE_PAGE def test_run_modular(workspace_gutachten): - TesserocrSegmentRegion( - workspace_gutachten, - input_file_grp="IMG", - output_file_grp="OCR-D-SEG-BLOCK", - parameter={'find_tables': True, 'overwrite_regions': True} - ).process() - TesserocrSegmentTable( - workspace_gutachten, - input_file_grp="OCR-D-SEG-BLOCK", - output_file_grp="OCR-D-SEG-CELL" - ).process() + run_processor(TesserocrSegmentRegion, + workspace=workspace_gutachten, + input_file_grp="IMG", + output_file_grp="OCR-D-SEG-BLOCK", + parameter={'find_tables': True, 'overwrite_regions': True}) + run_processor(TesserocrSegmentTable, + workspace=workspace_gutachten, + input_file_grp="OCR-D-SEG-BLOCK", + output_file_grp="OCR-D-SEG-CELL") out_files = list(workspace_gutachten.find_files( fileGrp="OCR-D-SEG-CELL", pageId="PHYS_1", mimetype=MIMETYPE_PAGE)) assert len(out_files) @@ -24,18 +23,17 @@ def test_run_modular(workspace_gutachten): workspace_gutachten.save_mets() def test_run_allinone(workspace_gutachten): - TesserocrSegment( - workspace_gutachten, - input_file_grp="IMG", - output_file_grp="OCR-D-SEG", - parameter={'find_tables': True} # , 'textequiv_level': 'cell' - ).process() - TesserocrSegmentTable( - workspace_gutachten, - input_file_grp="OCR-D-SEG", - output_file_grp="OCR-D-SEG-CELL", - parameter={'overwrite_cells': True} - ).process() + run_processor(TesserocrSegment, + workspace=workspace_gutachten, + input_file_grp="IMG", + output_file_grp="OCR-D-SEG", + parameter={'find_tables': True} # , 'textequiv_level': 'cell' + ) + run_processor(TesserocrSegmentTable, + workspace=workspace_gutachten, + input_file_grp="OCR-D-SEG", + output_file_grp="OCR-D-SEG-CELL", + parameter={'overwrite_cells': True}) out_files = list(workspace_gutachten.find_files( fileGrp="OCR-D-SEG-CELL", pageId="PHYS_1", mimetype=MIMETYPE_PAGE)) assert len(out_files) diff --git a/test/test_segment_word.py b/test/test_segment_word.py index 86fc28d..6776fff 100644 --- a/test/test_segment_word.py +++ b/test/test_segment_word.py @@ -1,3 +1,4 @@ +from ocrd import run_processor from ocrd_tesserocr import TesserocrSegmentRegion from ocrd_tesserocr import TesserocrSegmentLine from ocrd_tesserocr import TesserocrSegmentWord @@ -5,21 +6,18 @@ from ocrd_utils import MIMETYPE_PAGE def test_run_modular(workspace_kant_binarized): - TesserocrSegmentRegion( - workspace_kant_binarized, - input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-SEG-BLOCK" - ).process() - TesserocrSegmentLine( - workspace_kant_binarized, - input_file_grp="OCR-D-SEG-BLOCK", - output_file_grp="OCR-D-SEG-LINE" - ).process() - TesserocrSegmentWord( - workspace_kant_binarized, - input_file_grp="OCR-D-SEG-LINE", - output_file_grp="OCR-D-SEG-WORD" - ).process() + run_processor(TesserocrSegmentRegion, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-SEG-BLOCK") + run_processor(TesserocrSegmentLine, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-SEG-BLOCK", + output_file_grp="OCR-D-SEG-LINE") + run_processor(TesserocrSegmentWord, + workspace=workspace_kant_binarized, + input_file_grp="OCR-D-SEG-LINE", + output_file_grp="OCR-D-SEG-WORD") out_files = list(workspace_kant_binarized.find_files( fileGrp="OCR-D-SEG-WORD", pageId="P_0017", mimetype=MIMETYPE_PAGE)) assert len(out_files) From eb661f4f38032c56c8430e938b0e81e3c6087ef1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 7 Jul 2024 11:57:58 +0200 Subject: [PATCH 03/34] =?UTF-8?q?adapt=20to=20ocrd=20v3=20(process?= =?UTF-8?q?=E2=86=92process=5Fpage=5Fpcgts)=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - separate `setup` / `_init` from processing - inherit/override `_init` - init Tesseract API once, keep as class attribute - define `process_page_pcgts` instead of `process`: - no input file loop - no METS action - just return tuples for derived images - move shared utility functions into `common` module - generate etree mapping by exporting PAGE model --- .pylintrc | 3 - ocrd_tesserocr/binarize.py | 183 +++--- ocrd_tesserocr/common.py | 396 +++++++++++ ocrd_tesserocr/crop.py | 136 ++-- ocrd_tesserocr/deskew.py | 182 +++--- ocrd_tesserocr/fontshape.py | 137 ++-- ocrd_tesserocr/recognize.py | 1049 ++++++++++-------------------- ocrd_tesserocr/segment_line.py | 6 +- ocrd_tesserocr/segment_region.py | 6 +- ocrd_tesserocr/segment_table.py | 6 +- ocrd_tesserocr/segment_word.py | 6 +- test/test_cli.py | 2 +- 12 files changed, 1013 insertions(+), 1099 deletions(-) create mode 100644 ocrd_tesserocr/common.py diff --git a/.pylintrc b/.pylintrc index dfcd216..fc8552f 100644 --- a/.pylintrc +++ b/.pylintrc @@ -21,8 +21,5 @@ disable = wrong-import-order, duplicate-code -# allow indented whitespace (as required by interpreter): -no-space-check=empty-line - # allow non-snake-case identifiers: good-names=n,i diff --git a/ocrd_tesserocr/binarize.py b/ocrd_tesserocr/binarize.py index 180fb4c..0b916ba 100644 --- a/ocrd_tesserocr/binarize.py +++ b/ocrd_tesserocr/binarize.py @@ -6,17 +6,9 @@ PSM, RIL ) -from ocrd_utils import ( - getLogger, - assert_file_grp_cardinality, - make_file_id, - MIMETYPE_PAGE -) -from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( AlternativeImageType, TextRegionType, - to_xml ) from .recognize import TesserocrRecognize @@ -26,10 +18,14 @@ class TesserocrBinarize(TesserocrRecognize): def executable(self): return 'ocrd-tesserocr-binarize' - def process(self): + def _init(self): + # use default model (eng) with vanilla tesserocr API + self.tessapi = PyTessBaseAPI() + + def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): """Performs binarization of the region / line with Tesseract on the workspace. - Open and deserialize PAGE input files and their respective images, + Open and deserialize PAGE input file and its respective images, then iterate over the element hierarchy down to the requested level. Set up Tesseract to recognize the segment image's layout, and get @@ -42,109 +38,92 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) sepmask = self.parameter['tiseg'] oplevel = self.parameter['operation_level'] - with PyTessBaseAPI() as tessapi: - for n, input_file in enumerate(self.input_files): - file_id = make_file_id(input_file, self.output_file_grp) - page_id = input_file.pageId or input_file.ID - self.logger.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page = pcgts.get_Page() - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id) - if self.parameter['dpi'] > 0: - dpi = self.parameter['dpi'] - self.logger.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi = round(dpi * 2.54) - self.logger.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) - else: - dpi = 0 - self.logger.info("Page '%s' images will use DPI estimated from segmentation", page_id) - tessapi.SetVariable('user_defined_dpi', str(dpi)) - self.logger.info("Binarizing on '%s' level in page '%s'", oplevel, page_id) + page = pcgts.get_Page() + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id) + if self.parameter['dpi'] > 0: + dpi = self.parameter['dpi'] + self.logger.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) + elif page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + self.logger.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) + else: + dpi = 0 + self.logger.info("Page '%s' images will use DPI estimated from segmentation", page_id) + self.tessapi.SetVariable('user_defined_dpi', str(dpi)) + self.logger.info("Binarizing on '%s' level in page '%s'", oplevel, page_id) + + if oplevel == 'page': + image = self._process_segment(-1, page, page_image, page_xywh, + page_id, output_file_id) + if image: + return [pcgts, image] + else: + return pcgts - if oplevel == 'page': - tessapi.SetPageSegMode(PSM.AUTO_ONLY) - tessapi.SetImage(page_image) - if sepmask: - # will trigger FindLines() → SegmentPage() → AutoPageSeg() - # → SetupPageSegAndDetectOrientation() → FindAndRemoveLines() + FindImages() - tessapi.AnalyseLayout() - page_image_bin = tessapi.GetThresholdedImage() - if page_image_bin: - # update METS (add the image file): - file_path = self.workspace.save_image_file(page_image_bin, - file_id + '.IMG-BIN', - page_id=input_file.pageId, - file_grp=self.output_file_grp) - # update PAGE (reference the image file): - features = page_xywh['features'] + ",binarized" - if sepmask: - features += ",clipped" - page.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=features)) - else: - self.logger.error('Cannot binarize %s', "page '%s'" % page_id) - else: - regions = page.get_TextRegion() + page.get_TableRegion() - if not regions: - self.logger.warning("Page '%s' contains no text regions", page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh) - if oplevel == 'region': - tessapi.SetPageSegMode(PSM.SINGLE_BLOCK) - self._process_segment(tessapi, RIL.BLOCK, region, region_image, region_xywh, - "region '%s'" % region.id, input_file.pageId, - file_id + '_' + region.id) - elif isinstance(region, TextRegionType): - lines = region.get_TextLine() - if not lines: - self.logger.warning("Page '%s' region '%s' contains no text lines", - page_id, region.id) - for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh) - tessapi.SetPageSegMode(PSM.SINGLE_LINE) - self._process_segment(tessapi, RIL.TEXTLINE, line, line_image, line_xywh, - "line '%s'" % line.id, input_file.pageId, - file_id + '_' + region.id + '_' + line.id) + result = [pcgts] + regions = page.get_AllRegions(classes=['Text', 'Table']) + if not regions: + self.logger.warning("Page '%s' contains no text regions", page_id) + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh) + if oplevel == 'region': + image = self._process_segment(RIL.BLOCK, region, region_image, region_xywh, + "region '%s'" % region.id, + output_file_id + '_' + region.id) + if image: + result.append(image) + elif isinstance(region, TextRegionType): + lines = region.get_TextLine() + if not lines: + self.logger.warning("Page '%s' region '%s' contains no text lines", + page_id, region.id) + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment( + line, region_image, region_xywh) + image = self._process_segment(RIL.TEXTLINE, line, line_image, line_xywh, + "line '%s'" % line.id, + output_file_id + '_' + region.id + '_' + line.id) + if image: + result.append(image) - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - self.workspace.add_file( - file_id=file_id, - file_grp=self.output_file_grp, - page_id=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=os.path.join(self.output_file_grp, - file_id + '.xml'), - content=to_xml(pcgts)) + return result - def _process_segment(self, tessapi, ril, segment, image, xywh, where, page_id, file_id): - tessapi.SetImage(image) + def _process_segment(self, ril, segment, image, xywh, where, file_id): + self.tessapi.SetImage(image) + features = xywh['features'] + ",binarized" image_bin = None - layout = tessapi.AnalyseLayout() - if layout: - image_bin = layout.GetBinaryImage(ril) + if ril == -1: + # page level + self.tessapi.SetPageSegMode(PSM.AUTO_ONLY) + if self.parameter['tiseg']: + features += ",clipped" + # will trigger FindLines() → SegmentPage() → AutoPageSeg() + # → SetupPageSegAndDetectOrientation() → FindAndRemoveLines() + FindImages() + self.tessapi.AnalyseLayout() + image_bin = self.tessapi.GetThresholdedImage() + else: + if ril == RIL.BLOCK: + self.tessapi.SetPageSegMode(PSM.SINGLE_BLOCK) + if ril == RIL.TEXTLINE: + self.tessapi.SetPageSegMode(PSM.SINGLE_LINE) + layout = self.tessapi.AnalyseLayout() + if layout: + image_bin = layout.GetBinaryImage(ril) if not image_bin: self.logger.error('Cannot binarize %s', where) - return + return False # update METS (add the image file): - file_path = self.workspace.save_image_file(image_bin, - file_id + '.IMG-BIN', - page_id=page_id, - file_grp=self.output_file_grp) + file_id += '.IMG-BIN' + file_path = os.path.join(self.output_file_grp, file_id + '.png') # update PAGE (reference the image file): - features = xywh['features'] + ",binarized" segment.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=features)) + return image_bin, file_id, file_path diff --git a/ocrd_tesserocr/common.py b/ocrd_tesserocr/common.py new file mode 100644 index 0000000..9023a0a --- /dev/null +++ b/ocrd_tesserocr/common.py @@ -0,0 +1,396 @@ +import itertools +from PIL import Image, ImageStat + +import numpy as np +from scipy.sparse.csgraph import minimum_spanning_tree +from shapely.geometry import Polygon, LineString +from shapely.ops import unary_union, nearest_points, orient +from shapely import set_precision + + +from ocrd_utils import ( + getLogger, + polygon_from_points, + points_from_polygon, +) +from ocrd_models.ocrd_page import ( + ReadingOrderType, + RegionRefType, + RegionRefIndexedType, + OrderedGroupType, + OrderedGroupIndexedType, + UnorderedGroupType, + UnorderedGroupIndexedType, + PageType, + TextEquivType, +) +from ocrd_models.ocrd_page_generateds import ( + ReadingDirectionSimpleType, + TextLineOrderSimpleType, +) + + +def page_element_unicode0(element): + """Get Unicode string of the first text result.""" + if element.get_TextEquiv(): + return element.get_TextEquiv()[0].Unicode or '' + else: + return '' + +def page_element_conf0(element): + """Get confidence (as float value) of the first text result.""" + if element.get_TextEquiv(): + # generateDS does not convert simpleType for attributes (yet?) + return float(element.get_TextEquiv()[0].conf or "1.0") + return 1.0 + +def page_get_reading_order(ro, rogroup): + """Add all elements from the given reading order group to the given dictionary. + + Given a dict ``ro`` from layout element IDs to ReadingOrder element objects, + and an object ``rogroup`` with additional ReadingOrder element objects, + add all references to the dict, traversing the group recursively. + """ + regionrefs = list() + if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): + regionrefs = (rogroup.get_RegionRefIndexed() + + rogroup.get_OrderedGroupIndexed() + + rogroup.get_UnorderedGroupIndexed()) + if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): + regionrefs = (rogroup.get_RegionRef() + + rogroup.get_OrderedGroup() + + rogroup.get_UnorderedGroup()) + for elem in regionrefs: + ro[elem.get_regionRef()] = elem + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + page_get_reading_order(ro, elem) + +def page_update_higher_textequiv_levels(level, pcgts, overwrite=True): + """Update the TextEquivs of all PAGE-XML hierarchy levels above ``level`` for consistency. + + Starting with the lowest hierarchy level chosen for processing, + join all first TextEquiv.Unicode (by the rules governing the respective level) + into TextEquiv.Unicode of the next higher level, replacing them. + If ``overwrite`` is false and the higher level already has text, keep it. + + When two successive elements appear in a ``Relation`` of type ``join``, + then join them directly (without their respective white space). + + Likewise, average all first TextEquiv.conf into TextEquiv.conf of the next higher level. + + In the process, traverse the words and lines in their respective ``readingDirection``, + the (text) regions which contain lines in their respective ``textLineOrder``, and + the (text) regions which contain text regions in their ``ReadingOrder`` + (if they appear there as an ``OrderedGroup``). + Where no direction/order can be found, use XML ordering. + + Follow regions recursively, but make sure to traverse them in a depth-first strategy. + """ + page = pcgts.get_Page() + relations = page.get_Relations() # get RelationsType + if relations: + relations = relations.get_Relation() # get list of RelationType + else: + relations = [] + joins = list() # + for relation in relations: + if relation.get_type() == 'join': # ignore 'link' type here + joins.append((relation.get_SourceRegionRef().get_regionRef(), + relation.get_TargetRegionRef().get_regionRef())) + reading_order = dict() + ro = page.get_ReadingOrder() + if ro: + page_get_reading_order(reading_order, ro.get_OrderedGroup() or ro.get_UnorderedGroup()) + if level != 'region': + for region in page.get_AllRegions(classes=['Text']): + # order is important here, because regions can be recursive, + # and we want to concatenate by depth first; + # typical recursion structures would be: + # - TextRegion/@type=paragraph inside TextRegion + # - TextRegion/@type=drop-capital followed by TextRegion/@type=paragraph inside TextRegion + # - any region (including TableRegion or TextRegion) inside a TextRegion/@type=footnote + # - TextRegion inside TableRegion + subregions = region.get_TextRegion() + if subregions: # already visited in earlier iterations + # do we have a reading order for these? + # TODO: what if at least some of the subregions are in reading_order? + if (all(subregion.id in reading_order for subregion in subregions) and + isinstance(reading_order[subregions[0].id], # all have .index? + (OrderedGroupType, OrderedGroupIndexedType))): + subregions = sorted(subregions, key=lambda subregion: + reading_order[subregion.id].index) + region_unicode = page_element_unicode0(subregions[0]) + for subregion, next_subregion in zip(subregions, subregions[1:]): + if (subregion.id, next_subregion.id) not in joins: + region_unicode += '\n' # or '\f'? + region_unicode += page_element_unicode0(next_subregion) + region_conf = sum(page_element_conf0(subregion) for subregion in subregions) + region_conf /= len(subregions) + else: # TODO: what if a TextRegion has both TextLine and TextRegion children? + lines = region.get_TextLine() + if ((region.get_textLineOrder() or + page.get_textLineOrder()) == + TextLineOrderSimpleType.BOTTOMTOTOP): + lines = list(reversed(lines)) + if level != 'line': + for line in lines: + words = line.get_Word() + if ((line.get_readingDirection() or + region.get_readingDirection() or + page.get_readingDirection()) == + ReadingDirectionSimpleType.RIGHTTOLEFT): + words = list(reversed(words)) + if level != 'word': + for word in words: + glyphs = word.get_Glyph() + if ((word.get_readingDirection() or + line.get_readingDirection() or + region.get_readingDirection() or + page.get_readingDirection()) == + ReadingDirectionSimpleType.RIGHTTOLEFT): + glyphs = list(reversed(glyphs)) + word_unicode = ''.join(page_element_unicode0(glyph) for glyph in glyphs) + word_conf = sum(page_element_conf0(glyph) for glyph in glyphs) + if glyphs: + word_conf /= len(glyphs) + if not word.get_TextEquiv() or overwrite: + word.set_TextEquiv( # replace old, if any + [TextEquivType(Unicode=word_unicode, conf=word_conf)]) + line_unicode = ' '.join(page_element_unicode0(word) for word in words) + line_conf = sum(page_element_conf0(word) for word in words) + if words: + line_conf /= len(words) + if not line.get_TextEquiv() or overwrite: + line.set_TextEquiv( # replace old, if any + [TextEquivType(Unicode=line_unicode, conf=line_conf)]) + region_unicode = '' + region_conf = 0 + if lines: + region_unicode = page_element_unicode0(lines[0]) + for line, next_line in zip(lines, lines[1:]): + words = line.get_Word() + next_words = next_line.get_Word() + if not (words and next_words and (words[-1].id, next_words[0].id) in joins): + region_unicode += '\n' + region_unicode += page_element_unicode0(next_line) + region_conf = sum(page_element_conf0(line) for line in lines) + region_conf /= len(lines) + if not region.get_TextEquiv() or overwrite: + region.set_TextEquiv( # replace old, if any + [TextEquivType(Unicode=region_unicode, conf=region_conf)]) + +def page_shrink_higher_coordinate_levels(maxlevel, minlevel, pcgts): + """Project the coordinate hull of all PAGE-XML hierarchy levels above ``minlevel`` up to ``maxlevel``. + + Starting with the lowest hierarchy level chosen for processing, + join all segments into a convex hull for the next higher level, + replacing the parent coordinates, respectively. + + Follow regions recursively, but make sure to traverse them in a depth-first strategy. + """ + LOG = getLogger('processor.TesserocrRecognize') + page = pcgts.get_Page() + regions = page.get_AllRegions(classes=['Text']) + if minlevel != 'region': + for region in regions: + lines = region.get_TextLine() + if minlevel != 'line': + for line in lines: + words = line.get_Word() + if minlevel != 'word': + for word in words: + glyphs = word.get_Glyph() + if maxlevel in ['region', 'line', 'word', 'glyph'] and glyphs: + joint_polygon = join_segments(glyphs) + LOG.debug("setting hull for word '%s' from %d vertices", + word.id, len(joint_polygon)) + word.get_Coords().set_points(points_from_polygon(joint_polygon)) + if maxlevel in ['region', 'line', 'word'] and words: + joint_polygon = join_segments(words) + LOG.debug("setting hull for line '%s' from %d vertices", + line.id, len(joint_polygon)) + line.get_Coords().set_points(points_from_polygon(joint_polygon)) + if maxlevel in ['region', 'line'] and lines: + joint_polygon = join_segments(lines) + LOG.debug("setting hull for region '%s' from %d vertices", + region.id, len(joint_polygon)) + region.get_Coords().set_points(points_from_polygon(joint_polygon)) + +def join_segments(segments): + return join_polygons([polygon_from_points(segment.get_Coords().points) + for segment in segments]) + +def join_polygons(polygons, scale=20): + """construct concave hull (alpha shape) from input polygons by connecting their pairwise nearest points""" + return make_join([make_valid(Polygon(poly)) for poly in polygons], scale=scale).exterior.coords[:-1] + +def make_join(polygons, scale=20): + """construct concave hull (alpha shape) from input polygons by connecting their pairwise nearest points""" + # ensure input polygons are simply typed and all oriented equally + polygons = [orient(poly) + for poly in itertools.chain.from_iterable( + [poly.geoms + if poly.geom_type in ['MultiPolygon', 'GeometryCollection'] + else [poly] + for poly in polygons])] + npoly = len(polygons) + if npoly == 1: + return polygons[0] + # find min-dist path through all polygons (travelling salesman) + pairs = itertools.combinations(range(npoly), 2) + dists = np.zeros((npoly, npoly), dtype=float) + for i, j in pairs: + dist = polygons[i].distance(polygons[j]) + if dist < 1e-5: + dist = 1e-5 # if pair merely touches, we still need to get an edge + dists[i, j] = dist + dists[j, i] = dist + dists = minimum_spanning_tree(dists, overwrite=True) + # add bridge polygons (where necessary) + for prevp, nextp in zip(*dists.nonzero()): + prevp = polygons[prevp] + nextp = polygons[nextp] + nearest = nearest_points(prevp, nextp) + bridgep = LineString(nearest).buffer(max(1, scale/5), resolution=1) + polygons.append(bridgep) + jointp = unary_union(polygons) + assert jointp.geom_type == 'Polygon', jointp.wkt + # follow-up calculations will necessarily be integer; + # so anticipate rounding here and then ensure validity + jointp2 = set_precision(jointp, 1.0) + if jointp2.geom_type != 'Polygon' or not jointp2.is_valid: + jointp2 = Polygon(np.round(jointp.exterior.coords)) + jointp2 = make_valid(jointp2) + assert jointp2.geom_type == 'Polygon', jointp2.wkt + return jointp2 + +def pad_image(image, padding): + # TODO: input padding can create extra edges if not binarized; at least try to smooth + stat = ImageStat.Stat(image) + # workaround for Pillow#4925 + if len(stat.bands) > 1: + background = tuple(stat.median) + else: + background = stat.median[0] + padded = Image.new(image.mode, + (image.width + 2 * padding, + image.height + 2 * padding), + background) + padded.paste(image, (padding, padding)) + return padded + +def polygon_for_parent(polygon, parent): + """Clip polygon to parent polygon range. + + (Should be moved to ocrd_utils.coordinates_for_segment.) + """ + childp = Polygon(polygon) + if isinstance(parent, PageType): + if parent.get_Border(): + parentp = Polygon(polygon_from_points(parent.get_Border().get_Coords().points)) + else: + parentp = Polygon([[0, 0], [0, parent.get_imageHeight()], + [parent.get_imageWidth(), parent.get_imageHeight()], + [parent.get_imageWidth(), 0]]) + else: + parentp = Polygon(polygon_from_points(parent.get_Coords().points)) + # ensure input coords have valid paths (without self-intersection) + # (this can happen when shapes valid in floating point are rounded) + childp = make_valid(childp) + parentp = make_valid(parentp) + if not childp.is_valid: + return None + if not parentp.is_valid: + return None + # check if clipping is necessary + if childp.within(parentp): + return childp.exterior.coords[:-1] + # clip to parent + interp = make_intersection(childp, parentp) + if not interp: + return None + return interp.exterior.coords[:-1] # keep open + +def make_intersection(poly1, poly2): + interp = poly1.intersection(poly2) + # post-process + if interp.is_empty or interp.area == 0.0: + # this happens if Tesseract "finds" something + # outside of the valid Border of a deskewed/cropped page + # (empty corners created by masking); will be ignored + return None + if interp.geom_type == 'GeometryCollection': + # heterogeneous result: filter zero-area shapes (LineString, Point) + interp = unary_union([geom for geom in interp.geoms if geom.area > 0]) + if interp.geom_type == 'MultiPolygon': + # homogeneous result: construct convex hull to connect + interp = make_join(interp.geoms) + if interp.minimum_clearance < 1.0: + # follow-up calculations will necessarily be integer; + # so anticipate rounding here and then ensure validity + interp = Polygon(np.round(interp.exterior.coords)) + interp = make_valid(interp) + return interp + +def make_valid(polygon): + points = list(polygon.exterior.coords) + for split in range(1, len(points)): + if polygon.is_valid or polygon.simplify(polygon.area).is_valid: + break + # simplification may not be possible (at all) due to ordering + # in that case, try another starting point + polygon = Polygon(points[-split:]+points[:-split]) + # try by simplification + for tolerance in range(int(polygon.area + 1.5)): + if polygon.is_valid: + break + # simplification may require a larger tolerance + polygon = polygon.simplify(tolerance + 1) + # try by enlarging + for tolerance in range(1, int(polygon.area + 2.5)): + if polygon.is_valid: + break + # enlargement may require a larger tolerance + polygon = polygon.buffer(tolerance) + assert polygon.is_valid, polygon.wkt + return polygon + +def iterate_level(it, ril, parent=None): + LOG = getLogger('processor.TesserocrRecognize') + # improves over tesserocr.iterate_level by + # honouring multi-level semantics so iterators + # can be combined across levels + if parent is None: + parent = ril - 1 + pos = 0 + while it and not it.Empty(ril): + yield it + # With upstream Tesseract, these assertions may fail: + # if ril > 0 and it.IsAtFinalElement(parent, ril): + # for level in range(parent, ril): + # assert it.IsAtFinalElement(parent, level), \ + # "level %d iterator at %d is final w.r.t. %d but level %d is not" % ( + # ril, pos, parent, level) + # Hence the following workaround avails itself: + if ril > 0 and all(it.IsAtFinalElement(parent, level) + for level in range(parent, ril + 1)): + break + if not it.Next(ril): + break + while it.Empty(ril) and not it.Empty(0): + # This happens when + # - on RIL.PARA, RIL.TEXTLINE and RIL.WORD, + # empty non-text (pseudo-) blocks intervene + # - on RIL.SYMBOL, a word has no cblobs at all + # (because they have all been rejected) + # We must _not_ yield these (as they have strange + # properties and bboxes). But most importantly, + # they will have met IsAtFinalElement prematurely + # (hence the similar loop above). + # Since this may happen multiple consecutive times, + # enclose this in a while loop. + LOG.warning("level %d iterator at %d needs to skip empty segment", + ril, pos) + if not it.Next(ril): + break + pos += 1 diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index 7e7af8e..c3100f8 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -2,8 +2,8 @@ import os.path import tesserocr + from ocrd_utils import ( - getLogger, crop_image, coordinates_for_segment, coordinates_of_segment, @@ -12,29 +12,34 @@ polygon_from_bbox, points_from_polygon, bbox_from_xywh, - make_file_id, - assert_file_grp_cardinality, - MIMETYPE_PAGE ) -from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( CoordsType, AlternativeImageType, BorderType, - to_xml ) -from .recognize import TesserocrRecognize, polygon_for_parent +from .recognize import TesserocrRecognize +from .common import polygon_for_parent class TesserocrCrop(TesserocrRecognize): @property def executable(self): return 'ocrd-tesserocr-crop' - def process(self): + def _init(self): + # use default model (eng) with vanilla tesserocr API + self.tessapi = tesserocr.PyTessBaseAPI() + # disable table detection here (tables count as text blocks), + # because we do not want to risk confusing the spine with + # a column separator and thus creeping into a neighbouring + # page: + self.tessapi.SetVariable("textord_tabfind_find_tables", "0") + + def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): """Performs page cropping with Tesseract on the workspace. - Open and deserialize PAGE input files and their respective images. + Open and deserialize PAGE input file and its respective images. Set up Tesseract to detect text blocks on each page, and find the largest coordinate extent spanning all of them. Use this extent in defining a Border, and add that to the page. @@ -48,81 +53,59 @@ def process(self): Produce new output files by serialising the resulting hierarchy. """ - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - - with tesserocr.PyTessBaseAPI() as tessapi: - # disable table detection here (tables count as text blocks), - # because we do not want to risk confusing the spine with - # a column separator and thus creeping into a neighbouring - # page: - tessapi.SetVariable("textord_tabfind_find_tables", "0") - for (n, input_file) in enumerate(self.input_files): - file_id = make_file_id(input_file, self.output_file_grp) - page_id = input_file.pageId or input_file.ID - self.logger.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page = pcgts.get_Page() - - # warn of existing Border: - border = page.get_Border() - if border: - left, top, right, bottom = bbox_from_points(border.get_Coords().points) - self.logger.warning('Overwriting existing Border: %i:%i,%i:%i', - left, top, right, bottom) + page = pcgts.get_Page() - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id, - # image must not have been cropped already, - # abort if no such image can be produced: - feature_filter='cropped') - if self.parameter['dpi'] > 0: - dpi = self.parameter['dpi'] - self.logger.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi = round(dpi * 2.54) - self.logger.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) - else: - dpi = 0 - self.logger.info("Page '%s' images will use DPI estimated from segmentation", page_id) - tessapi.SetVariable('user_defined_dpi', str(dpi)) - if dpi: - zoom = 300 / dpi - else: - zoom = 1 + # warn of existing Border: + border = page.get_Border() + if border: + left, top, right, bottom = bbox_from_points(border.get_Coords().points) + self.logger.warning('Overwriting existing Border: %i:%i,%i:%i', + left, top, right, bottom) - bounds = self.estimate_bounds(page, page_image, tessapi, zoom) - self.process_page(page, page_image, page_xywh, bounds, file_id, input_file.pageId) + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id, + # image must not have been cropped already, + # abort if no such image can be produced: + feature_filter='cropped') + if self.parameter['dpi'] > 0: + dpi = self.parameter['dpi'] + self.logger.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) + elif page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + self.logger.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) + else: + dpi = 0 + self.logger.info("Page '%s' images will use DPI estimated from segmentation", page_id) + self.tessapi.SetVariable('user_defined_dpi', str(dpi)) + if dpi: + zoom = 300 / dpi + else: + zoom = 1 - pcgts.set_pcGtsId(file_id) - self.workspace.add_file( - file_id=file_id, - file_grp=self.output_file_grp, - page_id=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=os.path.join(self.output_file_grp, - file_id + '.xml'), - content=to_xml(pcgts)) + bounds = self._estimate_bounds(page, page_image, zoom) + cropped = self._process_page(page, page_image, page_xywh, bounds, output_file_id) + if cropped: + return [pcgts, cropped] + return pcgts - def estimate_bounds(self, page, page_image, tessapi, zoom=1.0): + def _estimate_bounds(self, page, page_image, zoom=1.0): """Get outer bounds of all (existing or detected) regions.""" all_left = page_image.width all_top = page_image.height all_right = 0 all_bottom = 0 self.logger.info("Cropping with Tesseract") - tessapi.SetImage(page_image) + self.tessapi.SetImage(page_image) # PSM.SPARSE_TEXT: get as much text as possible in no particular order # PSM.AUTO (default): includes tables (dangerous) # PSM.SPARSE_TEXT_OSD: sparse but all orientations - tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT) + self.tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT) # # iterate over all text blocks and compare their # bbox extent to the running min and max values - for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): + for component in self.tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): image, xywh, index, _ = component # # the region reference in the reading order element @@ -165,12 +148,12 @@ def estimate_bounds(self, page, page_image, tessapi, zoom=1.0): all_left, all_right, all_top, all_bottom) return all_left, all_top, all_right, all_bottom - def process_page(self, page, page_image, page_xywh, bounds, file_id, page_id): + def _process_page(self, page, page_image, page_xywh, bounds, file_id): """Set the identified page border, if valid.""" left, top, right, bottom = bounds if left >= right or top >= bottom: - self.logger.error("Cannot find valid extent for page '%s'", page_id) - return + self.logger.error("Cannot find valid extent for page") + return False padding = self.parameter['padding'] # add padding: left = max(left - padding, 0) @@ -183,7 +166,7 @@ def process_page(self, page, page_image, page_xywh, bounds, file_id, page_id): polygon = polygon_for_parent(polygon, page) if polygon is None: self.logger.error("Ignoring extant border") - return + return False border = BorderType(Coords=CoordsType( points_from_polygon(polygon))) # intersection with parent could have changed bbox, @@ -194,10 +177,9 @@ def process_page(self, page, page_image, page_xywh, bounds, file_id, page_id): # update METS (add the image file): page_image = crop_image(page_image, box=bbox) page_xywh['features'] += ',cropped' - file_path = self.workspace.save_image_file( - page_image, file_id + '.IMG-CROP', - page_id=page_id, file_grp=self.output_file_grp) + page_image_id = file_id + '.IMG-CROP' + page_image_path = os.path.join(self.output_file_grp, page_image_id + '.png') # update PAGE (reference the image file): page.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=page_xywh['features'])) - + filename=page_image_path, comments=page_xywh['features'])) + return page_image, page_image_id, page_image_path diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index cacf632..c6797d4 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -10,28 +10,29 @@ TextlineOrder ) -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - membername, - MIMETYPE_PAGE -) -from ocrd_modelfactory import page_from_file +from ocrd_utils import membername from ocrd_models.ocrd_page import ( AlternativeImageType, TextLineType, TextRegionType, PageType, - to_xml ) from .recognize import TesserocrRecognize + class TesserocrDeskew(TesserocrRecognize): @property def executable(self): return 'ocrd-tesserocr-deskew' - def process(self): + def _init(self): + # use default model (eng) with vanilla tesserocr API + self.tessapi = PyTessBaseAPI(lang="osd", # osd required for legacy init! + oem=OEM.TESSERACT_LSTM_COMBINED, # legacy required for OSD! + psm=PSM.AUTO_OSD) + if self.parameter['operation_level'] == 'line': + self.tessapi.SetVariable("min_characters_to_try", "15") + + def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): """Performs deskewing of the page / region with Tesseract on the workspace. Open and deserialise PAGE input files and their respective images, @@ -49,97 +50,83 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] - with PyTessBaseAPI( - lang="osd", # osd required for legacy init! - oem=OEM.TESSERACT_LSTM_COMBINED, # legacy required for OSD! - psm=PSM.AUTO_OSD - ) as tessapi: - if oplevel == 'line': - tessapi.SetVariable("min_characters_to_try", "15") - for n, input_file in enumerate(self.input_files): - file_id = make_file_id(input_file, self.output_file_grp) - page_id = input_file.pageId or input_file.ID - self.logger.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - pcgts.set_pcGtsId(file_id) - self.add_metadata(pcgts) - page = pcgts.get_Page() - - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id, - # image must not have been rotated already, - # (we will overwrite @orientation anyway,) - # abort if no such image can be produced: - feature_filter='deskewed' if oplevel == 'page' else '') - if self.parameter['dpi'] > 0: - dpi = self.parameter['dpi'] - self.logger.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi = round(dpi * 2.54) - self.logger.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) - else: - dpi = 0 - self.logger.info("Page '%s' images will use DPI estimated from segmentation", page_id) - tessapi.SetVariable('user_defined_dpi', str(dpi)) - - self.logger.info("Deskewing on '%s' level in page '%s'", oplevel, page_id) + page = pcgts.get_Page() - if oplevel == 'page': - self._process_segment(tessapi, page, page_image, page_xywh, - "page '%s'" % page_id, input_file.pageId, - file_id) - else: - regions = page.get_AllRegions(classes=['Text', 'Table']) - if not regions: - self.logger.warning("Page '%s' contains no text regions", page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh, - # image must not have been rotated already, - # (we will overwrite @orientation anyway,) - # abort if no such image can be produced: - feature_filter='deskewed') - if oplevel == 'region': - self._process_segment(tessapi, region, region_image, region_xywh, - "region '%s'" % region.id, input_file.pageId, - file_id + '_' + region.id) - elif isinstance(region, TextRegionType): - lines = region.get_TextLine() - if not lines: - self.logger.warning("Page '%s' region '%s' contains no lines", page_id, region.id) - for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh) - self._process_segment(tessapi, line, line_image, line_xywh, - "line '%s'" % line.id, input_file.pageId, - file_id + '_' + region.id + '_' + line.id) + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id, + # image must not have been rotated already, + # (we will overwrite @orientation anyway,) + # abort if no such image can be produced: + feature_filter='deskewed' if oplevel == 'page' else '') + if self.parameter['dpi'] > 0: + dpi = self.parameter['dpi'] + self.logger.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) + elif page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + self.logger.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) + else: + dpi = 0 + self.logger.info("Page '%s' images will use DPI estimated from segmentation", page_id) + self.tessapi.SetVariable('user_defined_dpi', str(dpi)) - self.workspace.add_file( - file_id=file_id, - file_grp=self.output_file_grp, - page_id=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), - content=to_xml(pcgts)) - - def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_id): + self.logger.info("Deskewing on '%s' level in page '%s'", oplevel, page_id) + + if oplevel == 'page': + image = self._process_segment(page, page_image, page_xywh, + "page '%s'" % page_id, + output_file_id) + if image: + return [pcgts, image] + else: + return pcgts + + result = [pcgts] + regions = page.get_AllRegions(classes=['Text', 'Table']) + if not regions: + self.logger.warning("Page '%s' contains no text regions", page_id) + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh, + # image must not have been rotated already, + # (we will overwrite @orientation anyway,) + # abort if no such image can be produced: + feature_filter='deskewed') + if oplevel == 'region': + image = self._process_segment(region, region_image, region_xywh, + "region '%s'" % region.id, + output_file_id + '_' + region.id) + if image: + result.append(image) + elif isinstance(region, TextRegionType): + lines = region.get_TextLine() + if not lines: + self.logger.warning("Page '%s' region '%s' contains no lines", page_id, region.id) + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment( + line, region_image, region_xywh) + image = self._process_segment(line, line_image, line_xywh, + "line '%s'" % line.id, + output_file_id + '_' + region.id + '_' + line.id) + if image: + result.append(image) + return result + + def _process_segment(self, segment, image, xywh, where, file_id): if not image.width or not image.height: self.logger.warning("Skipping %s with zero size", where) - return + return False angle0 = xywh['angle'] # deskewing (w.r.t. top image) already applied to image angle = 0. # additional angle to be applied at current level - tessapi.SetImage(image) - #tessapi.SetPageSegMode(PSM.AUTO_OSD) + self.tessapi.SetImage(image) + #self.tessapi.SetPageSegMode(PSM.AUTO_OSD) # # orientation/script # - osr = tessapi.DetectOrientationScript() + osr = self.tessapi.DetectOrientationScript() if osr: assert not math.isnan(osr['orient_conf']), \ "orientation detection failed (Tesseract probably compiled without legacy OEM, or osd model not installed)" @@ -207,14 +194,14 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i else: self.logger.warning('no OSD result in %s', where) if isinstance(segment, TextLineType): - return + return False # # orientation/skew # - layout = tessapi.AnalyseLayout() + layout = self.tessapi.AnalyseLayout() if not layout: self.logger.warning('no result iterator in %s', where) - return + return False orientation, writing_direction, textline_order, deskew_angle = layout.Orientation() if isinstance(segment, (TextRegionType, PageType)): segment.set_readingDirection({ @@ -270,7 +257,7 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i # We can delegate to OCR-D core for reflection, deskewing and re-cropping: if isinstance(segment, PageType): image, xywh, _ = self.workspace.image_from_page( - segment, page_id, + segment, where, fill='background', transparency=True) else: image, xywh = self.workspace.image_from_segment( @@ -283,10 +270,9 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i xywh['features'] += ',deskewed' features = xywh['features'] # features already applied to image # update METS (add the image file): - file_path = self.workspace.save_image_file( - image, file_id + '.IMG-DESKEW', - page_id=page_id, - file_grp=self.output_file_grp) + file_id += '.IMG-DESKEW' + file_path = os.path.join(self.output_file_grp, file_id + '.png') # update PAGE (reference the image file): segment.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=features)) + return image, file_id, file_path diff --git a/ocrd_tesserocr/fontshape.py b/ocrd_tesserocr/fontshape.py index 06d762c..b8eb00c 100644 --- a/ocrd_tesserocr/fontshape.py +++ b/ocrd_tesserocr/fontshape.py @@ -8,28 +8,31 @@ get_languages ) -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - MIMETYPE_PAGE -) -from ocrd_models.ocrd_page import ( - TextStyleType, - to_xml) -from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_page import TextStyleType from .recognize import TesserocrRecognize +from .common import pad_image class TesserocrFontShape(TesserocrRecognize): @property def executable(self): return 'ocrd-tesserocr-fontshape' - def process(self): + def _init(self): + model = self.parameter['model'] + if model not in get_languages()[1]: + raise Exception("model " + model + " (needed for font style detection) is not installed") + # use vanilla tesserocr API + self.tessapi = PyTessBaseAPI(oem=OEM.TESSERACT_ONLY, # legacy required for OSD or WordFontAttributes! + #oem=OEM.TESSERACT_LSTM_COMBINED, + lang=model) + self.logger.info("Using model '%s' in %s for recognition at the word level", + model, get_languages()[0]) + + def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): """Detect font shapes via rule-based OCR with Tesseract on the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the line level. Set up Tesseract to recognise each word's image (either from @@ -41,61 +44,34 @@ def process(self): Produce new output files by serialising the resulting hierarchy. """ - self.logger.debug("TESSDATA: %s, installed Tesseract models: %s", *get_languages()) - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) + page = pcgts.get_Page() - model = self.parameter['model'] - if model not in get_languages()[1]: - raise Exception("model " + model + " (needed for font style detection) is not installed") - - with PyTessBaseAPI(#oem=OEM.TESSERACT_LSTM_COMBINED, # legacy required for OSD or WordFontAttributes! - oem=OEM.TESSERACT_ONLY, # legacy required for OSD or WordFontAttributes! - lang=model) as tessapi: - self.logger.info("Using model '%s' in %s for recognition at the word level", - model, get_languages()[0]) - for (n, input_file) in enumerate(self.input_files): - page_id = input_file.pageId or input_file.ID - self.logger.info("INPUT FILE %i / %s", n, page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page = pcgts.get_Page() - - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id) - if self.parameter['dpi'] > 0: - dpi = self.parameter['dpi'] - self.logger.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi = round(dpi * 2.54) - self.logger.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) - else: - dpi = 0 - self.logger.info("Page '%s' images will use DPI estimated from segmentation", page_id) - tessapi.SetVariable('user_defined_dpi', str(dpi)) - - self.logger.info("Processing page '%s'", page_id) - regions = page.get_AllRegions(classes=['Text']) - if not regions: - self.logger.warning("Page '%s' contains no text regions", page_id) - else: - self._process_regions(tessapi, regions, page_image, page_coords) - - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - self.workspace.add_file( - file_id=file_id, - file_grp=self.output_file_grp, - page_id=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=os.path.join(self.output_file_grp, - file_id + '.xml'), - content=to_xml(pcgts)) + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id) + if self.parameter['dpi'] > 0: + dpi = self.parameter['dpi'] + self.logger.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) + elif page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + self.logger.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) + else: + dpi = 0 + self.logger.info("Page '%s' images will use DPI estimated from segmentation", page_id) + self.tessapi.SetVariable('user_defined_dpi', str(dpi)) + + self.logger.info("Processing page '%s'", page_id) + regions = page.get_AllRegions(classes=['Text']) + if not regions: + self.logger.warning("Page '%s' contains no text regions", page_id) + else: + self._process_regions(regions, page_image, page_coords) + + return pcgts - def _process_regions(self, tessapi, regions, page_image, page_coords): + def _process_regions(self, regions, page_image, page_coords): for region in regions: region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords) @@ -103,9 +79,9 @@ def _process_regions(self, tessapi, regions, page_image, page_coords): if not textlines: self.logger.warning("Region '%s' contains no text lines", region.id) else: - self._process_lines(tessapi, textlines, region_image, region_coords) + self._process_lines(textlines, region_image, region_coords) - def _process_lines(self, tessapi, textlines, region_image, region_coords): + def _process_lines(self, textlines, region_image, region_coords): for line in textlines: line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords) @@ -114,20 +90,20 @@ def _process_lines(self, tessapi, textlines, region_image, region_coords): if not words: self.logger.warning("Line '%s' contains no words", line.id) else: - self._process_words(tessapi, words, line_image, line_coords) + self._process_words(words, line_image, line_coords) - def _process_words(self, tessapi, words, line_image, line_coords): + def _process_words(self, words, line_image, line_coords): for word in words: word_image, word_coords = self.workspace.image_from_segment( word, line_image, line_coords) if self.parameter['padding']: - tessapi.SetImage(pad_image(word_image, self.parameter['padding'])) + self.tessapi.SetImage(pad_image(word_image, self.parameter['padding'])) else: - tessapi.SetImage(word_image) - tessapi.SetPageSegMode(PSM.SINGLE_WORD) - #tessapi.SetPageSegMode(PSM.RAW_LINE) - tessapi.Recognize() - result_it = tessapi.GetIterator() + self.tessapi.SetImage(word_image) + self.tessapi.SetPageSegMode(PSM.SINGLE_WORD) + #self.tessapi.SetPageSegMode(PSM.RAW_LINE) + self.tessapi.Recognize() + result_it = self.tessapi.GetIterator() if not result_it or result_it.Empty(RIL.WORD): self.logger.warning("No text in word '%s'", word.id) continue @@ -159,16 +135,3 @@ def _process_words(self, tessapi, words, line_image, line_coords): if 'serif' in word_attributes else None) word.set_TextStyle(word_style) # (or somewhere in custom attribute?) -def pad_image(image, padding): - stat = ImageStat.Stat(image) - # workaround for Pillow#4925 - if len(stat.bands) > 1: - background = tuple(stat.median) - else: - background = stat.median[0] - padded = Image.new(image.mode, - (image.width + 2 * padding, - image.height + 2 * padding), - background) - padded.paste(image, (padding, padding)) - return padded diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index f591eed..d609979 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -1,14 +1,8 @@ from __future__ import absolute_import + from os.path import join -from pathlib import Path import math -import itertools -from PIL import Image, ImageStat import numpy as np -from scipy.sparse.csgraph import minimum_spanning_tree -from shapely.geometry import Polygon, LineString -from shapely.ops import unary_union, nearest_points, orient -from shapely import set_precision from tesserocr import ( RIL, PSM, PT, OEM, @@ -21,15 +15,12 @@ from ocrd_utils import ( getLogger, - make_file_id, assert_file_grp_cardinality, shift_coordinates, coordinates_for_segment, polygon_from_x0y0x1y1, - polygon_from_points, points_from_polygon, xywh_from_polygon, - MIMETYPE_PAGE, VERSION as OCRD_VERSION, membername ) @@ -54,21 +45,19 @@ GlyphType, TextEquivType, AlternativeImageType, - to_xml) -from ocrd_models.ocrd_page_generateds import ( - ReadingDirectionSimpleType, - TextLineOrderSimpleType, - TextTypeSimpleType ) -from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_page_generateds import TextTypeSimpleType from ocrd import Processor +from .common import * + + CHOICE_THRESHOLD_NUM = 10 # maximum number of choices to query and annotate CHOICE_THRESHOLD_CONF = 1 # maximum score drop from best choice to query and annotate # (ChoiceIterator usually rounds to 0.0 for non-best, so this better be maximum) -# monkey-patch the tesserocr base class so have at least some state class TessBaseAPI(PyTessBaseAPI): + """wraps the tesserocr base class so have some state (for parameter/model switching)""" parameters = {} psm = PSM.AUTO image = None @@ -139,11 +128,163 @@ def setup(self): self.logger.debug("TESSDATA: %s, installed Tesseract models: %s", *get_languages()) assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) + self._init() + + def _init(self): + model = "eng" + if 'model' in self.parameter: + model = self.parameter['model'] + for sub_model in model.split('+'): + if sub_model.endswith('.traineddata'): + self.logger.warning("Model '%s' has a .traineddata extension, removing. Please use model names without .traineddata extension" % sub_model) + sub_model = sub_model.replace('.traineddata', '') + if sub_model not in get_languages()[1]: + raise Exception("configured model " + sub_model + " is not installed") + self.logger.info("Using model '%s' in %s for recognition at the %s level", + model, get_languages()[0], self.parameter['textequiv_level']) + self.tessapi = TessBaseAPI(init=False) + # Set init-time parameters + # self.SetVariable("debug_file", "") # show debug output (default: /dev/null) + if self.parameter['textequiv_level'] == 'glyph': + # populate GetChoiceIterator() with LSTM models, too: + self.tessapi.SetVariable("lstm_choice_mode", "2") # aggregate symbols + self.tessapi.SetVariable("lstm_choice_iterations", "15") # squeeze out more best paths + self.tessapi.SetVariable("pageseg_apply_music_mask", "1" if self.parameter['find_staves'] else "0") + # TODO: maybe warn/raise when illegal combinations or characters not in the model unicharset? + if self.parameter['char_whitelist']: + self.tessapi.SetVariable("tessedit_char_whitelist", self.parameter['char_whitelist']) + if self.parameter['char_blacklist']: + self.tessapi.SetVariable("tessedit_char_blacklist", self.parameter['char_blacklist']) + if self.parameter['char_unblacklist']: + self.tessapi.SetVariable("tessedit_char_unblacklist", self.parameter['char_unblacklist']) + # todo: determine relevancy of these variables: + # tessedit_preserve_min_wd_len 2 + # tessedit_prefer_joined_punct 0 + # tessedit_write_rep_codes 0 + # tessedit_parallelize 0 + # tessedit_zero_rejection 0 + # tessedit_zero_kelvin_rejection 0 + # tessedit_reject_mode 0 + # tessedit_use_reject_spaces 1 + # tessedit_fix_fuzzy_spaces 1 + # tessedit_char_blacklist + # tessedit_char_whitelist + # chs_leading_punct ('`" + # chs_trailing_punct1 ).,;:?! + # chs_trailing_punct2 )'`" + # numeric_punctuation ., + # unrecognised_char | + # ok_repeated_ch_non_alphanum_wds -?*= + # conflict_set_I_l_1 Il1[] + # preserve_interword_spaces 0 + # tessedit_enable_dict_correction 0 + # tessedit_enable_bigram_correction 1 + # stopper_smallword_size 2 + # wordrec_max_join_chunks 4 + # suspect_space_level 100 + # suspect_short_words 2 + # language_model_ngram_on 0 + # language_model_ngram_order 8 + # language_model_min_compound_length 3 + # language_model_penalty_non_freq_dict_word 0.1 + # language_model_penalty_non_dict_word 0.15 + # language_model_penalty_punc 0.2 + # language_model_penalty_case 0.1 + # language_model_penalty_script 0.5 + # language_model_penalty_chartype 0.3 + # language_model_penalty_spacing 0.05 + # textord_max_noise_size 7 + # enable_noise_removal 1 + # classify_bln_numeric_mode 0 + # lstm_use_matrix 1 + # user_words_file + # user_patterns_file + for variable, value in self.parameter['tesseract_parameters'].items(): + self.tessapi.SetVariable(variable, value) + # Initialize Tesseract (loading model) + self.tessapi.InitFull(lang=model, oem=getattr(OEM, self.parameter['oem'])) + + def _reinit(self, segment, mapping): + """Reset Tesseract API to initial state, and apply API-level settings for the given segment. + + If ``xpath_parameters`` is used, try each XPath expression against ``segment``, + and in case of a match, apply given parameters, respectively. + + If ``xpath_model`` is used, try each XPath expression against ``segment``, + and in case of a match, load the given language/model, respectively. + + If ``auto_model`` is used, and no ``xpath_model`` was applied yet, + try each given language/model individually on ``segment``, compare + their confidences, and load the best-scoring language/model. + + Before returning, store all previous settings (to catch by the next call). + """ + # Tesseract API is stateful but does not allow copy constructors + # for segment-by-segment configuration we therefore need to + # re-initialize the API with the currently loaded settings, + # and add some custom choices + node = mapping.get(id(segment), None) + tag = segment.__class__.__name__[:-4] + if hasattr(segment, 'id'): + at_ident = 'id' + else: + at_ident = 'imageFilename' + ident = getattr(segment, at_ident) + with self.tessapi: + # apply temporary changes + if self.parameter['xpath_parameters']: + if node is not None and node.attrib.get(at_ident, None) == ident: + ns = {'re': 'http://exslt.org/regular-expressions', + 'pc': node.nsmap[node.prefix], + node.prefix: node.nsmap[node.prefix]} + for xpath, params in self.parameter['xpath_parameters'].items(): + if node.xpath(xpath, namespaces=ns): + self.logger.info("Found '%s' in '%s', setting '%s'", + xpath, ident, params) + for name, val in params.items(): + self.tessapi.SetVariable(name, val) + else: + self.logger.error("Cannot find segment '%s' in etree mapping, " + "ignoring xpath_parameters", ident) + if self.parameter['xpath_model']: + if node is not None and node.attrib.get(at_ident, None) == ident: + ns = {'re': 'http://exslt.org/regular-expressions', + 'pc': node.nsmap[node.prefix], + node.prefix: node.nsmap[node.prefix]} + models = [] + for xpath, model in self.parameter['xpath_model'].items(): + if node.xpath(xpath, namespaces=ns): + self.logger.info("Found '%s' in '%s', reloading with '%s'", + xpath, ident, model) + models.append(model) + if models: + model = '+'.join(models) + self.logger.debug("Reloading model '%s' for %s '%s'", model, tag, ident) + self.tessapi.Reset(lang=model) + return + else: + self.logger.error("Cannot find segment '%s' in etree mapping, " + "ignoring xpath_model", ident) + if self.parameter['auto_model']: + models = self.parameter['model'].split('+') + if len(models) > 1: + confs = list() + for model in models: + self.tessapi.Reset(lang=model) + self.tessapi.Recognize() + confs.append(self.tessapi.MeanTextConf()) + model = models[np.argmax(confs)] + self.logger.debug("Reloading best model '%s' for %s '%s'", model, tag, ident) + self.tessapi.Reset(lang=model) + return + if self.parameter['xpath_model'] or self.parameter['auto_model']: + # default: undo all settings from previous calls (reset to init-state) + self.tessapi.Reset() - def process(self): + def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): """Perform layout segmentation and/or text recognition with Tesseract. - Open and deserialise each PAGE input file and its respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the requested ``textequiv_level`` if it exists and if ``segmentation_level`` is lower (i.e. more granular) or ``none``. @@ -270,214 +411,125 @@ def process(self): inlevel = self.parameter['segmentation_level'] outlevel = self.parameter['textequiv_level'] segment_only = outlevel == 'none' or not self.parameter.get('model', '') - - model = "eng" - if 'model' in self.parameter: - model = self.parameter['model'] - for sub_model in model.split('+'): - if sub_model.endswith('.traineddata'): - self.logger.warning("Model '%s' has a .traineddata extension, removing. Please use model names without .traineddata extension" % sub_model) - sub_model = sub_model.replace('.traineddata', '') - if sub_model not in get_languages()[1]: - raise Exception("configured model " + sub_model + " is not installed") - self.logger.info("Using model '%s' in %s for recognition at the %s level", - model, get_languages()[0], outlevel) - - with TessBaseAPI(init=False) as tessapi: - # Set init-time parameters - # self.SetVariable("debug_file", "") # show debug output (default: /dev/null) - if outlevel == 'glyph': - # populate GetChoiceIterator() with LSTM models, too: - tessapi.SetVariable("lstm_choice_mode", "2") # aggregate symbols - tessapi.SetVariable("lstm_choice_iterations", "15") # squeeze out more best paths - tessapi.SetVariable("pageseg_apply_music_mask", "1" if self.parameter['find_staves'] else "0") - # TODO: maybe warn/raise when illegal combinations or characters not in the model unicharset? - if self.parameter['char_whitelist']: - tessapi.SetVariable("tessedit_char_whitelist", self.parameter['char_whitelist']) - if self.parameter['char_blacklist']: - tessapi.SetVariable("tessedit_char_blacklist", self.parameter['char_blacklist']) - if self.parameter['char_unblacklist']: - tessapi.SetVariable("tessedit_char_unblacklist", self.parameter['char_unblacklist']) - # todo: determine relevancy of these variables: - # tessedit_preserve_min_wd_len 2 - # tessedit_prefer_joined_punct 0 - # tessedit_write_rep_codes 0 - # tessedit_parallelize 0 - # tessedit_zero_rejection 0 - # tessedit_zero_kelvin_rejection 0 - # tessedit_reject_mode 0 - # tessedit_use_reject_spaces 1 - # tessedit_fix_fuzzy_spaces 1 - # tessedit_char_blacklist - # tessedit_char_whitelist - # chs_leading_punct ('`" - # chs_trailing_punct1 ).,;:?! - # chs_trailing_punct2 )'`" - # numeric_punctuation ., - # unrecognised_char | - # ok_repeated_ch_non_alphanum_wds -?*= - # conflict_set_I_l_1 Il1[] - # preserve_interword_spaces 0 - # tessedit_enable_dict_correction 0 - # tessedit_enable_bigram_correction 1 - # stopper_smallword_size 2 - # wordrec_max_join_chunks 4 - # suspect_space_level 100 - # suspect_short_words 2 - # language_model_ngram_on 0 - # language_model_ngram_order 8 - # language_model_min_compound_length 3 - # language_model_penalty_non_freq_dict_word 0.1 - # language_model_penalty_non_dict_word 0.15 - # language_model_penalty_punc 0.2 - # language_model_penalty_case 0.1 - # language_model_penalty_script 0.5 - # language_model_penalty_chartype 0.3 - # language_model_penalty_spacing 0.05 - # textord_max_noise_size 7 - # enable_noise_removal 1 - # classify_bln_numeric_mode 0 - # lstm_use_matrix 1 - # user_words_file - # user_patterns_file - tesseract_params = self.parameter['tesseract_parameters'] - for variable in tesseract_params: - tessapi.SetVariable(variable, tesseract_params[variable]) - # Initialize Tesseract (loading model) - tessapi.InitFull(lang=model, - oem=getattr(OEM, self.parameter['oem'])) - # Iterate input files - for (n, input_file) in enumerate(self.input_files): - file_id = make_file_id(input_file, self.output_file_grp) - page_id = input_file.pageId or input_file.ID - self.logger.info("INPUT FILE %i / %s", n, page_id) - pcgts, _, pcgts_mapping, _ = page_from_file(self.workspace.download_file(input_file), - with_tree=True) - pcgts.set_pcGtsId(file_id) - self.add_metadata(pcgts) - page = pcgts.get_Page() - - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id) - if self.parameter['dpi'] > 0: - dpi = self.parameter['dpi'] - self.logger.info("Page '%s' images will use %d DPI from parameter override", - page_id, dpi) - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi = round(dpi * 2.54) - self.logger.info("Page '%s' images will use %d DPI from image meta-data", - page_id, dpi) - else: - dpi = 0 - self.logger.info("Page '%s' images will use DPI estimated from segmentation", - page_id) - tessapi.SetVariable('user_defined_dpi', str(dpi)) - - self.logger.info("Processing page '%s'", page_id) - # FIXME: We should somehow _mask_ existing regions in order to annotate incrementally (not redundantly). - # Currently segmentation_level=region also means removing regions, - # but we could have an independent setting for that, and attempt - # to detect regions only where nothing exists yet (by clipping to - # background before, or by removing clashing predictions after - # detection). - regions = page.get_AllRegions(classes=['Text']) - if inlevel == 'region' and ( - not regions or self.parameter['overwrite_segments']): - for regiontype in [ - 'AdvertRegion', - 'ChartRegion', - 'ChemRegion', - 'GraphicRegion', - 'ImageRegion', - 'LineDrawingRegion', - 'MathsRegion', - 'MusicRegion', - 'NoiseRegion', - 'SeparatorRegion', - 'TableRegion', - 'TextRegion', - 'UnknownRegion']: - if getattr(page, 'get_' + regiontype)(): - self.logger.info('Removing existing %ss on page %s', regiontype, page_id) - getattr(page, 'set_' + regiontype)([]) - page.set_ReadingOrder(None) - # prepare Tesseract - if self.parameter['find_tables']: - if outlevel == 'region' and self.parameter.get('model', ''): - raise Exception("When segmentation_level is region and find_tables is enabled, textequiv_level must be at least cell, because text results cannot be annotated on tables directly.") - tessapi.SetVariable("textord_tabfind_find_tables", "1") # (default) - # this should yield additional blocks within the table blocks - # from the page iterator, but does not in fact (yet?): - # (and it can run into assertion errors when the table structure - # does not meet certain homogeneity expectations) - #tessapi.SetVariable("textord_tablefind_recognize_tables", "1") - else: - # disable table detection here, so tables will be - # analysed as independent text/line blocks: - tessapi.SetVariable("textord_tabfind_find_tables", "0") - if not segment_only: - self._reinit(tessapi, page, pcgts_mapping) - tessapi.SetImage(page_image) # is already cropped to Border - tessapi.SetPageSegMode(PSM.SPARSE_TEXT - if self.parameter['sparse_text'] - else PSM.AUTO) - if segment_only: - self.logger.debug("Detecting regions in page '%s'", page_id) - tessapi.AnalyseLayout() - else: - self.logger.debug("Recognizing text in page '%s'", page_id) - tessapi.Recognize() - page_image_bin = tessapi.GetThresholdedImage() - file_path = self.workspace.save_image_file( - page_image_bin, file_id + '.IMG-BIN', - page_id=input_file.pageId, - file_grp=self.output_file_grp) - # update PAGE (reference the image file): - page.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=page_coords['features'] + ',binarized,clipped')) - self._process_regions_in_page(tessapi.GetIterator(), page, page_coords, pcgts_mapping, dpi) - elif inlevel == 'cell': - # Tables are obligatorily recursive regions; - # they might have existing text regions (cells), - # which will be processed in the next branch - # (because the iterator is recursive to depth), - # or be empty. This is independent of whether - # or not they should be segmented into cells. - if outlevel == 'region': - raise Exception("When segmentation_level is cell, textequiv_level must be at least cell too, because text results cannot be annotated on tables directly.") - # disable table detection here, so tables will be - # analysed as independent text/line blocks: - tessapi.SetVariable("textord_tabfind_find_tables", "0") - tables = page.get_AllRegions(classes=['Table']) - if not tables: - self.logger.warning("Page '%s' contains no table regions (but segmentation is off)", - page_id) - else: - self._process_existing_tables(tessapi, tables, page, page_image, page_coords, pcgts_mapping) - elif regions: - self._process_existing_regions(tessapi, regions, page_image, page_coords, pcgts_mapping) - else: - self.logger.warning("Page '%s' contains no text regions (but segmentation is off)", - page_id) - # post-processing - # bottom-up text concatenation - if outlevel != 'none' and self.parameter.get('model', ''): - page_update_higher_textequiv_levels(outlevel, pcgts, self.parameter['overwrite_text']) - # bottom-up polygonal outline projection - # if inlevel != 'none' and self.parameter['shrink_polygons']: - # page_shrink_higher_coordinate_levels(inlevel, outlevel, pcgts) + pcgts_mapping = dict() + _ = pcgts.to_etree(mapping_=pcgts_mapping) + page = pcgts.get_Page() + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id) + if self.parameter['dpi'] > 0: + dpi = self.parameter['dpi'] + self.logger.info("Page '%s' images will use %d DPI from parameter override", + page_id, dpi) + elif page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + self.logger.info("Page '%s' images will use %d DPI from image meta-data", + page_id, dpi) + else: + dpi = 0 + self.logger.info("Page '%s' images will use DPI estimated from segmentation", + page_id) + self.tessapi.SetVariable('user_defined_dpi', str(dpi)) + + self.logger.info("Processing page '%s'", page_id) + result = [pcgts] + # FIXME: We should somehow _mask_ existing regions in order to annotate incrementally (not redundantly). + # Currently segmentation_level=region also means removing regions, + # but we could have an independent setting for that, and attempt + # to detect regions only where nothing exists yet (by clipping to + # background before, or by removing clashing predictions after + # detection). + regions = page.get_AllRegions(classes=['Text']) + if inlevel == 'region' and ( + not regions or self.parameter['overwrite_segments']): + for regiontype in [ + 'AdvertRegion', + 'ChartRegion', + 'ChemRegion', + 'GraphicRegion', + 'ImageRegion', + 'LineDrawingRegion', + 'MathsRegion', + 'MusicRegion', + 'NoiseRegion', + 'SeparatorRegion', + 'TableRegion', + 'TextRegion', + 'UnknownRegion']: + if getattr(page, 'get_' + regiontype)(): + self.logger.info('Removing existing %ss on page %s', regiontype, page_id) + getattr(page, 'set_' + regiontype)([]) + page.set_ReadingOrder(None) + # prepare Tesseract + if self.parameter['find_tables']: + if outlevel == 'region' and self.parameter.get('model', ''): + raise Exception("When segmentation_level is region and find_tables is enabled, textequiv_level must be at least cell, because text results cannot be annotated on tables directly.") + self.tessapi.SetVariable("textord_tabfind_find_tables", "1") # (default) + # this should yield additional blocks within the table blocks + # from the page iterator, but does not in fact (yet?): + # (and it can run into assertion errors when the table structure + # does not meet certain homogeneity expectations) + #self.tessapi.SetVariable("textord_tablefind_recognize_tables", "1") + else: + # disable table detection here, so tables will be + # analysed as independent text/line blocks: + self.tessapi.SetVariable("textord_tabfind_find_tables", "0") + if not segment_only: + self._reinit(page, pcgts_mapping) + self.tessapi.SetImage(page_image) # is already cropped to Border + self.tessapi.SetPageSegMode(PSM.SPARSE_TEXT + if self.parameter['sparse_text'] + else PSM.AUTO) + if segment_only: + self.logger.debug("Detecting regions in page '%s'", page_id) + self.tessapi.AnalyseLayout() + else: + self.logger.debug("Recognizing text in page '%s'", page_id) + self.tessapi.Recognize() + page_image_bin = self.tessapi.GetThresholdedImage() + page_image_bin_id = output_file_id + '.IMG-BIN' + page_image_bin_path = join(self.output_file_grp, page_image_bin_id + '.png') + # update METS (reference the image file) and store image file: + result.append((page_image_bin, page_image_bin_id, page_image_bin_path)) + # update PAGE (reference the image file): + page.add_AlternativeImage(AlternativeImageType( + filename=page_image_bin_path, comments=page_coords['features'] + ',binarized,clipped')) + self._process_regions_in_page(self.tessapi.GetIterator(), page, page_coords, pcgts_mapping, dpi) + elif inlevel == 'cell': + # Tables are obligatorily recursive regions; + # they might have existing text regions (cells), + # which will be processed in the next branch + # (because the iterator is recursive to depth), + # or be empty. This is independent of whether + # or not they should be segmented into cells. + if outlevel == 'region': + raise Exception("When segmentation_level is cell, textequiv_level must be at least cell too, because text results cannot be annotated on tables directly.") + # disable table detection here, so tables will be + # analysed as independent text/line blocks: + self.tessapi.SetVariable("textord_tabfind_find_tables", "0") + tables = page.get_AllRegions(classes=['Table']) + if not tables: + self.logger.warning("Page '%s' contains no table regions (but segmentation is off)", + page_id) + else: + self._process_existing_tables(tables, page, page_image, page_coords, pcgts_mapping) + elif regions: + self._process_existing_regions(regions, page_image, page_coords, pcgts_mapping) + else: + self.logger.warning("Page '%s' contains no text regions (but segmentation is off)", + page_id) + + # post-processing + # bottom-up text concatenation + if outlevel != 'none' and self.parameter.get('model', ''): + page_update_higher_textequiv_levels(outlevel, pcgts, self.parameter['overwrite_text']) + # bottom-up polygonal outline projection + # if inlevel != 'none' and self.parameter['shrink_polygons']: + # page_shrink_higher_coordinate_levels(inlevel, outlevel, pcgts) - self.workspace.add_file( - file_id=file_id, - file_grp=self.output_file_grp, - page_id=input_file.pageId, - mimetype=MIMETYPE_PAGE, - local_filename=join(self.output_file_grp, - file_id + '.xml'), - content=to_xml(pcgts)) + return result def _process_regions_in_page(self, result_it, page, page_coords, mapping, dpi): index = 0 @@ -810,7 +862,7 @@ def _process_glyphs_in_word(self, result_it, word, coords, mapping): Unicode=alternative_text, conf=alternative_conf)) - def _process_existing_tables(self, tessapi, tables, page, page_image, page_coords, mapping): + def _process_existing_tables(self, tables, page, page_image, page_coords, mapping): # prepare dict of reading order reading_order = dict() ro = page.get_ReadingOrder() @@ -826,7 +878,7 @@ def _process_existing_tables(self, tessapi, tables, page, page_image, page_coord cells = table.get_TextRegion() if cells: if not self.parameter['overwrite_segments']: - self._process_existing_regions(tessapi, cells, page_image, page_coords, mapping) + self._process_existing_regions(cells, page_image, page_coords, mapping) continue self.logger.info('Removing existing TextRegion cells in table %s', table.id) for cell in table.get_TextRegion(): @@ -876,24 +928,24 @@ def _process_existing_tables(self, tessapi, tables, page, page_image, page_coord self.logger.warning("Skipping table region '%s' with zero size", table.id) continue if not segment_only: - self._reinit(tessapi, table, mapping) + self._reinit(table, mapping) if self.parameter['padding']: - tessapi.SetImage(pad_image(table_image, self.parameter['padding'])) + self.tessapi.SetImage(pad_image(table_image, self.parameter['padding'])) table_coords['transform'] = shift_coordinates( table_coords['transform'], 2*[self.parameter['padding']]) else: - tessapi.SetImage(table_image) - tessapi.SetPageSegMode(PSM.SPARSE_TEXT) # retrieve "cells" + self.tessapi.SetImage(table_image) + self.tessapi.SetPageSegMode(PSM.SPARSE_TEXT) # retrieve "cells" # TODO: we should XY-cut the sparse cells in regroup them into consistent cells if segment_only: self.logger.debug("Detecting cells in table '%s'", table.id) - tessapi.AnalyseLayout() + self.tessapi.AnalyseLayout() else: self.logger.debug("Recognizing text in table '%s'", table.id) - tessapi.Recognize() - self._process_cells_in_table(tessapi.GetIterator(), table, roelem, table_coords, mapping) + self.tessapi.Recognize() + self._process_cells_in_table(self.tessapi.GetIterator(), table, roelem, table_coords, mapping) - def _process_existing_regions(self, tessapi, regions, page_image, page_coords, mapping): + def _process_existing_regions(self, regions, page_image, page_coords, mapping): if self.parameter['textequiv_level'] in ['region', 'cell'] and not self.parameter.get('model', ''): return segment_only = self.parameter['textequiv_level'] == 'none' or not self.parameter.get('model', '') @@ -904,22 +956,22 @@ def _process_existing_regions(self, tessapi, regions, page_image, page_coords, m self.logger.warning("Skipping text region '%s' with zero size", region.id) continue if not segment_only: - self._reinit(tessapi, region, mapping) + self._reinit(region, mapping) if (region.get_TextEquiv() and not self.parameter['overwrite_text'] if self.parameter['textequiv_level'] in ['region', 'cell'] else self.parameter['segmentation_level'] != 'line'): pass # image not used here elif self.parameter['padding']: region_image = pad_image(region_image, self.parameter['padding']) - tessapi.SetImage(region_image) + self.tessapi.SetImage(region_image) region_coords['transform'] = shift_coordinates( region_coords['transform'], 2*[self.parameter['padding']]) else: - tessapi.SetImage(region_image) - tessapi.SetPageSegMode(PSM.SINGLE_BLOCK) + self.tessapi.SetImage(region_image) + self.tessapi.SetPageSegMode(PSM.SINGLE_BLOCK) # cell (region in table): we could enter from existing_tables or top-level existing regions if self.parameter['textequiv_level'] in ['region', 'cell']: - #if region.get_primaryScript() not in tessapi.GetLoadedLanguages()... + #if region.get_primaryScript() not in self.tessapi.GetLoadedLanguages()... if region.get_TextEquiv(): if not self.parameter['overwrite_text']: continue @@ -928,9 +980,9 @@ def _process_existing_regions(self, tessapi, regions, page_image, page_coords, m self.logger.debug("Recognizing text in region '%s'", region.id) # todo: consider SetParagraphSeparator region.add_TextEquiv(TextEquivType( - Unicode=tessapi.GetUTF8Text().rstrip("\n\f"), + Unicode=self.tessapi.GetUTF8Text().rstrip("\n\f"), # iterator scores are arithmetic averages, too - conf=tessapi.MeanTextConf()/100.0)) + conf=self.tessapi.MeanTextConf()/100.0)) continue # next region (to avoid indentation below) ## line, word, or glyph level: textlines = region.get_TextLine() @@ -941,18 +993,18 @@ def _process_existing_regions(self, tessapi, regions, page_image, page_coords, m region.set_TextLine([]) if segment_only: self.logger.debug("Detecting lines in region '%s'", region.id) - tessapi.AnalyseLayout() + self.tessapi.AnalyseLayout() else: self.logger.debug("Recognizing text in region '%s'", region.id) - tessapi.Recognize() - self._process_lines_in_region(tessapi.GetIterator(), region, region_coords, mapping) + self.tessapi.Recognize() + self._process_lines_in_region(self.tessapi.GetIterator(), region, region_coords, mapping) elif textlines: - self._process_existing_lines(tessapi, textlines, region_image, region_coords, mapping) + self._process_existing_lines(textlines, region_image, region_coords, mapping) else: self.logger.warning("Region '%s' contains no text lines (but segmentation is off)", region.id) - def _process_existing_lines(self, tessapi, textlines, region_image, region_coords, mapping): + def _process_existing_lines(self, textlines, region_image, region_coords, mapping): if self.parameter['textequiv_level'] == 'line' and not self.parameter.get('model', ''): return segment_only = self.parameter['textequiv_level'] == 'none' or not self.parameter.get('model', '') @@ -963,23 +1015,23 @@ def _process_existing_lines(self, tessapi, textlines, region_image, region_coord self.logger.warning("Skipping text line '%s' with zero size", line.id) continue if not segment_only: - self._reinit(tessapi, line, mapping) + self._reinit(line, mapping) if (line.get_TextEquiv() and not self.parameter['overwrite_text'] if self.parameter['textequiv_level'] == 'line' else self.parameter['segmentation_level'] != 'word'): pass # image not used here elif self.parameter['padding']: line_image = pad_image(line_image, self.parameter['padding']) - tessapi.SetImage(line_image) + self.tessapi.SetImage(line_image) line_coords['transform'] = shift_coordinates( line_coords['transform'], 2*[self.parameter['padding']]) else: - tessapi.SetImage(line_image) + self.tessapi.SetImage(line_image) if self.parameter['raw_lines']: - tessapi.SetPageSegMode(PSM.RAW_LINE) + self.tessapi.SetPageSegMode(PSM.RAW_LINE) else: - tessapi.SetPageSegMode(PSM.SINGLE_LINE) - #if line.get_primaryScript() not in tessapi.GetLoadedLanguages()... + self.tessapi.SetPageSegMode(PSM.SINGLE_LINE) + #if line.get_primaryScript() not in self.tessapi.GetLoadedLanguages()... if self.parameter['textequiv_level'] == 'line': if line.get_TextEquiv(): if not self.parameter['overwrite_text']: @@ -989,9 +1041,9 @@ def _process_existing_lines(self, tessapi, textlines, region_image, region_coord self.logger.debug("Recognizing text in line '%s'", line.id) # todo: consider BlankBeforeWord, SetLineSeparator line.add_TextEquiv(TextEquivType( - Unicode=tessapi.GetUTF8Text().rstrip("\n\f"), + Unicode=self.tessapi.GetUTF8Text().rstrip("\n\f"), # iterator scores are arithmetic averages, too - conf=tessapi.MeanTextConf()/100.0)) + conf=self.tessapi.MeanTextConf()/100.0)) continue # next line (to avoid indentation below) ## word, or glyph level: words = line.get_Word() @@ -1002,21 +1054,21 @@ def _process_existing_lines(self, tessapi, textlines, region_image, region_coord line.set_Word([]) if segment_only: self.logger.debug("Detecting words in line '%s'", line.id) - tessapi.AnalyseLayout() + self.tessapi.AnalyseLayout() else: self.logger.debug("Recognizing text in line '%s'", line.id) - tessapi.Recognize() + self.tessapi.Recognize() ## internal word and glyph layout: - self._process_words_in_line(tessapi.GetIterator(), line, line_coords, mapping) + self._process_words_in_line(self.tessapi.GetIterator(), line, line_coords, mapping) elif words: ## external word layout: self.logger.warning("Line '%s' contains words already, recognition might be suboptimal", line.id) - self._process_existing_words(tessapi, words, line_image, line_coords, mapping) + self._process_existing_words(words, line_image, line_coords, mapping) else: self.logger.warning("Line '%s' contains no words (but segmentation is off)", line.id) - def _process_existing_words(self, tessapi, words, line_image, line_coords, mapping): + def _process_existing_words(self, words, line_image, line_coords, mapping): if self.parameter['textequiv_level'] == 'word' and not self.parameter.get('model', ''): return segment_only = self.parameter['textequiv_level'] == 'none' or not self.parameter.get('model', '') @@ -1027,19 +1079,19 @@ def _process_existing_words(self, tessapi, words, line_image, line_coords, mappi self.logger.warning("Skipping word '%s' with zero size", word.id) continue if not segment_only: - self._reinit(tessapi, word, mapping) + self._reinit(word, mapping) if (word.get_TextEquiv() and not self.parameter['overwrite_text'] if self.parameter['textequiv_level'] == 'word' else self.parameter['segmentation_level'] != 'glyph'): pass # image not used here elif self.parameter['padding']: word_image = pad_image(word_image, self.parameter['padding']) - tessapi.SetImage(word_image) + self.tessapi.SetImage(word_image) word_coords['transform'] = shift_coordinates( word_coords['transform'], 2*[self.parameter['padding']]) else: - tessapi.SetImage(word_image) - tessapi.SetPageSegMode(PSM.SINGLE_WORD) + self.tessapi.SetImage(word_image) + self.tessapi.SetPageSegMode(PSM.SINGLE_WORD) if self.parameter['textequiv_level'] == 'word': if word.get_TextEquiv(): if not self.parameter['overwrite_text']: @@ -1047,9 +1099,9 @@ def _process_existing_words(self, tessapi, words, line_image, line_coords, mappi self.logger.warning("Word '%s' already contained text results", word.id) word.set_TextEquiv([]) self.logger.debug("Recognizing text in word '%s'", word.id) - word_conf = tessapi.AllWordConfidences() + word_conf = self.tessapi.AllWordConfidences() word.add_TextEquiv(TextEquivType( - Unicode=tessapi.GetUTF8Text().rstrip("\n\f"), + Unicode=self.tessapi.GetUTF8Text().rstrip("\n\f"), conf=word_conf[0]/100.0 if word_conf else 0.0)) continue # next word (to avoid indentation below) ## glyph level: @@ -1061,21 +1113,21 @@ def _process_existing_words(self, tessapi, words, line_image, line_coords, mappi word.set_Glyph([]) if segment_only: self.logger.debug("Detecting glyphs in word '%s'", word.id) - tessapi.AnalyseLayout() + self.tessapi.AnalyseLayout() else: self.logger.debug("Recognizing text in word '%s'", word.id) - tessapi.Recognize() + self.tessapi.Recognize() ## internal glyph layout: - self._process_glyphs_in_word(tessapi.GetIterator(), word, word_coords, mapping) + self._process_glyphs_in_word(self.tessapi.GetIterator(), word, word_coords, mapping) elif glyphs: ## external glyph layout: self.logger.warning("Word '%s' contains glyphs already, recognition might be suboptimal", word.id) - self._process_existing_glyphs(tessapi, glyphs, word_image, word_coords, mapping) + self._process_existing_glyphs(glyphs, word_image, word_coords, mapping) else: self.logger.warning("Word '%s' contains no glyphs (but segmentation is off)", word.id) - def _process_existing_glyphs(self, tessapi, glyphs, word_image, word_xywh, mapping): + def _process_existing_glyphs(self, glyphs, word_image, word_xywh, mapping): if not self.parameter.get('model', ''): return for glyph in glyphs: @@ -1084,29 +1136,29 @@ def _process_existing_glyphs(self, tessapi, glyphs, word_image, word_xywh, mappi if not glyph_image.width or not glyph_image.height: self.logger.warning("Skipping glyph '%s' with zero size", glyph.id) continue - self._reinit(tessapi, glyph, mapping) + self._reinit(glyph, mapping) if glyph.get_TextEquiv() and not self.parameter['overwrite_text']: pass # image not used here elif self.parameter['padding']: - tessapi.SetImage(pad_image(glyph_image, self.parameter['padding'])) + self.tessapi.SetImage(pad_image(glyph_image, self.parameter['padding'])) else: - tessapi.SetImage(glyph_image) - tessapi.SetPageSegMode(PSM.SINGLE_CHAR) + self.tessapi.SetImage(glyph_image) + self.tessapi.SetPageSegMode(PSM.SINGLE_CHAR) if glyph.get_TextEquiv(): if not self.parameter['overwrite_text']: continue self.logger.warning("Glyph '%s' already contained text results", glyph.id) glyph.set_TextEquiv([]) self.logger.debug("Recognizing text in glyph '%s'", glyph.id) - glyph_text = tessapi.GetUTF8Text().rstrip("\n\f") - glyph_conf = tessapi.AllWordConfidences() + glyph_text = self.tessapi.GetUTF8Text().rstrip("\n\f") + glyph_conf = self.tessapi.AllWordConfidences() glyph_conf = glyph_conf[0]/100.0 if glyph_conf else 1.0 #self.logger.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) glyph.add_TextEquiv(TextEquivType( index=0, Unicode=glyph_text, conf=glyph_conf)) - result_it = tessapi.GetIterator() + result_it = self.tessapi.GetIterator() if not result_it or result_it.Empty(RIL.SYMBOL): self.logger.error("No text in glyph '%s'", glyph.id) continue @@ -1175,444 +1227,3 @@ def _add_orientation(self, result_it, region, coords): TextlineOrder.TOP_TO_BOTTOM: 'top-to-bottom' }.get(textline_order, 'bottom-to-top')) - def _reinit(self, tessapi, segment, mapping): - """Reset Tesseract API to initial state, and apply API-level settings for the given segment. - - If ``xpath_parameters`` is used, try each XPath expression against ``segment``, - and in case of a match, apply given parameters, respectively. - - If ``xpath_model`` is used, try each XPath expression against ``segment``, - and in case of a match, load the given language/model, respectively. - - If ``auto_model`` is used, and no ``xpath_model`` was applied yet, - try each given language/model individually on ``segment``, compare - their confidences, and load the best-scoring language/model. - - Before returning, store all previous settings (to catch by the next call). - """ - # Tesseract API is stateful but does not allow copy constructors - # for segment-by-segment configuration we therefore need to - # re-initialize the API with the currently loaded settings, - # and add some custom choices - node = mapping.get(id(segment), None) - tag = segment.__class__.__name__[:-4] - if hasattr(segment, 'id'): - at_ident = 'id' - else: - at_ident = 'imageFilename' - ident = getattr(segment, at_ident) - with tessapi: - # apply temporary changes - if self.parameter['xpath_parameters']: - if node is not None and node.attrib.get(at_ident, None) == ident: - ns = {'re': 'http://exslt.org/regular-expressions', - 'pc': node.nsmap[node.prefix], - node.prefix: node.nsmap[node.prefix]} - for xpath, params in self.parameter['xpath_parameters'].items(): - if node.xpath(xpath, namespaces=ns): - self.logger.info("Found '%s' in '%s', setting '%s'", - xpath, ident, params) - for name, val in params.items(): - tessapi.SetVariable(name, val) - else: - self.logger.error("Cannot find segment '%s' in etree mapping, " - "ignoring xpath_parameters", ident) - if self.parameter['xpath_model']: - if node is not None and node.attrib.get(at_ident, None) == ident: - ns = {'re': 'http://exslt.org/regular-expressions', - 'pc': node.nsmap[node.prefix], - node.prefix: node.nsmap[node.prefix]} - models = [] - for xpath, model in self.parameter['xpath_model'].items(): - if node.xpath(xpath, namespaces=ns): - self.logger.info("Found '%s' in '%s', reloading with '%s'", - xpath, ident, model) - models.append(model) - if models: - model = '+'.join(models) - self.logger.debug("Reloading model '%s' for %s '%s'", model, tag, ident) - tessapi.Reset(lang=model) - return - else: - self.logger.error("Cannot find segment '%s' in etree mapping, " - "ignoring xpath_model", ident) - if self.parameter['auto_model']: - models = self.parameter['model'].split('+') - if len(models) > 1: - confs = list() - for model in models: - tessapi.Reset(lang=model) - tessapi.Recognize() - confs.append(tessapi.MeanTextConf()) - model = models[np.argmax(confs)] - self.logger.debug("Reloading best model '%s' for %s '%s'", model, tag, ident) - tessapi.Reset(lang=model) - return - if self.parameter['xpath_model'] or self.parameter['auto_model']: - # default: undo all settings from previous calls (reset to init-state) - tessapi.Reset() - -def page_element_unicode0(element): - """Get Unicode string of the first text result.""" - if element.get_TextEquiv(): - return element.get_TextEquiv()[0].Unicode or '' - else: - return '' - -def page_element_conf0(element): - """Get confidence (as float value) of the first text result.""" - if element.get_TextEquiv(): - # generateDS does not convert simpleType for attributes (yet?) - return float(element.get_TextEquiv()[0].conf or "1.0") - return 1.0 - -def page_get_reading_order(ro, rogroup): - """Add all elements from the given reading order group to the given dictionary. - - Given a dict ``ro`` from layout element IDs to ReadingOrder element objects, - and an object ``rogroup`` with additional ReadingOrder element objects, - add all references to the dict, traversing the group recursively. - """ - regionrefs = list() - if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): - regionrefs = (rogroup.get_RegionRefIndexed() + - rogroup.get_OrderedGroupIndexed() + - rogroup.get_UnorderedGroupIndexed()) - if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): - regionrefs = (rogroup.get_RegionRef() + - rogroup.get_OrderedGroup() + - rogroup.get_UnorderedGroup()) - for elem in regionrefs: - ro[elem.get_regionRef()] = elem - if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): - page_get_reading_order(ro, elem) - -def page_update_higher_textequiv_levels(level, pcgts, overwrite=True): - """Update the TextEquivs of all PAGE-XML hierarchy levels above ``level`` for consistency. - - Starting with the lowest hierarchy level chosen for processing, - join all first TextEquiv.Unicode (by the rules governing the respective level) - into TextEquiv.Unicode of the next higher level, replacing them. - If ``overwrite`` is false and the higher level already has text, keep it. - - When two successive elements appear in a ``Relation`` of type ``join``, - then join them directly (without their respective white space). - - Likewise, average all first TextEquiv.conf into TextEquiv.conf of the next higher level. - - In the process, traverse the words and lines in their respective ``readingDirection``, - the (text) regions which contain lines in their respective ``textLineOrder``, and - the (text) regions which contain text regions in their ``ReadingOrder`` - (if they appear there as an ``OrderedGroup``). - Where no direction/order can be found, use XML ordering. - - Follow regions recursively, but make sure to traverse them in a depth-first strategy. - """ - page = pcgts.get_Page() - relations = page.get_Relations() # get RelationsType - if relations: - relations = relations.get_Relation() # get list of RelationType - else: - relations = [] - joins = list() # - for relation in relations: - if relation.get_type() == 'join': # ignore 'link' type here - joins.append((relation.get_SourceRegionRef().get_regionRef(), - relation.get_TargetRegionRef().get_regionRef())) - reading_order = dict() - ro = page.get_ReadingOrder() - if ro: - page_get_reading_order(reading_order, ro.get_OrderedGroup() or ro.get_UnorderedGroup()) - if level != 'region': - for region in page.get_AllRegions(classes=['Text']): - # order is important here, because regions can be recursive, - # and we want to concatenate by depth first; - # typical recursion structures would be: - # - TextRegion/@type=paragraph inside TextRegion - # - TextRegion/@type=drop-capital followed by TextRegion/@type=paragraph inside TextRegion - # - any region (including TableRegion or TextRegion) inside a TextRegion/@type=footnote - # - TextRegion inside TableRegion - subregions = region.get_TextRegion() - if subregions: # already visited in earlier iterations - # do we have a reading order for these? - # TODO: what if at least some of the subregions are in reading_order? - if (all(subregion.id in reading_order for subregion in subregions) and - isinstance(reading_order[subregions[0].id], # all have .index? - (OrderedGroupType, OrderedGroupIndexedType))): - subregions = sorted(subregions, key=lambda subregion: - reading_order[subregion.id].index) - region_unicode = page_element_unicode0(subregions[0]) - for subregion, next_subregion in zip(subregions, subregions[1:]): - if (subregion.id, next_subregion.id) not in joins: - region_unicode += '\n' # or '\f'? - region_unicode += page_element_unicode0(next_subregion) - region_conf = sum(page_element_conf0(subregion) for subregion in subregions) - region_conf /= len(subregions) - else: # TODO: what if a TextRegion has both TextLine and TextRegion children? - lines = region.get_TextLine() - if ((region.get_textLineOrder() or - page.get_textLineOrder()) == - TextLineOrderSimpleType.BOTTOMTOTOP): - lines = list(reversed(lines)) - if level != 'line': - for line in lines: - words = line.get_Word() - if ((line.get_readingDirection() or - region.get_readingDirection() or - page.get_readingDirection()) == - ReadingDirectionSimpleType.RIGHTTOLEFT): - words = list(reversed(words)) - if level != 'word': - for word in words: - glyphs = word.get_Glyph() - if ((word.get_readingDirection() or - line.get_readingDirection() or - region.get_readingDirection() or - page.get_readingDirection()) == - ReadingDirectionSimpleType.RIGHTTOLEFT): - glyphs = list(reversed(glyphs)) - word_unicode = ''.join(page_element_unicode0(glyph) for glyph in glyphs) - word_conf = sum(page_element_conf0(glyph) for glyph in glyphs) - if glyphs: - word_conf /= len(glyphs) - if not word.get_TextEquiv() or overwrite: - word.set_TextEquiv( # replace old, if any - [TextEquivType(Unicode=word_unicode, conf=word_conf)]) - line_unicode = ' '.join(page_element_unicode0(word) for word in words) - line_conf = sum(page_element_conf0(word) for word in words) - if words: - line_conf /= len(words) - if not line.get_TextEquiv() or overwrite: - line.set_TextEquiv( # replace old, if any - [TextEquivType(Unicode=line_unicode, conf=line_conf)]) - region_unicode = '' - region_conf = 0 - if lines: - region_unicode = page_element_unicode0(lines[0]) - for line, next_line in zip(lines, lines[1:]): - words = line.get_Word() - next_words = next_line.get_Word() - if not (words and next_words and (words[-1].id, next_words[0].id) in joins): - region_unicode += '\n' - region_unicode += page_element_unicode0(next_line) - region_conf = sum(page_element_conf0(line) for line in lines) - region_conf /= len(lines) - if not region.get_TextEquiv() or overwrite: - region.set_TextEquiv( # replace old, if any - [TextEquivType(Unicode=region_unicode, conf=region_conf)]) - -def page_shrink_higher_coordinate_levels(maxlevel, minlevel, pcgts): - """Project the coordinate hull of all PAGE-XML hierarchy levels above ``minlevel`` up to ``maxlevel``. - - Starting with the lowest hierarchy level chosen for processing, - join all segments into a convex hull for the next higher level, - replacing the parent coordinates, respectively. - - Follow regions recursively, but make sure to traverse them in a depth-first strategy. - """ - LOG = getLogger('processor.TesserocrRecognize') - page = pcgts.get_Page() - regions = page.get_AllRegions(classes=['Text']) - if minlevel != 'region': - for region in regions: - lines = region.get_TextLine() - if minlevel != 'line': - for line in lines: - words = line.get_Word() - if minlevel != 'word': - for word in words: - glyphs = word.get_Glyph() - if maxlevel in ['region', 'line', 'word', 'glyph'] and glyphs: - joint_polygon = join_segments(glyphs) - LOG.debug("setting hull for word '%s' from %d vertices", - word.id, len(joint_polygon)) - word.get_Coords().set_points(points_from_polygon(joint_polygon)) - if maxlevel in ['region', 'line', 'word'] and words: - joint_polygon = join_segments(words) - LOG.debug("setting hull for line '%s' from %d vertices", - line.id, len(joint_polygon)) - line.get_Coords().set_points(points_from_polygon(joint_polygon)) - if maxlevel in ['region', 'line'] and lines: - joint_polygon = join_segments(lines) - LOG.debug("setting hull for region '%s' from %d vertices", - region.id, len(joint_polygon)) - region.get_Coords().set_points(points_from_polygon(joint_polygon)) - -def join_segments(segments): - return join_polygons([polygon_from_points(segment.get_Coords().points) - for segment in segments]) - -def join_polygons(polygons, scale=20): - """construct concave hull (alpha shape) from input polygons by connecting their pairwise nearest points""" - return make_join([make_valid(Polygon(poly)) for poly in polygons], scale=scale).exterior.coords[:-1] - -def make_join(polygons, scale=20): - """construct concave hull (alpha shape) from input polygons by connecting their pairwise nearest points""" - # ensure input polygons are simply typed and all oriented equally - polygons = [orient(poly) - for poly in itertools.chain.from_iterable( - [poly.geoms - if poly.geom_type in ['MultiPolygon', 'GeometryCollection'] - else [poly] - for poly in polygons])] - npoly = len(polygons) - if npoly == 1: - return polygons[0] - # find min-dist path through all polygons (travelling salesman) - pairs = itertools.combinations(range(npoly), 2) - dists = np.zeros((npoly, npoly), dtype=float) - for i, j in pairs: - dist = polygons[i].distance(polygons[j]) - if dist < 1e-5: - dist = 1e-5 # if pair merely touches, we still need to get an edge - dists[i, j] = dist - dists[j, i] = dist - dists = minimum_spanning_tree(dists, overwrite=True) - # add bridge polygons (where necessary) - for prevp, nextp in zip(*dists.nonzero()): - prevp = polygons[prevp] - nextp = polygons[nextp] - nearest = nearest_points(prevp, nextp) - bridgep = LineString(nearest).buffer(max(1, scale/5), resolution=1) - polygons.append(bridgep) - jointp = unary_union(polygons) - assert jointp.geom_type == 'Polygon', jointp.wkt - # follow-up calculations will necessarily be integer; - # so anticipate rounding here and then ensure validity - jointp2 = set_precision(jointp, 1.0) - if jointp2.geom_type != 'Polygon' or not jointp2.is_valid: - jointp2 = Polygon(np.round(jointp.exterior.coords)) - jointp2 = make_valid(jointp2) - assert jointp2.geom_type == 'Polygon', jointp2.wkt - return jointp2 - -def pad_image(image, padding): - # TODO: input padding can create extra edges if not binarized; at least try to smooth - stat = ImageStat.Stat(image) - # workaround for Pillow#4925 - if len(stat.bands) > 1: - background = tuple(stat.median) - else: - background = stat.median[0] - padded = Image.new(image.mode, - (image.width + 2 * padding, - image.height + 2 * padding), - background) - padded.paste(image, (padding, padding)) - return padded - -def polygon_for_parent(polygon, parent): - """Clip polygon to parent polygon range. - - (Should be moved to ocrd_utils.coordinates_for_segment.) - """ - childp = Polygon(polygon) - if isinstance(parent, PageType): - if parent.get_Border(): - parentp = Polygon(polygon_from_points(parent.get_Border().get_Coords().points)) - else: - parentp = Polygon([[0, 0], [0, parent.get_imageHeight()], - [parent.get_imageWidth(), parent.get_imageHeight()], - [parent.get_imageWidth(), 0]]) - else: - parentp = Polygon(polygon_from_points(parent.get_Coords().points)) - # ensure input coords have valid paths (without self-intersection) - # (this can happen when shapes valid in floating point are rounded) - childp = make_valid(childp) - parentp = make_valid(parentp) - if not childp.is_valid: - return None - if not parentp.is_valid: - return None - # check if clipping is necessary - if childp.within(parentp): - return childp.exterior.coords[:-1] - # clip to parent - interp = make_intersection(childp, parentp) - if not interp: - return None - return interp.exterior.coords[:-1] # keep open - -def make_intersection(poly1, poly2): - interp = poly1.intersection(poly2) - # post-process - if interp.is_empty or interp.area == 0.0: - # this happens if Tesseract "finds" something - # outside of the valid Border of a deskewed/cropped page - # (empty corners created by masking); will be ignored - return None - if interp.geom_type == 'GeometryCollection': - # heterogeneous result: filter zero-area shapes (LineString, Point) - interp = unary_union([geom for geom in interp.geoms if geom.area > 0]) - if interp.geom_type == 'MultiPolygon': - # homogeneous result: construct convex hull to connect - interp = make_join(interp.geoms) - if interp.minimum_clearance < 1.0: - # follow-up calculations will necessarily be integer; - # so anticipate rounding here and then ensure validity - interp = Polygon(np.round(interp.exterior.coords)) - interp = make_valid(interp) - return interp - -def make_valid(polygon): - points = list(polygon.exterior.coords) - for split in range(1, len(points)): - if polygon.is_valid or polygon.simplify(polygon.area).is_valid: - break - # simplification may not be possible (at all) due to ordering - # in that case, try another starting point - polygon = Polygon(points[-split:]+points[:-split]) - # try by simplification - for tolerance in range(int(polygon.area + 1.5)): - if polygon.is_valid: - break - # simplification may require a larger tolerance - polygon = polygon.simplify(tolerance + 1) - # try by enlarging - for tolerance in range(1, int(polygon.area + 2.5)): - if polygon.is_valid: - break - # enlargement may require a larger tolerance - polygon = polygon.buffer(tolerance) - assert polygon.is_valid, polygon.wkt - return polygon - -def iterate_level(it, ril, parent=None): - LOG = getLogger('processor.TesserocrRecognize') - # improves over tesserocr.iterate_level by - # honouring multi-level semantics so iterators - # can be combined across levels - if parent is None: - parent = ril - 1 - pos = 0 - while it and not it.Empty(ril): - yield it - # With upstream Tesseract, these assertions may fail: - # if ril > 0 and it.IsAtFinalElement(parent, ril): - # for level in range(parent, ril): - # assert it.IsAtFinalElement(parent, level), \ - # "level %d iterator at %d is final w.r.t. %d but level %d is not" % ( - # ril, pos, parent, level) - # Hence the following workaround avails itself: - if ril > 0 and all(it.IsAtFinalElement(parent, level) - for level in range(parent, ril + 1)): - break - if not it.Next(ril): - break - while it.Empty(ril) and not it.Empty(0): - # This happens when - # - on RIL.PARA, RIL.TEXTLINE and RIL.WORD, - # empty non-text (pseudo-) blocks intervene - # - on RIL.SYMBOL, a word has no cblobs at all - # (because they have all been rejected) - # We must _not_ yield these (as they have strange - # properties and bboxes). But most importantly, - # they will have met IsAtFinalElement prematurely - # (hence the similar loop above). - # Since this may happen multiple consecutive times, - # enclose this in a while loop. - LOG.warning("level %d iterator at %d needs to skip empty segment", - ril, pos) - if not it.Next(ril): - break - pos += 1 diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index 09a3445..27d4634 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -20,10 +20,10 @@ def __init__(self, *args, **kwargs): # add default params assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid - def process(self): + def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): """Performs (text) line segmentation with Tesseract on the workspace. - Open and deserialize PAGE input files and their respective images, + Open and deserialize PAGE input file and its respective images, then iterate over the element hierarchy down to the (text) region level, and remove any existing TextLine elements (unless ``overwrite_lines`` is False). @@ -39,4 +39,4 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - super(TesserocrSegmentLine, self).process() + return super().process_page_pcgts(pcgts, output_file_id=output_file_id, page_id=page_id) diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index 2b4aa95..3381a7d 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -22,10 +22,10 @@ def __init__(self, *args, **kwargs): # add default params assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid - def process(self): + def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): """Performs region segmentation with Tesseract on the workspace. - Open and deserialize PAGE input files and their respective images, + Open and deserialize PAGE input file and its respective images, and remove any existing Region and ReadingOrder elements (unless ``overwrite_regions`` is False). @@ -47,4 +47,4 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - super(TesserocrSegmentRegion, self).process() + return super().process_page_pcgts(pcgts, output_file_id=output_file_id, page_id=page_id) diff --git a/ocrd_tesserocr/segment_table.py b/ocrd_tesserocr/segment_table.py index 72dfe38..ea6ce2f 100644 --- a/ocrd_tesserocr/segment_table.py +++ b/ocrd_tesserocr/segment_table.py @@ -20,10 +20,10 @@ def __init__(self, *args, **kwargs): # add default params assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid - def process(self): + def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): """Performs table cell segmentation with Tesseract on the workspace. - Open and deserialize PAGE input files and their respective images, + Open and deserialize PAGE input file and its respective images, then iterate over the element hierarchy down to the region level for table regions, and remove any existing TextRegion elements (unless ``overwrite_cells`` is False). @@ -35,4 +35,4 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - super(TesserocrSegmentTable, self).process() + return super().process_page_pcgts(pcgts, output_file_id=output_file_id, page_id=page_id) diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index 7e4fb45..534b3e6 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -20,10 +20,10 @@ def __init__(self, *args, **kwargs): # add default params assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid - def process(self): + def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): """Performs word segmentation with Tesseract on the workspace. - Open and deserialize PAGE input files and their respective images, + Open and deserialize PAGE input file and its respective images, then iterate over the element hierarchy down to the textline level, and remove any existing Word elements. @@ -38,4 +38,4 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - super(TesserocrSegmentWord, self).process() + return super().process_page_pcgts(pcgts, output_file_id=output_file_id, page_id=page_id) diff --git a/test/test_cli.py b/test/test_cli.py index c679209..7fcc879 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -27,6 +27,6 @@ def test_list_all_resources(tmpdir, monkeypatch): # envvars influence tesserocr's module initialization from ocrd_tesserocr.cli import ocrd_tesserocr_recognize r = runner.invoke(ocrd_tesserocr_recognize, ['-L']) - assert not r.exit_code + assert not r.exit_code, r.output # XXX same problem # assert r.output == str(samplefile) + '\n' From 95d2837039642d18fe69b8ac12eb1f914ed7fe97 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 7 Jul 2024 12:12:13 +0200 Subject: [PATCH 04/34] require ocrd>=3.0 --- CHANGELOG.md | 4 ++++ ocrd_tesserocr/ocrd-tool.json | 2 +- requirements.txt | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 06eb752..fb3ab93 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + + * adapt to ocrd 3.0, #216 + ## [0.19.1] - 2024-07-01 Fixed: diff --git a/ocrd_tesserocr/ocrd-tool.json b/ocrd_tesserocr/ocrd-tool.json index a8621f7..cff6e31 100644 --- a/ocrd_tesserocr/ocrd-tool.json +++ b/ocrd_tesserocr/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.19.1", + "version": "0.20.0", "git_url": "https://github.com/OCR-D/ocrd_tesserocr", "dockerhub": "ocrd/tesserocr", "tools": { diff --git a/requirements.txt b/requirements.txt index 4a5e4bc..2d8b8b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ocrd >= 2.53 +ocrd >= 3.0 click tesserocr >= 2.5.2 shapely >= 2.0 From 47dee366c765d8d176b11c6095828f806bc85fc8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:11:26 +0200 Subject: [PATCH 05/34] ocrd-tool.json: add cardinality specs --- ocrd_tesserocr/ocrd-tool.json | 101 +++++++--------------------------- 1 file changed, 20 insertions(+), 81 deletions(-) diff --git a/ocrd_tesserocr/ocrd-tool.json b/ocrd_tesserocr/ocrd-tool.json index cff6e31..71c1d22 100644 --- a/ocrd_tesserocr/ocrd-tool.json +++ b/ocrd_tesserocr/ocrd-tool.json @@ -7,13 +7,8 @@ "executable": "ocrd-tesserocr-deskew", "categories": ["Image preprocessing"], "description": "Detect script, orientation and skew angle for pages or regions", - "input_file_grp": [ - "OCR-D-IMG", - "OCR-D-SEG-BLOCK" - ], - "output_file_grp": [ - "OCR-D-DESKEW-BLOCK" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["preprocessing/optimization/deskewing"], "parameters": { "dpi": { @@ -40,13 +35,8 @@ "executable": "ocrd-tesserocr-fontshape", "categories": ["Text recognition and optimization"], "description": "Recognize font shapes (family/monospace/bold/italic) and size in segmented words with Tesseract (using annotated derived images, or masking and cropping images from coordinate polygons), annotating TextStyle", - "input_file_grp": [ - "OCR-D-SEG-WORD", - "OCR-D-OCR" - ], - "output_file_grp": [ - "OCR-D-OCR-FONTSTYLE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["recognition/font-identification"], "parameters": { "dpi": { @@ -74,21 +64,8 @@ "executable": "ocrd-tesserocr-recognize", "categories": ["Text recognition and optimization"], "description": "Segment and/or recognize text with Tesseract (using annotated derived images, or masking and cropping images from coordinate polygons) on any level of the PAGE hierarchy.", - "input_file_grp": [ - "OCR-D-SEG-PAGE", - "OCR-D-SEG-REGION", - "OCR-D-SEG-TABLE", - "OCR-D-SEG-LINE", - "OCR-D-SEG-WORD" - ], - "output_file_grp": [ - "OCR-D-SEG-REGION", - "OCR-D-SEG-TABLE", - "OCR-D-SEG-LINE", - "OCR-D-SEG-WORD", - "OCR-D-SEG-GLYPH", - "OCR-D-OCR-TESS" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": [ "layout/segmentation/region", "layout/segmentation/line", @@ -300,14 +277,8 @@ "executable": "ocrd-tesserocr-segment", "categories": ["Layout analysis"], "description": "Segment page into regions and lines with Tesseract", - "input_file_grp": [ - "OCR-D-IMG", - "OCR-D-SEG-PAGE", - "OCR-D-GT-SEG-PAGE" - ], - "output_file_grp": [ - "OCR-D-SEG-LINE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["layout/segmentation/region", "layout/segmentation/line"], "parameters": { "dpi": { @@ -353,14 +324,8 @@ "executable": "ocrd-tesserocr-segment-region", "categories": ["Layout analysis"], "description": "Segment page into regions with Tesseract", - "input_file_grp": [ - "OCR-D-IMG", - "OCR-D-SEG-PAGE", - "OCR-D-GT-SEG-PAGE" - ], - "output_file_grp": [ - "OCR-D-SEG-BLOCK" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["layout/segmentation/region"], "parameters": { "dpi": { @@ -411,13 +376,8 @@ "executable": "ocrd-tesserocr-segment-table", "categories": ["Layout analysis"], "description": "Segment table regions into cell text regions with Tesseract", - "input_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-GT-SEG-BLOCK" - ], - "output_file_grp": [ - "OCR-D-SEG-BLOCK" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["layout/segmentation/region"], "parameters": { "dpi": { @@ -448,13 +408,8 @@ "executable": "ocrd-tesserocr-segment-line", "categories": ["Layout analysis"], "description": "Segment regions into lines with Tesseract", - "input_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-GT-SEG-BLOCK" - ], - "output_file_grp": [ - "OCR-D-SEG-LINE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["layout/segmentation/line"], "parameters": { "dpi": { @@ -485,13 +440,8 @@ "executable": "ocrd-tesserocr-segment-word", "categories": ["Layout analysis"], "description": "Segment lines into words with Tesseract", - "input_file_grp": [ - "OCR-D-SEG-LINE", - "OCR-D-GT-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-WORD" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["layout/segmentation/word"], "parameters": { "dpi": { @@ -522,12 +472,8 @@ "executable": "ocrd-tesserocr-crop", "categories": ["Image preprocessing"], "description": "Poor man's cropping via region segmentation", - "input_file_grp": [ - "OCR-D-IMG" - ], - "output_file_grp": [ - "OCR-D-SEG-PAGE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["preprocessing/optimization/cropping"], "parameters" : { "dpi": { @@ -548,15 +494,8 @@ "executable": "ocrd-tesserocr-binarize", "categories": ["Image preprocessing"], "description": "Binarize regions or lines with Tesseract's global Otsu", - "input_file_grp": [ - "OCR-D-IMG", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-BIN-BLOCK", - "OCR-D-BIN-LINE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "steps": ["preprocessing/optimization/binarization"], "parameters": { "dpi": { From e9d562bae5e9a83db521847458bdcde2b4f68a88 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 14 Aug 2024 00:26:46 +0200 Subject: [PATCH 06/34] require ocrd 3.0 prerelease --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2d8b8b3..9191406 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ocrd >= 3.0 +ocrd >= 3.0.0a1 click tesserocr >= 2.5.2 shapely >= 2.0 From f6c5ea0f04a5953faa04221b1ae4311dd3f2570b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 01:29:54 +0200 Subject: [PATCH 07/34] binarize: use final v3 API --- ocrd_tesserocr/binarize.py | 39 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/ocrd_tesserocr/binarize.py b/ocrd_tesserocr/binarize.py index 0b916ba..1a4b001 100644 --- a/ocrd_tesserocr/binarize.py +++ b/ocrd_tesserocr/binarize.py @@ -1,5 +1,6 @@ from __future__ import absolute_import +from typing import Optional import os.path from tesserocr import ( PyTessBaseAPI, @@ -9,7 +10,9 @@ from ocrd_models.ocrd_page import ( AlternativeImageType, TextRegionType, + OcrdPage ) +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from .recognize import TesserocrRecognize @@ -22,7 +25,7 @@ def _init(self): # use default model (eng) with vanilla tesserocr API self.tessapi = PyTessBaseAPI() - def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """Performs binarization of the region / line with Tesseract on the workspace. Open and deserialize PAGE input file and its respective images, @@ -42,6 +45,8 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): sepmask = self.parameter['tiseg'] oplevel = self.parameter['operation_level'] + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) @@ -60,14 +65,11 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): self.logger.info("Binarizing on '%s' level in page '%s'", oplevel, page_id) if oplevel == 'page': - image = self._process_segment(-1, page, page_image, page_xywh, - page_id, output_file_id) + image = self._process_segment(-1, page, page_image, page_xywh, page_id) if image: - return [pcgts, image] - else: - return pcgts + result.images.append(image) + return result - result = [pcgts] regions = page.get_AllRegions(classes=['Text', 'Table']) if not regions: self.logger.warning("Page '%s' contains no text regions", page_id) @@ -76,10 +78,9 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): region, page_image, page_xywh) if oplevel == 'region': image = self._process_segment(RIL.BLOCK, region, region_image, region_xywh, - "region '%s'" % region.id, - output_file_id + '_' + region.id) + "region '%s'" % region.id) if image: - result.append(image) + result.images.append(image) elif isinstance(region, TextRegionType): lines = region.get_TextLine() if not lines: @@ -89,14 +90,13 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh) image = self._process_segment(RIL.TEXTLINE, line, line_image, line_xywh, - "line '%s'" % line.id, - output_file_id + '_' + region.id + '_' + line.id) + "line '%s'" % line.id) if image: - result.append(image) + result.images.append(image) return result - def _process_segment(self, ril, segment, image, xywh, where, file_id): + def _process_segment(self, ril, segment, image, xywh, where) -> Optional[OcrdPageResultImage]: self.tessapi.SetImage(image) features = xywh['features'] + ",binarized" image_bin = None @@ -119,11 +119,8 @@ def _process_segment(self, ril, segment, image, xywh, where, file_id): image_bin = layout.GetBinaryImage(ril) if not image_bin: self.logger.error('Cannot binarize %s', where) - return False - # update METS (add the image file): - file_id += '.IMG-BIN' - file_path = os.path.join(self.output_file_grp, file_id + '.png') + return None # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=features)) - return image_bin, file_id, file_path + image_ref = AlternativeImageType(comments=features) + segment.add_AlternativeImage(image_ref) + return OcrdPageResultImage(image_bin, segment.id + '.IMG-BIN', image_ref) From 3fd8265978649b404f5fcecc6b79ea1ccde19ea7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 01:34:57 +0200 Subject: [PATCH 08/34] crop: adapt to final v3 API --- ocrd_tesserocr/crop.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index c3100f8..4fb3a91 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -1,5 +1,7 @@ from __future__ import absolute_import + import os.path +from typing import Optional import tesserocr @@ -17,7 +19,9 @@ CoordsType, AlternativeImageType, BorderType, + OcrdPage ) +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from .recognize import TesserocrRecognize from .common import polygon_for_parent @@ -36,7 +40,7 @@ def _init(self): # page: self.tessapi.SetVariable("textord_tabfind_find_tables", "0") - def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult: """Performs page cropping with Tesseract on the workspace. Open and deserialize PAGE input file and its respective images. @@ -53,6 +57,8 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): Produce new output files by serialising the resulting hierarchy. """ + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) page = pcgts.get_Page() # warn of existing Border: @@ -85,10 +91,10 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): zoom = 1 bounds = self._estimate_bounds(page, page_image, zoom) - cropped = self._process_page(page, page_image, page_xywh, bounds, output_file_id) + cropped = self._process_page(page, page_image, page_xywh, bounds) if cropped: - return [pcgts, cropped] - return pcgts + result.images.append(cropped) + return result def _estimate_bounds(self, page, page_image, zoom=1.0): """Get outer bounds of all (existing or detected) regions.""" @@ -148,12 +154,12 @@ def _estimate_bounds(self, page, page_image, zoom=1.0): all_left, all_right, all_top, all_bottom) return all_left, all_top, all_right, all_bottom - def _process_page(self, page, page_image, page_xywh, bounds, file_id): + def _process_page(self, page, page_image, page_xywh, bounds) -> Optional[OcrdPageResultImage]: """Set the identified page border, if valid.""" left, top, right, bottom = bounds if left >= right or top >= bottom: self.logger.error("Cannot find valid extent for page") - return False + return None padding = self.parameter['padding'] # add padding: left = max(left - padding, 0) @@ -166,7 +172,7 @@ def _process_page(self, page, page_image, page_xywh, bounds, file_id): polygon = polygon_for_parent(polygon, page) if polygon is None: self.logger.error("Ignoring extant border") - return False + return None border = BorderType(Coords=CoordsType( points_from_polygon(polygon))) # intersection with parent could have changed bbox, @@ -177,9 +183,7 @@ def _process_page(self, page, page_image, page_xywh, bounds, file_id): # update METS (add the image file): page_image = crop_image(page_image, box=bbox) page_xywh['features'] += ',cropped' - page_image_id = file_id + '.IMG-CROP' - page_image_path = os.path.join(self.output_file_grp, page_image_id + '.png') # update PAGE (reference the image file): - page.add_AlternativeImage(AlternativeImageType( - filename=page_image_path, comments=page_xywh['features'])) - return page_image, page_image_id, page_image_path + alt_image = AlternativeImageType(comments=page_xywh['features']) + page.add_AlternativeImage(alt_image) + return OcrdPageResultImage(page_image, '.IMG-CROP', alt_image) From a66fbbe0df8dea76c2b553bf7d016489a04fa1cb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 01:41:58 +0200 Subject: [PATCH 09/34] deskew: adapt to final v3 API --- ocrd_tesserocr/deskew.py | 49 ++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index c6797d4..d2ff970 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -1,7 +1,9 @@ from __future__ import absolute_import +from typing import Optional import os.path import math + from tesserocr import ( PyTessBaseAPI, PSM, OEM, @@ -13,8 +15,12 @@ from ocrd_utils import membername from ocrd_models.ocrd_page import ( AlternativeImageType, - TextLineType, TextRegionType, PageType, + TextLineType, + TextRegionType, + PageType, + OcrdPage ) +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from .recognize import TesserocrRecognize @@ -32,7 +38,7 @@ def _init(self): if self.parameter['operation_level'] == 'line': self.tessapi.SetVariable("min_characters_to_try", "15") - def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Performs deskewing of the page / region with Tesseract on the workspace. Open and deserialise PAGE input files and their respective images, @@ -51,8 +57,9 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): Produce a new output file by serialising the resulting hierarchy. """ oplevel = self.parameter['operation_level'] - + pcgts = input_pcgts[0] page = pcgts.get_Page() + result = OcrdPageResult(pcgts) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, @@ -77,14 +84,11 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): if oplevel == 'page': image = self._process_segment(page, page_image, page_xywh, - "page '%s'" % page_id, - output_file_id) + "page '%s'" % page_id) if image: - return [pcgts, image] - else: - return pcgts + result.images.append(image) + return result - result = [pcgts] regions = page.get_AllRegions(classes=['Text', 'Table']) if not regions: self.logger.warning("Page '%s' contains no text regions", page_id) @@ -97,10 +101,9 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): feature_filter='deskewed') if oplevel == 'region': image = self._process_segment(region, region_image, region_xywh, - "region '%s'" % region.id, - output_file_id + '_' + region.id) + "region '%s'" % region.id) if image: - result.append(image) + result.images.append(image) elif isinstance(region, TextRegionType): lines = region.get_TextLine() if not lines: @@ -109,16 +112,15 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh) image = self._process_segment(line, line_image, line_xywh, - "line '%s'" % line.id, - output_file_id + '_' + region.id + '_' + line.id) + "line '%s'" % line.id) if image: - result.append(image) + result.images.append(image) return result - def _process_segment(self, segment, image, xywh, where, file_id): + def _process_segment(self, segment, image, xywh, where): if not image.width or not image.height: self.logger.warning("Skipping %s with zero size", where) - return False + return None angle0 = xywh['angle'] # deskewing (w.r.t. top image) already applied to image angle = 0. # additional angle to be applied at current level self.tessapi.SetImage(image) @@ -194,14 +196,14 @@ def _process_segment(self, segment, image, xywh, where, file_id): else: self.logger.warning('no OSD result in %s', where) if isinstance(segment, TextLineType): - return False + return None # # orientation/skew # layout = self.tessapi.AnalyseLayout() if not layout: self.logger.warning('no result iterator in %s', where) - return False + return None orientation, writing_direction, textline_order, deskew_angle = layout.Orientation() if isinstance(segment, (TextRegionType, PageType)): segment.set_readingDirection({ @@ -269,10 +271,7 @@ def _process_segment(self, segment, image, xywh, where, file_id): # workflow had deskewing xywh['features'] += ',deskewed' features = xywh['features'] # features already applied to image - # update METS (add the image file): - file_id += '.IMG-DESKEW' - file_path = os.path.join(self.output_file_grp, file_id + '.png') # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=features)) - return image, file_id, file_path + alternative = AlternativeImageType(comments=features) + segment.add_AlternativeImage(alternative) + return OcrdPageResultImage(image, segment.id + '.IMG-DESKEW', alternative) From ae10667a54afdc368371586920b4940564b482b7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 01:44:05 +0200 Subject: [PATCH 10/34] fontshape: adapt to final v3 API --- ocrd_tesserocr/fontshape.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ocrd_tesserocr/fontshape.py b/ocrd_tesserocr/fontshape.py index b8eb00c..37c648d 100644 --- a/ocrd_tesserocr/fontshape.py +++ b/ocrd_tesserocr/fontshape.py @@ -1,4 +1,6 @@ from __future__ import absolute_import + +from typing import Optional import os.path from PIL import Image, ImageStat @@ -8,7 +10,8 @@ get_languages ) -from ocrd_models.ocrd_page import TextStyleType +from ocrd_models.ocrd_page import TextStyleType, OcrdPage +from ocrd.processor import OcrdPageResult from .recognize import TesserocrRecognize from .common import pad_image @@ -29,7 +32,7 @@ def _init(self): self.logger.info("Using model '%s' in %s for recognition at the word level", model, get_languages()[0]) - def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Detect font shapes via rule-based OCR with Tesseract on the workspace. Open and deserialise PAGE input file and its respective images, @@ -44,8 +47,9 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): Produce new output files by serialising the resulting hierarchy. """ - + pcgts = input_pcgts[0] page = pcgts.get_Page() + result = OcrdPageResult(pcgts) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) @@ -69,7 +73,7 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): else: self._process_regions(regions, page_image, page_coords) - return pcgts + return result def _process_regions(self, regions, page_image, page_coords): for region in regions: From 4c222458bc51b1db6069cc9cf0c87294f80c629f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 01:50:55 +0200 Subject: [PATCH 11/34] recognize: use final v3 API --- ocrd_tesserocr/recognize.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index d609979..59d4daf 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -1,9 +1,10 @@ from __future__ import absolute_import +from typing import Optional from os.path import join import math -import numpy as np +import numpy as np from tesserocr import ( RIL, PSM, PT, OEM, Orientation, @@ -15,7 +16,6 @@ from ocrd_utils import ( getLogger, - assert_file_grp_cardinality, shift_coordinates, coordinates_for_segment, polygon_from_x0y0x1y1, @@ -45,9 +45,11 @@ GlyphType, TextEquivType, AlternativeImageType, + OcrdPage ) from ocrd_models.ocrd_page_generateds import TextTypeSimpleType from ocrd import Processor +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from .common import * @@ -126,8 +128,6 @@ def moduledir(self): def setup(self): self.logger = getLogger('processor.' + self.__class__.__name__) self.logger.debug("TESSDATA: %s, installed Tesseract models: %s", *get_languages()) - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) self._init() def _init(self): @@ -281,7 +281,7 @@ def _reinit(self, segment, mapping): # default: undo all settings from previous calls (reset to init-state) self.tessapi.Reset() - def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Perform layout segmentation and/or text recognition with Tesseract. Open and deserialise PAGE input file and its respective images, @@ -407,7 +407,7 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): model (among the models given in ``model``), enable ``auto_model``. To constrain models by type (called OCR engine mode), use ``oem``. """ - + pcgts = input_pcgts[0] inlevel = self.parameter['segmentation_level'] outlevel = self.parameter['textequiv_level'] segment_only = outlevel == 'none' or not self.parameter.get('model', '') @@ -434,7 +434,7 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): self.tessapi.SetVariable('user_defined_dpi', str(dpi)) self.logger.info("Processing page '%s'", page_id) - result = [pcgts] + result = OcrdPageResult(pcgts) # FIXME: We should somehow _mask_ existing regions in order to annotate incrementally (not redundantly). # Currently segmentation_level=region also means removing regions, # but we could have an independent setting for that, and attempt @@ -489,13 +489,10 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): self.logger.debug("Recognizing text in page '%s'", page_id) self.tessapi.Recognize() page_image_bin = self.tessapi.GetThresholdedImage() - page_image_bin_id = output_file_id + '.IMG-BIN' - page_image_bin_path = join(self.output_file_grp, page_image_bin_id + '.png') - # update METS (reference the image file) and store image file: - result.append((page_image_bin, page_image_bin_id, page_image_bin_path)) # update PAGE (reference the image file): - page.add_AlternativeImage(AlternativeImageType( - filename=page_image_bin_path, comments=page_coords['features'] + ',binarized,clipped')) + page_image_ref = AlternativeImageType(comments=page_coords['features'] + ',binarized,clipped') + page.add_AlternativeImage(page_image_ref) + result.images.append(OcrdPageResultImage(page_image_bin, '.IMG-BIN', page_image_ref)) self._process_regions_in_page(self.tessapi.GetIterator(), page, page_coords, pcgts_mapping, dpi) elif inlevel == 'cell': # Tables are obligatorily recursive regions; From 491003ffb640cd76790f01a2fd4691964304620a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:06:54 +0200 Subject: [PATCH 12/34] segment: adapt to final v3 API --- ocrd_tesserocr/segment.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ocrd_tesserocr/segment.py b/ocrd_tesserocr/segment.py index ae45677..4a3f10d 100644 --- a/ocrd_tesserocr/segment.py +++ b/ocrd_tesserocr/segment.py @@ -1,7 +1,11 @@ from __future__ import absolute_import +from typing import Optional + from ocrd_utils import getLogger from ocrd_validators import ParameterValidator +from ocrd_models import OcrdPage +from ocrd.processor import OcrdPageResult from .recognize import TesserocrRecognize @@ -19,10 +23,10 @@ def __init__(self, *args, **kwargs): # add default params assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Performs region and line segmentation with Tesseract on the workspace. - Open and deserialize PAGE input files and their respective images, + Open and deserialize PAGE input file and its respective images, and remove any existing Region and ReadingOrder elements. Set up Tesseract to detect blocks, and add each one to the page @@ -57,4 +61,4 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - super(TesserocrSegment, self).process() + return super().process_page_pcgts(*input_pcgts, page_id=page_id) From 0adfdee00890b1187d8d12fa95797849cc105b78 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:08:22 +0200 Subject: [PATCH 13/34] segment_line: adapt to final v3 API --- ocrd_tesserocr/segment_line.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index 27d4634..f6cfadc 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -1,7 +1,11 @@ from __future__ import absolute_import +from typing import Optional + from ocrd_utils import getLogger from ocrd_validators import ParameterValidator +from ocrd_models import OcrdPage +from ocrd.processor import OcrdPageResult from .recognize import TesserocrRecognize @@ -20,7 +24,7 @@ def __init__(self, *args, **kwargs): # add default params assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid - def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Performs (text) line segmentation with Tesseract on the workspace. Open and deserialize PAGE input file and its respective images, @@ -39,4 +43,4 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): Produce a new output file by serialising the resulting hierarchy. """ - return super().process_page_pcgts(pcgts, output_file_id=output_file_id, page_id=page_id) + return super().process_page_pcgts(*input_pcgts, page_id=page_id) From 1d7efa52236768c2e44999142a308bded752eb28 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:09:27 +0200 Subject: [PATCH 14/34] segment_region: adapt to final v3 API --- ocrd_tesserocr/segment_region.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index 3381a7d..8da52f6 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -1,7 +1,11 @@ from __future__ import absolute_import +from typing import Optional + from ocrd_utils import getLogger from ocrd_validators import ParameterValidator +from ocrd_models import OcrdPage +from ocrd.processor import OcrdPageResult from .recognize import TesserocrRecognize @@ -22,7 +26,7 @@ def __init__(self, *args, **kwargs): # add default params assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid - def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Performs region segmentation with Tesseract on the workspace. Open and deserialize PAGE input file and its respective images, @@ -47,4 +51,4 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): Produce a new output file by serialising the resulting hierarchy. """ - return super().process_page_pcgts(pcgts, output_file_id=output_file_id, page_id=page_id) + return super().process_page_pcgts(*input_pcgts, page_id=page_id) From aadd01b6b8121986ad6cca61e18b5088aa499f2f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:12:46 +0200 Subject: [PATCH 15/34] segment_table: adapt to final v3 API --- ocrd_tesserocr/segment_table.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ocrd_tesserocr/segment_table.py b/ocrd_tesserocr/segment_table.py index ea6ce2f..0750d7e 100644 --- a/ocrd_tesserocr/segment_table.py +++ b/ocrd_tesserocr/segment_table.py @@ -1,7 +1,11 @@ from __future__ import absolute_import +from typing import Optional + from ocrd_utils import getLogger from ocrd_validators import ParameterValidator +from ocrd_models import OcrdPage +from ocrd.processor import OcrdPageResult from .recognize import TesserocrRecognize @@ -20,7 +24,7 @@ def __init__(self, *args, **kwargs): # add default params assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid - def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Performs table cell segmentation with Tesseract on the workspace. Open and deserialize PAGE input file and its respective images, @@ -35,4 +39,4 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): Produce a new output file by serialising the resulting hierarchy. """ - return super().process_page_pcgts(pcgts, output_file_id=output_file_id, page_id=page_id) + return super().process_page_pcgts(*input_pcgts, page_id=page_id) From f5099c728c67529d63c29a1addbdab8997524a38 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:13:39 +0200 Subject: [PATCH 16/34] segment_word: adapt to final v3 API --- ocrd_tesserocr/segment_word.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index 534b3e6..fd78a69 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -1,7 +1,11 @@ from __future__ import absolute_import +from typing import Optional + from ocrd_utils import getLogger from ocrd_validators import ParameterValidator +from ocrd_models import OcrdPage +from ocrd.processor import OcrdPageResult from .recognize import TesserocrRecognize @@ -20,7 +24,7 @@ def __init__(self, *args, **kwargs): # add default params assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid - def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Performs word segmentation with Tesseract on the workspace. Open and deserialize PAGE input file and its respective images, @@ -38,4 +42,4 @@ def process_page_pcgts(self, pcgts, output_file_id=None, page_id=None): Produce a new output file by serialising the resulting hierarchy. """ - return super().process_page_pcgts(pcgts, output_file_id=output_file_id, page_id=page_id) + return super().process_page_pcgts(*input_pcgts, page_id=page_id) From 013de28da3c8b8074e3bc937a9562d8a09c18994 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:44:54 +0200 Subject: [PATCH 17/34] deskew: no segment.id for suffix on page level --- ocrd_tesserocr/deskew.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index d2ff970..ef5524d 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -261,10 +261,12 @@ def _process_segment(self, segment, image, xywh, where): image, xywh, _ = self.workspace.image_from_page( segment, where, fill='background', transparency=True) + suffix = '.IMG-DESKEW' else: image, xywh = self.workspace.image_from_segment( segment, image, xywh, fill='background', transparency=True) + suffix = segment.id + '.IMG-DESKEW' if not angle: # zero rotation does not change coordinates, # but assures consuming processors that the @@ -274,4 +276,4 @@ def _process_segment(self, segment, image, xywh, where): # update PAGE (reference the image file): alternative = AlternativeImageType(comments=features) segment.add_AlternativeImage(alternative) - return OcrdPageResultImage(image, segment.id + '.IMG-DESKEW', alternative) + return OcrdPageResultImage(image, suffix, alternative) From ff258a3ee7a31e4ab8355c9835419f4a5bce1853 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 03:01:17 +0200 Subject: [PATCH 18/34] CI: ex py37, in py311 --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 2dd45dc..628b60d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -49,7 +49,7 @@ workflows: - test-python: matrix: parameters: - python-version: ['3.7', '3.8', '3.9', '3.10'] + python-version: ['3.8', '3.9', '3.10', '3.11'] deploy: jobs: - deploy-docker: From 276735b35cac0107f1af58fcf6b49672344aa1ba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 25 Aug 2024 11:11:01 +0200 Subject: [PATCH 19/34] adapt to v3 b1, replace inheritance w/ proxy pattern --- ocrd_tesserocr/recognize.py | 56 +++++++++++++++----------------- ocrd_tesserocr/segment.py | 50 +++++++++++++++------------- ocrd_tesserocr/segment_line.py | 41 +++++++++++++---------- ocrd_tesserocr/segment_region.py | 49 +++++++++++++++------------- ocrd_tesserocr/segment_table.py | 40 +++++++++++++---------- ocrd_tesserocr/segment_word.py | 41 +++++++++++++---------- 6 files changed, 150 insertions(+), 127 deletions(-) diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index 59d4daf..b119f3b 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -48,8 +48,7 @@ OcrdPage ) from ocrd_models.ocrd_page_generateds import TextTypeSimpleType -from ocrd import Processor -from ocrd.processor import OcrdPageResult, OcrdPageResultImage +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage from .common import * @@ -126,7 +125,6 @@ def moduledir(self): return get_languages()[0] def setup(self): - self.logger = getLogger('processor.' + self.__class__.__name__) self.logger.debug("TESSDATA: %s, installed Tesseract models: %s", *get_languages()) self._init() @@ -283,47 +281,47 @@ def _reinit(self, segment, mapping): def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Perform layout segmentation and/or text recognition with Tesseract. - + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the requested ``textequiv_level`` if it exists and if ``segmentation_level`` is lower (i.e. more granular) or ``none``. - + Otherwise stop before (i.e. above) ``segmentation_level``. If any segmentation exists at that level already, and ``overwrite_segments`` is false, then descend into these segments, else remove them. - + Set up Tesseract to recognise each segment's image (either from AlternativeImage or cropping the bounding box rectangle and masking it from the polygon outline) with the appropriate segmentation mode and recognition ``model``. (If no ``model`` is given, then only layout analysis will be performed.) - + Next, if there still is a gap between the current level in the PAGE hierarchy and the requested ``textequiv_level``, then iterate down the result hierarchy, adding new segments at each level (as well as reading order references, text line order, reading direction and orientation at the region/table level). - + Then, at ``textequiv_level``, remove any existing TextEquiv, unless ``overwrite_text`` is false, and add text and confidence results, unless ``model`` is empty. - + The special value ``textequiv_level=none`` behaves like ``glyph``, except that no actual text recognition will be performed, only layout analysis (so no ``model`` is needed, and new segmentation is created down to the glyph level). - + The special value ``segmentation_level=none`` likewise is lowest, i.e. no actual layout analysis will be performed, only text recognition (so existing segmentation is needed down to ``textequiv_level``). - + Finally, make all higher levels consistent with these text results by concatenation, ordering according to each level's respective readingDirection, textLineOrder, and ReadingOrder, and joining by whitespace as appropriate for each level and according to its Relation/join status. - + Produce a new output file by serialising the resulting hierarchy. \b @@ -358,7 +356,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional recognize text in the glyphs and annotate it. Regardless, continue with the next glyph. Otherwise... - (i.e. ``none``) annotate no text and be done. - + Note that ``cell`` is an _optional_ level that is only relevant for table regions, not text or other regions. Also, when segmenting tables in the same run that detects them @@ -366,42 +364,42 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional just be 'paragraphs'. In contrast, when segmenting tables that already exist (via ``segmentation_level=cell``), cells will be detected in ``sparse_text`` mode, i.e. as single-line text regions. - + Thus, ``segmentation_level`` is the entry point level for layout analysis, and setting it to ``none`` makes this processor behave as recognition-only. Whereas ``textequiv_level`` selects the exit point level for segmentation, and setting it to ``none`` makes this processor behave as segmentation-only, as does omitting ``model``. - + All segments above ``segmentation_level`` must already exist, and no segments below ``textequiv_level`` will be newly created. - + If ``find_tables``, then during region segmentation, also try to detect table blocks and add them as TableRegion, then query the page iterator for paragraphs and add them as TextRegion cells. - + If ``find_staves``, then during region segmentation, also try to detect sheet music blocks and suppress them during page layout analysis. - + If ``block_polygons``, then during region segmentation, query Tesseract for polygon outlines instead of bounding boxes for each region. (This is more precise, but due to some path representation errors does not always yield accurate/valid polygons.) - + If ``shrink_polygons``, then during segmentation (on any level), query Tesseract for all symbols/glyphs of each segment and calculate the convex hull for them. Annotate the resulting polygon instead of the coarse bounding box. (This is more precise and helps avoid overlaps between neighbours, especially when not segmenting all levels at once.) - + If ``sparse_text``, then during region segmentation, attempt to find single-line text blocks in no particular order (Tesseract's page segmentation mode ``SPARSE_TEXT``). - + If ``tesseract_parameters`` is given, setup each of its key-value pairs as run-time parameters in Tesseract. For local (per-segment) parameter selection based on XPath queries into the input PAGE, use ``xpath_parameters``. - + Similarly, for local (per-segment) OCR model selection based on XPath queries into the input PAGE, use ``xpath_model``. For auto-detection of the best performing model (among the models given in ``model``), enable ``auto_model``. To constrain @@ -412,8 +410,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional outlevel = self.parameter['textequiv_level'] segment_only = outlevel == 'none' or not self.parameter.get('model', '') - pcgts_mapping = dict() - _ = pcgts.to_etree(mapping_=pcgts_mapping) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) @@ -477,7 +473,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional # analysed as independent text/line blocks: self.tessapi.SetVariable("textord_tabfind_find_tables", "0") if not segment_only: - self._reinit(page, pcgts_mapping) + self._reinit(page, pcgts.mapping) self.tessapi.SetImage(page_image) # is already cropped to Border self.tessapi.SetPageSegMode(PSM.SPARSE_TEXT if self.parameter['sparse_text'] @@ -493,7 +489,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional page_image_ref = AlternativeImageType(comments=page_coords['features'] + ',binarized,clipped') page.add_AlternativeImage(page_image_ref) result.images.append(OcrdPageResultImage(page_image_bin, '.IMG-BIN', page_image_ref)) - self._process_regions_in_page(self.tessapi.GetIterator(), page, page_coords, pcgts_mapping, dpi) + self._process_regions_in_page(self.tessapi.GetIterator(), page, page_coords, pcgts.mapping, dpi) elif inlevel == 'cell': # Tables are obligatorily recursive regions; # they might have existing text regions (cells), @@ -511,9 +507,9 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional self.logger.warning("Page '%s' contains no table regions (but segmentation is off)", page_id) else: - self._process_existing_tables(tables, page, page_image, page_coords, pcgts_mapping) + self._process_existing_tables(tables, page, page_image, page_coords, pcgts.mapping) elif regions: - self._process_existing_regions(regions, page_image, page_coords, pcgts_mapping) + self._process_existing_regions(regions, page_image, page_coords, pcgts.mapping) else: self.logger.warning("Page '%s' contains no text regions (but segmentation is off)", page_id) @@ -690,7 +686,7 @@ def _process_regions_in_page(self, result_it, page, page_coords, mapping, dpi): not og.get_UnorderedGroupIndexed()): # schema forbids empty OrderedGroup ro.set_OrderedGroup(None) - + def _process_cells_in_table(self, result_it, region, rogroup, page_coords, mapping): if self.parameter['segmentation_level'] == 'cell': ril = RIL.BLOCK # for sparse_text mode @@ -941,7 +937,7 @@ def _process_existing_tables(self, tables, page, page_image, page_coords, mappin self.logger.debug("Recognizing text in table '%s'", table.id) self.tessapi.Recognize() self._process_cells_in_table(self.tessapi.GetIterator(), table, roelem, table_coords, mapping) - + def _process_existing_regions(self, regions, page_image, page_coords, mapping): if self.parameter['textequiv_level'] in ['region', 'cell'] and not self.parameter.get('model', ''): return diff --git a/ocrd_tesserocr/segment.py b/ocrd_tesserocr/segment.py index 4a3f10d..c45c908 100644 --- a/ocrd_tesserocr/segment.py +++ b/ocrd_tesserocr/segment.py @@ -2,10 +2,8 @@ from typing import Optional -from ocrd_utils import getLogger -from ocrd_validators import ParameterValidator from ocrd_models import OcrdPage -from ocrd.processor import OcrdPageResult +from ocrd import OcrdPageResult from .recognize import TesserocrRecognize @@ -14,51 +12,59 @@ class TesserocrSegment(TesserocrRecognize): def executable(self): return 'ocrd-tesserocr-segment' - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if hasattr(self, 'parameter'): - self.parameter['overwrite_segments'] = True - self.parameter['segmentation_level'] = "region" - self.parameter['textequiv_level'] = "none" - # add default params - assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid + def setup(self): + # don't run super().setup(self) - helper will + parameter = dict(self.parameter) + # we already did validate and default-expand + parameter['overwrite_segments'] = True + parameter['segmentation_level'] = "region" + parameter['textequiv_level'] = "none" + # this will validate and default-expand, then call helper's setup() + self.helper = TesserocrRecognize(None, parameter=parameter) + self.helper.logger = self.logger def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: - """Performs region and line segmentation with Tesseract on the workspace. - + """Performs region and line segmentation with Tesseract. + Open and deserialize PAGE input file and its respective images, and remove any existing Region and ReadingOrder elements. - + Set up Tesseract to detect blocks, and add each one to the page as a region according to BlockType at the detected coordinates (bounding boxes). - + If ``find_tables`` is True, try to detect table blocks and add them as TableRegion, then query the page iterator for paragraphs and add them as TextRegion cells. - + If ``block_polygons``, then query Tesseract for polygon outlines instead of bounding boxes for each region. (This is more precise, but due to some path representation errors does not always yield accurate/valid polygons.) - + If ``shrink_polygons``, then query Tesseract for all symbols/glyphs of each segment and calculate the convex hull for them. Annotate the resulting polygon instead of the coarse bounding box. (This is more precise and helps avoid overlaps between neighbours, especially when not segmenting all levels at once.) - + If ``sparse_text``, then attempt to find single-line text blocks only, in no particular order. - + Next, query the page iterator for text lines inside the text regions, and add each one to the region according to the detected coordinates (bounding boxes). - + Finally, query the page iterator for words inside the text lines, and add each one to the line according to the detected coordinates (bounding boxes). - + Produce a new output file by serialising the resulting hierarchy. """ - return super().process_page_pcgts(*input_pcgts, page_id=page_id) + # delegate implementation to helper tool + self.helper.workspace = self.workspace + self.helper.page_id = self.page_id + self.helper.input_file_grp = self.input_file_grp + self.helper.output_file_grp = self.output_file_grp + return self.helper.process_page_pcgts(*input_pcgts, page_id=page_id) + diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index f6cfadc..5fe8005 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -2,10 +2,8 @@ from typing import Optional -from ocrd_utils import getLogger -from ocrd_validators import ParameterValidator from ocrd_models import OcrdPage -from ocrd.processor import OcrdPageResult +from ocrd import OcrdPageResult from .recognize import TesserocrRecognize @@ -14,33 +12,40 @@ class TesserocrSegmentLine(TesserocrRecognize): def executable(self): return 'ocrd-tesserocr-segment-line' - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if hasattr(self, 'parameter'): - self.parameter['overwrite_segments'] = self.parameter['overwrite_lines'] - del self.parameter['overwrite_lines'] - self.parameter['segmentation_level'] = "line" - self.parameter['textequiv_level'] = "line" - # add default params - assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid + def setup(self): + # don't run super().setup(self) - helper will + parameter = dict(self.parameter) + # we already did validate and default-expand + parameter['overwrite_segments'] = parameter['overwrite_lines'] + del parameter['overwrite_lines'] + parameter['segmentation_level'] = "line" + parameter['textequiv_level'] = "line" + # this will validate and default-expand, then call helper's setup() + self.helper = TesserocrRecognize(None, parameter=parameter) + self.helper.logger = self.logger def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: - """Performs (text) line segmentation with Tesseract on the workspace. - + """Performs (text) line segmentation with Tesseract. + Open and deserialize PAGE input file and its respective images, then iterate over the element hierarchy down to the (text) region level, and remove any existing TextLine elements (unless ``overwrite_lines`` is False). - + Set up Tesseract to detect lines, and add each one to the region at the detected coordinates. - + If ``shrink_polygons``, then during segmentation (on any level), query Tesseract for all symbols/glyphs of each segment and calculate the convex hull for them. Annotate the resulting polygon instead of the coarse bounding box. (This is more precise and helps avoid overlaps between neighbours, especially when not segmenting all levels at once.) - + Produce a new output file by serialising the resulting hierarchy. """ - return super().process_page_pcgts(*input_pcgts, page_id=page_id) + # delegate implementation to helper tool + self.helper.workspace = self.workspace + self.helper.page_id = self.page_id + self.helper.input_file_grp = self.input_file_grp + self.helper.output_file_grp = self.output_file_grp + return self.helper.process_page_pcgts(*input_pcgts, page_id=page_id) diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index 8da52f6..691b651 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -2,53 +2,58 @@ from typing import Optional -from ocrd_utils import getLogger -from ocrd_validators import ParameterValidator from ocrd_models import OcrdPage -from ocrd.processor import OcrdPageResult +from ocrd import OcrdPageResult from .recognize import TesserocrRecognize class TesserocrSegmentRegion(TesserocrRecognize): @property - def executable(self): + def executable(self) -> str: return 'ocrd-tesserocr-segment-region' - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if hasattr(self, 'parameter'): - self.parameter['overwrite_segments'] = self.parameter['overwrite_regions'] - del self.parameter['overwrite_regions'] - self.parameter['segmentation_level'] = "region" - self.parameter['textequiv_level'] = "region" - self.parameter['block_polygons'] = self.parameter['crop_polygons'] - del self.parameter['crop_polygons'] - # add default params - assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid + def setup(self): + # don't run super().setup(self) - helper will + parameter = dict(self.parameter) + # we already did validate and default-expand + parameter['overwrite_segments'] = parameter['overwrite_regions'] + del parameter['overwrite_regions'] + parameter['segmentation_level'] = "region" + parameter['textequiv_level'] = "region" + parameter['block_polygons'] = parameter['crop_polygons'] + del parameter['crop_polygons'] + # this will validate and default-expand, then call helper's setup() + self.helper = TesserocrRecognize(None, parameter=parameter) + self.helper.logger = self.logger def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: - """Performs region segmentation with Tesseract on the workspace. - + """Performs region segmentation with Tesseract. + Open and deserialize PAGE input file and its respective images, and remove any existing Region and ReadingOrder elements (unless ``overwrite_regions`` is False). - + Set up Tesseract to detect blocks, and add each one to the page as a region according to BlockType at the detected coordinates. If ``find_tables`` is True, try to detect table blocks and add them as (atomic) TableRegion. - + If ``crop_polygons`` is True, then query polygon outlines instead of bounding boxes from Tesseract for each region. (This is more precise, but due to some path representation errors does not always yield accurate/valid polygons.) - + If ``shrink_polygons``, then query Tesseract for all symbols/glyphs of each segment and calculate the convex hull for them. Annotate the resulting polygon instead of the coarse bounding box. (This is more precise and helps avoid overlaps between neighbours, especially when not segmenting all levels at once.) - + Produce a new output file by serialising the resulting hierarchy. """ - return super().process_page_pcgts(*input_pcgts, page_id=page_id) + # delegate implementation to helper tool + self.helper.workspace = self.workspace + self.helper.page_id = self.page_id + self.helper.input_file_grp = self.input_file_grp + self.helper.output_file_grp = self.output_file_grp + return self.helper.process_page_pcgts(*input_pcgts, page_id=page_id) diff --git a/ocrd_tesserocr/segment_table.py b/ocrd_tesserocr/segment_table.py index 0750d7e..263c550 100644 --- a/ocrd_tesserocr/segment_table.py +++ b/ocrd_tesserocr/segment_table.py @@ -2,10 +2,8 @@ from typing import Optional -from ocrd_utils import getLogger -from ocrd_validators import ParameterValidator from ocrd_models import OcrdPage -from ocrd.processor import OcrdPageResult +from ocrd import OcrdPageResult from .recognize import TesserocrRecognize @@ -14,29 +12,37 @@ class TesserocrSegmentTable(TesserocrRecognize): def executable(self): return 'ocrd-tesserocr-segment-table' - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if hasattr(self, 'parameter'): - self.parameter['overwrite_segments'] = self.parameter['overwrite_cells'] - del self.parameter['overwrite_cells'] - self.parameter['segmentation_level'] = "cell" - self.parameter['textequiv_level'] = "cell" - # add default params - assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid + def setup(self): + # don't run super().setup(self) - helper will + parameter = dict(self.parameter) + # we already did validate and default-expand + parameter['overwrite_segments'] = parameter['overwrite_cells'] + del parameter['overwrite_cells'] + parameter['segmentation_level'] = "cell" + parameter['textequiv_level'] = "cell" + # this will validate and default-expand, then call helper's setup() + self.helper = TesserocrRecognize(None, parameter=parameter) + self.helper.logger = self.logger def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: - """Performs table cell segmentation with Tesseract on the workspace. - + """Performs table cell segmentation with Tesseract. + Open and deserialize PAGE input file and its respective images, then iterate over the element hierarchy down to the region level for table regions, and remove any existing TextRegion elements (unless ``overwrite_cells`` is False). - + Set up Tesseract to detect text blocks (as table cells). (This is not Tesseract's internal table structure recognition, but the general page segmentation in sparse mode.) Add each block to the table at the detected coordinates. - + Produce a new output file by serialising the resulting hierarchy. """ - return super().process_page_pcgts(*input_pcgts, page_id=page_id) + # delegate implementation to helper tool + self.helper.workspace = self.workspace + self.helper.page_id = self.page_id + self.helper.input_file_grp = self.input_file_grp + self.helper.output_file_grp = self.output_file_grp + return self.helper.process_page_pcgts(*input_pcgts, page_id=page_id) + diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index fd78a69..dcace6b 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -2,10 +2,8 @@ from typing import Optional -from ocrd_utils import getLogger -from ocrd_validators import ParameterValidator from ocrd_models import OcrdPage -from ocrd.processor import OcrdPageResult +from ocrd import OcrdPageResult from .recognize import TesserocrRecognize @@ -14,32 +12,39 @@ class TesserocrSegmentWord(TesserocrRecognize): def executable(self): return 'ocrd-tesserocr-segment-word' - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if hasattr(self, 'parameter'): - self.parameter['overwrite_segments'] = self.parameter['overwrite_words'] - del self.parameter['overwrite_words'] - self.parameter['segmentation_level'] = "word" - self.parameter['textequiv_level'] = "word" - # add default params - assert ParameterValidator(self.metadata['tools']['ocrd-tesserocr-recognize']).validate(self.parameter).is_valid + def setup(self): + # don't run super().setup(self) - helper will + parameter = dict(self.parameter) + # we already did validate and default-expand + parameter['overwrite_segments'] = parameter['overwrite_words'] + del parameter['overwrite_words'] + parameter['segmentation_level'] = "word" + parameter['textequiv_level'] = "word" + # this will validate and default-expand, then call helper's setup() + self.helper = TesserocrRecognize(None, parameter=parameter) + self.helper.logger = self.logger def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: - """Performs word segmentation with Tesseract on the workspace. - + """Performs word segmentation with Tesseract. + Open and deserialize PAGE input file and its respective images, then iterate over the element hierarchy down to the textline level, and remove any existing Word elements. - + Set up Tesseract to detect words, and add each one to the line at the detected coordinates. - + If ``shrink_polygons``, then during segmentation (on any level), query Tesseract for all symbols/glyphs of each segment and calculate the convex hull for them. Annotate the resulting polygon instead of the coarse bounding box. (This is more precise and helps avoid overlaps between neighbours, especially when not segmenting all levels at once.) - + Produce a new output file by serialising the resulting hierarchy. """ - return super().process_page_pcgts(*input_pcgts, page_id=page_id) + # delegate implementation to helper tool + self.helper.workspace = self.workspace + self.helper.page_id = self.page_id + self.helper.input_file_grp = self.input_file_grp + self.helper.output_file_grp = self.output_file_grp + return self.helper.process_page_pcgts(*input_pcgts, page_id=page_id) From 7ae25a32735a6147703191c80144ebb6420760d7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 25 Aug 2024 11:11:34 +0200 Subject: [PATCH 20/34] tests: adapt to etree in v3 b1 --- repo/tesseract | 2 +- test/test_recognize.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/repo/tesseract b/repo/tesseract index 2991d36..5d5a633 160000 --- a/repo/tesseract +++ b/repo/tesseract @@ -1 +1 @@ -Subproject commit 2991d36a8b92454cc413c8347a7decc7daf33877 +Subproject commit 5d5a633a5d7abfb155a605be90f8033f82e9744f diff --git a/test/test_recognize.py b/test/test_recognize.py index 2c00c55..5d200a5 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -39,8 +39,8 @@ def test_run_modular(workspace_kant_binarized): results = workspace_kant_binarized.find_files(file_grp='OCR-D-OCR-TESS-W2C', mimetype=MIMETYPE_PAGE) result0 = next(results, False) assert result0 - _, result0, _, _ = page_from_file(result0, with_tree=True) - text0 = result0.xpath('//page:Glyph/page:TextEquiv/page:Unicode', namespaces=NAMESPACES) + result0 = page_from_file(result0) + text0 = result0.etree.xpath('//page:Glyph/page:TextEquiv/page:Unicode', namespaces=NAMESPACES) assert len(text0) > 0 def test_run_modular_full(workspace_kant_binarized): @@ -79,10 +79,10 @@ def test_run_modular_full(workspace_kant_binarized): results = workspace_kant_binarized.find_files(file_grp='OCR-D-OCR-STYLE', mimetype=MIMETYPE_PAGE) result0 = next(results, False) assert result0 - _, result0, _, _ = page_from_file(result0, with_tree=True) - text0 = result0.xpath('//page:Word/page:TextEquiv/page:Unicode', namespaces=NAMESPACES) + result0 = page_from_file(result0) + text0 = result0.etree.xpath('//page:Word/page:TextEquiv/page:Unicode', namespaces=NAMESPACES) assert len(text0) > 0 - style0 = result0.xpath('//page:Word/page:TextStyle', namespaces=NAMESPACES) + style0 = result0.etree.xpath('//page:Word/page:TextStyle', namespaces=NAMESPACES) assert len(style0) > 0 def test_run_allinone(workspace_kant_binarized): @@ -96,8 +96,8 @@ def test_run_allinone(workspace_kant_binarized): results = workspace_kant_binarized.find_files(file_grp='OCR-D-OCR-TESS-W2C', mimetype=MIMETYPE_PAGE) result0 = next(results, False) assert result0 - _, result0, _, _ = page_from_file(result0, with_tree=True) - text0 = result0.xpath('//page:Glyph/page:TextEquiv/page:Unicode', namespaces=NAMESPACES) + result0 = page_from_file(result0) + text0 = result0.etree.xpath('//page:Glyph/page:TextEquiv/page:Unicode', namespaces=NAMESPACES) assert len(text0) > 0 def test_run_allinone_shrink(workspace_kant_binarized): From ef099953de91e8840ed44b582ba7b8bc4260c21b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 26 Aug 2024 11:38:17 +0200 Subject: [PATCH 21/34] require ocrd>=3.0.0b1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9191406..cd87094 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ocrd >= 3.0.0a1 +ocrd >= 3.0.0b1 click tesserocr >= 2.5.2 shapely >= 2.0 From 972ac7684eed468486d73a3e73fd8ec036962b1f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 23:05:06 +0200 Subject: [PATCH 22/34] test_recognize: also test with METS Server and METS caching --- test/conftest.py | 32 ++++++++++++++++++++++++++++++-- test/test_recognize.py | 36 ++++++++++++++++++++++-------------- 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index fd09f74..6cdac13 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,13 +1,41 @@ -from ocrd.resolver import Resolver -from ocrd_utils import pushd_popd, initLogging +from multiprocessing import Process +from time import sleep from pytest import fixture +from ocrd import Resolver, Workspace, OcrdMetsServer +from ocrd_utils import pushd_popd, initLogging, config + from test.assets import assets as assets METS_KANT_BINARIZED = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml') METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml') METS_GUTACHTEN = assets.url_of('gutachten/data/mets.xml') +CONFIGS = ['', 'pageparallel', 'metscache', 'pageparallel+metscache'] + +@fixture(params=CONFIGS) +def configsettings(request): + if 'metscache' in request.param: + config.OCRD_METS_CACHING = True + print("enabled METS caching") + if 'pageparallel' in request.param: + config.OCRD_MAX_PARALLEL_PAGES = 4 + print("enabled page-parallel processing") + def _start_mets_server(*args, **kwargs): + server = OcrdMetsServer(*args, **kwargs) + server.startup() + workspace = Workspace(Resolver(), '.') + process = Process(target=_start_mets_server, + kwargs={'workspace': workspace, 'url': 'mets.sock'}) + process.start() + sleep(1) + workspace = Workspace(Resolver(), '.', mets_server_url='mets.sock') + yield 'mets.sock', workspace + process.terminate() + else: + yield () + config.reset_defaults() + @fixture def workspace_kant_binarized(tmpdir): initLogging() diff --git a/test/test_recognize.py b/test/test_recognize.py index 5d200a5..306cc3f 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -11,32 +11,40 @@ from ocrd_tesserocr import TesserocrRecognize from ocrd_tesserocr import TesserocrFontShape -def test_run_modular(workspace_kant_binarized): +def test_run_modular(workspace_kant_binarized, configsettings): + ws = workspace_kant_binarized + if len(configsettings): + print("running with METS server") + mets_server_url, ws = configsettings + kwargs = {'workspace': ws, + 'mets_server_url': mets_server_url} + else: + kwargs = {'workspace': ws} run_processor(TesserocrSegmentRegion, - workspace=workspace_kant_binarized, input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-SEG-BLOCK") + output_file_grp="OCR-D-SEG-BLOCK", + **kwargs) run_processor(TesserocrSegmentLine, - workspace=workspace_kant_binarized, input_file_grp="OCR-D-SEG-BLOCK", - output_file_grp="OCR-D-SEG-LINE") + output_file_grp="OCR-D-SEG-LINE", + **kwargs) run_processor(TesserocrRecognize, - workspace=workspace_kant_binarized, input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-OCR-TESS", - parameter={'textequiv_level': 'line', 'model': 'Fraktur'}) + parameter={'textequiv_level': 'line', 'model': 'Fraktur'}, + **kwargs) run_processor(TesserocrSegmentWord, - workspace=workspace_kant_binarized, input_file_grp="OCR-D-SEG-LINE", - output_file_grp="OCR-D-SEG-WORD") + output_file_grp="OCR-D-SEG-WORD", + **kwargs) run_processor(TesserocrRecognize, - workspace=workspace_kant_binarized, input_file_grp="OCR-D-SEG-WORD", output_file_grp="OCR-D-OCR-TESS-W2C", - parameter={'segmentation_level': 'glyph', 'textequiv_level': 'glyph', 'model': 'Fraktur'}) - workspace_kant_binarized.save_mets() - assert os.path.isdir(os.path.join(workspace_kant_binarized.directory, 'OCR-D-OCR-TESS-W2C')) - results = workspace_kant_binarized.find_files(file_grp='OCR-D-OCR-TESS-W2C', mimetype=MIMETYPE_PAGE) + parameter={'segmentation_level': 'glyph', 'textequiv_level': 'glyph', 'model': 'Fraktur'}, + **kwargs) + ws.save_mets() + assert os.path.isdir(os.path.join(ws.directory, 'OCR-D-OCR-TESS-W2C')) + results = ws.find_files(file_grp='OCR-D-OCR-TESS-W2C', mimetype=MIMETYPE_PAGE) result0 = next(results, False) assert result0 result0 = page_from_file(result0) From a0d7ffaca756e338e208242c8902ddb507c40a7f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 23:05:59 +0200 Subject: [PATCH 23/34] limit max_workers=1 (libtesseract is not thread-safe) --- ocrd_tesserocr/recognize.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index b119f3b..0f8f038 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -112,6 +112,9 @@ def __exit__(self, exc_type, exc_val, exc_trace): return None class TesserocrRecognize(Processor): + # Tesseract API is not thread-safe + max_workers = 1 + @property def executable(self): return 'ocrd-tesserocr-recognize' From a4064005cee83b2ffee04f16c1a28842d5f1e43c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 15:23:52 +0200 Subject: [PATCH 24/34] conftest: simplify --- test/conftest.py | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index 6cdac13..5730f8b 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -3,13 +3,31 @@ from pytest import fixture from ocrd import Resolver, Workspace, OcrdMetsServer -from ocrd_utils import pushd_popd, initLogging, config +from ocrd_utils import pushd_popd, initLogging, setOverrideLogLevel, config from test.assets import assets as assets -METS_KANT_BINARIZED = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml') -METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml') -METS_GUTACHTEN = assets.url_of('gutachten/data/mets.xml') +@fixture +def workspace(tmpdir, pytestconfig): + def _make_workspace(workspace_path): + initLogging() + if pytestconfig.getoption('verbose') > 0: + setOverrideLogLevel('DEBUG') + with pushd_popd(tmpdir): + yield Resolver().workspace_from_url(workspace_path, dst_dir=tmpdir, download=True) + return _make_workspace + +@fixture +def workspace_kant_binarized(workspace): + yield from workspace(assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml')) + +@fixture +def workspace_herold_small(workspace): + yield from workspace(assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')) + +@fixture +def workspace_gutachten(workspace): + yield from workspace(assets.url_of('gutachten/data/mets.xml')) CONFIGS = ['', 'pageparallel', 'metscache', 'pageparallel+metscache'] @@ -35,22 +53,3 @@ def _start_mets_server(*args, **kwargs): else: yield () config.reset_defaults() - -@fixture -def workspace_kant_binarized(tmpdir): - initLogging() - with pushd_popd(tmpdir): - yield Resolver().workspace_from_url(METS_KANT_BINARIZED, dst_dir=tmpdir, download=True) - -@fixture -def workspace_herold_small(tmpdir): - initLogging() - with pushd_popd(tmpdir): - yield Resolver().workspace_from_url(METS_HEROLD_SMALL, dst_dir=tmpdir, download=True) - -@fixture -def workspace_gutachten(tmpdir): - initLogging() - with pushd_popd(tmpdir): - yield Resolver().workspace_from_url(METS_GUTACHTEN, dst_dir=tmpdir, download=True) - From 81fe66fe1303ff1aea4a1edb2762c992a8779869 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 30 Aug 2024 15:35:00 +0200 Subject: [PATCH 25/34] require ocrd>=3.0.0b3 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index cd87094..098b0ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ocrd >= 3.0.0b1 +ocrd >= 3.0.0b3 click tesserocr >= 2.5.2 shapely >= 2.0 From 4e7fa70bcc990551ca02cf18470c925da6d23adf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 31 Aug 2024 02:27:01 +0200 Subject: [PATCH 26/34] test_cli: use subprocess CLI instead of monkeypatching env for TESSDATA_PREFIX --- test/test_cli.py | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/test/test_cli.py b/test/test_cli.py index 7fcc879..1b7f59a 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -1,32 +1,24 @@ -from click.testing import CliRunner - from pathlib import Path +from os import environ +from subprocess import run -runner = CliRunner() def test_show_resource(tmpdir, monkeypatch): samplefile = Path(tmpdir, 'bar.traineddata') samplefile.write_text('bar') # simulate a Tesseract compiled with custom tessdata dir - monkeypatch.setenv('TESSDATA_PREFIX', str(tmpdir)) - # does not work (thus, tesserocr must not have been loaded already): - #monkeypatch.delitem(sys.modules, 'tesserocr') - # envvars influence tesserocr's module initialization - from ocrd_tesserocr.cli import ocrd_tesserocr_recognize - r = runner.invoke(ocrd_tesserocr_recognize, ['-C', 'bar.traineddata']) - assert not r.exit_code, r.output - # XXX doesn't work because shutil.copyfileobj to stdout won't be captured - # by self.invoke_cli Not sure why it does not work :( - # assert r.output == 'bar' + env = dict(environ) + env.update(TESSDATA_PREFIX=str(tmpdir)) + r = run(['ocrd-tesserocr-recognize', '-C', 'bar.traineddata'], + env=env, text=True, capture_output=True) + assert not r.returncode, r.output def test_list_all_resources(tmpdir, monkeypatch): samplefile = Path(tmpdir, 'foo.traineddata') samplefile.write_text('foo') # simulate a Tesseract compiled with custom tessdata dir - monkeypatch.setenv('TESSDATA_PREFIX', str(tmpdir)) - # envvars influence tesserocr's module initialization - from ocrd_tesserocr.cli import ocrd_tesserocr_recognize - r = runner.invoke(ocrd_tesserocr_recognize, ['-L']) - assert not r.exit_code, r.output - # XXX same problem - # assert r.output == str(samplefile) + '\n' + env = dict(environ) + env.update(TESSDATA_PREFIX=str(tmpdir)) + r = run(['ocrd-tesserocr-recognize', '-L'], + env=env, text=True, capture_output=True) + assert not r.returncode, r.output From b76a4f50b7f730a62356c01644b38fb6934d2df6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 31 Aug 2024 02:27:54 +0200 Subject: [PATCH 27/34] test: all in pytest call --- Makefile | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Makefile b/Makefile index e097214..9a2a5f5 100644 --- a/Makefile +++ b/Makefile @@ -142,10 +142,7 @@ install: deps # Run unit tests test: test/assets deps-test @# declare -p HTTP_PROXY - #$(PYTHON) -m pytest -n auto --continue-on-collection-errors test $(PYTEST_ARGS) - # workaround for pytest-xdist not isolating setenv calls in click.CliRunner from each other: - $(PYTHON) -m pytest --continue-on-collection-errors test/test_cli.py $(PYTEST_ARGS) - $(PYTHON) -m pytest --continue-on-collection-errors test/test_{segment_{region,table,line,word},recognize}.py $(PYTEST_ARGS) + $(PYTHON) -m pytest test --durations=0 --continue-on-collection-errors $(PYTEST_ARGS) # Run unit tests and determine test coverage coverage: From c9b8f3a7a9b986c22c62252b72fd9b8c1bbc5fdc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 31 Aug 2024 02:28:45 +0200 Subject: [PATCH 28/34] test: do not skip failured pages --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 9a2a5f5..f2eb0d3 100644 --- a/Makefile +++ b/Makefile @@ -139,6 +139,8 @@ install: deps ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata +test test-cli coverage: export OCRD_MISSING_OUTPUT := ABORT + # Run unit tests test: test/assets deps-test @# declare -p HTTP_PROXY From 6d26cf096e96ad783cebf376c30680f33dbe7535 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Sep 2024 11:46:40 +0200 Subject: [PATCH 29/34] require ocrd>=3.0.0b4 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 098b0ff..ba85530 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ocrd >= 3.0.0b3 +ocrd >= 3.0.0b4 click tesserocr >= 2.5.2 shapely >= 2.0 From 6ca668e036e245d254db593127f37aa33c847612 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 29 Oct 2024 18:36:44 +0100 Subject: [PATCH 30/34] require ocrd>=3.0.0b6 (mp), unlimit max_workers --- ocrd_tesserocr/recognize.py | 3 --- requirements.txt | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index 0f8f038..b119f3b 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -112,9 +112,6 @@ def __exit__(self, exc_type, exc_val, exc_trace): return None class TesserocrRecognize(Processor): - # Tesseract API is not thread-safe - max_workers = 1 - @property def executable(self): return 'ocrd-tesserocr-recognize' diff --git a/requirements.txt b/requirements.txt index ba85530..44ef17a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ocrd >= 3.0.0b4 +ocrd >= 3.0.0b6 click tesserocr >= 2.5.2 shapely >= 2.0 From 2a8b23b86a3b11080968b6f1e3fbb0a89f362c0c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 29 Oct 2024 18:37:29 +0100 Subject: [PATCH 31/34] test: simplify, use all configs in all tests --- test/conftest.py | 55 +++++++++++++++++++++--------------------- test/test_recognize.py | 32 ++++++++++-------------- 2 files changed, 40 insertions(+), 47 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index 5730f8b..ef16e71 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -7,14 +7,38 @@ from test.assets import assets as assets -@fixture -def workspace(tmpdir, pytestconfig): +CONFIGS = ['', 'pageparallel', 'metscache', 'pageparallel+metscache'] + +@fixture(params=CONFIGS) +def workspace(tmpdir, pytestconfig, request): def _make_workspace(workspace_path): initLogging() if pytestconfig.getoption('verbose') > 0: setOverrideLogLevel('DEBUG') with pushd_popd(tmpdir): - yield Resolver().workspace_from_url(workspace_path, dst_dir=tmpdir, download=True) + if 'metscache' in request.param: + config.OCRD_METS_CACHING = True + print("enabled METS caching") + directory = str(tmpdir) + resolver = Resolver() + workspace = resolver.workspace_from_url(workspace_path, dst_dir=directory, download=True) + if 'pageparallel' in request.param: + config.OCRD_MAX_PARALLEL_PAGES = 2 + print("enabled page-parallel processing") + def _start_mets_server(*args, **kwargs): + print("running with METS server") + server = OcrdMetsServer(*args, **kwargs) + server.startup() + process = Process(target=_start_mets_server, + kwargs={'workspace': workspace, 'url': 'mets.sock'}) + process.start() + sleep(1) + workspace = Workspace(resolver, directory, mets_server_url='mets.sock') + yield workspace + process.terminate() + else: + yield workspace + config.reset_defaults() return _make_workspace @fixture @@ -28,28 +52,3 @@ def workspace_herold_small(workspace): @fixture def workspace_gutachten(workspace): yield from workspace(assets.url_of('gutachten/data/mets.xml')) - -CONFIGS = ['', 'pageparallel', 'metscache', 'pageparallel+metscache'] - -@fixture(params=CONFIGS) -def configsettings(request): - if 'metscache' in request.param: - config.OCRD_METS_CACHING = True - print("enabled METS caching") - if 'pageparallel' in request.param: - config.OCRD_MAX_PARALLEL_PAGES = 4 - print("enabled page-parallel processing") - def _start_mets_server(*args, **kwargs): - server = OcrdMetsServer(*args, **kwargs) - server.startup() - workspace = Workspace(Resolver(), '.') - process = Process(target=_start_mets_server, - kwargs={'workspace': workspace, 'url': 'mets.sock'}) - process.start() - sleep(1) - workspace = Workspace(Resolver(), '.', mets_server_url='mets.sock') - yield 'mets.sock', workspace - process.terminate() - else: - yield () - config.reset_defaults() diff --git a/test/test_recognize.py b/test/test_recognize.py index 306cc3f..e24dcbd 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -11,37 +11,31 @@ from ocrd_tesserocr import TesserocrRecognize from ocrd_tesserocr import TesserocrFontShape -def test_run_modular(workspace_kant_binarized, configsettings): - ws = workspace_kant_binarized - if len(configsettings): - print("running with METS server") - mets_server_url, ws = configsettings - kwargs = {'workspace': ws, - 'mets_server_url': mets_server_url} - else: - kwargs = {'workspace': ws} +def test_run_modular(workspace_kant_binarized): run_processor(TesserocrSegmentRegion, + workspace=workspace_kant_binarized, input_file_grp="OCR-D-IMG", - output_file_grp="OCR-D-SEG-BLOCK", - **kwargs) + output_file_grp="OCR-D-SEG-BLOCK") run_processor(TesserocrSegmentLine, + workspace=workspace_kant_binarized, input_file_grp="OCR-D-SEG-BLOCK", - output_file_grp="OCR-D-SEG-LINE", - **kwargs) + output_file_grp="OCR-D-SEG-LINE") run_processor(TesserocrRecognize, + workspace=workspace_kant_binarized, input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-OCR-TESS", - parameter={'textequiv_level': 'line', 'model': 'Fraktur'}, - **kwargs) + parameter={'textequiv_level': 'line', 'model': 'Fraktur'}) run_processor(TesserocrSegmentWord, + workspace=workspace_kant_binarized, input_file_grp="OCR-D-SEG-LINE", - output_file_grp="OCR-D-SEG-WORD", - **kwargs) + output_file_grp="OCR-D-SEG-WORD") run_processor(TesserocrRecognize, + workspace=workspace_kant_binarized, input_file_grp="OCR-D-SEG-WORD", output_file_grp="OCR-D-OCR-TESS-W2C", - parameter={'segmentation_level': 'glyph', 'textequiv_level': 'glyph', 'model': 'Fraktur'}, - **kwargs) + parameter={'segmentation_level': 'glyph', 'textequiv_level': 'glyph', + 'model': 'Fraktur'}) + ws = workspace_kant_binarized ws.save_mets() assert os.path.isdir(os.path.join(ws.directory, 'OCR-D-OCR-TESS-W2C')) results = ws.find_files(file_grp='OCR-D-OCR-TESS-W2C', mimetype=MIMETYPE_PAGE) From 23d7f7f28972790372b9217a995afdc65afbae3b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 30 Oct 2024 23:50:25 +0100 Subject: [PATCH 32/34] CI: add RAM, more verbose --- .circleci/config.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index be175fe..a5fb885 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -22,12 +22,13 @@ jobs: - run: make install - run: make deps-test - run: mkdir test-results - - run: make test PYTEST_ARGS=--junitxml=test-results/test.xml + - run: make test PYTEST_ARGS="-vv --junitxml=test-results/test.xml" - store_test_results: path: test-results - run: make test-cli - run: make coverage - codecov/upload + resource_class: large build-docker: docker: From 1a157a53cef7b3416ebe19f826ff4c3e80b85370 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 20 Jan 2025 14:37:45 +0100 Subject: [PATCH 33/34] require core >= 3 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 44ef17a..4dc149b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ocrd >= 3.0.0b6 +ocrd >= 3.0.0 click tesserocr >= 2.5.2 shapely >= 2.0 From e0e5e4dbc202c3caff6149d540037bb9267e1c21 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 20 Jan 2025 14:37:55 +0100 Subject: [PATCH 34/34] update tesser{act,ocr} --- repo/tesseract | 2 +- repo/tesserocr | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/repo/tesseract b/repo/tesseract index 5d5a633..3157ff0 160000 --- a/repo/tesseract +++ b/repo/tesseract @@ -1 +1 @@ -Subproject commit 5d5a633a5d7abfb155a605be90f8033f82e9744f +Subproject commit 3157ff0e741ea5c85e16fbd1c6edf20f30eccbd3 diff --git a/repo/tesserocr b/repo/tesserocr index c4307f0..bbe0fb8 160000 --- a/repo/tesserocr +++ b/repo/tesserocr @@ -1 +1 @@ -Subproject commit c4307f0e499422c70e4684caf24e047eb75c2938 +Subproject commit bbe0fb8edabdcc990f1e6fa9334c0747c2ac76ee