From aa5d76ccb549a5001eb9f6e154354add9864c064 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 25 Sep 2024 17:39:03 +0200 Subject: [PATCH 1/3] adapt to ocrd>=3.0 --- ocrd_page_to_alto/ocrd-tool.json | 6 +- ocrd_page_to_alto/ocrd_processor.py | 96 ++++++++++++----------------- requirements.txt | 2 +- 3 files changed, 42 insertions(+), 62 deletions(-) diff --git a/ocrd_page_to_alto/ocrd-tool.json b/ocrd_page_to_alto/ocrd-tool.json index e98f02a..969a111 100644 --- a/ocrd_page_to_alto/ocrd-tool.json +++ b/ocrd_page_to_alto/ocrd-tool.json @@ -6,9 +6,9 @@ "executable": "ocrd-page2alto-transform", "categories": ["Layout analysis"], "description": "Transform PAGE-XML to ALTO", - "input_file_grp": ["OBSOLETE"], - "output_file_grp": ["ALSO-OBSOLETE"], - "steps": ["whatevs"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "steps": ["format conversion"], "parameters": { "check_border": { "type": "boolean", diff --git a/ocrd_page_to_alto/ocrd_processor.py b/ocrd_page_to_alto/ocrd_processor.py index c162426..9004734 100644 --- a/ocrd_page_to_alto/ocrd_processor.py +++ b/ocrd_page_to_alto/ocrd_processor.py @@ -1,66 +1,46 @@ -from json import loads -from pkg_resources import resource_string -from os.path import join - +from typing import Optional from ocrd import Processor -from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_file import OcrdFileType from ocrd_utils import ( - getLogger, - assert_file_grp_cardinality, - make_file_id + make_file_id, + MIMETYPE_PAGE, ) from .convert import OcrdPageAltoConverter -OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) - -# @dataclass() -# class OcrdPageResult(): -# pcgts : OcrdPage -# images : List = field(default_factory=list) class Page2AltoProcessor(Processor): - - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-page2alto-transform'] - kwargs['version'] = OCRD_TOOL['version'] - super().__init__(*args, **kwargs) - self.log = getLogger('ocrd.processor.page2alto') - - - def process(self): - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - assert isinstance(self.parameter, dict) - for n, input_file in enumerate(self.input_files): - page_id = input_file.pageId or input_file.ID - self.log.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files)) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.log.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight) - self.add_metadata(pcgts) - page = pcgts.get_Page() - converter = OcrdPageAltoConverter( - page_filename=input_file.local_filename, - alto_version=self.parameter["alto_version"].replace('v', ''), - check_words=self.parameter["check_words"], - timestamp_src=self.parameter["timestamp_src"], - check_border=self.parameter["check_border"], - skip_empty_lines=self.parameter["skip_empty_lines"], - trailing_dash_to_hyp=self.parameter["trailing_dash_to_hyp"], - dummy_textline=self.parameter["dummy_textline"], - dummy_word=self.parameter["dummy_word"], - textequiv_index=self.parameter["textequiv_index"], - textequiv_fallback_strategy=self.parameter["textequiv_fallback_strategy"], - region_order=self.parameter["region_order"], - textline_order=self.parameter["textline_order"], - ) - converter.convert() - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=page_id, - mimetype='application/alto+xml', - local_filename=join(self.output_file_grp, file_id) + '.xml', - content=str(converter)) + @property + def executable(self): + return 'ocrd-page2alto-transform' + + def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: + input_file = input_files[0] + assert input_file + assert input_file.local_filename + assert input_file.mimetype == MIMETYPE_PAGE + self.logger.debug("converting file %s", input_file.local_filename) + converter = OcrdPageAltoConverter( + page_filename=input_file.local_filename, + alto_version=self.parameter["alto_version"].replace('v', ''), + check_words=self.parameter["check_words"], + timestamp_src=self.parameter["timestamp_src"], + check_border=self.parameter["check_border"], + skip_empty_lines=self.parameter["skip_empty_lines"], + trailing_dash_to_hyp=self.parameter["trailing_dash_to_hyp"], + dummy_textline=self.parameter["dummy_textline"], + dummy_word=self.parameter["dummy_word"], + textequiv_index=self.parameter["textequiv_index"], + textequiv_fallback_strategy=self.parameter["textequiv_fallback_strategy"], + region_order=self.parameter["region_order"], + textline_order=self.parameter["textline_order"], + ) + converter.convert() + file_id = make_file_id(input_file, self.output_file_grp) + self.workspace.add_file( + file_id=file_id, + file_grp=self.output_file_grp, + pageId=input_file.pageId, + mimetype='application/alto+xml', + local_filename=join(self.output_file_grp, file_id) + '.xml', + content=str(converter)) diff --git a/requirements.txt b/requirements.txt index 7aaeeb7..bb535fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -ocrd >= 2.23.2 +ocrd >= 3.0.0b5 lxml langcodes >= 3.4.0 packaging From 5248d62185888f938120cacb732c5b46a51c5d5e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 25 Sep 2024 18:05:49 +0200 Subject: [PATCH 2/3] forgot os.path.join --- ocrd_page_to_alto/ocrd_processor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocrd_page_to_alto/ocrd_processor.py b/ocrd_page_to_alto/ocrd_processor.py index 9004734..712a551 100644 --- a/ocrd_page_to_alto/ocrd_processor.py +++ b/ocrd_page_to_alto/ocrd_processor.py @@ -1,4 +1,6 @@ from typing import Optional +from os.path import join + from ocrd import Processor from ocrd_models.ocrd_file import OcrdFileType from ocrd_utils import ( From eeadb7c40327bf6a19daaeda3a72dc17b752c405 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 25 Oct 2024 13:49:20 +0200 Subject: [PATCH 3/3] v3 API -> new major 2 --- src/ocrd_page_to_alto/ocrd-tool.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_page_to_alto/ocrd-tool.json b/src/ocrd_page_to_alto/ocrd-tool.json index dff65ed..5d27cff 100644 --- a/src/ocrd_page_to_alto/ocrd-tool.json +++ b/src/ocrd_page_to_alto/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "1.4.1", + "version": "2.0.0", "git_url": "https://github.com/kba/page-to-alto", "tools": { "ocrd-page2alto-transform": {