📦 0.1.9 (extract*: expose param feature_filter)

OCR-D · Feb 10, 2021 · 03c38d9 · 03c38d9
1 parent 5a7a9dc
commit 03c38d9
Show file tree

Hide file tree

Showing 6 changed files with 65 additions and 17 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,12 @@ Versioned according to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+## [0.1.9]
+
+Changed:
+
+ * extract-regions/lines/words/glyphs: add `feature_filter` param
+
 ## [0.1.8]
 
 Fixed:
@@ -89,6 +95,7 @@ Changed:
   * further improve README
 
 <!-- link-labels -->
+[0.1.9]: ../../compare/v0.1.8...v0.1.9
 [0.1.8]: ../../compare/v0.1.7...v0.1.8
 [0.1.7]: ../../compare/v0.1.6...v0.1.7
 [0.1.6]: ../../compare/v0.1.5...v0.1.6

diff --git a/ocrd_segment/extract_glyphs.py b/ocrd_segment/extract_glyphs.py
@@ -34,6 +34,9 @@ def process(self):
         Extract an image for each glyph (which depending on the workflow
         can already be deskewed, dewarped, binarized etc.), cropped to its
         minimal bounding box, and masked by the coordinate polygon outline.
+        Apply ``feature_filter`` (a comma-separated list of image features,
+        cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip
+        specific features when retrieving derived images.
         If ``transparency`` is true, then also add an alpha channel which is
         fully transparent outside of the mask.
         
@@ -75,6 +78,7 @@ def process(self):
             page = pcgts.get_Page()
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id,
+                feature_filter=self.parameter['feature_filter'],
                 transparency=self.parameter['transparency'])
             if page_image_info.resolution != 1:
                 dpi = page_image_info.resolution
@@ -92,6 +96,7 @@ def process(self):
             for region in regions:
                 region_image, region_coords = self.workspace.image_from_segment(
                     region, page_image, page_coords,
+                    feature_filter=self.parameter['feature_filter'],
                     transparency=self.parameter['transparency'])
                 rtype = region.get_type()
 
@@ -101,20 +106,23 @@ def process(self):
                 for line in lines:
                     line_image, line_coords = self.workspace.image_from_segment(
                         line, region_image, region_coords,
+                        feature_filter=self.parameter['feature_filter'],
                         transparency=self.parameter['transparency'])
                     words = line.get_Word()
                     if not words:
                         LOG.warning("Line '%s' contains no words", line.id)
                     for word in words:
                         word_image, word_coords = self.workspace.image_from_segment(
                             word, line_image, line_coords,
+                            feature_filter=self.parameter['feature_filter'],
                             transparency=self.parameter['transparency'])
                         glyphs = word.get_Glyph()
                         if not glyphs:
                             LOG.warning("Word '%s' contains no glyphs", word.id)
                         for glyph in glyphs:
                             glyph_image, glyph_coords = self.workspace.image_from_segment(
                                 glyph, word_image, word_coords,
+                                feature_filter=self.parameter['feature_filter'],
                                 transparency=self.parameter['transparency'])
                             lpolygon_rel = coordinates_of_segment(
                                 glyph, glyph_image, glyph_coords).tolist()

diff --git a/ocrd_segment/extract_lines.py b/ocrd_segment/extract_lines.py
@@ -1,7 +1,6 @@
 from __future__ import absolute_import
 
 import json
-import itertools
 
 from ocrd_utils import (
     getLogger,
@@ -34,6 +33,9 @@ def process(self):
         Extract an image for each textline (which depending on the workflow
         can already be deskewed, dewarped, binarized etc.), cropped to its
         minimal bounding box, and masked by the coordinate polygon outline.
+        Apply ``feature_filter`` (a comma-separated list of image features,
+        cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip
+        specific features when retrieving derived images.
         If ``transparency`` is true, then also add an alpha channel which is
         fully transparent outside of the mask.
         
@@ -75,6 +77,7 @@ def process(self):
             page = pcgts.get_Page()
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id,
+                feature_filter=self.parameter['feature_filter'],
                 transparency=self.parameter['transparency'])
             if page_image_info.resolution != 1:
                 dpi = page_image_info.resolution
@@ -84,14 +87,13 @@ def process(self):
                 dpi = None
             ptype = page.get_type()
 
-            regions = itertools.chain.from_iterable(
-                [page.get_TextRegion()] +
-                [subregion.get_TextRegion() for subregion in page.get_TableRegion()])
+            regions = page.get_AllRegions(classes=['Text'], order='reading-order')
             if not regions:
                 LOG.warning("Page '%s' contains no text regions", page_id)
             for region in regions:
                 region_image, region_coords = self.workspace.image_from_segment(
                     region, page_image, page_coords,
+                    feature_filter=self.parameter['feature_filter'],
                     transparency=self.parameter['transparency'])
                 rtype = region.get_type()
 
@@ -101,6 +103,7 @@ def process(self):
                 for line in lines:
                     line_image, line_coords = self.workspace.image_from_segment(
                         line, region_image, region_coords,
+                        feature_filter=self.parameter['feature_filter'],
                         transparency=self.parameter['transparency'])
                     lpolygon_rel = coordinates_of_segment(
                         line, line_image, line_coords).tolist()

diff --git a/ocrd_segment/extract_pages.py b/ocrd_segment/extract_pages.py
@@ -180,7 +180,11 @@ def process(self):
         # pylint: disable=attribute-defined-outside-init
         for n, input_file in enumerate(self.input_files):
             page_id = input_file.pageId or input_file.ID
-            num_page_id = int(page_id.strip(page_id.strip("0123456789")))
+            try:
+                # separate non-numeric part of page ID to retain the numeric part
+                num_page_id = int(page_id.strip(page_id.strip("0123456789")))
+            except Exception:
+                num_page_id = n
             LOG.info("INPUT FILE %i / %s", n, page_id)
             pcgts = page_from_file(self.workspace.download_file(input_file))
             self.add_metadata(pcgts)

diff --git a/ocrd_segment/extract_words.py b/ocrd_segment/extract_words.py
@@ -34,6 +34,9 @@ def process(self):
         Extract an image for each word (which depending on the workflow
         can already be deskewed, dewarped, binarized etc.), cropped to its
         minimal bounding box, and masked by the coordinate polygon outline.
+        Apply ``feature_filter`` (a comma-separated list of image features,
+        cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip
+        specific features when retrieving derived images.
         If ``transparency`` is true, then also add an alpha channel which is
         fully transparent outside of the mask.
         
@@ -75,6 +78,7 @@ def process(self):
             page = pcgts.get_Page()
             page_image, page_coords, page_image_info = self.workspace.image_from_page(
                 page, page_id,
+                feature_filter=self.parameter['feature_filter'],
                 transparency=self.parameter['transparency'])
             if page_image_info.resolution != 1:
                 dpi = page_image_info.resolution
@@ -92,6 +96,7 @@ def process(self):
             for region in regions:
                 region_image, region_coords = self.workspace.image_from_segment(
                     region, page_image, page_coords,
+                    feature_filter=self.parameter['feature_filter'],
                     transparency=self.parameter['transparency'])
                 rtype = region.get_type()
 
@@ -101,13 +106,15 @@ def process(self):
                 for line in lines:
                     line_image, line_coords = self.workspace.image_from_segment(
                         line, region_image, region_coords,
+                        feature_filter=self.parameter['feature_filter'],
                         transparency=self.parameter['transparency'])
                     words = line.get_Word()
                     if not words:
                         LOG.warning("Line '%s' contains no words", line.id)
                     for word in words:
                         word_image, word_coords = self.workspace.image_from_segment(
                             word, line_image, line_coords,
+                            feature_filter=self.parameter['feature_filter'],
                             transparency=self.parameter['transparency'])
                         lpolygon_rel = coordinates_of_segment(
                             word, word_image, word_coords).tolist()

diff --git a/ocrd_segment/ocrd-tool.json b/ocrd_segment/ocrd-tool.json
@@ -1,5 +1,5 @@
 {
-  "version": "0.1.8",
+  "version": "0.1.9",
   "git_url": "https://github.com/OCR-D/ocrd_segment",
   "tools": {
     "ocrd-segment-repair": {
@@ -131,7 +131,7 @@
         "OCR-D-GT-SEG-BLOCK"
       ],
       "output_file_grp": [
-        "OCR-D-IMG-CROP"
+        "OCR-D-IMG-PAGE"
       ],
       "steps": ["layout/analysis"],
       "parameters": {
@@ -231,10 +231,15 @@
         "OCR-D-GT-SEG-BLOCK"
       ],
       "output_file_grp": [
-        "OCR-D-IMG-CROP"
+        "OCR-D-IMG-REGION"
       ],
       "steps": ["layout/analysis"],
       "parameters": {
+        "feature_filter": {
+          "type": "string",
+          "default": "",
+          "description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)."
+        },
         "mimetype": {
           "type": "string",
           "enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"],
@@ -257,10 +262,15 @@
         "OCR-D-GT-SEG-LINE"
       ],
       "output_file_grp": [
-        "OCR-D-IMG-CROP"
+        "OCR-D-IMG-LINE"
       ],
       "steps": ["layout/analysis"],
       "parameters": {
+        "feature_filter": {
+          "type": "string",
+          "default": "",
+          "description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)."
+        },
         "mimetype": {
           "type": "string",
           "enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"],
@@ -283,10 +293,15 @@
         "OCR-D-GT-SEG-WORD"
       ],
       "output_file_grp": [
-        "OCR-D-IMG-CROP"
+        "OCR-D-IMG-WORD"
       ],
       "steps": ["layout/analysis"],
       "parameters": {
+        "feature_filter": {
+          "type": "string",
+          "default": "",
+          "description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)."
+        },
         "mimetype": {
           "type": "string",
           "enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"],
@@ -309,10 +324,15 @@
         "OCR-D-GT-SEG-GLYPH"
       ],
       "output_file_grp": [
-        "OCR-D-IMG-CROP"
+        "OCR-D-IMG-GLYPH"
       ],
       "steps": ["layout/analysis"],
       "parameters": {
+        "feature_filter": {
+          "type": "string",
+          "default": "",
+          "description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)."
+        },
         "mimetype": {
           "type": "string",
           "enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"],
@@ -336,20 +356,19 @@
         "OCR-D-OCR"
       ],
       "output_file_grp": [
-        "OCR-D-SEG-CROP",
-        "OCR-D-IMG-CROP"
+        "OCR-D-SEG-CROP"
       ],
       "steps": ["layout/analysis"],
       "parameters": {
           "feature_selector": {
               "type": "string",
               "default": "",
-              "description": "comma-separated list of required image features (e.g. binarized,despeckled)"
+              "description": "Comma-separated list of required image features (e.g. `binarized,despeckled`)"
           },
           "feature_filter": {
               "type": "string",
               "default": "",
-              "description": "comma-separated list of forbidden image features (e.g. binarized,despeckled)"
+              "description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)"
           },
           "transform_coordinates": {
               "type": "boolean",
@@ -368,8 +387,8 @@
         "OCR-D-OCR"
       ],
       "output_file_grp": [
-        "OCR-D-SEG-CROP",
-        "OCR-D-IMG-CROP"
+        "OCR-D-SEG-LINE",
+        "OCR-D-OCR"
       ],
       "steps": ["layout/analysis"],
       "parameters": {