Skip to content

Commit

Permalink
📦 0.1.9 (extract*: expose param feature_filter)
Browse files Browse the repository at this point in the history
  • Loading branch information
bertsky committed Feb 10, 2021
1 parent 5a7a9dc commit 03c38d9
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 17 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

## [0.1.9]

Changed:

* extract-regions/lines/words/glyphs: add `feature_filter` param

## [0.1.8]

Fixed:
Expand Down Expand Up @@ -89,6 +95,7 @@ Changed:
* further improve README

<!-- link-labels -->
[0.1.9]: ../../compare/v0.1.8...v0.1.9
[0.1.8]: ../../compare/v0.1.7...v0.1.8
[0.1.7]: ../../compare/v0.1.6...v0.1.7
[0.1.6]: ../../compare/v0.1.5...v0.1.6
Expand Down
8 changes: 8 additions & 0 deletions ocrd_segment/extract_glyphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ def process(self):
Extract an image for each glyph (which depending on the workflow
can already be deskewed, dewarped, binarized etc.), cropped to its
minimal bounding box, and masked by the coordinate polygon outline.
Apply ``feature_filter`` (a comma-separated list of image features,
cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip
specific features when retrieving derived images.
If ``transparency`` is true, then also add an alpha channel which is
fully transparent outside of the mask.
Expand Down Expand Up @@ -75,6 +78,7 @@ def process(self):
page = pcgts.get_Page()
page_image, page_coords, page_image_info = self.workspace.image_from_page(
page, page_id,
feature_filter=self.parameter['feature_filter'],
transparency=self.parameter['transparency'])
if page_image_info.resolution != 1:
dpi = page_image_info.resolution
Expand All @@ -92,6 +96,7 @@ def process(self):
for region in regions:
region_image, region_coords = self.workspace.image_from_segment(
region, page_image, page_coords,
feature_filter=self.parameter['feature_filter'],
transparency=self.parameter['transparency'])
rtype = region.get_type()

Expand All @@ -101,20 +106,23 @@ def process(self):
for line in lines:
line_image, line_coords = self.workspace.image_from_segment(
line, region_image, region_coords,
feature_filter=self.parameter['feature_filter'],
transparency=self.parameter['transparency'])
words = line.get_Word()
if not words:
LOG.warning("Line '%s' contains no words", line.id)
for word in words:
word_image, word_coords = self.workspace.image_from_segment(
word, line_image, line_coords,
feature_filter=self.parameter['feature_filter'],
transparency=self.parameter['transparency'])
glyphs = word.get_Glyph()
if not glyphs:
LOG.warning("Word '%s' contains no glyphs", word.id)
for glyph in glyphs:
glyph_image, glyph_coords = self.workspace.image_from_segment(
glyph, word_image, word_coords,
feature_filter=self.parameter['feature_filter'],
transparency=self.parameter['transparency'])
lpolygon_rel = coordinates_of_segment(
glyph, glyph_image, glyph_coords).tolist()
Expand Down
11 changes: 7 additions & 4 deletions ocrd_segment/extract_lines.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import absolute_import

import json
import itertools

from ocrd_utils import (
getLogger,
Expand Down Expand Up @@ -34,6 +33,9 @@ def process(self):
Extract an image for each textline (which depending on the workflow
can already be deskewed, dewarped, binarized etc.), cropped to its
minimal bounding box, and masked by the coordinate polygon outline.
Apply ``feature_filter`` (a comma-separated list of image features,
cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip
specific features when retrieving derived images.
If ``transparency`` is true, then also add an alpha channel which is
fully transparent outside of the mask.
Expand Down Expand Up @@ -75,6 +77,7 @@ def process(self):
page = pcgts.get_Page()
page_image, page_coords, page_image_info = self.workspace.image_from_page(
page, page_id,
feature_filter=self.parameter['feature_filter'],
transparency=self.parameter['transparency'])
if page_image_info.resolution != 1:
dpi = page_image_info.resolution
Expand All @@ -84,14 +87,13 @@ def process(self):
dpi = None
ptype = page.get_type()

regions = itertools.chain.from_iterable(
[page.get_TextRegion()] +
[subregion.get_TextRegion() for subregion in page.get_TableRegion()])
regions = page.get_AllRegions(classes=['Text'], order='reading-order')
if not regions:
LOG.warning("Page '%s' contains no text regions", page_id)
for region in regions:
region_image, region_coords = self.workspace.image_from_segment(
region, page_image, page_coords,
feature_filter=self.parameter['feature_filter'],
transparency=self.parameter['transparency'])
rtype = region.get_type()

Expand All @@ -101,6 +103,7 @@ def process(self):
for line in lines:
line_image, line_coords = self.workspace.image_from_segment(
line, region_image, region_coords,
feature_filter=self.parameter['feature_filter'],
transparency=self.parameter['transparency'])
lpolygon_rel = coordinates_of_segment(
line, line_image, line_coords).tolist()
Expand Down
6 changes: 5 additions & 1 deletion ocrd_segment/extract_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,11 @@ def process(self):
# pylint: disable=attribute-defined-outside-init
for n, input_file in enumerate(self.input_files):
page_id = input_file.pageId or input_file.ID
num_page_id = int(page_id.strip(page_id.strip("0123456789")))
try:
# separate non-numeric part of page ID to retain the numeric part
num_page_id = int(page_id.strip(page_id.strip("0123456789")))
except Exception:
num_page_id = n
LOG.info("INPUT FILE %i / %s", n, page_id)
pcgts = page_from_file(self.workspace.download_file(input_file))
self.add_metadata(pcgts)
Expand Down
7 changes: 7 additions & 0 deletions ocrd_segment/extract_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ def process(self):
Extract an image for each word (which depending on the workflow
can already be deskewed, dewarped, binarized etc.), cropped to its
minimal bounding box, and masked by the coordinate polygon outline.
Apply ``feature_filter`` (a comma-separated list of image features,
cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip
specific features when retrieving derived images.
If ``transparency`` is true, then also add an alpha channel which is
fully transparent outside of the mask.
Expand Down Expand Up @@ -75,6 +78,7 @@ def process(self):
page = pcgts.get_Page()
page_image, page_coords, page_image_info = self.workspace.image_from_page(
page, page_id,
feature_filter=self.parameter['feature_filter'],
transparency=self.parameter['transparency'])
if page_image_info.resolution != 1:
dpi = page_image_info.resolution
Expand All @@ -92,6 +96,7 @@ def process(self):
for region in regions:
region_image, region_coords = self.workspace.image_from_segment(
region, page_image, page_coords,
feature_filter=self.parameter['feature_filter'],
transparency=self.parameter['transparency'])
rtype = region.get_type()

Expand All @@ -101,13 +106,15 @@ def process(self):
for line in lines:
line_image, line_coords = self.workspace.image_from_segment(
line, region_image, region_coords,
feature_filter=self.parameter['feature_filter'],
transparency=self.parameter['transparency'])
words = line.get_Word()
if not words:
LOG.warning("Line '%s' contains no words", line.id)
for word in words:
word_image, word_coords = self.workspace.image_from_segment(
word, line_image, line_coords,
feature_filter=self.parameter['feature_filter'],
transparency=self.parameter['transparency'])
lpolygon_rel = coordinates_of_segment(
word, word_image, word_coords).tolist()
Expand Down
43 changes: 31 additions & 12 deletions ocrd_segment/ocrd-tool.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "0.1.8",
"version": "0.1.9",
"git_url": "https://github.com/OCR-D/ocrd_segment",
"tools": {
"ocrd-segment-repair": {
Expand Down Expand Up @@ -131,7 +131,7 @@
"OCR-D-GT-SEG-BLOCK"
],
"output_file_grp": [
"OCR-D-IMG-CROP"
"OCR-D-IMG-PAGE"
],
"steps": ["layout/analysis"],
"parameters": {
Expand Down Expand Up @@ -231,10 +231,15 @@
"OCR-D-GT-SEG-BLOCK"
],
"output_file_grp": [
"OCR-D-IMG-CROP"
"OCR-D-IMG-REGION"
],
"steps": ["layout/analysis"],
"parameters": {
"feature_filter": {
"type": "string",
"default": "",
"description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)."
},
"mimetype": {
"type": "string",
"enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"],
Expand All @@ -257,10 +262,15 @@
"OCR-D-GT-SEG-LINE"
],
"output_file_grp": [
"OCR-D-IMG-CROP"
"OCR-D-IMG-LINE"
],
"steps": ["layout/analysis"],
"parameters": {
"feature_filter": {
"type": "string",
"default": "",
"description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)."
},
"mimetype": {
"type": "string",
"enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"],
Expand All @@ -283,10 +293,15 @@
"OCR-D-GT-SEG-WORD"
],
"output_file_grp": [
"OCR-D-IMG-CROP"
"OCR-D-IMG-WORD"
],
"steps": ["layout/analysis"],
"parameters": {
"feature_filter": {
"type": "string",
"default": "",
"description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)."
},
"mimetype": {
"type": "string",
"enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"],
Expand All @@ -309,10 +324,15 @@
"OCR-D-GT-SEG-GLYPH"
],
"output_file_grp": [
"OCR-D-IMG-CROP"
"OCR-D-IMG-GLYPH"
],
"steps": ["layout/analysis"],
"parameters": {
"feature_filter": {
"type": "string",
"default": "",
"description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)."
},
"mimetype": {
"type": "string",
"enum": ["image/bmp", "application/postscript", "image/gif", "image/jpeg", "image/jp2", "image/png", "image/x-portable-pixmap", "image/tiff"],
Expand All @@ -336,20 +356,19 @@
"OCR-D-OCR"
],
"output_file_grp": [
"OCR-D-SEG-CROP",
"OCR-D-IMG-CROP"
"OCR-D-SEG-CROP"
],
"steps": ["layout/analysis"],
"parameters": {
"feature_selector": {
"type": "string",
"default": "",
"description": "comma-separated list of required image features (e.g. binarized,despeckled)"
"description": "Comma-separated list of required image features (e.g. `binarized,despeckled`)"
},
"feature_filter": {
"type": "string",
"default": "",
"description": "comma-separated list of forbidden image features (e.g. binarized,despeckled)"
"description": "Comma-separated list of forbidden image features (e.g. `binarized,despeckled`)"
},
"transform_coordinates": {
"type": "boolean",
Expand All @@ -368,8 +387,8 @@
"OCR-D-OCR"
],
"output_file_grp": [
"OCR-D-SEG-CROP",
"OCR-D-IMG-CROP"
"OCR-D-SEG-LINE",
"OCR-D-OCR"
],
"steps": ["layout/analysis"],
"parameters": {
Expand Down

0 comments on commit 03c38d9

Please sign in to comment.