Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate ocrd v3 #216

Open
wants to merge 41 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
4f98e6d
adapt to ocrd v3 Processor init (automatic ocrd-tool.json loading)
bertsky Jul 6, 2024
a9168e0
tests: adapt to ocrd v3 init (setup only via run_processor)
bertsky Jul 6, 2024
eb661f4
adapt to ocrd v3 (process→process_page_pcgts)…
bertsky Jul 7, 2024
95d2837
require ocrd>=3.0
bertsky Jul 7, 2024
47dee36
ocrd-tool.json: add cardinality specs
bertsky Aug 13, 2024
e9d562b
require ocrd 3.0 prerelease
bertsky Aug 13, 2024
f6c5ea0
binarize: use final v3 API
bertsky Aug 15, 2024
3fd8265
crop: adapt to final v3 API
bertsky Aug 15, 2024
a66fbbe
deskew: adapt to final v3 API
bertsky Aug 15, 2024
ae10667
fontshape: adapt to final v3 API
bertsky Aug 15, 2024
4c22245
recognize: use final v3 API
bertsky Aug 15, 2024
491003f
segment: adapt to final v3 API
bertsky Aug 16, 2024
0adfdee
segment_line: adapt to final v3 API
bertsky Aug 16, 2024
1d7efa5
segment_region: adapt to final v3 API
bertsky Aug 16, 2024
aadd01b
segment_table: adapt to final v3 API
bertsky Aug 16, 2024
f5099c7
segment_word: adapt to final v3 API
bertsky Aug 16, 2024
013de28
deskew: no segment.id for suffix on page level
bertsky Aug 16, 2024
ff258a3
CI: ex py37, in py311
bertsky Aug 16, 2024
276735b
adapt to v3 b1, replace inheritance w/ proxy pattern
bertsky Aug 25, 2024
7ae25a3
tests: adapt to etree in v3 b1
bertsky Aug 25, 2024
ef09995
require ocrd>=3.0.0b1
bertsky Aug 26, 2024
972ac76
test_recognize: also test with METS Server and METS caching
bertsky Aug 29, 2024
a0d7ffa
limit max_workers=1 (libtesseract is not thread-safe)
bertsky Aug 29, 2024
a406400
conftest: simplify
bertsky Aug 30, 2024
81fe66f
require ocrd>=3.0.0b3
bertsky Aug 30, 2024
4e7fa70
test_cli: use subprocess CLI instead of monkeypatching env for TESSDA…
bertsky Aug 31, 2024
b76a4f5
test: all in pytest call
bertsky Aug 31, 2024
c9b8f3a
test: do not skip failured pages
bertsky Aug 31, 2024
6d26cf0
require ocrd>=3.0.0b4
bertsky Sep 2, 2024
6ca668e
require ocrd>=3.0.0b6 (mp), unlimit max_workers
bertsky Oct 29, 2024
2a8b23b
test: simplify, use all configs in all tests
bertsky Oct 29, 2024
8dc5a4f
Merge branch 'master' into migrate-ocrd-v3
bertsky Oct 30, 2024
23d7f7f
CI: add RAM, more verbose
bertsky Oct 30, 2024
1a157a5
require core >= 3
kba Jan 20, 2025
e0e5e4d
update tesser{act,ocr}
kba Jan 20, 2025
cb1d35b
require core >= v3.0.1
kba Feb 4, 2025
a77009b
CI: extend timeout without output
bertsky Feb 4, 2025
9b6004e
test: disableLogging at the end of tests (because pytest closes all f…
bertsky Feb 8, 2025
95609cd
update ocrd req
bertsky Feb 8, 2025
7b4c066
Merge remote-tracking branch 'refs/remotes/bertsky/migrate-ocrd-v3' i…
bertsky Feb 11, 2025
aaa8d05
extend tests: with instance_caching
bertsky Feb 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
test: simplify, use all configs in all tests
bertsky committed Oct 29, 2024
commit 2a8b23b86a3b11080968b6f1e3fbb0a89f362c0c
55 changes: 27 additions & 28 deletions test/conftest.py
Original file line number Diff line number Diff line change
@@ -7,14 +7,38 @@

from test.assets import assets as assets

@fixture
def workspace(tmpdir, pytestconfig):
CONFIGS = ['', 'pageparallel', 'metscache', 'pageparallel+metscache']

@fixture(params=CONFIGS)
def workspace(tmpdir, pytestconfig, request):
def _make_workspace(workspace_path):
initLogging()
if pytestconfig.getoption('verbose') > 0:
setOverrideLogLevel('DEBUG')
with pushd_popd(tmpdir):
yield Resolver().workspace_from_url(workspace_path, dst_dir=tmpdir, download=True)
if 'metscache' in request.param:
config.OCRD_METS_CACHING = True
print("enabled METS caching")
directory = str(tmpdir)
resolver = Resolver()
workspace = resolver.workspace_from_url(workspace_path, dst_dir=directory, download=True)
if 'pageparallel' in request.param:
config.OCRD_MAX_PARALLEL_PAGES = 2
print("enabled page-parallel processing")
def _start_mets_server(*args, **kwargs):
print("running with METS server")
server = OcrdMetsServer(*args, **kwargs)
server.startup()
process = Process(target=_start_mets_server,
kwargs={'workspace': workspace, 'url': 'mets.sock'})
process.start()
sleep(1)
workspace = Workspace(resolver, directory, mets_server_url='mets.sock')
yield workspace
process.terminate()
else:
yield workspace
config.reset_defaults()
return _make_workspace

@fixture
@@ -28,28 +52,3 @@ def workspace_herold_small(workspace):
@fixture
def workspace_gutachten(workspace):
yield from workspace(assets.url_of('gutachten/data/mets.xml'))

CONFIGS = ['', 'pageparallel', 'metscache', 'pageparallel+metscache']

@fixture(params=CONFIGS)
def configsettings(request):
if 'metscache' in request.param:
config.OCRD_METS_CACHING = True
print("enabled METS caching")
if 'pageparallel' in request.param:
config.OCRD_MAX_PARALLEL_PAGES = 4
print("enabled page-parallel processing")
def _start_mets_server(*args, **kwargs):
server = OcrdMetsServer(*args, **kwargs)
server.startup()
workspace = Workspace(Resolver(), '.')
process = Process(target=_start_mets_server,
kwargs={'workspace': workspace, 'url': 'mets.sock'})
process.start()
sleep(1)
workspace = Workspace(Resolver(), '.', mets_server_url='mets.sock')
yield 'mets.sock', workspace
process.terminate()
else:
yield ()
config.reset_defaults()
32 changes: 13 additions & 19 deletions test/test_recognize.py
Original file line number Diff line number Diff line change
@@ -11,37 +11,31 @@
from ocrd_tesserocr import TesserocrRecognize
from ocrd_tesserocr import TesserocrFontShape

def test_run_modular(workspace_kant_binarized, configsettings):
ws = workspace_kant_binarized
if len(configsettings):
print("running with METS server")
mets_server_url, ws = configsettings
kwargs = {'workspace': ws,
'mets_server_url': mets_server_url}
else:
kwargs = {'workspace': ws}
def test_run_modular(workspace_kant_binarized):
run_processor(TesserocrSegmentRegion,
workspace=workspace_kant_binarized,
input_file_grp="OCR-D-IMG",
output_file_grp="OCR-D-SEG-BLOCK",
**kwargs)
output_file_grp="OCR-D-SEG-BLOCK")
run_processor(TesserocrSegmentLine,
workspace=workspace_kant_binarized,
input_file_grp="OCR-D-SEG-BLOCK",
output_file_grp="OCR-D-SEG-LINE",
**kwargs)
output_file_grp="OCR-D-SEG-LINE")
run_processor(TesserocrRecognize,
workspace=workspace_kant_binarized,
input_file_grp="OCR-D-SEG-LINE",
output_file_grp="OCR-D-OCR-TESS",
parameter={'textequiv_level': 'line', 'model': 'Fraktur'},
**kwargs)
parameter={'textequiv_level': 'line', 'model': 'Fraktur'})
run_processor(TesserocrSegmentWord,
workspace=workspace_kant_binarized,
input_file_grp="OCR-D-SEG-LINE",
output_file_grp="OCR-D-SEG-WORD",
**kwargs)
output_file_grp="OCR-D-SEG-WORD")
run_processor(TesserocrRecognize,
workspace=workspace_kant_binarized,
input_file_grp="OCR-D-SEG-WORD",
output_file_grp="OCR-D-OCR-TESS-W2C",
parameter={'segmentation_level': 'glyph', 'textequiv_level': 'glyph', 'model': 'Fraktur'},
**kwargs)
parameter={'segmentation_level': 'glyph', 'textequiv_level': 'glyph',
'model': 'Fraktur'})
ws = workspace_kant_binarized
ws.save_mets()
assert os.path.isdir(os.path.join(ws.directory, 'OCR-D-OCR-TESS-W2C'))
results = ws.find_files(file_grp='OCR-D-OCR-TESS-W2C', mimetype=MIMETYPE_PAGE)