diff --git a/.circleci/config.yml b/.circleci/config.yml index 1eba0d70..2669a7bc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,16 +2,17 @@ version: 2.1 jobs: build: docker: - - image: cimg/base:stable-18.04 + - image: cimg/base:stable-22.04 steps: - checkout - setup_remote_docker # https://circleci.com/docs/2.0/building-docker-images/ - run: name: build image - command: make docker-maximum-cuda + command: make docker-maximum-cuda GIT_DEPTH=--single-branch no_output_timeout: 30m - when: # takes too long for 1h1m CircleCI timeout overall + # also, storage is limited... condition: false steps: - run: @@ -26,7 +27,7 @@ jobs: destination: artifacts deploy: docker: - - image: cimg/base:stable-18.04 + - image: cimg/base:stable-22.04 environment: GIT_DEPTH: "--depth 1" parameters: @@ -38,7 +39,7 @@ jobs: - setup_remote_docker # https://circleci.com/docs/2.0/building-docker-images/ - run: name: Build Docker image - command: make docker-<< parameters.variant >>-git + command: make docker-<< parameters.variant >>-git GIT_DEPTH=--single-branch # fails due to pip races: DOCKER_PARALLEL=-j3 no_output_timeout: 30m - run: diff --git a/.github/workflows/makedocker.yml b/.github/workflows/makedocker.yml index c7397a55..09f4bc81 100644 --- a/.github/workflows/makedocker.yml +++ b/.github/workflows/makedocker.yml @@ -7,29 +7,10 @@ on: # Trigger workflow in GitHub web frontend or from API. workflow_dispatch: inputs: - os: - description: 'Operating system' - required: true - default: 'ubuntu-18.04' - type: choice - options: - - 'ubuntu-18.04' - - 'ubuntu-20.04' - python-version: - description: 'Python version' - required: true - default: '3.6' - type: choice - options: - - '3.6' - - '3.7' - - '3.8' - - '3.9' - - '3.10' docker-image: description: 'Docker image' required: true - default: 'docker-minimum' + default: 'minimum' type: choice options: - 'minimum' @@ -44,59 +25,46 @@ on: - 'medium-cuda-git' - 'maximum-git' - 'maximum-cuda-git' - upload-docker-image: - description: 'Upload Docker image' + upload-dockerhub: + description: 'Upload Docker image to Dockerhub' default: False type: boolean + upload-github: + description: 'Upload Docker image Github Container Registry' + default: False + type: boolean + upterm-session: + description: 'Run SSH login server for debugging' + default: False + type: boolean + # not yet: + #push: + # branches: [ "master" ] jobs: make: - runs-on: ${{ github.event.inputs.os }} - - env: - PYTHON_VERSION: ${{ github.event.inputs.python-version }} + runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: ${{ env.PYTHON_VERSION }} - - name: Show Python3 version - run: python3 --version - - name: Show disk usage of Homebrew, Android and .NET - run: sudo du -mscx /home/linuxbrew /usr/local/lib/android /usr/share/dotnet 2>/dev/null || true - name: Remove Docker images run: | df -h docker images - docker rmi alpine:3.12 alpine:3.13 alpine:3.14 - docker rmi buildpack-deps:stretch buildpack-deps:buster buildpack-deps:bullseye - docker rmi debian:9 debian:10 debian:11 - docker rmi moby/buildkit:latest - docker rmi node:12-alpine node:14-alpine node:16-alpine - docker rmi node:12 node:14 node:16 - if false; then # don't remove Ubuntu images - docker rmi ubuntu:16.04 ubuntu:18.04 ubuntu:20.04 - fi - docker images + docker rmi $(docker images --filter=reference="alpine:*" -q) + docker rmi $(docker images --filter=reference="buildpack-deps:*" -q) + docker rmi $(docker images --filter=reference="debian:*" -q) + docker rmi $(docker images --filter=reference="node:*" -q) df -h / - name: Remove unneeded Debian packages run: | - if false; then # skip time consuming package uninstall sudo apt-get install -y deborphan - deborphan -a | sort - sudo apt-get purge -y $(deborphan -a|fgrep main/cli-mono|while read dummy package; do echo $package; done) - sudo apt-get purge -y $(deborphan -a|fgrep main/database|while read dummy package; do echo $package; done) - sudo apt-get purge -y $(deborphan -a|fgrep main/devel|while read dummy package; do echo $package; done) - sudo apt-get purge -y $(deborphan -a|fgrep main/httpd|while read dummy package; do echo $package; done) - sudo apt-get purge -y $(deborphan -a|fgrep main/php|while read dummy package; do echo $package; done) - sudo apt-get purge -y $(deborphan -a|fgrep main/vcs|while read dummy package; do echo $package; done) + sudo apt-get purge -y $(deborphan -a | fgrep -e main/cli-mono -e main/database -e main/devel -e main/httpd -e main/php -e main/vcs | while read _ pkg; do echo $package; done) deborphan | sort sudo du -mscx /* 2>/dev/null || true sudo du -mscx /opt/* 2>/dev/null || true sudo du -mscx /usr/* 2>/dev/null || true df -h / - fi - name: Remove Homebrew, Android and .NET run: | # https://github.com/actions/virtual-environments/issues/2606#issuecomment-772683150 @@ -104,20 +72,57 @@ jobs: sudo rm -rf /home/linuxbrew # will release Homebrew sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET + sudo rm -rf /opt/ghc + sudo rm -rf /usr/local/share/boost + sudo rm -rf "$AGENT_TOOLSDIRECTORY" sudo du -mscx /* 2>/dev/null || true df -h / + - name: Setup upterm session + # interactive SSH logins for debugging + if: github.event.inputs.upterm-session == 'true' + uses: lhotari/action-upterm@v1 - name: Make Docker image - run: make docker-${{ github.event.inputs.docker-image }} - - name: Show Docker images - run: docker images - - name: Login to Docker Hub and push new image(s) to Docker Hub + run: make docker-${{ github.event.inputs.docker-image }} GIT_DEPTH=--single-branch + - name: Generate ocrd-all-tool.json + # the Docker build will set OCRD_MODULES inside the image, which we can re-use + # regardless of whether we have /build, we can just use the Makefile from outside again + run: | + export OCRD_MODULES=$(docker run --rm ocrd/all:${{ github.event.inputs.docker-image }} bash -c 'echo $OCRD_MODULES') + make ocrd-all-tool.json + wc -l ocrd-all-tool.json + - name: Upload ocrd-all-tool.json + uses: actions/upload-artifact@v3 + with: + name: ${{ github.event.inputs.docker-image }}_ocrd-all-tool.json + path: ./ocrd-all-tool.json + # if-no-files-found: error + - name: Login to Docker Hub + if: github.event.inputs.upload-dockerhub == 'true' + run: echo ${{ secrets.DOCKERHUB_PASSWORD }} | docker login --username ${{ secrets.DOCKERHUB_USERNAME }} --password-stdin + - name: Push to Docker Hub + if: github.event.inputs.upload-dockerhub == 'true' run: | - if ${{ github.event.inputs.upload-docker-image }}; then - echo ${{ secrets.DOCKERHUB_PASSWORD }} | docker login --username ${{ secrets.DOCKERHUB_USERNAME }} --password-stdin - docker push ocrd/all:${{ github.event.inputs.docker-image }} - if test ${{ github.event.inputs.docker-image }} = maximum-git; then - # Alias Docker image. - docker tag ocrd/all:maximum-git ocrd/all:latest - docker push ocrd/all:latest - fi + docker push ocrd/all:${{ github.event.inputs.docker-image }} + if test ${{ github.event.inputs.docker-image }} = maximum-git; then + # Alias Docker image. + docker tag ocrd/all:maximum-git ocrd/all:latest + docker push ocrd/all:latest fi + - name: Login to GitHub Container Registry + if: github.event.inputs.upload-github == 'true' + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Push to Github Container Registry + if: github.event.inputs.upload-github == 'true' + run: | + docker tag ocrd/all:${{ github.event.inputs.docker-image }} ghcr.io/ocr-d/all:${{ github.event.inputs.docker-image }} + docker push ghcr.io/ocr-d/all:${{ github.event.inputs.docker-image }} + if test ${{ github.event.inputs.docker-image }} = maximum-git; then + # Alias Docker image. + docker tag ocrd/all:maximum-git ghcr.io/ocr-d/all:latest + docker push ghcr.io/ocr-d/all:latest + fi + diff --git a/CHANGELOG.md b/CHANGELOG.md index b48174d4..bb217a19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,200 @@ # Changelog +## [v2023-06-12](https://github.com/OCR-D/ocrd_all/releases/v2023-06-12) + +Changed: + + * All docker images now contain git checkouts and retain `/build`, i.e. behave like the `-git` variants + * No more git updates within docker build, but fix git module dependency outside + * Reduce docker image size (by reinstating all-in-one layer, removing cache, avoiding duplicate CUDA libraries...) + * Use `git submodule update --single-branch` on CI to reduce docker image size + +Added: + + * `make deps-cuda`: non-intrusively support CUDA system dependencies (in docker or native) + * `make ocrd-all-tool.json`: Generate and upload a combination of all processors' `ocrd-tool.json`, #362 + * `make test-workflow`: Run a workflow with most processors as a general smoke test + * `make test-cuda`: to test whether CUDA properly set up and has GPU available + * `make test-core`: Run OCR-D/core unit tests + +Fixed: + + * dependencies between modules, esp. with custom `OCRD_MODULES` selection + * editable mode (`pip install -e`) + * OpenCV build + * get `tesserocr` from PyPI if disabled as a module + * get `ocrd` from PyPI if core disabled as a module + * consistent interoperable module versions (esp. Numpy/OpenCV/Shapely/Protobuf/Torch/TF Python dependencies) + +### [cor-asv-ann](https://github.com/ASVLeipzig/cor-asv-ann) [006a70e](https://github.com/ASVLeipzig/cor-asv-ann/commits/006a70e)..[2c4b1ff](https://github.com/ASVLeipzig/cor-asv-ann/commits/2c4b1ff) + +> Release: [v0.1.14](https://github.com/ASVLeipzig/cor-asv-ann/releases/v0.1.14) + + > * CI: use ocrd/core-cuda as base image + > * CI: dummy venv + > * CI: use proper tab character + > * CI: clone first + > * CI: mkdir first + > * CI: chdir to tmp location + > * CI: use /tmp for aux clone of ocrd_all + > * try getting tensorflow-gpu from Nvidia + > * use proper URLs for submodules + > * Merge pull request #6 from kba/init-report-dict + > * evaluate: skip pages with no results + +### [core](https://github.com/OCR-D/core) [de08453](https://github.com/OCR-D/core/commits/de08453)..[6708624](https://github.com/OCR-D/core/commits/6708624) + +> Release: [v2.51.0](https://github.com/OCR-D/core/releases/v2.51.0) + + > * Merge pull request #1055 from bertsky/deps-cuda + > * ci: disable upterm for gh actions + > * readme: remove dockerhub/travis badge, add GH actions badge + > * debug gh actions + > * test bashlib: /usr/bin/env bash instead of /bin/bash + > * test_workspace_bagger: use ocr-d.de instead of google.com for testing + > * disable logging tests until properly fixed + > * docker-image: reuse local ghcr.io image instead of docker.io + > * :package: v2.51.0 + > * :memo: changelog + > * make help: improve description + > * Revert "Merge remote-tracking branch 'hnesk/no-more-pkg_resources' into release-2.36.0" + > * remove out-dated processor resources + > * docker-cuda: improve (reduce size) again… + > * docker-cuda: rewrite… + > * core-cuda: use same CUDA libs as needed for Torch anyway + > * Merge branch 'pr-1008' into reduce-cuda + > * Merge branch 'master' of https://github.com/OCR-D/core into reduce-cuda + > * make install on py36: revert to prefer-binary via install + > * make install on py36: fix prefer-binary syntax + > * make install on py36: prefer binary OpenCV/Numpy via pip config instead of preinstall + > * core-cuda: install more CUDA libs via pip and ld.so.conf, simplify Dockerfile for that + > * core-cuda: use CUDA 11.8, install cuDNN via pip and make available system-wide via ld.so.conf + > * reinstate workaround for shapely, but more robust + > * docker-cuda: change base image, no multi-CUDA runtimes + > * keep gcc, no autoremove + > * rehash after pip upgrade + > * give up workaround for shapely-CUDA issue + +### [dinglehopper](https://github.com/qurator-spk/dinglehopper) [0fd4ea1](https://github.com/qurator-spk/dinglehopper/commits/0fd4ea1)..[35be58c](https://github.com/qurator-spk/dinglehopper/commits/35be58c) + + > * Merge pull request #83 from INL/feat/batch-processing + > * Merge pull request #82 from CircleCI-config-suggestions-bot/StoreTestResults + > * 🧹 .gitignore .python-version (for pyenv) + > * 🧹 Remove qurator. namespace prefix + > * 🐛 Fix installing by calling find_namespace_packages in setup.py + > * 🕸Do not use deprecated ID, pageId options + > * 🔧 Remove explicit namespace_packages + > * ✔ CircleCI: Explicitly install binary opencv-python-headless (dep of OCR-D?) to avoid compilation + > * 🐛 Remove deprecated declare_namespace call + +### [eynollah](https://github.com/qurator-spk/eynollah) [ea792d1](https://github.com/qurator-spk/eynollah/commits/ea792d1)..[706433c](https://github.com/qurator-spk/eynollah/commits/706433c) + +> Release: [v0.2.0](https://github.com/qurator-spk/eynollah/releases/v0.2.0) + + > * Revert "Merge pull request #97 from qurator-spk/420-namespace-package" + > * Merge pull request #100 from bertsky/patch-2 + > * Merge pull request #97 from qurator-spk/420-namespace-package + +### [ocrd_cis](https://github.com/cisocrgroup/ocrd_cis) [c90b29f](https://github.com/cisocrgroup/ocrd_cis/commits/c90b29f)..[a0ea0a2](https://github.com/cisocrgroup/ocrd_cis/commits/a0ea0a2) + +> Release: [v0.1.5](https://github.com/cisocrgroup/ocrd_cis/releases/v0.1.5) + + > * Merge branch 'kba:typo' #91 into fix-alpha-shape + > * Merge branch 'kba:double-page-max-size' #96 into fix-alpha-shape + > * Merge branch 'kba:resolve-resources' #83 into fix-alpha-shape + > * segment: adapt to OpenCV changes + > * resegment (baseline/ccomps): improve handling of fg conflicts + > * resegment: add param baseline_only + > * check_page/region/line: skip assumptions on number of components + > * adapt to Shapely 2.0 deprecations + > * adapt to Numpy 1.24 dtypes + > * resegment: list instead of generator + > * re/segment: improve polygon simplification + > * re/segment: join_baselines: skip lines outside of polygon + > * re/segment: join_baselines: for complex subtypes, apply recursively + > * re/segment: join_polygons: connect touching neighbours, too + +### [ocrd_fileformat](https://github.com/OCR-D/ocrd_fileformat) [dacfa50](https://github.com/OCR-D/ocrd_fileformat/commits/dacfa50)..[4e7e0de](https://github.com/OCR-D/ocrd_fileformat/commits/4e7e0de) + +> Release: [v0.7.0](https://github.com/OCR-D/ocrd_fileformat/releases/v0.7.0) + + > * :package: v0.7.0 + > * update ocr-fileformat + +### [ocrd_kraken](https://github.com/OCR-D/ocrd_kraken) [802c6b0](https://github.com/OCR-D/ocrd_kraken/commits/802c6b0)..[b13dd8a](https://github.com/OCR-D/ocrd_kraken/commits/b13dd8a) + +> Release: [v0.3.0](https://github.com/OCR-D/ocrd_kraken/releases/v0.3.0) + + > * segment/recognize: default to device=cuda:0 (now backed by safe fall-back) + > * segment/recognize: fall back to CPU if no CUDA device + > * fix typo + > * update changelog + > * recognize: project text upwards in order by concatenation + > * recognize: ensure baseline/boundary are consistent + > * recognize: ignore invalid baselines + > * setup metadata: update/improve + > * deps-ubuntu: update + > * improve/update readme + > * Dockerfile: use CUDA base image, improve labels + > * update changelog + > * recognize: pass lines in baseline format if any baselines are annotated + > * update blla.model URL (master→main) + > * recognize: workaround for empty/failed line records + > * recognize: workaround for better quality box cuts + > * recognize: avoid invalid polygons on single-glyph words + > * Revert "recognize: avoid invalid polygons on single-glyph words" + > * segment: also show tags/type prediction + > * recognize: avoid invalid polygons on single-glyph words + > * recognize: use proper data structures of rpred + +### [ocrd_pagetopdf](https://github.com/UB-Mannheim/ocrd_pagetopdf) [6155605](https://github.com/UB-Mannheim/ocrd_pagetopdf/commits/6155605)..[4f4a330](https://github.com/UB-Mannheim/ocrd_pagetopdf/commits/4f4a330) + +> Release: [v1.0.0](https://github.com/UB-Mannheim/ocrd_pagetopdf/releases/v1.0.0) + + > * Merge pull request #22 from bertsky/fix-input-files + +### [ocrd_wrap](https://github.com/bertsky/ocrd_wrap) [63c04d5](https://github.com/bertsky/ocrd_wrap/commits/63c04d5)..[2cd800d](https://github.com/bertsky/ocrd_wrap/commits/2cd800d) + +> Release: [v0.1.8](https://github.com/bertsky/ocrd_wrap/releases/v0.1.8) + + > * :package: 0.1.8 + > * Merge pull request #10 from bertsky/update-numpy + +### [opencv-python](https://github.com/skvark/opencv-python) [6b73d90](https://github.com/skvark/opencv-python/commits/6b73d90)..[474a1cc](https://github.com/skvark/opencv-python/commits/474a1cc) + +> Release: [72](https://github.com/skvark/opencv-python/releases/72) + + > * Merge pull request #849 from asmorkalov/as/python3_for_build + > * Fix: numpy version for python 3.11 (#839) + > * Merge pull request #852 from asmorkalov:as/ci_check + > * Merge pull request #837 from bertsky/fix-py38-build + > * Merge pull request #838 from henryiii/patch-2 + +### [sbb_binarization](https://github.com/qurator-spk/sbb_binarization) [39ef3fd](https://github.com/qurator-spk/sbb_binarization/commits/39ef3fd)..[010ec99](https://github.com/qurator-spk/sbb_binarization/commits/010ec99) + +> Release: [v0.1.0](https://github.com/qurator-spk/sbb_binarization/releases/v0.1.0) + + > * :package: v0.1.0 + > * Update README.md + > * update CI badge + > * Merge pull request #59 from bertsky/change-model-url + > * Merge pull request #56 from bertsky/non-verbose + +### [workflow-configuration](https://github.com/bertsky/workflow-configuration) [cb923f7](https://github.com/bertsky/workflow-configuration/commits/cb923f7)..[5aff777](https://github.com/bertsky/workflow-configuration/commits/5aff777) + + > * ocrd-import: add option --regex (positive path selector) + > * ocrd-import: fix skipping in subshell + > * add METS transforms to TOC + > * generalise standalone CLI for both PAGE and METS XSL, update documentation + > * mets-copy-agents.xsl: make path for other-mets relative to input mets (not stylesheet file) + > * (ocrd-)page-transform: add pretty-printing option + > * add page-ensure-readingorder.xsl + > * add page-ensure-textequiv-index.xsl + > * ocrd-import: also replace comma in IDs + > * Merge remote-tracking branch 'origin/master' + > * page-textequiv-*: ensure target TextEquiv exists + + ## [v2023-03-24](https://github.com/OCR-D/ocrd_all/releases/v2023-03-24) Fixed: diff --git a/Dockerfile b/Dockerfile index e2e677e3..e40176c7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,7 @@ # use OCR-D base container (from ubuntu:18.04) ARG BASE_IMAGE=ocrd/core FROM $BASE_IMAGE +ARG BASE_IMAGE ARG VCS_REF ARG BUILD_DATE LABEL \ @@ -83,33 +84,34 @@ RUN rm $VIRTUAL_ENV/bin/pip* && apt-get purge -y python3-pip && python3 -m venv # so we must rely on .dockerignore here) COPY . . -# deinit opencv-python (otherwise git submodule status can segfault) and remove copied junk -RUN git submodule deinit opencv-python && git submodule foreach --recursive git clean -fxd - # make apt system functional RUN apt-get -y update && apt-get install -y apt-utils -# get packages for build, try to fetch all modules system requirements, -# remove unneeded automatic deps and clear pkg cache -RUN apt-get -y install automake autoconf libtool pkg-config g++ && make deps-ubuntu && apt-get -y autoremove && apt-get clean - # start a shell script (so we can comment individual steps here) RUN echo "set -ex" > docker.sh +# get packages for build +RUN echo "apt-get -y install automake autoconf libtool pkg-config g++" >> docker.sh +# ensure no additional git actions happen after copying the checked out modules +RUN echo "export NO_UPDATE=1" >> docker.sh +# try to fetch all modules system requirements +RUN echo "make deps-ubuntu" >> docker.sh RUN echo "source $VIRTUAL_ENV/bin/activate" >> docker.sh RUN echo "pip install -U pip setuptools wheel" >> docker.sh +RUN echo "hash -r" >> docker.sh # build/install all tools of the requested modules: RUN echo "make $PARALLEL all" >> docker.sh -# check installation -RUN echo "make -j check CHECK_HELP=1" >> docker.sh -# remove source directories from image, unless using editable mode -# (in the latter case, the git repos are also the installation targets -# and must be kept; so merely clean-up some temporary files) -RUN echo "if [[ '${PIP_OPTIONS}' =~ -e|--editable ]]; then make -i clean-olena clean-tesseract; else rm -fr /.cache /build; fi" >> docker.sh +# remove unneeded automatic deps and clear pkg cache +RUN echo "apt-get -y remove automake autoconf libtool pkg-config g++ && apt-get -y clean" >> docker.sh +# clean-up some temporary files (git repos are also installation targets and must be kept) +RUN echo "make -i clean-olena clean-tesseract; rm -fr /.cache" >> docker.sh # run the script in one layer/step (to minimise image size) # (and export all variables) RUN set -a; bash docker.sh # update ld.so cache for new libs in /usr/local RUN ldconfig +# check installation +RUN make -j4 check CHECK_HELP=1 +RUN if echo $BASE_IMAGE | fgrep -q cuda; then make fix-cuda; fi # remove (dated) security workaround preventing use of # ImageMagick's convert on PDF/PS/EPS/XPS: diff --git a/Makefile b/Makefile index b4817a24..5586a52f 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ GIT = sudo -u $(SUDO_USER) git endif endif GIT_RECURSIVE = # --recursive -GIT_DEPTH = # --depth 1 +GIT_DEPTH = # --depth 1 or --single-branch # Required and optional Tesseract models. ALL_TESSERACT_MODELS = eng equ osd $(TESSERACT_MODELS) @@ -110,20 +110,41 @@ cat <<"EOF" Rules to download and install all OCR-D module processors from their source repositories into a single virtualenv. -Targets: +Targets (general): help: show this message show: list the venv path and all executables (to be) installed - check: verify that all executables are runnable and the venv is consistent + +Targets (module management): modules: download all submodules to the managed revision + deinit: clean, then deinit and rmdir all submodules + tidy: clean, then deinit opencv-python and git-clean all submodules + (WARNING: potential data loss; if unsure, try with `make -n` and `git clean -n`) + +Targets (system dependencies, may need root privileges): + deps-ubuntu: install all system dependencies of all modules + deps-cuda: install CUDA toolkit and libraries (via micromamba and nvidia-pyindex) + +Targets (build and installation into venv): all: install all executables of all modules ocrd: only install the virtual environment and OCR-D/core packages install-tesseract: only build and install Tesseract (with TESSERACT_MODELS) - install-tesseract-training: build and install Tesseract training tools - install-models: download commonly used models to appropriate locations + install-tesseract-training: also build and install Tesseract training tools + fix-cuda: workaround for non-conflicting CUDA libs after installation clean: remove the virtual environment directory, and make clean-* clean-tesseract: remove the build directory for tesseract clean-olena: remove the build directory for ocrd_olena - deinit: clean, then deinit and rmdir all submodules + +Targets (testing): + check: verify that all executables are runnable and the venv is consistent + test-core: verify ocrd via core module regression tests + test-cuda: verify that CUDA is available for Tensorflow and Pytorch + test-workflow: verify that most executables work correctly via test runs on test data + +Targets (auxiliary data): + ocrd-all-tool.json: generate union of ocrd-tool.json for all executables of all modules + install-models: download commonly used models to appropriate locations + +Targets (build of container images): docker: (re)build a docker image including all executables dockers: (re)build docker images for some pre-selected subsets of modules @@ -137,6 +158,7 @@ Variables: TMPDIR: path to use for temporary storage instead of the system default PYTHON: name of the Python binary PIP_OPTIONS: extra options for the `pip install` command like `-q` or `-v` or `-e` + CHECK_HELP: set to `1` to also check each executable can generate help output TESSERACT_MODELS: list of additional models/languages to download for Tesseract. Default: "$(ALL_TESSERACT_MODELS)" TESSERACT_CONFIG: command line options for Tesseract `configure`. Default: "$(TESSERACT_CONFIG)" EOF @@ -152,6 +174,8 @@ help: ; @eval "$$HELP" # - then updates the time stamp of the module directory # so the directory can be used as a dependency # - synchronize via mutex to avoid race for git lock file +# - for minimal image sizes in Docker builds, avoid cloning all branches +# by using GIT_DEPTH="--depth 1" or GIT_DEPTH=--single-branch modules: $(OCRD_MODULES) # but bypass updates if we have no repo here (e.g. Docker build) ifneq (,$(wildcard .git)) @@ -164,12 +188,20 @@ $(OCRD_MODULES): always-update endif endif -deinit: clean .PHONY: deinit -deinit: +deinit: clean git submodule deinit --all # --force git submodule status | while read stat dir ver; do rmdir $$dir; done +.PHONY: tidy +tidy: clean + git submodule status opencv-python | grep -q ^- || git submodule deinit opencv-python + git submodule foreach --recursive git clean -fxd +# if you already have a clone with too many refs, consider the following recipe: +#git submodule foreach 'for ref in $(git for-each-ref --no-contains=HEAD --format="%(refname)" refs/remotes/ | sed s,^refs/remotes/,,); do git branch -d -r $ref; done' + git gc + + # Get Python modules. $(BIN)/pip: $(ACTIVATE_VENV) @@ -184,18 +216,23 @@ wheel: $(BIN)/wheel $(BIN)/wheel: | $(ACTIVATE_VENV) . $(ACTIVATE_VENV) && $(SEMPIP) pip install --force-reinstall $(PIP_OPTIONS_E) wheel -# avoid making this .PHONY so it does not have to be repeated -$(SHARE)/numpy: | $(ACTIVATE_VENV) $(SHARE) - . $(ACTIVATE_VENV) && $(SEMPIP) pip install $(PIP_OPTIONS_E) numpy - @touch $@ - # Install modules from source. .PHONY: ocrd ocrd: $(BIN)/ocrd +ifneq ($(filter core, $(OCRD_MODULES)),) deps-ubuntu-modules: core $(BIN)/ocrd: core . $(ACTIVATE_VENV) && $(MAKE) -C $< install PIP="$(SEMPIP) pip" PIP_INSTALL="$(SEMPIP) pip install $(PIP_OPTIONS)" && touch -c $@ +else +CUSTOM_DEPS += python3 imagemagick libgeos-dev +$(BIN)/ocrd: | $(ACTIVATE_VENV) + . $(ACTIVATE_VENV) && $(SEMPIP) pip install $(PIP_OPTIONS_E) ocrd ocrd_network +endif + +.PHONY: test-core +test-core: core $(BIN)/ocrd + . $(ACTIVATE_VENV) && $(MAKE) -C $< deps-test test # Convert the executable names (1) to a pattern rule, # so that the recipe will be used with single-recipe- @@ -203,7 +240,7 @@ $(BIN)/ocrd: core multirule = $(patsubst $(BIN)/%,\%/%,$(1)) -ifneq ($(findstring format-converters, $(OCRD_MODULES)),) +ifneq ($(filter format-converters, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(PAGE2IMG) PAGE2IMG := $(BIN)/page2img format-converters/page2img.py: format-converters @@ -213,20 +250,20 @@ $(PAGE2IMG): format-converters/page2img.py chmod +x $@ endif -ifneq ($(findstring opencv-python, $(OCRD_MODULES)),) +ifneq ($(filter opencv-python, $(OCRD_MODULES)),) CUSTOM_DEPS += cmake gcc g++ # libavcodec-dev libavformat-dev libswscale-dev libgstreamer-plugins-base1.0-dev libgstreamer1.0-dev # libpng-dev libjpeg-dev libopenexr-dev libtiff-dev libwebp-dev libjasper-dev opencv-python: GIT_RECURSIVE = --recursive opencv-python/setup.py: opencv-python -$(SHARE)/opencv-python: opencv-python/setup.py | $(ACTIVATE_VENV) $(SHARE) $(SHARE)/numpy - . $(ACTIVATE_VENV) && cd opencv-python && ENABLE_HEADLESS=1 $(PYTHON) setup.py bdist_wheel - . $(ACTIVATE_VENV) && $(SEMPIP) pip install $($(tool-jsons-file),$(tool-jsons-code)) + . $(ACTIVATE_VENV) && $(PYTHON) $(tool-jsons-file) $(wildcard $(OCRD_MODULES:%=%/ocrd-tool.json)) > $@ + $(RM) $(tool-jsons-file) + .PHONY: $(OCRD_EXECUTABLES:%=%-check) $(OCRD_EXECUTABLES:%=%-check): . $(ACTIVATE_VENV) \ @@ -707,7 +774,7 @@ $(OCRD_EXECUTABLES:%=%-check): .PHONY: $(OCRD_EXECUTABLES:$(BIN)/%=%) $(OCRD_EXECUTABLES:$(BIN)/%=%): %: $(BIN)/% -ifneq ($(findstring tesseract, $(OCRD_MODULES)),) +ifneq ($(filter tesseract, $(OCRD_MODULES)),) # Tesseract. # when not installing via PPA, we must cope without ocrd_tesserocr's deps-ubuntu-modules @@ -822,6 +889,30 @@ deps-ubuntu-modules: .PHONY: deps-ubuntu deps-ubuntu-modules +# For native (non-Docker) installations, install CUDA system dependencies +deps-cuda: core $(ACTIVATE_VENV) + . $(ACTIVATE_VENV) && $(MAKE) -C $< $@ + +# For standalone use ("just get me tensorflow-gpu<2.0 into the current venv") +tf1nvidia: $(ACTIVATE_VENV) + $(pip_install_tf1nvidia) + +# post-fix workaround for clash between cuDNN of Tensorflow 2.12 (→8.6) and Pytorch 1.13 (→8.5) +# the latter is explicit (but unnecessary), the former is implicit (and causes "DNN library not found" crashes at runtime) +# so we have three potential options: +# 1. revert to the version required by TF after pip overruled our choice via Torch dependency +# pip3 install nvidia-cudnn-cu11==8.6.0.* +# 2. downgrade TF so there is no overt conflict +# pip3 install "tensorflow<2.12" +# 3. upgrade Torch so there is no overt conflict +# pip install "torch>=2.0" +# Since ATM we don't know whether Torch 2.x will work everywhere, we opt for 2: +fix-cuda: $(ACTIVATE_VENV) + . $(ACTIVATE_VENV) && $(SEMPIP) pip install "tensorflow<2.12" + +.PHONY: deps-cuda tf1nvidia fix-cuda + + # Docker builds. DOCKER_TAG ?= ocrd/all @@ -830,31 +921,31 @@ DOCKER_TAG ?= ocrd/all # these variants won't share common layers / steps / data, # so build-time and bandwidth are n-fold) .PHONY: dockers -ifdef DOCKERS_WITHOUT_REPOS dockers: docker-minimum docker-minimum-cuda docker-medium docker-medium-cuda docker-maximum docker-maximum-cuda -else -dockers: docker-minimum-git docker-minimum-cuda-git docker-medium-git docker-medium-cuda-git docker-maximum-git docker-maximum-cuda-git -endif -# Selections which keep git repos and reference them for install +# keep git repos and reference them for install # (so components can be updated via git from the container alone) -docker-%-git: PIP_OPTIONS = -e +docker-%: PIP_OPTIONS = -e +# old non-git alias +docker-%um-git: docker-%um # Minimum-size selection: use Ocropy binarization, use Tesseract from PPA -docker-mini%: DOCKER_MODULES = core ocrd_cis ocrd_fileformat ocrd_im6convert ocrd_pagetopdf ocrd_repair_inconsistencies ocrd_tesserocr ocrd_wrap tesserocr workflow-configuration ocrd_olahd_client +docker-mini%: DOCKER_MODULES := core ocrd_cis ocrd_fileformat ocrd_im6convert ocrd_pagetopdf ocrd_repair_inconsistencies ocrd_tesserocr ocrd_wrap tesserocr workflow-configuration ocrd_olahd_client # Medium-size selection: add Olena binarization and Calamari, use Tesseract from git, add evaluation -docker-medi%: DOCKER_MODULES = core cor-asv-ann dinglehopper docstruct format-converters nmalign ocrd_calamari ocrd_cis ocrd_fileformat ocrd_im6convert ocrd_keraslm ocrd_neat ocrd_olahd_client ocrd_olena ocrd_pagetopdf ocrd_repair_inconsistencies ocrd_segment ocrd_tesserocr ocrd_wrap tesseract tesserocr workflow-configuration +docker-medi%: DOCKER_MODULES := core cor-asv-ann dinglehopper docstruct format-converters nmalign ocrd_calamari ocrd_cis ocrd_fileformat ocrd_im6convert ocrd_keraslm ocrd_neat ocrd_olahd_client ocrd_olena ocrd_pagetopdf ocrd_repair_inconsistencies ocrd_segment ocrd_tesserocr ocrd_wrap tesseract tesserocr workflow-configuration # Maximum-size selection: use all modules -docker-maxi%: DOCKER_MODULES = $(OCRD_MODULES) +docker-maxi%: DOCKER_MODULES := $(OCRD_MODULES) # DOCKER_BASE_IMAGE -docker%um docke%um-git: DOCKER_BASE_IMAGE = docker.io/ocrd/core +docker-%um: DOCKER_BASE_IMAGE = docker.io/ocrd/core # CUDA variants -docker%-cuda docker%-cuda-git: DOCKER_BASE_IMAGE = docker.io/ocrd/core-cuda +docker-%-cuda: DOCKER_BASE_IMAGE = docker.io/ocrd/core-cuda # Build rule for all selections -docker%: Dockerfile $(DOCKER_MODULES) +# FIXME: $(DOCKER_MODULES) ref does not work at phase 1; workaround: all modules +docker-%: Dockerfile modules docker build \ + --progress=plain \ --build-arg BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ --build-arg VCS_REF=$$(git rev-parse --short HEAD) \ --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ @@ -863,7 +954,7 @@ docker%: Dockerfile $(DOCKER_MODULES) --build-arg PARALLEL="$(DOCKER_PARALLEL)" \ --build-arg PYTHON="$(PYTHON)" \ --network=host \ - -t $(DOCKER_TAG):$(or $(*:-%=%),latest) . + -t $(DOCKER_TAG):$* . docker: DOCKER_MODULES ?= $(OCRD_MODULES) docker: DOCKER_PARALLEL ?= -j1 diff --git a/README.md b/README.md index 8c3a0213..82fa85ee 100644 --- a/README.md +++ b/README.md @@ -8,22 +8,28 @@ This controls installation of all OCR-D modules from source (as git submodules). It includes a Makefile for their installation into a virtual environment (venv) or Docker container. -(A venv is a local user directory with shell scripts to load/unload itself +(A [venv](https://packaging.python.org/tutorials/installing-packages/#creating-virtual-environments) +is a local user directory with shell scripts to load/unload itself in the current shell environment via PATH and PYTHONHOME.) -(NOTE: If you are going to install ocrd_all, you may want to first reference the [OCR-D setup guide](https://ocr-d.de/en/setup) at the OCR-D website. If you are a non-IT user, it is especially recommended you utilize the guide.) +> **Note**: If you are going to install ocrd_all, you may want to first consult +> the [OCR-D setup guide](https://ocr-d.de/en/setup) on the [OCR-D website](https://ocr-d.de). +> If you are a non-IT user, it is especially recommended you utilize the guide. -* [Preconditions](#preconditions) +* [Prerequisites](#prerequisites) * [Space](#space) * [Locale](#locale) * [System packages](#system-packages) + * [GPU support](#gpu-support) * [Usage](#usage) * [Targets](#targets) * [deps-ubuntu](#deps-ubuntu) + * [deps-cuda](#deps-cuda) * [modules](#modules) * [ocrd](#ocrd) * [all](#all) * [docker](#docker) + * [dockers](#dockers) * [clean](#clean) * [show](#show) * [help (default goal)](#help-default-goal) @@ -31,6 +37,7 @@ in the current shell environment via PATH and PYTHONHOME.) * [[any executable name]](#any-executable-name) * [Variables](#variables) * [OCRD_MODULES](#ocrd_modules) + * [NO_UPDATE](#no_update) * [PYTHON](#python) * [VIRTUAL_ENV](#virtual_env) * [TMPDIR](#tmpdir) @@ -48,24 +55,26 @@ in the current shell environment via PATH and PYTHONHOME.) * [System requirements](#system-requirements) * [Contributing](#contributing) -## Preconditions +## Prerequisites ### Space -Make sure that there is enough free disk space. 7 GiB or more is recommended for -the required submodules, build data, temporary data, installed virtual environment -and pip cache. +Make sure that there is enough free disk space. For a **full installation** including executables from all modules, +around **22 GiB** will be needed (mostly on the same filesystem as the ocrd_all checkout). The same goes for the +[`maximum-cuda`](#docker-hub) variant of the prebuilt Docker images (due on the filesystem harboring Docker, typically +`/var/lib/docker`). -If the `/tmp` directory has less than 5 GiB of free space, you can override the location -of temporary files by setting the `TMPDIR` variable when calling make: +Also, during build, an additional 5 GiB may be needed for temporary files, typically in the `/tmp` directory. +To use a different location path with more free space, set the `TMPDIR` variable when calling `make`: + + TMPDIR=/path/to/my/tempdir make all -```sh -TMPDIR=/path/to/my/tempdir make all -``` ### Locale -Next, the (shell) environment must have a Unicode-based localization. (Otherwise Python code based on `click` will not work, i.e. most OCR-D CLIs.) This is true for most installations today, and can be verified by: +The (shell) environment must have a Unicode-based localization. +(Otherwise Python code based on `click` will not work, i.e. most OCR-D CLIs.) +This is true for most installations today, and can be verified by: locale | fgrep .UTF-8 @@ -80,46 +89,72 @@ This should show several `LC_*` variables. Otherwise, either select another loca ### System packages -Install GNU make, git and GNU parallel. +* Install git, GNU make and GNU parallel. - # on Debian / Ubuntu: - sudo apt install make git parallel + # on Debian / Ubuntu: + sudo apt install make git parallel -Install wget or curl if you want to download Tesseract models. +* Install wget or curl if you want to download Tesseract models. - # on Debian / Ubuntu: - sudo apt install wget + # on Debian / Ubuntu: + sudo apt install wget -Install the packages for Python3 development and for Python3 virtual environments +* Install the packages for Python3 development and Python3 virtual environments for your operating system / distribution. - # on Debian / Ubuntu: - sudo apt install python3-dev python3-venv + # on Debian / Ubuntu: + sudo apt install python3-dev python3-venv + +* Some modules require [Tesseract](https://github.com/tesseract-ocr/tesseract). +If your operating system / distribution already provides Tesseract 4.1 +or newer, then just install its development package: -Some modules use the Tesseract library. If your distribution provides Tesseract 4.1 -or newer, install the development package: + # on Debian / Ubuntu: + sudo apt install libtesseract-dev - # on Debian / Ubuntu: - sudo apt install libtesseract-dev + Otherwise, recent Tesseract packages for Ubuntu are available via PPA + [alex-p](https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr-devel). -Ubuntu packages for Tesseract 5.0.0 (alpha) are available at the PPA -https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr-devel. + Alternatively, the latest version of Tesseract can also be built as a module locally. -Otherwise or for the latest Tesseract code it can also be built locally. +* Other modules will have additional system dependencies. -Other modules will have additional system dependencies. +> **Note**: System dependencies **for all modules** on Ubuntu 20.04 (or similar) +> can also be installed **automatically** by running: +> +> # on Debian / Ubuntu: +> make modules +> sudo apt install make +> sudo make deps-ubuntu +> +> (And you can define the scope of _all modules_ by setting the `OCRD_MODULES` +[variable](#Variables) as described below. If unsure, consider doing a dry-run +first, by using `make -n`.) -System dependencies **for all modules** on Ubuntu 18.04 (or similar) can also be installed **automatically** by running: +### GPU support - # on Debian / Ubuntu: - sudo apt install make - sudo make deps-ubuntu +Many executables can utilize Nvidia GPU for much faster computation, _if available_ (i.e. optionally). -(And you can define the scope of _all modules_ by setting the `OCRD_MODULES` [variable](#Variables).) +For that, as a further prerequisite you need an installation of +[CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) and additional optimised +libraries like [cuDNN](https://developer.nvidia.com/cudnn) for your system. + +The CUDA version currently supported is 11.8 (but other's may work as well). + +> **Note**: CUDA toolkit and libraries (in a development version with CUDA compiler) +> can also be installed **automatically** by running: +> +> make ocrd +> sudo make deps-cuda +> +> This will deploy [Micromamba](https://mamba.readthedocs.io/en/latest/index.html) +non-intrusively (without system packages or Conda environments), but also share some +of the CUDA libraries installed as Python packages system-wide via ld.so.conf rules. +If unsure, consider doing a dry-run first, by using `make -n`.) ## Usage -Run `make` with optional parameters for _variables_ and _targets_ like so: +Run `make` with optional parameters for __variables__ and __targets__ like so: make [PYTHON=python3] [VIRTUAL_ENV=./venv] [OCRD_MODULES="..."] [TARGET...] @@ -129,9 +164,17 @@ Run `make` with optional parameters for _variables_ and _targets_ like so: Install system packages for all modules. (Depends on [_modules_](#modules).) +See [system package prerequisites](#system-packages) above. + +#### _deps-cuda_ + +Install CUDA toolkit and libraries. (Depends on [_ocrd_](#ocrd).) + +See (optional) [GPU support prerequisites](#gpu-support) above. + #### _modules_ -Download/update all modules, but do not install anything. +Checkout/update all modules, but do not install anything. #### _all_ @@ -139,11 +182,23 @@ Install executables from all modules into the venv. (Depends on [_modules_](#mod #### _ocrd_ -Install only OCR-D/core and its CLI `ocrd` into the venv. +Install only the `core` module and its CLI `ocrd` into the venv. #### _docker_ -(Re-)build a docker image for all modules/executables. (Depends on [_modules_](#modules).) +(Re-)build a Docker image for all modules/executables. (Depends on [_modules_](#modules).) + +#### _dockers_ + +(Re-)build Docker images for some pre-selected subsets of modules/executables. (Depends on [_modules_](#modules).) + +(These are the very same variants published as [prebuilt images on Docker Hub](#docker-hub), +cf. [CI configuration](.circleci/config.yml#L27-L65).) + +> **Note**: The image will contain all refs and branches of all checked out modules, +> which may not be actually needed. If you are planning on building and distributing +> Docker images with minimal size, consider using `GIT_DEPTH=--single-branch` +> before `modules` or running `make tidy` later-on. #### _clean_ @@ -151,7 +206,7 @@ Remove the venv and the modules' build directories. #### _show_ -Print the venv directory, the module directories, and the executable names. +Print the venv directory, the module directories, and the executable names – as configured by the current variables. #### _check_ @@ -183,6 +238,12 @@ Override the list of git submodules to include. Targets affected by this include - [docker](#docker) (reducing the list of executables and modules to install) - [show](#show) (reducing the list of `OCRD_MODULES` and of `OCRD_EXECUTABLES` to print) +#### _NO_UPDATE_ + +If set to `1`, then when installing executables, does not attempt to `git submodule update` +any currently checked out modules. (Useful for development when testing different module version +prior to a commit.) + #### _PYTHON_ Name of the Python binary to use (at least python3 required). @@ -191,7 +252,8 @@ Name of the Python binary to use (at least python3 required). Directory prefix to use for local installation. -(This is set automatically when activating a virtual environment on the shell. The build system will re-use the venv if one already exists here, or create one.) +(This is set automatically when activating a virtual environment on the shell. +The build system will re-use the venv if one already exists here, or create one otherwise.) #### _TMPDIR_ @@ -201,7 +263,8 @@ Override the default path (`/tmp` on Unix) where temporary files during build ar Add extra options to the `pip install` command like `-q` or `-v` or `-e`. -(The latter will install Python modules in _editable mode_, i.e. any update to the source will directly affect the executables.) +> **Note**: The latter option will install Python modules in __editable mode__, +> i.e. any update to the source would directly affect the executables. #### _GIT_RECURSIVE_ @@ -211,8 +274,8 @@ Set to `--recursive` to checkout/update all modules recursively. (This usually i Add more models to the minimum required list of languages (`eng equ osd`) to install along with Tesseract. -Note: this only affects `make install-tesseract` (or `all`), but is independent of the `install-models` step. -(The latter delegates to `ocrd resmgr download`, which fetches all registered resources.) +> **Note**: this only affects `make install-tesseract` (or `all`), but is independent of the `install-models` step. +> (The latter delegates to `ocrd resmgr download`, which fetches all registered resources.) #### _TESSERACT_CONFIG_ @@ -220,11 +283,10 @@ Set `configure` options for building Tesseract from source (`--disable-openmp -- ### Examples -The following examples assume a working development installation of Tesseract. To build the latest Tesseract locally, run this command first: # Get code, build and install Tesseract with the default English model. - make tesseract + make install-tesseract Optionally install additional Tesseract models. @@ -288,32 +350,39 @@ TESSERACT_MODELS = deu frk script/Fraktur script/Latin # install all of Tesseract's submodules to support unit tests and training tools, too tesseract: GIT_RECURSIVE = --recursive + +# avoid automatic submodule updates +NO_UPDATE = 1 ``` -Note: When `local.mk` exists, variables can still be overridden on the command line, -(i.e. `make all OCRD_MODULES=` will build all executables for all modules again), -but not from the shell environment -(i.e. `OCRD_MODULES= make all` will still use the value from local.mk). +> **Note**: When `local.mk` exists, variables can still be overridden on the command line, +> (i.e. `make all OCRD_MODULES=` will build all executables for all modules again), +> but not from the shell environment +> (i.e. `OCRD_MODULES= make all` will still use the value from local.mk). ### Docker Hub -The project is available as prebuilt Docker images from [Docker Hub as -`ocrd/all`](https://hub.docker.com/r/ocrd/all). You can choose from three tags, -`minimum`, `medium` and `maximum`. These differ in which modules are included, -with `maximum` being the equivalent of doing `make all` with the default (unset) value for `OCRD_MODULES`. To download the images -on the command line: - -```sh -docker pull ocrd/all:minimum -# or -docker pull ocrd/all:medium -# or -docker pull ocrd/all:maximum -``` +Besides native installation, `ocrd_all` is also available as prebuilt Docker images +from [Docker Hub as `ocrd/all`](https://hub.docker.com/r/ocrd/all). You can choose from three tags, +`minimum`, `medium` and `maximum`. These differ w.r.t. which modules are included, +with `maximum` being the equivalent of doing `make all` with the default (unset) value for `OCRD_MODULES`. + +To download the images on the command line: + + docker pull ocrd/all:minimum + # or + docker pull ocrd/all:medium + # or + docker pull ocrd/all:maximum -In addition to these base variants, there are `minimum-cuda`, `medium-cuda` and `maximum-cuda` with GPU support. (Also needs [nvidia-docker](https://github.com/NVIDIA/nvidia-docker), which adds the `docker --gpus` option.) +In addition to these base variants, there are `minimum-cuda`, `medium-cuda` and `maximum-cuda` with GPU support. +(These also need [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) runtime, which will add the +`docker --gpus` option.) -Usage is the same [as if you had built the image yourself](#results). +These tags will be overwritten with every new release of ocrd_all. However, the `maximum` variant of each release +will also be aliased to a permanent tag by ISO date, e.g. `2023-04-02`. + +Usage of the prebuilt Docker image is the same [as if you had built the image yourself](#results). This table lists which tag contains which module: | Module | `minimum` | `medium` | `maximum` | @@ -350,11 +419,12 @@ This table lists which tag contains which module: | ocrd_ocropy | - | - | - | | ocrd_pc_segmentation | - | - | - | -**Note**: The following modules have been disabled by default and can only be -enabled by explicitly setting `OCRD_MODULES` or `DISABLED_MODULES`: - -* cor-asv-fst (runtime issues) -* ocrd_ocropy (better implementation in ocrd_cis available) +> **Note**: The following modules have been disabled by default and can only be +> enabled by explicitly setting `OCRD_MODULES` or `DISABLED_MODULES`: +> +> * `cor-asv-fst` (runtime issues) +> * `ocrd_ocropy` (better implementation in ocrd_cis available) +> * `ocrd_pc_segmentation` (dependency and quality issues) ### Uninstall diff --git a/cor-asv-ann b/cor-asv-ann index 006a70ee..2c4b1ffc 160000 --- a/cor-asv-ann +++ b/cor-asv-ann @@ -1 +1 @@ -Subproject commit 006a70eefe3a2e9e0af3ea24d387d8234c1ccaa5 +Subproject commit 2c4b1ffc123e867cc5e5203970996bfb05075397 diff --git a/core b/core index de084535..67086249 160000 --- a/core +++ b/core @@ -1 +1 @@ -Subproject commit de084535e18a2f2fa9293085cda4431abfe2e191 +Subproject commit 670862493408008441963a739ef650c6d3fa122d diff --git a/dinglehopper b/dinglehopper index 0fd4ea19..35be58cb 160000 --- a/dinglehopper +++ b/dinglehopper @@ -1 +1 @@ -Subproject commit 0fd4ea19732b2956942bc0fee735cef90a7d36cc +Subproject commit 35be58cb9456b0893bc46640b234912148621fb6 diff --git a/eynollah b/eynollah index ea792d1e..706433c5 160000 --- a/eynollah +++ b/eynollah @@ -1 +1 @@ -Subproject commit ea792d1e4ac4a722770b82dc91e71f84d5beb212 +Subproject commit 706433c5049c63c6e16fee5f71d81a7e507abe06 diff --git a/ocrd_cis b/ocrd_cis index c90b29f4..a0ea0a2a 160000 --- a/ocrd_cis +++ b/ocrd_cis @@ -1 +1 @@ -Subproject commit c90b29f4c6f3369b5eecae1617903dada14a3553 +Subproject commit a0ea0a2a4aeea99414c08ae543585b994f9ab0d5 diff --git a/ocrd_fileformat b/ocrd_fileformat index dacfa509..4e7e0de6 160000 --- a/ocrd_fileformat +++ b/ocrd_fileformat @@ -1 +1 @@ -Subproject commit dacfa50957fda54596f78d1612c8b5c29363a9e9 +Subproject commit 4e7e0de68e2a0dcd9b238f64d1657beda0d74da7 diff --git a/ocrd_kraken b/ocrd_kraken index 802c6b0b..b13dd8a9 160000 --- a/ocrd_kraken +++ b/ocrd_kraken @@ -1 +1 @@ -Subproject commit 802c6b0b76a3e75070c680aa3b19d36142decf4e +Subproject commit b13dd8a932b7dfbfe5019698e87542f5f767e2bd diff --git a/ocrd_pagetopdf b/ocrd_pagetopdf index 6155605b..4f4a330c 160000 --- a/ocrd_pagetopdf +++ b/ocrd_pagetopdf @@ -1 +1 @@ -Subproject commit 6155605b44488da95dfe0280df71202f8f09897f +Subproject commit 4f4a330c97208635e7b304cfce4db9e937fefd2b diff --git a/ocrd_wrap b/ocrd_wrap index 63c04d5a..2cd800d9 160000 --- a/ocrd_wrap +++ b/ocrd_wrap @@ -1 +1 @@ -Subproject commit 63c04d5a6a377ead9989a5c1a6a1b1d9aa6f8b33 +Subproject commit 2cd800d9eccbc084751558a87972ac22ee60e87a diff --git a/opencv-python b/opencv-python index 6b73d90f..474a1cc0 160000 --- a/opencv-python +++ b/opencv-python @@ -1 +1 @@ -Subproject commit 6b73d90fc3e50ba6858926d299b49f0228e19d68 +Subproject commit 474a1cc0ebf2086c596b60c050a9e1af658ff380 diff --git a/sbb_binarization b/sbb_binarization index 39ef3fd7..010ec99d 160000 --- a/sbb_binarization +++ b/sbb_binarization @@ -1 +1 @@ -Subproject commit 39ef3fd7bbda76fc1d8531e9c40fac3b6650dbae +Subproject commit 010ec99d2a666c363efb7e50c1eb2423857ff092 diff --git a/test-workflow.sh b/test-workflow.sh new file mode 100644 index 00000000..b9caafd8 --- /dev/null +++ b/test-workflow.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +set -e + +ocrd resmgr download ocrd-sbb-binarize default-2021-03-09 +ocrd-sbb-binarize -I OCR-D-IMG -O OCR-D-BIN -P model default-2021-03-09 + +ocrd resmgr download ocrd-detectron2-segment Jambo-sudo_X101.pth +ocrd resmgr download ocrd-detectron2-segment Jambo-sudo_X101.yaml +ocrd-detectron2-segment -p $(python -c "import ocrd_detectron2; print(ocrd_detectron2.__path__[0])")/presets_Jambo-sudo_X101.json -I OCR-D-BIN -O OCR-D-SEG + +ocrd-typegroups-classifier -I OCR-D-IMG -O FONT + +ocrd resmgr download ocrd-eynollah-segment default +ocrd-eynollah-segment -P models default -I OCR-D-IMG -O OCR-D-SEG2 + +ocrd resmgr download ocrd-calamari-recognize qurator-gt4histocr-1.0 +ocrd-calamari-recognize -I OCR-D-SEG2 -O OCR-D-OCR -P checkpoint_dir qurator-gt4histocr-1.0 -P textequiv_level glyph + +ocrd resmgr download ocrd-kraken-segment blla.mlmodel +ocrd-kraken-segment -I OCR-D-BIN -O OCR-D-SEG3 +ocrd-cis-ocropy-resegment -I OCR-D-SEG3 -O OCR-D-SEG3X -P method baseline + +ocrd resmgr download ocrd-kraken-recognize reichsanzeiger.mlmodel +ocrd-kraken-recognize -I OCR-D-SEG3X -O OCR-D-OCR2 -P model reichsanzeiger.mlmodel + +wget "https://git.informatik.uni-leipzig.de/ocr-d/cor-asv-ann-models/-/raw/master/s2s.gt4histocr.s-%C5%BF.d2.w0512.adam.attention.stateless.variational-dropout.char.transfer-lm.h5" +ocrd resmgr download -n s2s.gt4histocr.s-ſ.d2.w0512.adam.attention.stateless.variational-dropout.char.transfer-lm.h5 ocrd-cor-asv-ann-process s2s.gt4histocr.s-ſ.d2.w0512.adam.attention.stateless.variational-dropout.char.transfer-lm.h5 +ocrd-cor-asv-ann-process -I OCR-D-OCR -O OCR-D-COR -P model_file s2s.gt4histocr.s-ſ.d2.w0512.adam.attention.stateless.variational-dropout.char.transfer-lm.h5 + +ocrd-anybaseocr-crop -I OCR-D-BIN -O OCR-D-CROP +ocrd-skimage-denoise -I OCR-D-CROP -O OCR-D-DEN +ocrd-cis-ocropy-segment -I OCR-D-DEN -O OCR-D-SEG4 -P level-of-operation page + +ocrd-segment-evaluate -I OCR-D-SEG,OCR-D-SEG2 -O OCR-D-SEGEVAL + +ocrd resmgr download ocrd-tesserocr-recognize frak2021.traineddata +ocrd-tesserocr-recognize -I OCR-D-SEG2 -O OCR-D-OCR3 -P model frak2021 + +ocrd resmgr download ocrd-cis-ocropy-recognize LatinHist.pyrnn.gz +ocrd-cis-ocropy-recognize -I OCR-D-SEG2 -O OCR-D-OCR4 -P model LatinHist.pyrnn.gz -P textequiv_level glyph + +ocrd-cor-asv-ann-align -I OCR-D-OCR,OCR-D-OCR3,OCR-D-OCR4 -O OCR-D-OCR5 + +ocrd-cor-asv-ann-evaluate -I OCR-D-OCR,OCR-D-OCR3,OCR-D-OCR4,OCR-D-OCR5 -O OCR-D-OCREVAL + +ocrd-page-transform -I OCR-D-OCR4 -O OCR-D-OCR4X -P xsl page-textequiv-lines-to-regions.xsl +ocrd-fileformat-transform -I OCR-D-OCR4X -O TXT -P from-to "page text" -P script-args level=region pb="$(echo -e \v)" +ocrd-fileformat-transform -I OCR-D-OCR4X -O FULLTEXT -P from-to "page alto" -P script-args "--no-check-border --dummy-word" + diff --git a/workflow-configuration b/workflow-configuration index cb923f7f..5aff777c 160000 --- a/workflow-configuration +++ b/workflow-configuration @@ -1 +1 @@ -Subproject commit cb923f7fade2de84e08c2d7a4f9f2b6178f696b0 +Subproject commit 5aff777c761cae1b6f9d954fb80f9b212e8fab92