From f8cfe2020c6b4b0b185208c82a23ca54c48f368b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 28 Mar 2023 20:42:46 +0200 Subject: [PATCH 01/63] add rule for ocrd-tool-all.json --- Makefile | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Makefile b/Makefile index c3cb5d25..f43eebc4 100644 --- a/Makefile +++ b/Makefile @@ -115,6 +115,7 @@ Targets: show: list the venv path and all executables (to be) installed check: verify that all executables are runnable and the venv is consistent modules: download all submodules to the managed revision + ocrd-all-tool.json: generate union of ocrd-tool.json for all executables of all modules all: install all executables of all modules ocrd: only install the virtual environment and OCR-D/core packages install-tesseract: only build and install Tesseract (with TESSERACT_MODELS) @@ -691,6 +692,20 @@ check: $(OCRD_EXECUTABLES:%=%-check) $(OCRD_MODULES:%=%-check) . $(ACTIVATE_VENV) && pip check %-check: ; +define tool-jsons-code = +import json +import sys +all = dict() +for path in sys.argv[1:]: + all.update(json.load(open(path))['tools']) +print(json.dumps(all, indent=2)) +endef +tool-jsons-file != mktemp -u +ocrd-all-tool.json: modules + $(file >$(tool-jsons-file),$(tool-jsons-code)) + . $(ACTIVATE_VENV) && $(PYTHON) $(tool-jsons-file) $(wildcard $(OCRD_MODULES:%=%/ocrd-tool.json)) > $@ + $(RM) $(tool-jsons-file) + .PHONY: $(OCRD_EXECUTABLES:%=%-check) $(OCRD_EXECUTABLES:%=%-check): . $(ACTIVATE_VENV) \ From 3bc8d6a766d78562b60dfcbc03773e774b03297e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Tue, 28 Mar 2023 23:04:12 +0200 Subject: [PATCH 02/63] Update makedocker.yml - remove unnecessary steps - simplify commands to free up space - add more locations to rm - use fixed base image ubuntu-latest (only Docker build anyway), remove respective input - remove setup-python (only Docker build anyway), remove respective input - remove input choices with `-git` (same as without) - add input boolean upload-github - log in and push to GHCR, too - use conditional syntax for Dockerhub/Github options - add command to generate ocrd-all-tool.json from Docker - add action to upload ocrd-all-tool.json as artifact --- .github/workflows/makedocker.yml | 126 ++++++++++++++----------------- 1 file changed, 58 insertions(+), 68 deletions(-) diff --git a/.github/workflows/makedocker.yml b/.github/workflows/makedocker.yml index c7397a55..a549cdff 100644 --- a/.github/workflows/makedocker.yml +++ b/.github/workflows/makedocker.yml @@ -7,25 +7,6 @@ on: # Trigger workflow in GitHub web frontend or from API. workflow_dispatch: inputs: - os: - description: 'Operating system' - required: true - default: 'ubuntu-18.04' - type: choice - options: - - 'ubuntu-18.04' - - 'ubuntu-20.04' - python-version: - description: 'Python version' - required: true - default: '3.6' - type: choice - options: - - '3.6' - - '3.7' - - '3.8' - - '3.9' - - '3.10' docker-image: description: 'Docker image' required: true @@ -38,65 +19,43 @@ on: - 'medium-cuda' - 'maximum' - 'maximum-cuda' - - 'minimum-git' - - 'minimum-cuda-git' - - 'medium-git' - - 'medium-cuda-git' - - 'maximum-git' - - 'maximum-cuda-git' - upload-docker-image: - description: 'Upload Docker image' + upload-dockerhub: + description: 'Upload Docker image to Dockerhub' + default: False + type: boolean + upload-github: + description: 'Upload Docker image Github Container Registry' default: False type: boolean + # not yet: + #push: + # branches: [ "master" ] jobs: make: - runs-on: ${{ github.event.inputs.os }} - - env: - PYTHON_VERSION: ${{ github.event.inputs.python-version }} + runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: ${{ env.PYTHON_VERSION }} - - name: Show Python3 version - run: python3 --version - - name: Show disk usage of Homebrew, Android and .NET - run: sudo du -mscx /home/linuxbrew /usr/local/lib/android /usr/share/dotnet 2>/dev/null || true - name: Remove Docker images run: | df -h docker images - docker rmi alpine:3.12 alpine:3.13 alpine:3.14 - docker rmi buildpack-deps:stretch buildpack-deps:buster buildpack-deps:bullseye - docker rmi debian:9 debian:10 debian:11 - docker rmi moby/buildkit:latest - docker rmi node:12-alpine node:14-alpine node:16-alpine - docker rmi node:12 node:14 node:16 - if false; then # don't remove Ubuntu images - docker rmi ubuntu:16.04 ubuntu:18.04 ubuntu:20.04 - fi - docker images + docker rmi $(docker images --filter=reference="alpine:*") + docker rmi $(docker images --filter=reference="buildpack-deps:*") + docker rmi $(docker images --filter=reference="debian:*") + docker rmi $(docker images --filter=reference="node:*") + docker rmi ubuntu:16.04 ubuntu:18.04 df -h / - name: Remove unneeded Debian packages run: | - if false; then # skip time consuming package uninstall sudo apt-get install -y deborphan - deborphan -a | sort - sudo apt-get purge -y $(deborphan -a|fgrep main/cli-mono|while read dummy package; do echo $package; done) - sudo apt-get purge -y $(deborphan -a|fgrep main/database|while read dummy package; do echo $package; done) - sudo apt-get purge -y $(deborphan -a|fgrep main/devel|while read dummy package; do echo $package; done) - sudo apt-get purge -y $(deborphan -a|fgrep main/httpd|while read dummy package; do echo $package; done) - sudo apt-get purge -y $(deborphan -a|fgrep main/php|while read dummy package; do echo $package; done) - sudo apt-get purge -y $(deborphan -a|fgrep main/vcs|while read dummy package; do echo $package; done) + sudo apt-get purge -y $(deborphan -a | fgrep -e main/cli-mono -e main/database -e main/devel -e main/httpd -e main/php -e main/vcs | while read _ pkg; do echo $package; done) deborphan | sort sudo du -mscx /* 2>/dev/null || true sudo du -mscx /opt/* 2>/dev/null || true sudo du -mscx /usr/* 2>/dev/null || true df -h / - fi - name: Remove Homebrew, Android and .NET run: | # https://github.com/actions/virtual-environments/issues/2606#issuecomment-772683150 @@ -104,20 +63,51 @@ jobs: sudo rm -rf /home/linuxbrew # will release Homebrew sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET + sudo rm -rf /opt/ghc + sudo rm -rf /usr/local/share/boost + sudo rm -rf "$AGENT_TOOLSDIRECTORY" sudo du -mscx /* 2>/dev/null || true df -h / - name: Make Docker image run: make docker-${{ github.event.inputs.docker-image }} - - name: Show Docker images - run: docker images - - name: Login to Docker Hub and push new image(s) to Docker Hub + - name: Generate ocrd-all-tool.json + run: | + mkdir -p data + echo "cd /build && make ocrd-all-tool.json && cp ocrd-all-tool.json /data" | docker run -v $PWD/data:/data --rm docker-${{ github.event.inputs.docker-image }} + wc -l data/ocrd-all-tool.json + - name: Upload ocrd-all-tool.json + uses: actions/upload-artifact@v3 + with: + name: ${{ github.event.inputs.docker-image }}_ocrd-all-tool.json + path: ./data/ocrd-all-tool.json + # if-no-files-found: error + - name: Login to Docker Hub + if: github.event.inputs.upload-dockerhub == true + run: echo ${{ secrets.DOCKERHUB_PASSWORD }} | docker login --username ${{ secrets.DOCKERHUB_USERNAME }} --password-stdin + - name: Push to Docker Hub + if: github.event.inputs.upload-dockerhub == true run: | - if ${{ github.event.inputs.upload-docker-image }}; then - echo ${{ secrets.DOCKERHUB_PASSWORD }} | docker login --username ${{ secrets.DOCKERHUB_USERNAME }} --password-stdin - docker push ocrd/all:${{ github.event.inputs.docker-image }} - if test ${{ github.event.inputs.docker-image }} = maximum-git; then - # Alias Docker image. - docker tag ocrd/all:maximum-git ocrd/all:latest - docker push ocrd/all:latest - fi + docker push ocrd/all:${{ github.event.inputs.docker-image }} + if test ${{ github.event.inputs.docker-image }} = maximum-git; then + # Alias Docker image. + docker tag ocrd/all:maximum-git ocrd/all:latest + docker push ocrd/all:latest fi + - name: Login to GitHub Container Registry + if: github.event.inputs.upload-github == true + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Push to Github Container Registry + if: github.event.inputs.upload-github == true + run: | + docker tag ocrd/all:${{ github.event.inputs.docker-image }} ghcr.io/ocr-d/all:${{ github.event.inputs.docker-image }} + docker push ghcr.io/ocr-d/all:${{ github.event.inputs.docker-image }} + if test ${{ github.event.inputs.docker-image }} = maximum-git; then + # Alias Docker image. + docker tag ocrd/all:maximum-git ghcr.io/ocr-d/all:latest + docker push ghcr.io/ocr-d/all:latest + fi + From 2835c6cb6ff7c483bbecd1ac5e48638ac56266ea Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Tue, 28 Mar 2023 23:11:34 +0200 Subject: [PATCH 03/63] docker rmi: fix argument --- .github/workflows/makedocker.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/makedocker.yml b/.github/workflows/makedocker.yml index a549cdff..f8b9d4dc 100644 --- a/.github/workflows/makedocker.yml +++ b/.github/workflows/makedocker.yml @@ -41,10 +41,10 @@ jobs: run: | df -h docker images - docker rmi $(docker images --filter=reference="alpine:*") - docker rmi $(docker images --filter=reference="buildpack-deps:*") - docker rmi $(docker images --filter=reference="debian:*") - docker rmi $(docker images --filter=reference="node:*") + docker rmi $(docker images --filter=reference="alpine:*" -q) + docker rmi $(docker images --filter=reference="buildpack-deps:*" -q) + docker rmi $(docker images --filter=reference="debian:*" -q) + docker rmi $(docker images --filter=reference="node:*" -q) docker rmi ubuntu:16.04 ubuntu:18.04 df -h / - name: Remove unneeded Debian packages From 39955c9e5ab4833e609ed2d237ba263acec4224d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Tue, 28 Mar 2023 23:15:59 +0200 Subject: [PATCH 04/63] docker rmi: avoid assuming which Ubuntu is installed --- .github/workflows/makedocker.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/makedocker.yml b/.github/workflows/makedocker.yml index f8b9d4dc..2fc648ae 100644 --- a/.github/workflows/makedocker.yml +++ b/.github/workflows/makedocker.yml @@ -45,7 +45,6 @@ jobs: docker rmi $(docker images --filter=reference="buildpack-deps:*" -q) docker rmi $(docker images --filter=reference="debian:*" -q) docker rmi $(docker images --filter=reference="node:*" -q) - docker rmi ubuntu:16.04 ubuntu:18.04 df -h / - name: Remove unneeded Debian packages run: | From 3e4f209acd47710fd97d0138a029fbb4b8292dd1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Tue, 28 Mar 2023 23:31:33 +0200 Subject: [PATCH 05/63] reinstate -git variants --- .github/workflows/makedocker.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/makedocker.yml b/.github/workflows/makedocker.yml index 2fc648ae..cd6d9507 100644 --- a/.github/workflows/makedocker.yml +++ b/.github/workflows/makedocker.yml @@ -10,7 +10,7 @@ on: docker-image: description: 'Docker image' required: true - default: 'docker-minimum' + default: 'minimum' type: choice options: - 'minimum' @@ -19,6 +19,12 @@ on: - 'medium-cuda' - 'maximum' - 'maximum-cuda' + - 'minimum-git' + - 'minimum-cuda-git' + - 'medium-git' + - 'medium-cuda-git' + - 'maximum-git' + - 'maximum-cuda-git' upload-dockerhub: description: 'Upload Docker image to Dockerhub' default: False From c50dfa310b9e5b1ece242982e96edb6400292ae7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Tue, 28 Mar 2023 23:32:09 +0200 Subject: [PATCH 06/63] generate ocrd-all-tool.json: fix image name --- .github/workflows/makedocker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/makedocker.yml b/.github/workflows/makedocker.yml index cd6d9507..4c2e95e2 100644 --- a/.github/workflows/makedocker.yml +++ b/.github/workflows/makedocker.yml @@ -78,7 +78,7 @@ jobs: - name: Generate ocrd-all-tool.json run: | mkdir -p data - echo "cd /build && make ocrd-all-tool.json && cp ocrd-all-tool.json /data" | docker run -v $PWD/data:/data --rm docker-${{ github.event.inputs.docker-image }} + echo "cd /build && make ocrd-all-tool.json && cp ocrd-all-tool.json /data" | docker run -v $PWD/data:/data --rm ocrd/all:${{ github.event.inputs.docker-image }} wc -l data/ocrd-all-tool.json - name: Upload ocrd-all-tool.json uses: actions/upload-artifact@v3 From a0dbe734242576ed6810614b675d0f9de332bd28 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Tue, 28 Mar 2023 23:45:35 +0200 Subject: [PATCH 07/63] fix docker run (needs -i) --- .github/workflows/makedocker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/makedocker.yml b/.github/workflows/makedocker.yml index 4c2e95e2..1aeda100 100644 --- a/.github/workflows/makedocker.yml +++ b/.github/workflows/makedocker.yml @@ -78,7 +78,7 @@ jobs: - name: Generate ocrd-all-tool.json run: | mkdir -p data - echo "cd /build && make ocrd-all-tool.json && cp ocrd-all-tool.json /data" | docker run -v $PWD/data:/data --rm ocrd/all:${{ github.event.inputs.docker-image }} + echo "cd /build && make ocrd-all-tool.json && cp ocrd-all-tool.json /data" | docker run -i -v $PWD/data:/data --rm ocrd/all:${{ github.event.inputs.docker-image }} wc -l data/ocrd-all-tool.json - name: Upload ocrd-all-tool.json uses: actions/upload-artifact@v3 From 672eac3605f15f5efb835acc8c593b01bd610737 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 29 Mar 2023 00:00:07 +0200 Subject: [PATCH 08/63] fix make ocrd-all-tool.json --- .github/workflows/makedocker.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/makedocker.yml b/.github/workflows/makedocker.yml index 1aeda100..c2c60681 100644 --- a/.github/workflows/makedocker.yml +++ b/.github/workflows/makedocker.yml @@ -76,15 +76,16 @@ jobs: - name: Make Docker image run: make docker-${{ github.event.inputs.docker-image }} - name: Generate ocrd-all-tool.json + # the Docker build will set OCRD_MODULES and VIRTUAL_ENV inside the image, which we can re-use + # regardless of whether we have /build, we can just use the Makefile from outside again run: | - mkdir -p data - echo "cd /build && make ocrd-all-tool.json && cp ocrd-all-tool.json /data" | docker run -i -v $PWD/data:/data --rm ocrd/all:${{ github.event.inputs.docker-image }} - wc -l data/ocrd-all-tool.json + echo "cd /data && make ocrd-all-tool.json" | docker run -i -v $PWD:/data --rm ocrd/all:${{ github.event.inputs.docker-image }} + wc -l ocrd-all-tool.json - name: Upload ocrd-all-tool.json uses: actions/upload-artifact@v3 with: name: ${{ github.event.inputs.docker-image }}_ocrd-all-tool.json - path: ./data/ocrd-all-tool.json + path: ./ocrd-all-tool.json # if-no-files-found: error - name: Login to Docker Hub if: github.event.inputs.upload-dockerhub == true From bc364d749b4dcd2d11905bcc282f32e8643052d0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 29 Mar 2023 00:11:58 +0200 Subject: [PATCH 09/63] make ocrd-all-tool.json: avoid git actions --- .github/workflows/makedocker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/makedocker.yml b/.github/workflows/makedocker.yml index c2c60681..4e8cebf8 100644 --- a/.github/workflows/makedocker.yml +++ b/.github/workflows/makedocker.yml @@ -79,7 +79,7 @@ jobs: # the Docker build will set OCRD_MODULES and VIRTUAL_ENV inside the image, which we can re-use # regardless of whether we have /build, we can just use the Makefile from outside again run: | - echo "cd /data && make ocrd-all-tool.json" | docker run -i -v $PWD:/data --rm ocrd/all:${{ github.event.inputs.docker-image }} + echo "cd /data && make ocrd-all-tool.json NO_UPDATE=1" | docker run -i -v $PWD:/data --rm ocrd/all:${{ github.event.inputs.docker-image }} wc -l ocrd-all-tool.json - name: Upload ocrd-all-tool.json uses: actions/upload-artifact@v3 From 8cd239c53a6e888296d8c435a07ffc1d3381b283 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 29 Mar 2023 00:39:29 +0200 Subject: [PATCH 10/63] add SSH session for debugging --- .github/workflows/makedocker.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/makedocker.yml b/.github/workflows/makedocker.yml index 4e8cebf8..df2789af 100644 --- a/.github/workflows/makedocker.yml +++ b/.github/workflows/makedocker.yml @@ -79,8 +79,10 @@ jobs: # the Docker build will set OCRD_MODULES and VIRTUAL_ENV inside the image, which we can re-use # regardless of whether we have /build, we can just use the Makefile from outside again run: | - echo "cd /data && make ocrd-all-tool.json NO_UPDATE=1" | docker run -i -v $PWD:/data --rm ocrd/all:${{ github.event.inputs.docker-image }} + echo "cd /data && make show ocrd-all-tool.json NO_UPDATE=1" | docker run -i -v $PWD:/data --rm ocrd/all:${{ github.event.inputs.docker-image }} wc -l ocrd-all-tool.json + - name: Setup upterm session + uses: lhotari/action-upterm@v1 - name: Upload ocrd-all-tool.json uses: actions/upload-artifact@v3 with: From a79979746b8c6682ca830ac985efa826a6f82910 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 29 Mar 2023 01:24:56 +0200 Subject: [PATCH 11/63] make ocrd-all-tool.json: try outside of Docker --- .github/workflows/makedocker.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/makedocker.yml b/.github/workflows/makedocker.yml index df2789af..8c5986b0 100644 --- a/.github/workflows/makedocker.yml +++ b/.github/workflows/makedocker.yml @@ -76,12 +76,14 @@ jobs: - name: Make Docker image run: make docker-${{ github.event.inputs.docker-image }} - name: Generate ocrd-all-tool.json - # the Docker build will set OCRD_MODULES and VIRTUAL_ENV inside the image, which we can re-use + # the Docker build will set OCRD_MODULES inside the image, which we can re-use # regardless of whether we have /build, we can just use the Makefile from outside again run: | - echo "cd /data && make show ocrd-all-tool.json NO_UPDATE=1" | docker run -i -v $PWD:/data --rm ocrd/all:${{ github.event.inputs.docker-image }} + export OCRD_MODULES=$(docker run --rm ocrd/all:${{ github.event.inputs.docker-image }} bash -c 'echo $OCRD_MODULES') + make ocrd-all-tool.json wc -l ocrd-all-tool.json - name: Setup upterm session + if: false # interactive SSH logins for debugging uses: lhotari/action-upterm@v1 - name: Upload ocrd-all-tool.json uses: actions/upload-artifact@v3 From 32e153812db9f77cd65a18ade5b7cc5db8479eb1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 29 Mar 2023 01:39:07 +0200 Subject: [PATCH 12/63] make ocrd-all-tool.json: add venv dependency --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f43eebc4..b5ec5ee5 100644 --- a/Makefile +++ b/Makefile @@ -701,7 +701,7 @@ for path in sys.argv[1:]: print(json.dumps(all, indent=2)) endef tool-jsons-file != mktemp -u -ocrd-all-tool.json: modules +ocrd-all-tool.json: modules $(ACTIVATE_VENV) $(file >$(tool-jsons-file),$(tool-jsons-code)) . $(ACTIVATE_VENV) && $(PYTHON) $(tool-jsons-file) $(wildcard $(OCRD_MODULES:%=%/ocrd-tool.json)) > $@ $(RM) $(tool-jsons-file) From e746a884b978d567ba54826cb8912b2c7271052b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 29 Mar 2023 08:15:09 +0200 Subject: [PATCH 13/63] TF1: exclude nvidia-tensorflow==1.15.5+nv23.3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b5ec5ee5..592a748e 100644 --- a/Makefile +++ b/Makefile @@ -600,7 +600,7 @@ define pip_install_tf1nvidia = . $(ACTIVATE_VENV) && if test $(PYTHON_VERSION) = 3.8 && ! pip show -q tensorflow-gpu; then \ $(SEMPIP) pip install nvidia-pyindex && \ pushd $$(mktemp -d) && \ - $(SEMPIP) pip download --no-deps nvidia-tensorflow && \ + $(SEMPIP) pip download --no-deps "nvidia-tensorflow!=1.15.5+nv23.3" && \ for name in nvidia_tensorflow-*.whl; do name=$${name%.whl}; done && \ $(PYTHON) -m wheel unpack $$name.whl && \ for name in nvidia_tensorflow-*/; do name=$${name%/}; done && \ From 72b37387c24e776cdba391099cf4cad5dce33f52 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 29 Mar 2023 11:33:39 +0200 Subject: [PATCH 14/63] downgrade protobuf --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 592a748e..8f856aaa 100644 --- a/Makefile +++ b/Makefile @@ -490,6 +490,8 @@ install-models-calamari: $(BIN)/ocrd OCRD_EXECUTABLES += $(OCRD_CALAMARI) OCRD_CALAMARI := $(BIN)/ocrd-calamari-recognize $(OCRD_CALAMARI): ocrd_calamari $(BIN)/ocrd + @# workaround for Calamari#337: + $(PIP) install "protobuf<4" $(pip_install) endif From 353fa446c4cc1cb4cbc27125665cac61f6a47aa7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 29 Mar 2023 13:45:07 +0200 Subject: [PATCH 15/63] hold Numpy for ocrd_cis --- Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8f856aaa..694e3c8a 100644 --- a/Makefile +++ b/Makefile @@ -471,6 +471,8 @@ OCRD_CIS += $(BIN)/ocrd-cis-ocropy-segment #OCRD_CIS += $(BIN)/ocrd-cis-ocropy-train OCRD_CIS += $(BIN)/ocrd-cis-postcorrect $(call multirule,$(OCRD_CIS)): ocrd_cis $(BIN)/ocrd + @# workaround against breaking changes in Numpy + . $(ACTIVATE_VENV) && $(SEMPIP) pip install "numpy<1.24" $(pip_install) endif @@ -491,7 +493,7 @@ OCRD_EXECUTABLES += $(OCRD_CALAMARI) OCRD_CALAMARI := $(BIN)/ocrd-calamari-recognize $(OCRD_CALAMARI): ocrd_calamari $(BIN)/ocrd @# workaround for Calamari#337: - $(PIP) install "protobuf<4" + . $(ACTIVATE_VENV) && $(SEMPIP) pip install "protobuf<4" $(pip_install) endif From 5b1cb139a54f79e501c89edc4fa866af34ab4192 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 16 Apr 2023 00:21:56 +0200 Subject: [PATCH 16/63] update core --- core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core b/core index de084535..c1178f90 160000 --- a/core +++ b/core @@ -1 +1 @@ -Subproject commit de084535e18a2f2fa9293085cda4431abfe2e191 +Subproject commit c1178f9076cbbcc8b45ce487bc39eb8739b82ed7 From 0e21c7f2a94ccc1f37f217f1df2158d419ab0e0b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 21 Apr 2023 01:41:02 +0200 Subject: [PATCH 17/63] also hold OpenCV for ocrd_cis --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 694e3c8a..14e3a141 100644 --- a/Makefile +++ b/Makefile @@ -471,8 +471,8 @@ OCRD_CIS += $(BIN)/ocrd-cis-ocropy-segment #OCRD_CIS += $(BIN)/ocrd-cis-ocropy-train OCRD_CIS += $(BIN)/ocrd-cis-postcorrect $(call multirule,$(OCRD_CIS)): ocrd_cis $(BIN)/ocrd - @# workaround against breaking changes in Numpy - . $(ACTIVATE_VENV) && $(SEMPIP) pip install "numpy<1.24" + @# workaround against breaking changes in Numpy and OpenCV + . $(ACTIVATE_VENV) && $(SEMPIP) pip install "numpy<1.24" "opencv-python-headless<4.5" $(pip_install) endif From 5a678849382a7b81b8260e76d8e3bc9330289788 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 21 Apr 2023 01:42:19 +0200 Subject: [PATCH 18/63] post-update Shapely after (ocrd_)kraken --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 14e3a141..2d68f7f0 100644 --- a/Makefile +++ b/Makefile @@ -239,6 +239,8 @@ OCRD_KRAKEN += $(BIN)/ocrd-kraken-segment OCRD_KRAKEN += $(BIN)/ocrd-kraken-recognize $(call multirule,$(OCRD_KRAKEN)): ocrd_kraken $(BIN)/ocrd $(pip_install) + @# workaround for kraken requiring broken shapely==1.8.5 + . $(ACTIVATE_VENV) && $(SEMPIP) pip install "shapely<2.0" endif ifneq ($(findstring ocrd_ocropy, $(OCRD_MODULES)),) From a24bf17675b3d7050d1f7083bf9164dffb65413d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 21 Apr 2023 01:43:34 +0200 Subject: [PATCH 19/63] move ocrd_detectron2 to top venv but order before typegroups_classifier --- Makefile | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 2d68f7f0..23325b2d 100644 --- a/Makefile +++ b/Makefile @@ -277,17 +277,10 @@ endif ifneq ($(findstring ocrd_detectron2, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_DETECTRON2) OCRD_DETECTRON2 += $(BIN)/ocrd-detectron2-segment -$(call multirule,$(OCRD_DETECTRON2)): ocrd_detectron2 $(SUB_VENV_TF1)/bin/activate -ifeq (0,$(MAKELEVEL)) - $(MAKE) -B -o $< $(notdir $(OCRD_DETECTRON2)) VIRTUAL_ENV=$(SUB_VENV_TF1) - $(call delegate_venv,$(OCRD_DETECTRON2),$(SUB_VENV_TF1)) -ocrd_detectron2-check: - $(MAKE) check OCRD_MODULES=ocrd_detectron2 VIRTUAL_ENV=$(SUB_VENV_TF1) -else +$(call multirule,$(OCRD_DETECTRON2)): ocrd_detectron2 $(BIN)/ocrd . $(ACTIVATE_VENV) && $(MAKE) -C $< deps $(pip_install) endif -endif ifneq ($(findstring cor-asv-fst, $(OCRD_MODULES)),) deps-ubuntu-modules: cor-asv-fst @@ -531,7 +524,7 @@ ifneq ($(findstring ocrd_typegroups_classifier, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_TYPECLASS) OCRD_TYPECLASS := $(BIN)/ocrd-typegroups-classifier OCRD_TYPECLASS += $(BIN)/typegroups-classifier -$(call multirule,$(OCRD_TYPECLASS)): ocrd_typegroups_classifier +$(call multirule,$(OCRD_TYPECLASS)): ocrd_typegroups_classifier | $(OCRD_DETECTRON2) $(pip_install) endif From 5b1ce3eb42d976d4f0a7b79ebd2f37f9aa1f4256 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 21 Apr 2023 01:44:44 +0200 Subject: [PATCH 20/63] hold TF1 via nvidia-tensorflow at nv22.12 still compatible with CUDA 11 required for TF2 --- Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 23325b2d..fea6862f 100644 --- a/Makefile +++ b/Makefile @@ -594,12 +594,15 @@ endef # Workaround for missing prebuilt versions of TF<2 for Python==3.8 # todo: find another solution for 3.9, 3.10 etc +# https://docs.nvidia.com/deeplearning/frameworks/tensorflow-wheel-release-notes/tf-wheel-rel.html # Nvidia has them, but under a different name, so let's rewrite that: +# (hold at nv22.12, because newer releases require CUDA 12, which is not supported by TF2, +# and therefore not in our ocrd/core-cuda base image yet) define pip_install_tf1nvidia = . $(ACTIVATE_VENV) && if test $(PYTHON_VERSION) = 3.8 && ! pip show -q tensorflow-gpu; then \ $(SEMPIP) pip install nvidia-pyindex && \ pushd $$(mktemp -d) && \ - $(SEMPIP) pip download --no-deps "nvidia-tensorflow!=1.15.5+nv23.3" && \ + $(SEMPIP) pip download --no-deps "nvidia-tensorflow==1.15.5+nv22.12" && \ for name in nvidia_tensorflow-*.whl; do name=$${name%.whl}; done && \ $(PYTHON) -m wheel unpack $$name.whl && \ for name in nvidia_tensorflow-*/; do name=$${name%/}; done && \ From 18a027b9399ec4be53db1c009404dc51c34f87b5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 21 Apr 2023 01:46:43 +0200 Subject: [PATCH 21/63] docker*: prefer plain progress meter on buildkit --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index fea6862f..593f0034 100644 --- a/Makefile +++ b/Makefile @@ -869,6 +869,7 @@ docker%-cuda docker%-cuda-git: DOCKER_BASE_IMAGE = docker.io/ocrd/core-cuda # Build rule for all selections docker%: Dockerfile $(DOCKER_MODULES) docker build \ + --progress=plain \ --build-arg BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ --build-arg VCS_REF=$$(git rev-parse --short HEAD) \ --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ From 995eefb5c12751f67b03db48371a9e39165b11e3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 21 Apr 2023 01:47:35 +0200 Subject: [PATCH 22/63] venv/pip updates: rehash to ensure pip is dereferenced correctly --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 593f0034..4dc2cf8b 100644 --- a/Makefile +++ b/Makefile @@ -175,10 +175,12 @@ deinit: $(BIN)/pip: $(ACTIVATE_VENV) . $(ACTIVATE_VENV) && $(SEMPIP) pip install --upgrade pip setuptools + hash -r %/bin/activate: $(PYTHON) -m venv $(subst /bin/activate,,$@) . $@ && pip install --upgrade pip setuptools wheel + hash -r .PHONY: wheel wheel: $(BIN)/wheel From 15e8d77e4f190fcf1af7b76a6da4930e8d55d833 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 21 Apr 2023 01:48:22 +0200 Subject: [PATCH 23/63] tidy: new variant of clean with extra recursive git clean and git gc --- Makefile | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 4dc2cf8b..45e72bec 100644 --- a/Makefile +++ b/Makefile @@ -124,6 +124,7 @@ Targets: clean: remove the virtual environment directory, and make clean-* clean-tesseract: remove the build directory for tesseract clean-olena: remove the build directory for ocrd_olena + tidy: clean, then deinit opencv-python and git-clean all submodules deinit: clean, then deinit and rmdir all submodules docker: (re)build a docker image including all executables dockers: (re)build docker images for some pre-selected subsets of modules @@ -165,12 +166,20 @@ $(OCRD_MODULES): always-update endif endif -deinit: clean .PHONY: deinit -deinit: +deinit: clean git submodule deinit --all # --force git submodule status | while read stat dir ver; do rmdir $$dir; done +.PHONY: tidy +tidy: clean + git submodule status opencv-python | grep -q ^- || git submodule deinit opencv-python + git submodule foreach --recursive git clean -fxd +# if you already have a clone with too many refs, consider the following recipe: +#git submodule foreach 'for ref in $(git for-each-ref --no-contains=HEAD --format="%(refname)" refs/remotes/ | sed s,^refs/remotes/,,); do git branch -d -r $ref; done' + git gc + + # Get Python modules. $(BIN)/pip: $(ACTIVATE_VENV) From c06998dfdb3c3288337d763ab5f86da739dfaa38 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 21 Apr 2023 01:49:57 +0200 Subject: [PATCH 24/63] CI: build on newer base image to get 'git clone --single-branch' for 'make docker' to save space --- .circleci/config.yml | 9 +++++---- .github/workflows/makedocker.yml | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 1eba0d70..2669a7bc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,16 +2,17 @@ version: 2.1 jobs: build: docker: - - image: cimg/base:stable-18.04 + - image: cimg/base:stable-22.04 steps: - checkout - setup_remote_docker # https://circleci.com/docs/2.0/building-docker-images/ - run: name: build image - command: make docker-maximum-cuda + command: make docker-maximum-cuda GIT_DEPTH=--single-branch no_output_timeout: 30m - when: # takes too long for 1h1m CircleCI timeout overall + # also, storage is limited... condition: false steps: - run: @@ -26,7 +27,7 @@ jobs: destination: artifacts deploy: docker: - - image: cimg/base:stable-18.04 + - image: cimg/base:stable-22.04 environment: GIT_DEPTH: "--depth 1" parameters: @@ -38,7 +39,7 @@ jobs: - setup_remote_docker # https://circleci.com/docs/2.0/building-docker-images/ - run: name: Build Docker image - command: make docker-<< parameters.variant >>-git + command: make docker-<< parameters.variant >>-git GIT_DEPTH=--single-branch # fails due to pip races: DOCKER_PARALLEL=-j3 no_output_timeout: 30m - run: diff --git a/.github/workflows/makedocker.yml b/.github/workflows/makedocker.yml index 8c5986b0..a478a4a6 100644 --- a/.github/workflows/makedocker.yml +++ b/.github/workflows/makedocker.yml @@ -74,7 +74,7 @@ jobs: sudo du -mscx /* 2>/dev/null || true df -h / - name: Make Docker image - run: make docker-${{ github.event.inputs.docker-image }} + run: make docker-${{ github.event.inputs.docker-image }} GIT_DEPTH=--single-branch - name: Generate ocrd-all-tool.json # the Docker build will set OCRD_MODULES inside the image, which we can re-use # regardless of whether we have /build, we can just use the Makefile from outside again From 1f6ac8f36af0da60f1058713ee18bd6d21c7dfc8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 21 Apr 2023 02:26:15 +0200 Subject: [PATCH 25/63] docker*: revert 7a5ff45 to have all intermediate deps in 1 layer again --- Dockerfile | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index e2e677e3..c36a8e53 100644 --- a/Dockerfile +++ b/Dockerfile @@ -89,18 +89,20 @@ RUN git submodule deinit opencv-python && git submodule foreach --recursive git # make apt system functional RUN apt-get -y update && apt-get install -y apt-utils -# get packages for build, try to fetch all modules system requirements, -# remove unneeded automatic deps and clear pkg cache -RUN apt-get -y install automake autoconf libtool pkg-config g++ && make deps-ubuntu && apt-get -y autoremove && apt-get clean - # start a shell script (so we can comment individual steps here) RUN echo "set -ex" > docker.sh +# get packages for build +RUN echo "apt-get -y install automake autoconf libtool pkg-config g++" >> docker.sh +# try to fetch all modules system requirements +RUN echo "make deps-ubuntu" >> docker.sh RUN echo "source $VIRTUAL_ENV/bin/activate" >> docker.sh RUN echo "pip install -U pip setuptools wheel" >> docker.sh # build/install all tools of the requested modules: RUN echo "make $PARALLEL all" >> docker.sh # check installation RUN echo "make -j check CHECK_HELP=1" >> docker.sh +# remove unneeded automatic deps and clear pkg cache +RUN echo "apt-get remove automake autoconf libtool pkg-config g++ && apt-get clean" >> docker.sh # remove source directories from image, unless using editable mode # (in the latter case, the git repos are also the installation targets # and must be kept; so merely clean-up some temporary files) From 2aa21a2d424cb01a1181fc28cf652df98528e9f2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 21 Apr 2023 02:27:07 +0200 Subject: [PATCH 26/63] docker*: rm /.cache regardless of editable/-git or not --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index c36a8e53..489df5cc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -106,7 +106,7 @@ RUN echo "apt-get remove automake autoconf libtool pkg-config g++ && apt-get cle # remove source directories from image, unless using editable mode # (in the latter case, the git repos are also the installation targets # and must be kept; so merely clean-up some temporary files) -RUN echo "if [[ '${PIP_OPTIONS}' =~ -e|--editable ]]; then make -i clean-olena clean-tesseract; else rm -fr /.cache /build; fi" >> docker.sh +RUN echo "if [[ '${PIP_OPTIONS}' =~ -e|--editable ]]; then make -i clean-olena clean-tesseract; else rm -fr /build; fi; rm -fr /.cache" >> docker.sh # run the script in one layer/step (to minimise image size) # (and export all variables) RUN set -a; bash docker.sh From 2d7b4c70c38258e41cf61959099f101cb048e05b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 21 Apr 2023 02:29:14 +0200 Subject: [PATCH 27/63] docker*: no more git update in Docker layers (only on build context via deps) --- Dockerfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 489df5cc..e0307c29 100644 --- a/Dockerfile +++ b/Dockerfile @@ -83,9 +83,6 @@ RUN rm $VIRTUAL_ENV/bin/pip* && apt-get purge -y python3-pip && python3 -m venv # so we must rely on .dockerignore here) COPY . . -# deinit opencv-python (otherwise git submodule status can segfault) and remove copied junk -RUN git submodule deinit opencv-python && git submodule foreach --recursive git clean -fxd - # make apt system functional RUN apt-get -y update && apt-get install -y apt-utils @@ -93,6 +90,8 @@ RUN apt-get -y update && apt-get install -y apt-utils RUN echo "set -ex" > docker.sh # get packages for build RUN echo "apt-get -y install automake autoconf libtool pkg-config g++" >> docker.sh +# ensure no additional git actions happen after copying the checked out modules +RUN echo "export NO_UPDATE=1" >> docker.sh # try to fetch all modules system requirements RUN echo "make deps-ubuntu" >> docker.sh RUN echo "source $VIRTUAL_ENV/bin/activate" >> docker.sh From 3f0e9e3ed4be104562c6b19a159d0079c8af9dcc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 21 Apr 2023 10:11:37 +0200 Subject: [PATCH 28/63] docker*: move make check to separate step --- Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index e0307c29..648a0d6d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,7 @@ # use OCR-D base container (from ubuntu:18.04) ARG BASE_IMAGE=ocrd/core FROM $BASE_IMAGE +ARG BASE_IMAGE ARG VCS_REF ARG BUILD_DATE LABEL \ @@ -98,8 +99,6 @@ RUN echo "source $VIRTUAL_ENV/bin/activate" >> docker.sh RUN echo "pip install -U pip setuptools wheel" >> docker.sh # build/install all tools of the requested modules: RUN echo "make $PARALLEL all" >> docker.sh -# check installation -RUN echo "make -j check CHECK_HELP=1" >> docker.sh # remove unneeded automatic deps and clear pkg cache RUN echo "apt-get remove automake autoconf libtool pkg-config g++ && apt-get clean" >> docker.sh # remove source directories from image, unless using editable mode @@ -111,6 +110,8 @@ RUN echo "if [[ '${PIP_OPTIONS}' =~ -e|--editable ]]; then make -i clean-olena c RUN set -a; bash docker.sh # update ld.so cache for new libs in /usr/local RUN ldconfig +# check installation +RUN make -j check CHECK_HELP=1 # remove (dated) security workaround preventing use of # ImageMagick's convert on PDF/PS/EPS/XPS: From f7abce112cb63252a0e43bc6f97381628d19dc93 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 21 Apr 2023 11:09:39 +0200 Subject: [PATCH 29/63] add make testcuda for diagnostics --- Makefile | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 45e72bec..209be7cb 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ GIT = sudo -u $(SUDO_USER) git endif endif GIT_RECURSIVE = # --recursive -GIT_DEPTH = # --depth 1 +GIT_DEPTH = # --depth 1 or --single-branch # Required and optional Tesseract models. ALL_TESSERACT_MODELS = eng equ osd $(TESSERACT_MODELS) @@ -114,6 +114,7 @@ Targets: help: show this message show: list the venv path and all executables (to be) installed check: verify that all executables are runnable and the venv is consistent + testcuda: verify that CUDA is available for Tensorflow and Pytorch modules: download all submodules to the managed revision ocrd-all-tool.json: generate union of ocrd-tool.json for all executables of all modules all: install all executables of all modules @@ -154,6 +155,8 @@ help: ; @eval "$$HELP" # - then updates the time stamp of the module directory # so the directory can be used as a dependency # - synchronize via mutex to avoid race for git lock file +# - for minimal image sizes in Docker builds, avoid cloning all branches +# by using GIT_DEPTH="--depth 1" or GIT_DEPTH=--single-branch modules: $(OCRD_MODULES) # but bypass updates if we have no repo here (e.g. Docker build) ifneq (,$(wildcard .git)) @@ -705,6 +708,16 @@ check: $(OCRD_EXECUTABLES:%=%-check) $(OCRD_MODULES:%=%-check) . $(ACTIVATE_VENV) && pip check %-check: ; +# ensure shapely#1598 workaround works +# ensure CUDA works for Torch and TF +testcuda: + . $(ACTIVATE_VENV) && $(PYTHON) -c "from shapely.geometry import Polygon; import torch; torch.randn(10).cuda()" + . $(ACTIVATE_VENV) && $(PYTHON) -c "import torch, sys; sys.exit(0 if torch.cuda.is_available() else 1)" + . $(ACTIVATE_VENV) && $(PYTHON) -c "import tensorflow as tf, sys; sys.exit(0 if tf.test.is_gpu_available() else 1)" + . $(SUB_VENV_TF1)/bin/activate && $(PYTHON) -c "import tensorflow as tf, sys; sys.exit(0 if tf.test.is_gpu_available() else 1)" + @echo everything seems to be fine + + define tool-jsons-code = import json import sys From bad270fc244285536e4d00505e72011f338cb61d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 21 Apr 2023 11:14:07 +0200 Subject: [PATCH 30/63] kraken workaround not needed anymore --- Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile b/Makefile index 209be7cb..2452989a 100644 --- a/Makefile +++ b/Makefile @@ -253,8 +253,6 @@ OCRD_KRAKEN += $(BIN)/ocrd-kraken-segment OCRD_KRAKEN += $(BIN)/ocrd-kraken-recognize $(call multirule,$(OCRD_KRAKEN)): ocrd_kraken $(BIN)/ocrd $(pip_install) - @# workaround for kraken requiring broken shapely==1.8.5 - . $(ACTIVATE_VENV) && $(SEMPIP) pip install "shapely<2.0" endif ifneq ($(findstring ocrd_ocropy, $(OCRD_MODULES)),) From 46c8722a4f3f498f9322dba70ef91d466a4f7b49 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 21 Apr 2023 11:14:09 +0200 Subject: [PATCH 31/63] update modules --- core | 2 +- eynollah | 2 +- ocrd_pagetopdf | 2 +- sbb_binarization | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/core b/core index c1178f90..357e7298 160000 --- a/core +++ b/core @@ -1 +1 @@ -Subproject commit c1178f9076cbbcc8b45ce487bc39eb8739b82ed7 +Subproject commit 357e72984e80d84b6bcfd0bfa85c9e78f9b08c7e diff --git a/eynollah b/eynollah index ea792d1e..52d2e0b0 160000 --- a/eynollah +++ b/eynollah @@ -1 +1 @@ -Subproject commit ea792d1e4ac4a722770b82dc91e71f84d5beb212 +Subproject commit 52d2e0b098f8defe3056a9d50c6cafd578480768 diff --git a/ocrd_pagetopdf b/ocrd_pagetopdf index 6155605b..4f4a330c 160000 --- a/ocrd_pagetopdf +++ b/ocrd_pagetopdf @@ -1 +1 @@ -Subproject commit 6155605b44488da95dfe0280df71202f8f09897f +Subproject commit 4f4a330c97208635e7b304cfce4db9e937fefd2b diff --git a/sbb_binarization b/sbb_binarization index 39ef3fd7..010ec99d 160000 --- a/sbb_binarization +++ b/sbb_binarization @@ -1 +1 @@ -Subproject commit 39ef3fd7bbda76fc1d8531e9c40fac3b6650dbae +Subproject commit 010ec99d2a666c363efb7e50c1eb2423857ff092 From 7852fcb932e93e89bc8d24ca45d4c6379371a962 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 24 Apr 2023 14:02:35 +0200 Subject: [PATCH 32/63] update core to v2.50 --- core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core b/core index 357e7298..c0c153e9 160000 --- a/core +++ b/core @@ -1 +1 @@ -Subproject commit 357e72984e80d84b6bcfd0bfa85c9e78f9b08c7e +Subproject commit c0c153e97ddc6623219421f05372b44f243adf96 From 446b32ed5a4d08381ce670c83d51cc12224d34f8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 26 Apr 2023 18:35:18 +0200 Subject: [PATCH 33/63] opencv-python: adapt to pip wheel builds --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index b4817a24..ad3b776f 100644 --- a/Makefile +++ b/Makefile @@ -219,9 +219,9 @@ CUSTOM_DEPS += cmake gcc g++ # libpng-dev libjpeg-dev libopenexr-dev libtiff-dev libwebp-dev libjasper-dev opencv-python: GIT_RECURSIVE = --recursive opencv-python/setup.py: opencv-python -$(SHARE)/opencv-python: opencv-python/setup.py | $(ACTIVATE_VENV) $(SHARE) $(SHARE)/numpy - . $(ACTIVATE_VENV) && cd opencv-python && ENABLE_HEADLESS=1 $(PYTHON) setup.py bdist_wheel - . $(ACTIVATE_VENV) && $(SEMPIP) pip install $( Date: Wed, 26 Apr 2023 18:55:03 +0200 Subject: [PATCH 34/63] remove explicit dependency for numpy --- Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Makefile b/Makefile index ad3b776f..1f2ecf9d 100644 --- a/Makefile +++ b/Makefile @@ -184,11 +184,6 @@ wheel: $(BIN)/wheel $(BIN)/wheel: | $(ACTIVATE_VENV) . $(ACTIVATE_VENV) && $(SEMPIP) pip install --force-reinstall $(PIP_OPTIONS_E) wheel -# avoid making this .PHONY so it does not have to be repeated -$(SHARE)/numpy: | $(ACTIVATE_VENV) $(SHARE) - . $(ACTIVATE_VENV) && $(SEMPIP) pip install $(PIP_OPTIONS_E) numpy - @touch $@ - # Install modules from source. .PHONY: ocrd From db16444a2f31710ced032d560cfbf70a39ea2f6e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 26 Apr 2023 18:56:09 +0200 Subject: [PATCH 35/63] replace findstring with filter (no substring matches) --- Makefile | 64 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/Makefile b/Makefile index 1f2ecf9d..a8ac77de 100644 --- a/Makefile +++ b/Makefile @@ -198,7 +198,7 @@ $(BIN)/ocrd: core multirule = $(patsubst $(BIN)/%,\%/%,$(1)) -ifneq ($(findstring format-converters, $(OCRD_MODULES)),) +ifneq ($(filter format-converters, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(PAGE2IMG) PAGE2IMG := $(BIN)/page2img format-converters/page2img.py: format-converters @@ -208,7 +208,7 @@ $(PAGE2IMG): format-converters/page2img.py chmod +x $@ endif -ifneq ($(findstring opencv-python, $(OCRD_MODULES)),) +ifneq ($(filter opencv-python, $(OCRD_MODULES)),) CUSTOM_DEPS += cmake gcc g++ # libavcodec-dev libavformat-dev libswscale-dev libgstreamer-plugins-base1.0-dev libgstreamer1.0-dev # libpng-dev libjpeg-dev libopenexr-dev libtiff-dev libwebp-dev libjasper-dev @@ -221,7 +221,7 @@ $(SHARE)/opencv-python: opencv-python/setup.py | $(ACTIVATE_VENV) $(SHARE) $(BIN)/ocrd: $(SHARE)/opencv-python endif -ifneq ($(findstring ocrd_kraken, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_kraken, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_KRAKEN) install-models: install-models-kraken .PHONY: install-models-kraken @@ -235,14 +235,14 @@ $(call multirule,$(OCRD_KRAKEN)): ocrd_kraken $(BIN)/ocrd $(pip_install) endif -ifneq ($(findstring ocrd_ocropy, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_ocropy, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_OCROPY) OCRD_OCROPY := $(BIN)/ocrd-ocropy-segment $(OCRD_OCROPY): ocrd_ocropy $(BIN)/ocrd $(pip_install) endif -ifneq ($(findstring cor-asv-ann, $(OCRD_MODULES)),) +ifneq ($(filter cor-asv-ann, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_COR_ASV_ANN) OCRD_COR_ASV_ANN := $(BIN)/ocrd-cor-asv-ann-evaluate OCRD_COR_ASV_ANN += $(BIN)/ocrd-cor-asv-ann-process @@ -266,7 +266,7 @@ else endif endif -ifneq ($(findstring ocrd_detectron2, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_detectron2, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_DETECTRON2) OCRD_DETECTRON2 += $(BIN)/ocrd-detectron2-segment $(call multirule,$(OCRD_DETECTRON2)): ocrd_detectron2 $(SUB_VENV_TF1)/bin/activate @@ -281,7 +281,7 @@ else endif endif -ifneq ($(findstring cor-asv-fst, $(OCRD_MODULES)),) +ifneq ($(filter cor-asv-fst, $(OCRD_MODULES)),) deps-ubuntu-modules: cor-asv-fst OCRD_EXECUTABLES += $(OCRD_COR_ASV_FST) OCRD_COR_ASV_FST := $(BIN)/ocrd-cor-asv-fst-process @@ -299,7 +299,7 @@ else endif endif -ifneq ($(findstring ocrd_keraslm, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_keraslm, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_KERASLM) OCRD_KERASLM := $(BIN)/ocrd-keraslm-rate OCRD_KERASLM += $(BIN)/keraslm-rate @@ -315,14 +315,14 @@ else endif endif -ifneq ($(findstring ocrd_im6convert, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_im6convert, $(OCRD_MODULES)),) deps-ubuntu-modules: ocrd_im6convert OCRD_EXECUTABLES += $(BIN)/ocrd-im6convert $(BIN)/ocrd-im6convert: ocrd_im6convert $(BIN)/ocrd . $(ACTIVATE_VENV) && $(MAKE) -C $< install endif -ifneq ($(findstring ocrd_neat, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_neat, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_NEAT) OCRD_NEAT := $(BIN)/ocrd-neat-import OCRD_NEAT += $(BIN)/ocrd-neat-export @@ -336,7 +336,7 @@ $(call multirule,$(OCRD_NEAT)): ocrd_neat $(BIN)/ocrd endif -ifneq ($(findstring ocrd_wrap, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_wrap, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_WRAP) OCRD_WRAP := $(BIN)/ocrd-preprocess-image OCRD_WRAP += $(BIN)/ocrd-skimage-normalize @@ -347,14 +347,14 @@ $(call multirule,$(OCRD_WRAP)): ocrd_wrap $(BIN)/ocrd $(pip_install) endif -ifneq ($(findstring ocrd_fileformat, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_fileformat, $(OCRD_MODULES)),) ocrd_fileformat: GIT_RECURSIVE = --recursive OCRD_EXECUTABLES += $(BIN)/ocrd-fileformat-transform $(BIN)/ocrd-fileformat-transform: ocrd_fileformat $(BIN)/ocrd . $(ACTIVATE_VENV) && $(MAKE) -C $< install-fileformat install endif -ifneq ($(findstring ocrd_olena, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_olena, $(OCRD_MODULES)),) ocrd_olena: GIT_RECURSIVE = --recursive deps-ubuntu-modules: ocrd_olena OCRD_EXECUTABLES += $(BIN)/ocrd-olena-binarize @@ -367,25 +367,25 @@ clean-olena: test ! -f ocrd_olena/Makefile || \ $(MAKE) -C ocrd_olena clean-olena BUILD_DIR=$(VIRTUAL_ENV)/build/ocrd_olena -ifneq ($(findstring dinglehopper, $(OCRD_MODULES)),) +ifneq ($(filter dinglehopper, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(BIN)/ocrd-dinglehopper $(BIN)/ocrd-dinglehopper: dinglehopper $(BIN)/ocrd $(pip_install) endif -ifneq ($(findstring docstruct, $(OCRD_MODULES)),) +ifneq ($(filter docstruct, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(BIN)/ocrd-docstruct $(BIN)/ocrd-docstruct: docstruct $(BIN)/ocrd $(pip_install) endif -ifneq ($(findstring nmalign, $(OCRD_MODULES)),) +ifneq ($(filter nmalign, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(BIN)/ocrd-nmalign-merge $(BIN)/ocrd-nmalign-merge: nmalign $(BIN)/ocrd $(pip_install) endif -ifneq ($(findstring ocrd_segment, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_segment, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_SEGMENT) OCRD_SEGMENT := $(BIN)/ocrd-segment-evaluate OCRD_SEGMENT += $(BIN)/ocrd-segment-from-masks @@ -411,7 +411,7 @@ else endif endif -ifneq ($(findstring ocrd_tesserocr, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_tesserocr, $(OCRD_MODULES)),) install-models: install-models-tesseract .PHONY: install-models-tesseract install-models-tesseract: @@ -419,7 +419,7 @@ install-models-tesseract: OCRD_EXECUTABLES += $(OCRD_TESSEROCR) # only add custom PPA when not building tesseract from source -ifeq ($(findstring tesseract, $(OCRD_MODULES)),) +ifeq ($(filter tesseract, $(OCRD_MODULES)),) deps-ubuntu-modules: ocrd_tesserocr # convert Tesseract model names into Ubuntu/Debian pkg names # (does not work with names under script/ though) @@ -445,7 +445,7 @@ endif endif -ifneq ($(findstring ocrd_cis, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_cis, $(OCRD_MODULES)),) install-models: install-models-ocropus .PHONY: install-models-ocropus install-models-ocropus: @@ -468,7 +468,7 @@ $(call multirule,$(OCRD_CIS)): ocrd_cis $(BIN)/ocrd $(pip_install) endif -ifneq ($(findstring ocrd_pagetopdf, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_pagetopdf, $(OCRD_MODULES)),) deps-ubuntu-modules: ocrd_pagetopdf OCRD_EXECUTABLES += $(OCRD_PAGETOPDF) OCRD_PAGETOPDF := $(BIN)/ocrd-pagetopdf @@ -476,7 +476,7 @@ $(OCRD_PAGETOPDF): ocrd_pagetopdf $(BIN)/ocrd . $(ACTIVATE_VENV) && $(MAKE) -C $< install endif -ifneq ($(findstring ocrd_calamari, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_calamari, $(OCRD_MODULES)),) install-models: install-models-calamari .PHONY: install-models-calamari install-models-calamari: $(BIN)/ocrd @@ -487,7 +487,7 @@ $(OCRD_CALAMARI): ocrd_calamari $(BIN)/ocrd $(pip_install) endif -ifneq ($(findstring ocrd_pc_segmentation, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_pc_segmentation, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_PC_SEGMENTATION) OCRD_PC_SEGMENTATION := $(BIN)/ocrd-pc-segmentation $(OCRD_PC_SEGMENTATION): ocrd_pc_segmentation @@ -495,7 +495,7 @@ $(OCRD_PC_SEGMENTATION): ocrd_pc_segmentation $(pip_install) endif -ifneq ($(findstring ocrd_anybaseocr, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_anybaseocr, $(OCRD_MODULES)),) install-models: install-models-anybaseocr .PHONY: install-models-anybaseocr install-models-anybaseocr: @@ -515,7 +515,7 @@ $(call multirule,$(OCRD_ANYBASEOCR)): ocrd_anybaseocr $(BIN)/ocrd $(pip_install) endif -ifneq ($(findstring ocrd_typegroups_classifier, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_typegroups_classifier, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_TYPECLASS) OCRD_TYPECLASS := $(BIN)/ocrd-typegroups-classifier OCRD_TYPECLASS += $(BIN)/typegroups-classifier @@ -523,14 +523,14 @@ $(call multirule,$(OCRD_TYPECLASS)): ocrd_typegroups_classifier $(pip_install) endif -ifneq ($(findstring ocrd_doxa, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_doxa, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_DOXA) OCRD_DOXA := $(BIN)/ocrd-doxa-binarize $(OCRD_DOXA): ocrd_doxa $(BIN)/ocrd $(pip_install) endif -ifneq ($(findstring sbb_binarization, $(OCRD_MODULES)),) +ifneq ($(filter sbb_binarization, $(OCRD_MODULES)),) install-models: install-models-sbb-binarization .PHONY: install-models-sbb-binarization install-models-sbb-binarization: @@ -543,7 +543,7 @@ $(call multirule,$(SBB_BINARIZATION)): sbb_binarization $(BIN)/ocrd $(pip_install) endif -ifneq ($(findstring eynollah, $(OCRD_MODULES)),) +ifneq ($(filter eynollah, $(OCRD_MODULES)),) install-models: install-models-eynollah .PHONY: install-models-eynollah install-models-eynollah: @@ -554,21 +554,21 @@ $(EYNOLLAH_SEGMENT): eynollah $(BIN)/ocrd $(pip_install) endif -ifneq ($(findstring ocrd_repair_inconsistencies, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_repair_inconsistencies, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_REPAIR_INCONSISTENCIES) OCRD_REPAIR_INCONSISTENCIES := $(BIN)/ocrd-repair-inconsistencies $(OCRD_REPAIR_INCONSISTENCIES): ocrd_repair_inconsistencies $(BIN)/ocrd $(pip_install) endif -ifneq ($(findstring ocrd_olahd_client, $(OCRD_MODULES)),) +ifneq ($(filter ocrd_olahd_client, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_OLAHD_CLIENT) OCRD_OLAHD_CLIENT := $(BIN)/ocrd-olahd-client $(OCRD_OLAHD_CLIENT): ocrd_olahd_client $(BIN)/ocrd $(pip_install) endif -ifneq ($(findstring workflow-configuration, $(OCRD_MODULES)),) +ifneq ($(filter workflow-configuration, $(OCRD_MODULES)),) deps-ubuntu-modules: workflow-configuration OCRD_EXECUTABLES += $(WORKFLOW_CONFIGURATION) WORKFLOW_CONFIGURATION := $(BIN)/ocrd-make @@ -702,7 +702,7 @@ $(OCRD_EXECUTABLES:%=%-check): .PHONY: $(OCRD_EXECUTABLES:$(BIN)/%=%) $(OCRD_EXECUTABLES:$(BIN)/%=%): %: $(BIN)/% -ifneq ($(findstring tesseract, $(OCRD_MODULES)),) +ifneq ($(filter tesseract, $(OCRD_MODULES)),) # Tesseract. # when not installing via PPA, we must cope without ocrd_tesserocr's deps-ubuntu-modules From 55849c0e0e08686379f5f495a77c19641cddf60f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 26 Apr 2023 18:59:16 +0200 Subject: [PATCH 36/63] honour PIP_OPTIONS=-e again --- Makefile | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Makefile b/Makefile index a8ac77de..91cecc5f 100644 --- a/Makefile +++ b/Makefile @@ -579,12 +579,8 @@ $(call multirule,$(WORKFLOW_CONFIGURATION)): workflow-configuration $(BIN)/ocrd $(MAKE) -C $< install endif -# Build by entering subdir (first dependent), then -# install gracefully with dependencies, and finally -# install again forcefully without depds (to ensure -# the binary itself updates): define pip_install -. $(ACTIVATE_VENV) && cd $< && $(SEMPIP) pip install $(PIP_OPTIONS_E) . && touch -c $@ +. $(ACTIVATE_VENV) && cd $< && $(SEMPIP) pip install $(PIP_OPTIONS) . && touch -c $@ endef # Workaround for missing prebuilt versions of TF<2 for Python==3.8 From 47d837fa5041c5166782938c263ccc55435bdfcf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 26 Apr 2023 19:00:08 +0200 Subject: [PATCH 37/63] get tesserocr from PyPI if not enabled --- Makefile | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 91cecc5f..4a503e6c 100644 --- a/Makefile +++ b/Makefile @@ -425,6 +425,9 @@ deps-ubuntu-modules: ocrd_tesserocr # (does not work with names under script/ though) CUSTOM_DEPS += $(filter-out tesseract-ocr-equ,$(subst _,-,$(ALL_TESSERACT_MODELS:%=tesseract-ocr-%))) CUSTOM_DEPS += libarchive-dev +else +# tesserocr must wait for tesseract in parallel builds. +$(SHARE)/tesserocr: $(BIN)/tesseract endif OCRD_TESSEROCR := $(BIN)/ocrd-tesserocr-binarize @@ -438,11 +441,15 @@ OCRD_TESSEROCR += $(BIN)/ocrd-tesserocr-segment-word $(call multirule,$(OCRD_TESSEROCR)): ocrd_tesserocr $(SHARE)/tesserocr $(BIN)/ocrd $(pip_install) -# tesserocr must wait for tesseract in parallel builds. -ifneq ($(findstring tesseract, $(OCRD_MODULES)),) -$(SHARE)/tesserocr: $(BIN)/tesseract endif +ifneq ($(filter tesserocr, $(OCRD_MODULES)),) +$(SHARE)/tesserocr: tesserocr | $(ACTIVATE_VENV) $(SHARE) + $(pip_install) +else +$(SHARE)/tesserocr: | $(ACTIVATE_VENV) $(SHARE) + . $(ACTIVATE_VENV) && $(SEMPIP) pip install $(PIP_OPTIONS_E) tesserocr + @touch $@ endif ifneq ($(filter ocrd_cis, $(OCRD_MODULES)),) @@ -656,12 +663,6 @@ chmod +x $(1) endef endif -# avoid making these .PHONY so they do not have to be repeated: -# tesserocr -$(SHARE)/%: % | $(ACTIVATE_VENV) $(SHARE) - . $(ACTIVATE_VENV) && cd $< && $(SEMPIP) pip install $(PIP_OPTIONS) . - @touch $@ - $(SHARE): @mkdir -p "$@" From 884aae5d321e4774b792c9353202502bee3160a4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 26 Apr 2023 19:07:16 +0200 Subject: [PATCH 38/63] get ocrd from PyPI if core not enabled --- Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile b/Makefile index 4a503e6c..da79a7ed 100644 --- a/Makefile +++ b/Makefile @@ -188,9 +188,15 @@ $(BIN)/wheel: | $(ACTIVATE_VENV) .PHONY: ocrd ocrd: $(BIN)/ocrd +ifneq ($(filter core, $(OCRD_MODULES)),) deps-ubuntu-modules: core $(BIN)/ocrd: core . $(ACTIVATE_VENV) && $(MAKE) -C $< install PIP="$(SEMPIP) pip" PIP_INSTALL="$(SEMPIP) pip install $(PIP_OPTIONS)" && touch -c $@ +else +CUSTOM_DEPS += python3 imagemagick libgeos-dev +$(BIN)/ocrd: | $(ACTIVATE_VENV) + . $(ACTIVATE_VENV) && $(SEMPIP) pip install $(PIP_OPTIONS_E) ocrd ocrd_network +endif # Convert the executable names (1) to a pattern rule, # so that the recipe will be used with single-recipe- From 4a08f40c34564525171ae13912029606c08c391e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 1 Jun 2023 20:17:25 +0200 Subject: [PATCH 39/63] install ocrd_detectron2 before ocrd_kraken (better Pytorch installer) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2452989a..befd1077 100644 --- a/Makefile +++ b/Makefile @@ -251,7 +251,7 @@ install-models-kraken: OCRD_KRAKEN := $(BIN)/ocrd-kraken-binarize OCRD_KRAKEN += $(BIN)/ocrd-kraken-segment OCRD_KRAKEN += $(BIN)/ocrd-kraken-recognize -$(call multirule,$(OCRD_KRAKEN)): ocrd_kraken $(BIN)/ocrd +$(call multirule,$(OCRD_KRAKEN)): ocrd_kraken $(BIN)/ocrd | $(OCRD_DETECTRON2) $(pip_install) endif From 2cb44207b3e6ca6db2911acd98235df233c711e7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 1 Jun 2023 20:24:47 +0200 Subject: [PATCH 40/63] update opencv-python (with fixes for py38) --- opencv-python | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencv-python b/opencv-python index 6b73d90f..474a1cc0 160000 --- a/opencv-python +++ b/opencv-python @@ -1 +1 @@ -Subproject commit 6b73d90fc3e50ba6858926d299b49f0228e19d68 +Subproject commit 474a1cc0ebf2086c596b60c050a9e1af658ff380 From f972fb1ff597dd3394fb3b92b6071c524c12f890 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 2 Jun 2023 04:17:53 +0200 Subject: [PATCH 41/63] update modules --- core | 2 +- ocrd_cis | 2 +- ocrd_kraken | 2 +- ocrd_wrap | 2 +- workflow-configuration | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/core b/core index c0c153e9..12e781c6 160000 --- a/core +++ b/core @@ -1 +1 @@ -Subproject commit c0c153e97ddc6623219421f05372b44f243adf96 +Subproject commit 12e781c67812e7cda97659786d5b79ec2b3aa31a diff --git a/ocrd_cis b/ocrd_cis index c90b29f4..1abc3b7b 160000 --- a/ocrd_cis +++ b/ocrd_cis @@ -1 +1 @@ -Subproject commit c90b29f4c6f3369b5eecae1617903dada14a3553 +Subproject commit 1abc3b7b617b1c342908e6b69f6e706a14fc666f diff --git a/ocrd_kraken b/ocrd_kraken index 802c6b0b..567b74a0 160000 --- a/ocrd_kraken +++ b/ocrd_kraken @@ -1 +1 @@ -Subproject commit 802c6b0b76a3e75070c680aa3b19d36142decf4e +Subproject commit 567b74a0255236e5b38c8cdb71e4910ee18a50be diff --git a/ocrd_wrap b/ocrd_wrap index 63c04d5a..2cd800d9 160000 --- a/ocrd_wrap +++ b/ocrd_wrap @@ -1 +1 @@ -Subproject commit 63c04d5a6a377ead9989a5c1a6a1b1d9aa6f8b33 +Subproject commit 2cd800d9eccbc084751558a87972ac22ee60e87a diff --git a/workflow-configuration b/workflow-configuration index cb923f7f..818f0131 160000 --- a/workflow-configuration +++ b/workflow-configuration @@ -1 +1 @@ -Subproject commit cb923f7fade2de84e08c2d7a4f9f2b6178f696b0 +Subproject commit 818f0131d273ba7983d2b499cd07384a875d0017 From fc35815243c3820722600fd83ed8ada73ad2ffc3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 2 Jun 2023 11:23:35 +0200 Subject: [PATCH 42/63] docker-*-cuda: workaround for conflicting cuDNN version (TF/Torch) --- Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Dockerfile b/Dockerfile index 648a0d6d..dd72c126 100644 --- a/Dockerfile +++ b/Dockerfile @@ -112,6 +112,11 @@ RUN set -a; bash docker.sh RUN ldconfig # check installation RUN make -j check CHECK_HELP=1 +# workaround for clash between cuDNN of Tensorflow (→8.6) and Pytorch (→8.5) +# the latter is explicit (but unnecessary), the former is implicit (and causes "DNN library not found" crashes at runtime) +# so all we can do here is revert to the version required by TF after pip overruled our choice: +RUN if echo $BASE_IMAGE | fgrep -q cuda; then \ + pip3 install nvidia-cudnn-cu11==8.6.0.*; fi # remove (dated) security workaround preventing use of # ImageMagick's convert on PDF/PS/EPS/XPS: From b2adc3d9560b2082c9096031e54cf5b8ad94f527 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 2 Jun 2023 19:08:12 +0200 Subject: [PATCH 43/63] apply suggestions from review --- Dockerfile | 1 + Makefile | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index dd72c126..8ac766ff 100644 --- a/Dockerfile +++ b/Dockerfile @@ -97,6 +97,7 @@ RUN echo "export NO_UPDATE=1" >> docker.sh RUN echo "make deps-ubuntu" >> docker.sh RUN echo "source $VIRTUAL_ENV/bin/activate" >> docker.sh RUN echo "pip install -U pip setuptools wheel" >> docker.sh +RUN echo "hash -r" >> docker.sh # build/install all tools of the requested modules: RUN echo "make $PARALLEL all" >> docker.sh # remove unneeded automatic deps and clear pkg cache diff --git a/Makefile b/Makefile index befd1077..8f5ddd67 100644 --- a/Makefile +++ b/Makefile @@ -126,6 +126,7 @@ Targets: clean-tesseract: remove the build directory for tesseract clean-olena: remove the build directory for ocrd_olena tidy: clean, then deinit opencv-python and git-clean all submodules + (WARNING: potential data loss; if unsure, try with `make -n` and `git clean -n`) deinit: clean, then deinit and rmdir all submodules docker: (re)build a docker image including all executables dockers: (re)build docker images for some pre-selected subsets of modules @@ -187,12 +188,10 @@ tidy: clean $(BIN)/pip: $(ACTIVATE_VENV) . $(ACTIVATE_VENV) && $(SEMPIP) pip install --upgrade pip setuptools - hash -r %/bin/activate: $(PYTHON) -m venv $(subst /bin/activate,,$@) . $@ && pip install --upgrade pip setuptools wheel - hash -r .PHONY: wheel wheel: $(BIN)/wheel From e5fb2407d6c64c8abcc6fe4d0ad2963518a1a99c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 6 Jun 2023 01:12:47 +0200 Subject: [PATCH 44/63] docker-*-cuda: workaround for conflicting cuDNN version (TF/Torch) --- Dockerfile | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8ac766ff..68e29292 100644 --- a/Dockerfile +++ b/Dockerfile @@ -113,11 +113,18 @@ RUN set -a; bash docker.sh RUN ldconfig # check installation RUN make -j check CHECK_HELP=1 -# workaround for clash between cuDNN of Tensorflow (→8.6) and Pytorch (→8.5) +# workaround for clash between cuDNN of Tensorflow 2.12 (→8.6) and Pytorch 1.13 (→8.5) # the latter is explicit (but unnecessary), the former is implicit (and causes "DNN library not found" crashes at runtime) -# so all we can do here is revert to the version required by TF after pip overruled our choice: +# so we have three potential options: +# 1. revert to the version required by TF after pip overruled our choice via Torch dependency +# pip3 install nvidia-cudnn-cu11==8.6.0.* +# 2. downgrade TF so there is no overt conflict +# pip3 install "tensorflow<2.12" +# 3. upgrade Torch so there is no overt conflict +# pip install "torch>=2.0" +# Since ATM we don't know whether Torch 2.x will work everywhere, we opt for 2: RUN if echo $BASE_IMAGE | fgrep -q cuda; then \ - pip3 install nvidia-cudnn-cu11==8.6.0.*; fi + pip3 install "tensorflow<2.12"; fi # remove (dated) security workaround preventing use of # ImageMagick's convert on PDF/PS/EPS/XPS: From d6fd7fba4fdb747bc8de75ead86b9c735b1a84ee Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 6 Jun 2023 16:14:54 +0200 Subject: [PATCH 45/63] update ocrd_fileformat and ocrd_kraken --- ocrd_fileformat | 2 +- ocrd_kraken | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_fileformat b/ocrd_fileformat index dacfa509..4e7e0de6 160000 --- a/ocrd_fileformat +++ b/ocrd_fileformat @@ -1 +1 @@ -Subproject commit dacfa50957fda54596f78d1612c8b5c29363a9e9 +Subproject commit 4e7e0de68e2a0dcd9b238f64d1657beda0d74da7 diff --git a/ocrd_kraken b/ocrd_kraken index 567b74a0..a05a0694 160000 --- a/ocrd_kraken +++ b/ocrd_kraken @@ -1 +1 @@ -Subproject commit 567b74a0255236e5b38c8cdb71e4910ee18a50be +Subproject commit a05a0694a53650c3e57497966ade27a172928ecc From 421baea4c733b6baecbf3f087774fceffb6dc163 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 7 Jun 2023 13:54:40 +0200 Subject: [PATCH 46/63] docker*: always editable, *-git only as alias, never rm /build --- Dockerfile | 6 ++---- Makefile | 14 ++++++-------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/Dockerfile b/Dockerfile index 68e29292..b8222a54 100644 --- a/Dockerfile +++ b/Dockerfile @@ -102,10 +102,8 @@ RUN echo "hash -r" >> docker.sh RUN echo "make $PARALLEL all" >> docker.sh # remove unneeded automatic deps and clear pkg cache RUN echo "apt-get remove automake autoconf libtool pkg-config g++ && apt-get clean" >> docker.sh -# remove source directories from image, unless using editable mode -# (in the latter case, the git repos are also the installation targets -# and must be kept; so merely clean-up some temporary files) -RUN echo "if [[ '${PIP_OPTIONS}' =~ -e|--editable ]]; then make -i clean-olena clean-tesseract; else rm -fr /build; fi; rm -fr /.cache" >> docker.sh +# clean-up some temporary files (git repos are also installation targets and must be kept) +RUN echo "make -i clean-olena clean-tesseract; rm -fr /.cache" >> docker.sh # run the script in one layer/step (to minimise image size) # (and export all variables) RUN set -a; bash docker.sh diff --git a/Makefile b/Makefile index 8f5ddd67..b7c2c0ce 100644 --- a/Makefile +++ b/Makefile @@ -865,15 +865,13 @@ DOCKER_TAG ?= ocrd/all # these variants won't share common layers / steps / data, # so build-time and bandwidth are n-fold) .PHONY: dockers -ifdef DOCKERS_WITHOUT_REPOS dockers: docker-minimum docker-minimum-cuda docker-medium docker-medium-cuda docker-maximum docker-maximum-cuda -else -dockers: docker-minimum-git docker-minimum-cuda-git docker-medium-git docker-medium-cuda-git docker-maximum-git docker-maximum-cuda-git -endif -# Selections which keep git repos and reference them for install +# keep git repos and reference them for install # (so components can be updated via git from the container alone) -docker-%-git: PIP_OPTIONS = -e +docker-%: PIP_OPTIONS = -e +# old non-git alias +docke%um-git: docke%um # Minimum-size selection: use Ocropy binarization, use Tesseract from PPA docker-mini%: DOCKER_MODULES = core ocrd_cis ocrd_fileformat ocrd_im6convert ocrd_pagetopdf ocrd_repair_inconsistencies ocrd_tesserocr ocrd_wrap tesserocr workflow-configuration ocrd_olahd_client @@ -883,9 +881,9 @@ docker-medi%: DOCKER_MODULES = core cor-asv-ann dinglehopper docstruct format-co docker-maxi%: DOCKER_MODULES = $(OCRD_MODULES) # DOCKER_BASE_IMAGE -docker%um docke%um-git: DOCKER_BASE_IMAGE = docker.io/ocrd/core +docker%um: DOCKER_BASE_IMAGE = docker.io/ocrd/core # CUDA variants -docker%-cuda docker%-cuda-git: DOCKER_BASE_IMAGE = docker.io/ocrd/core-cuda +docker%-cuda: DOCKER_BASE_IMAGE = docker.io/ocrd/core-cuda # Build rule for all selections docker%: Dockerfile $(DOCKER_MODULES) From b5b36024255bb37275278016a0da90a6ecd227c1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 9 Jun 2023 14:49:14 +0200 Subject: [PATCH 47/63] update submodules --- cor-asv-ann | 2 +- ocrd_cis | 2 +- ocrd_kraken | 2 +- workflow-configuration | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cor-asv-ann b/cor-asv-ann index 006a70ee..2c4b1ffc 160000 --- a/cor-asv-ann +++ b/cor-asv-ann @@ -1 +1 @@ -Subproject commit 006a70eefe3a2e9e0af3ea24d387d8234c1ccaa5 +Subproject commit 2c4b1ffc123e867cc5e5203970996bfb05075397 diff --git a/ocrd_cis b/ocrd_cis index 1abc3b7b..a0ea0a2a 160000 --- a/ocrd_cis +++ b/ocrd_cis @@ -1 +1 @@ -Subproject commit 1abc3b7b617b1c342908e6b69f6e706a14fc666f +Subproject commit a0ea0a2a4aeea99414c08ae543585b994f9ab0d5 diff --git a/ocrd_kraken b/ocrd_kraken index a05a0694..89a9face 160000 --- a/ocrd_kraken +++ b/ocrd_kraken @@ -1 +1 @@ -Subproject commit a05a0694a53650c3e57497966ade27a172928ecc +Subproject commit 89a9facedecb51438a8550a61eb86bc3726ba815 diff --git a/workflow-configuration b/workflow-configuration index 818f0131..5aff777c 160000 --- a/workflow-configuration +++ b/workflow-configuration @@ -1 +1 @@ -Subproject commit 818f0131d273ba7983d2b499cd07384a875d0017 +Subproject commit 5aff777c761cae1b6f9d954fb80f9b212e8fab92 From d68fd0ca06e0595cbc372dc740bd1329f5367dd2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 9 Jun 2023 14:52:10 +0200 Subject: [PATCH 48/63] docker*cuda: move fix-cuda to makefile, add deps-cuda from core --- Dockerfile | 13 +------------ Makefile | 28 ++++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/Dockerfile b/Dockerfile index b8222a54..7369787d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -111,18 +111,7 @@ RUN set -a; bash docker.sh RUN ldconfig # check installation RUN make -j check CHECK_HELP=1 -# workaround for clash between cuDNN of Tensorflow 2.12 (→8.6) and Pytorch 1.13 (→8.5) -# the latter is explicit (but unnecessary), the former is implicit (and causes "DNN library not found" crashes at runtime) -# so we have three potential options: -# 1. revert to the version required by TF after pip overruled our choice via Torch dependency -# pip3 install nvidia-cudnn-cu11==8.6.0.* -# 2. downgrade TF so there is no overt conflict -# pip3 install "tensorflow<2.12" -# 3. upgrade Torch so there is no overt conflict -# pip install "torch>=2.0" -# Since ATM we don't know whether Torch 2.x will work everywhere, we opt for 2: -RUN if echo $BASE_IMAGE | fgrep -q cuda; then \ - pip3 install "tensorflow<2.12"; fi +RUN if echo $BASE_IMAGE | fgrep -q cuda; then make fix-cuda; fi # remove (dated) security workaround preventing use of # ImageMagick's convert on PDF/PS/EPS/XPS: diff --git a/Makefile b/Makefile index b7c2c0ce..93e80825 100644 --- a/Makefile +++ b/Makefile @@ -707,13 +707,13 @@ check: $(OCRD_EXECUTABLES:%=%-check) $(OCRD_MODULES:%=%-check) # ensure shapely#1598 workaround works # ensure CUDA works for Torch and TF -testcuda: +testcuda test-cuda: $(ACTIVATE_VENV) . $(ACTIVATE_VENV) && $(PYTHON) -c "from shapely.geometry import Polygon; import torch; torch.randn(10).cuda()" . $(ACTIVATE_VENV) && $(PYTHON) -c "import torch, sys; sys.exit(0 if torch.cuda.is_available() else 1)" . $(ACTIVATE_VENV) && $(PYTHON) -c "import tensorflow as tf, sys; sys.exit(0 if tf.test.is_gpu_available() else 1)" . $(SUB_VENV_TF1)/bin/activate && $(PYTHON) -c "import tensorflow as tf, sys; sys.exit(0 if tf.test.is_gpu_available() else 1)" @echo everything seems to be fine - +.PHONY: testcuda test-cuda define tool-jsons-code = import json @@ -857,6 +857,30 @@ deps-ubuntu-modules: .PHONY: deps-ubuntu deps-ubuntu-modules +# For native (non-Docker) installations, install CUDA system dependencies +deps-cuda: core + $(MAKE) -C $< $@ + +# For standalone use ("just get me tensorflow-gpu<2.0") +tf1nvidia: $(ACTIVATE_VENV) + $(pip_install_tf1nvidia) + +# post-fix workaround for clash between cuDNN of Tensorflow 2.12 (→8.6) and Pytorch 1.13 (→8.5) +# the latter is explicit (but unnecessary), the former is implicit (and causes "DNN library not found" crashes at runtime) +# so we have three potential options: +# 1. revert to the version required by TF after pip overruled our choice via Torch dependency +# pip3 install nvidia-cudnn-cu11==8.6.0.* +# 2. downgrade TF so there is no overt conflict +# pip3 install "tensorflow<2.12" +# 3. upgrade Torch so there is no overt conflict +# pip install "torch>=2.0" +# Since ATM we don't know whether Torch 2.x will work everywhere, we opt for 2: +fix-cuda: $(ACTIVATE_VENV) + . $(ACTIVATE_VENV) && $(SEMPIP) pip install "tensorflow<2.12" + +.PHONY: deps-cuda tf1nvidia fix-cuda + + # Docker builds. DOCKER_TAG ?= ocrd/all From 08f39d8d7a147a89ac71e0a96c4f1868f660c064 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 9 Jun 2023 17:06:03 +0200 Subject: [PATCH 49/63] update submodules --- core | 2 +- dinglehopper | 2 +- eynollah | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/core b/core index 12e781c6..d76409ed 160000 --- a/core +++ b/core @@ -1 +1 @@ -Subproject commit 12e781c67812e7cda97659786d5b79ec2b3aa31a +Subproject commit d76409edfca5955d60a10714d2cce1e405be2124 diff --git a/dinglehopper b/dinglehopper index 0fd4ea19..35be58cb 160000 --- a/dinglehopper +++ b/dinglehopper @@ -1 +1 @@ -Subproject commit 0fd4ea19732b2956942bc0fee735cef90a7d36cc +Subproject commit 35be58cb9456b0893bc46640b234912148621fb6 diff --git a/eynollah b/eynollah index 52d2e0b0..68923e0a 160000 --- a/eynollah +++ b/eynollah @@ -1 +1 @@ -Subproject commit 52d2e0b098f8defe3056a9d50c6cafd578480768 +Subproject commit 68923e0a5d7d2cf2f43205309148d090aa1b2ce0 From c37f9936b2b827f72afbb368b26aac905773d757 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 10 Jun 2023 21:13:02 +0200 Subject: [PATCH 50/63] downgrade eynollah --- eynollah | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eynollah b/eynollah index 68923e0a..706433c5 160000 --- a/eynollah +++ b/eynollah @@ -1 +1 @@ -Subproject commit 68923e0a5d7d2cf2f43205309148d090aa1b2ce0 +Subproject commit 706433c5049c63c6e16fee5f71d81a7e507abe06 From 1757708d5ba93a0d0b5dfab8650e1167ac5ecc09 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 11 Jun 2023 11:47:18 +0200 Subject: [PATCH 51/63] update core (deps-cuda) --- core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core b/core index d76409ed..67086249 160000 --- a/core +++ b/core @@ -1 +1 @@ -Subproject commit d76409edfca5955d60a10714d2cce1e405be2124 +Subproject commit 670862493408008441963a739ef650c6d3fa122d From 80447566f1573b4817632b060e9598706b9df668 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 11 Jun 2023 22:05:54 +0200 Subject: [PATCH 52/63] add 'test-core' and 'test-workflow', improve 'help' --- Makefile | 58 +++++++++++++++++++++++++++++++++++++----------- test-workflow.sh | 49 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 13 deletions(-) create mode 100644 test-workflow.sh diff --git a/Makefile b/Makefile index 179559dc..71a61d89 100644 --- a/Makefile +++ b/Makefile @@ -110,24 +110,41 @@ cat <<"EOF" Rules to download and install all OCR-D module processors from their source repositories into a single virtualenv. -Targets: +Targets (general): help: show this message show: list the venv path and all executables (to be) installed - check: verify that all executables are runnable and the venv is consistent - testcuda: verify that CUDA is available for Tensorflow and Pytorch + +Targets (module management): modules: download all submodules to the managed revision - ocrd-all-tool.json: generate union of ocrd-tool.json for all executables of all modules + deinit: clean, then deinit and rmdir all submodules + tidy: clean, then deinit opencv-python and git-clean all submodules + (WARNING: potential data loss; if unsure, try with `make -n` and `git clean -n`) + +Targets (system dependencies, may need root privileges): + deps-ubuntu: install all system dependencies of all modules + deps-cuda: install CUDA toolkit and libraries (via micromamba and nvidia-pyindex) + +Targets (build and installation into venv): all: install all executables of all modules ocrd: only install the virtual environment and OCR-D/core packages install-tesseract: only build and install Tesseract (with TESSERACT_MODELS) - install-tesseract-training: build and install Tesseract training tools - install-models: download commonly used models to appropriate locations + install-tesseract-training: also build and install Tesseract training tools + fix-cuda: workaround for non-conflicting CUDA libs after installation clean: remove the virtual environment directory, and make clean-* clean-tesseract: remove the build directory for tesseract clean-olena: remove the build directory for ocrd_olena - tidy: clean, then deinit opencv-python and git-clean all submodules - (WARNING: potential data loss; if unsure, try with `make -n` and `git clean -n`) - deinit: clean, then deinit and rmdir all submodules + +Targets (testing): + check: verify that all executables are runnable and the venv is consistent + test-core: verify ocrd via core module regression tests + test-cuda: verify that CUDA is available for Tensorflow and Pytorch + test-workflow: verify that most executables work correctly via test runs on test data + +Targets (auxiliary data): + ocrd-all-tool.json: generate union of ocrd-tool.json for all executables of all modules + install-models: download commonly used models to appropriate locations + +Targets (build of container images): docker: (re)build a docker image including all executables dockers: (re)build docker images for some pre-selected subsets of modules @@ -141,6 +158,7 @@ Variables: TMPDIR: path to use for temporary storage instead of the system default PYTHON: name of the Python binary PIP_OPTIONS: extra options for the `pip install` command like `-q` or `-v` or `-e` + CHECK_HELP: set to `1` to also check each executable can generate help output TESSERACT_MODELS: list of additional models/languages to download for Tesseract. Default: "$(ALL_TESSERACT_MODELS)" TESSERACT_CONFIG: command line options for Tesseract `configure`. Default: "$(TESSERACT_CONFIG)" EOF @@ -212,6 +230,10 @@ $(BIN)/ocrd: | $(ACTIVATE_VENV) . $(ACTIVATE_VENV) && $(SEMPIP) pip install $(PIP_OPTIONS_E) ocrd ocrd_network endif +.PHONY: test-core +test-core: core $(BIN)/ocrd + . $(ACTIVATE_VENV) && $(MAKE) -C $< deps-test test + # Convert the executable names (1) to a pattern rule, # so that the recipe will be used with single-recipe- # multiple-output semantics: @@ -706,6 +728,7 @@ ifeq (0,$(MAKELEVEL)) endif %-check: ; +.PHONY: testcuda test-cuda test-assets test-workflow # ensure shapely#1598 workaround works # ensure CUDA works for Torch and TF testcuda test-cuda: $(ACTIVATE_VENV) @@ -714,7 +737,16 @@ testcuda test-cuda: $(ACTIVATE_VENV) . $(ACTIVATE_VENV) && $(PYTHON) -c "import tensorflow as tf, sys; sys.exit(0 if tf.test.is_gpu_available() else 1)" . $(SUB_VENV_TF1)/bin/activate && $(PYTHON) -c "import tensorflow as tf, sys; sys.exit(0 if tf.test.is_gpu_available() else 1)" @echo everything seems to be fine -.PHONY: testcuda test-cuda + +# download models and run some processors (not for result quality, only coverage) +test-workflow: export CUDA_DEVICE ?= cpu # cuda:0 +test-workflow: test-assets core $(BIN)/ocrd $(ACTIVATE_VENV) + . $(ACTIVATE_VENV) && cd core/tests/assets/SBB0000F29300010000/data/ && bash -x $(CURDIR)/test-workflow.sh + +test-assets: + $(MAKE) -C core assets + +clean: define tool-jsons-code = import json @@ -859,10 +891,10 @@ deps-ubuntu-modules: .PHONY: deps-ubuntu deps-ubuntu-modules # For native (non-Docker) installations, install CUDA system dependencies -deps-cuda: core - $(MAKE) -C $< $@ +deps-cuda: core $(ACTIVATE_VENV) + . $(ACTIVATE_VENV) && $(MAKE) -C $< $@ -# For standalone use ("just get me tensorflow-gpu<2.0") +# For standalone use ("just get me tensorflow-gpu<2.0 into the current venv") tf1nvidia: $(ACTIVATE_VENV) $(pip_install_tf1nvidia) diff --git a/test-workflow.sh b/test-workflow.sh new file mode 100644 index 00000000..060a9a2b --- /dev/null +++ b/test-workflow.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +set -e + +ocrd resmgr download ocrd-sbb-binarize default-2021-03-09 +ocrd-sbb-binarize -I OCR-D-IMG -O OCR-D-BIN -P model default-2021-03-09 + +ocrd resmgr download ocrd-detectron2-segment Jambo-sudo_X101.pth +ocrd resmgr download ocrd-detectron2-segment Jambo-sudo_X101.yaml +ocrd-detectron2-segment -p $(python -c "import ocrd_detectron2; print(ocrd_detectron2.__path__[0])")/presets_Jambo-sudo_X101.json -I OCR-D-BIN -O OCR-D-SEG -P device ${CUDA_DEVICE:-cpu} + +ocrd-typegroups-classifier -I OCR-D-IMG -O FONT + +ocrd resmgr download ocrd-eynollah-segment default +ocrd-eynollah-segment -P models default -I OCR-D-IMG -O OCR-D-SEG2 + +ocrd resmgr download ocrd-calamari-recognize qurator-gt4histocr-1.0 +ocrd-calamari-recognize -I OCR-D-SEG2 -O OCR-D-OCR -P checkpoint_dir qurator-gt4histocr-1.0 -P textequiv_level glyph + +ocrd resmgr download ocrd-kraken-segment blla.mlmodel +ocrd-kraken-segment -I OCR-D-BIN -O OCR-D-SEG3 -P device ${CUDA_DEVICE:-cpu} +ocrd-cis-ocropy-resegment -I OCR-D-SEG3 -O OCR-D-SEG3X -P method baseline + +ocrd resmgr download ocrd-kraken-recognize reichsanzeiger.mlmodel +ocrd-kraken-recognize -I OCR-D-SEG3X -O OCR-D-OCR2 -P model reichsanzeiger.mlmodel -P device ${CUDA_DEVICE:-cpu} + +wget "https://git.informatik.uni-leipzig.de/ocr-d/cor-asv-ann-models/-/raw/master/s2s.gt4histocr.s-%C5%BF.d2.w0512.adam.attention.stateless.variational-dropout.char.transfer-lm.h5" +ocrd resmgr download -n s2s.gt4histocr.s-ſ.d2.w0512.adam.attention.stateless.variational-dropout.char.transfer-lm.h5 ocrd-cor-asv-ann-process s2s.gt4histocr.s-ſ.d2.w0512.adam.attention.stateless.variational-dropout.char.transfer-lm.h5 +ocrd-cor-asv-ann-process -I OCR-D-OCR -O OCR-D-COR -P model_file s2s.gt4histocr.s-ſ.d2.w0512.adam.attention.stateless.variational-dropout.char.transfer-lm.h5 + +ocrd-anybaseocr-crop -I OCR-D-BIN -O OCR-D-CROP +ocrd-skimage-denoise -I OCR-D-CROP -O OCR-D-DEN +ocrd-cis-ocropy-segment -I OCR-D-DEN -O OCR-D-SEG4 -P level-of-operation page + +ocrd-segment-evaluate -I OCR-D-SEG,OCR-D-SEG2 -O OCR-D-SEGEVAL + +ocrd resmgr download ocrd-tesserocr-recognize frak2021.traineddata +ocrd-tesserocr-recognize -I OCR-D-SEG2 -O OCR-D-OCR3 -P model frak2021 + +ocrd resmgr download ocrd-cis-ocropy-recognize LatinHist.pyrnn.gz +ocrd-cis-ocropy-recognize -I OCR-D-SEG2 -O OCR-D-OCR4 -P model LatinHist.pyrnn.gz -P textequiv_level glyph + +ocrd-cor-asv-ann-align -I OCR-D-OCR,OCR-D-OCR3,OCR-D-OCR4 -O OCR-D-OCR5 + +ocrd-cor-asv-ann-evaluate -I OCR-D-OCR,OCR-D-OCR3,OCR-D-OCR4,OCR-D-OCR5 -O OCR-D-OCREVAL + +ocrd-page-transform -I OCR-D-OCR4 -O OCR-D-OCR4X -P xsl page-textequiv-lines-to-regions.xsl +ocrd-fileformat-transform -I OCR-D-OCR4X -O TXT -P from-to "page text" -P script-args level=region pb="$(echo -e \v)" +ocrd-fileformat-transform -I OCR-D-OCR4X -O FULLTEXT -P from-to "page alto" -P script-args "--no-check-border --dummy-word" + From 482f36408896c36c74a61b422e6e2f59e9216670 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Jun 2023 00:38:27 +0200 Subject: [PATCH 53/63] update ocrd_kraken (default to device=cuda:0), adapt test-workflow --- Makefile | 1 - ocrd_kraken | 2 +- test-workflow.sh | 6 +++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 71a61d89..96390085 100644 --- a/Makefile +++ b/Makefile @@ -739,7 +739,6 @@ testcuda test-cuda: $(ACTIVATE_VENV) @echo everything seems to be fine # download models and run some processors (not for result quality, only coverage) -test-workflow: export CUDA_DEVICE ?= cpu # cuda:0 test-workflow: test-assets core $(BIN)/ocrd $(ACTIVATE_VENV) . $(ACTIVATE_VENV) && cd core/tests/assets/SBB0000F29300010000/data/ && bash -x $(CURDIR)/test-workflow.sh diff --git a/ocrd_kraken b/ocrd_kraken index 89a9face..b13dd8a9 160000 --- a/ocrd_kraken +++ b/ocrd_kraken @@ -1 +1 @@ -Subproject commit 89a9facedecb51438a8550a61eb86bc3726ba815 +Subproject commit b13dd8a932b7dfbfe5019698e87542f5f767e2bd diff --git a/test-workflow.sh b/test-workflow.sh index 060a9a2b..b9caafd8 100644 --- a/test-workflow.sh +++ b/test-workflow.sh @@ -6,7 +6,7 @@ ocrd-sbb-binarize -I OCR-D-IMG -O OCR-D-BIN -P model default-2021-03-09 ocrd resmgr download ocrd-detectron2-segment Jambo-sudo_X101.pth ocrd resmgr download ocrd-detectron2-segment Jambo-sudo_X101.yaml -ocrd-detectron2-segment -p $(python -c "import ocrd_detectron2; print(ocrd_detectron2.__path__[0])")/presets_Jambo-sudo_X101.json -I OCR-D-BIN -O OCR-D-SEG -P device ${CUDA_DEVICE:-cpu} +ocrd-detectron2-segment -p $(python -c "import ocrd_detectron2; print(ocrd_detectron2.__path__[0])")/presets_Jambo-sudo_X101.json -I OCR-D-BIN -O OCR-D-SEG ocrd-typegroups-classifier -I OCR-D-IMG -O FONT @@ -17,11 +17,11 @@ ocrd resmgr download ocrd-calamari-recognize qurator-gt4histocr-1.0 ocrd-calamari-recognize -I OCR-D-SEG2 -O OCR-D-OCR -P checkpoint_dir qurator-gt4histocr-1.0 -P textequiv_level glyph ocrd resmgr download ocrd-kraken-segment blla.mlmodel -ocrd-kraken-segment -I OCR-D-BIN -O OCR-D-SEG3 -P device ${CUDA_DEVICE:-cpu} +ocrd-kraken-segment -I OCR-D-BIN -O OCR-D-SEG3 ocrd-cis-ocropy-resegment -I OCR-D-SEG3 -O OCR-D-SEG3X -P method baseline ocrd resmgr download ocrd-kraken-recognize reichsanzeiger.mlmodel -ocrd-kraken-recognize -I OCR-D-SEG3X -O OCR-D-OCR2 -P model reichsanzeiger.mlmodel -P device ${CUDA_DEVICE:-cpu} +ocrd-kraken-recognize -I OCR-D-SEG3X -O OCR-D-OCR2 -P model reichsanzeiger.mlmodel wget "https://git.informatik.uni-leipzig.de/ocr-d/cor-asv-ann-models/-/raw/master/s2s.gt4histocr.s-%C5%BF.d2.w0512.adam.attention.stateless.variational-dropout.char.transfer-lm.h5" ocrd resmgr download -n s2s.gt4histocr.s-ſ.d2.w0512.adam.attention.stateless.variational-dropout.char.transfer-lm.h5 ocrd-cor-asv-ann-process s2s.gt4histocr.s-ſ.d2.w0512.adam.attention.stateless.variational-dropout.char.transfer-lm.h5 From c7c170bddfb54d300ea1451463e7de32866d07b8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Jun 2023 10:49:08 +0200 Subject: [PATCH 54/63] update/improve readme --- README.md | 186 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 125 insertions(+), 61 deletions(-) diff --git a/README.md b/README.md index 8c3a0213..f9555c75 100644 --- a/README.md +++ b/README.md @@ -8,22 +8,28 @@ This controls installation of all OCR-D modules from source (as git submodules). It includes a Makefile for their installation into a virtual environment (venv) or Docker container. -(A venv is a local user directory with shell scripts to load/unload itself +(A [venv](https://packaging.python.org/tutorials/installing-packages/#creating-virtual-environments) +is a local user directory with shell scripts to load/unload itself in the current shell environment via PATH and PYTHONHOME.) -(NOTE: If you are going to install ocrd_all, you may want to first reference the [OCR-D setup guide](https://ocr-d.de/en/setup) at the OCR-D website. If you are a non-IT user, it is especially recommended you utilize the guide.) +> **Note**: If you are going to install ocrd_all, you may want to first consult +the [OCR-D setup guide](https://ocr-d.de/en/setup) on the [OCR-D website](https://ocr-d.de). +If you are a non-IT user, it is especially recommended you utilize the guide. -* [Preconditions](#preconditions) +* [Prerequisites](#prerequisites) * [Space](#space) * [Locale](#locale) * [System packages](#system-packages) + * [GPU support](#gpu-support) * [Usage](#usage) * [Targets](#targets) * [deps-ubuntu](#deps-ubuntu) + * [deps-cuda](#deps-cuda) * [modules](#modules) * [ocrd](#ocrd) * [all](#all) * [docker](#docker) + * [dockers](#dockers) * [clean](#clean) * [show](#show) * [help (default goal)](#help-default-goal) @@ -31,6 +37,7 @@ in the current shell environment via PATH and PYTHONHOME.) * [[any executable name]](#any-executable-name) * [Variables](#variables) * [OCRD_MODULES](#ocrd_modules) + * [NO_UPDATE](#no_update) * [PYTHON](#python) * [VIRTUAL_ENV](#virtual_env) * [TMPDIR](#tmpdir) @@ -48,24 +55,26 @@ in the current shell environment via PATH and PYTHONHOME.) * [System requirements](#system-requirements) * [Contributing](#contributing) -## Preconditions +## Prerequisites ### Space -Make sure that there is enough free disk space. 7 GiB or more is recommended for -the required submodules, build data, temporary data, installed virtual environment -and pip cache. +Make sure that there is enough free disk space. For a **full installation** including executables from all modules, +around **22 GiB** will be needed (mostly on the same filesystem as the ocrd_all checkout). The same goes for the +[`maximum-cuda`](#docker-hub) variant of the prebuilt Docker images (due on the filesystem harboring Docker, typically +`/var/lib/docker`). -If the `/tmp` directory has less than 5 GiB of free space, you can override the location -of temporary files by setting the `TMPDIR` variable when calling make: +Also, during build, an additional 5 GiB may be needed for temporary files, typically in the `/tmp` directory. +To use a different location path with more free space, set the `TMPDIR` variable when calling `make`: + + TMPDIR=/path/to/my/tempdir make all -```sh -TMPDIR=/path/to/my/tempdir make all -``` ### Locale -Next, the (shell) environment must have a Unicode-based localization. (Otherwise Python code based on `click` will not work, i.e. most OCR-D CLIs.) This is true for most installations today, and can be verified by: +The (shell) environment must have a Unicode-based localization. +(Otherwise Python code based on `click` will not work, i.e. most OCR-D CLIs.) +This is true for most installations today, and can be verified by: locale | fgrep .UTF-8 @@ -80,46 +89,72 @@ This should show several `LC_*` variables. Otherwise, either select another loca ### System packages -Install GNU make, git and GNU parallel. +* Install git, GNU make and GNU parallel. - # on Debian / Ubuntu: - sudo apt install make git parallel + # on Debian / Ubuntu: + sudo apt install make git parallel -Install wget or curl if you want to download Tesseract models. +* Install wget or curl if you want to download Tesseract models. - # on Debian / Ubuntu: - sudo apt install wget + # on Debian / Ubuntu: + sudo apt install wget -Install the packages for Python3 development and for Python3 virtual environments +* Install the packages for Python3 development and Python3 virtual environments for your operating system / distribution. - # on Debian / Ubuntu: - sudo apt install python3-dev python3-venv + # on Debian / Ubuntu: + sudo apt install python3-dev python3-venv + +* Some modules require [Tesseract](https://github.com/tesseract-ocr/tesseract). +If your operating system / distribution already provides Tesseract 4.1 +or newer, then just install its development package: + + # on Debian / Ubuntu: + sudo apt install libtesseract-dev + + Otherwise, recent Tesseract packages for Ubuntu are available via PPA + [alex-p](https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr-devel). + + Alternatively, the latest version of Tesseract can also be built as a module locally. -Some modules use the Tesseract library. If your distribution provides Tesseract 4.1 -or newer, install the development package: +* Other modules will have additional system dependencies. - # on Debian / Ubuntu: - sudo apt install libtesseract-dev +> **Note**: System dependencies **for all modules** on Ubuntu 20.04 (or similar) +can also be installed **automatically** by running: -Ubuntu packages for Tesseract 5.0.0 (alpha) are available at the PPA -https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr-devel. + # on Debian / Ubuntu: + make modules + sudo apt install make + sudo make deps-ubuntu -Otherwise or for the latest Tesseract code it can also be built locally. +> (And you can define the scope of _all modules_ by setting the `OCRD_MODULES` +[variable](#Variables) as described below. If unsure, consider doing a dry-run +first, by using `make -n`.) -Other modules will have additional system dependencies. +### GPU support -System dependencies **for all modules** on Ubuntu 18.04 (or similar) can also be installed **automatically** by running: +Many executables can utilize Nvidia GPU for much faster computation, _if available_ (i.e. optionally). - # on Debian / Ubuntu: - sudo apt install make - sudo make deps-ubuntu +For that, as a further prerequisite you need an installation of +[CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) and additional optimised +libraries like [cuDNN](https://developer.nvidia.com/cudnn) for your system. -(And you can define the scope of _all modules_ by setting the `OCRD_MODULES` [variable](#Variables).) +The CUDA version currently supported is 11.8 (but other's may work as well). + +> **Note**: CUDA toolkit and libraries (in a development version with CUDA compiler) +can also be installed **automatically** by running: + + make ocrd + sudo make deps-cuda + +> This will deploy [Micromamba](https://mamba.readthedocs.io/en/latest/index.html) +non-intrusively (without system packages or Conda environments), but also share some +of the CUDA libraries installed as Python packages system-wide via ld.so.conf rules. +If unsure, consider doing a dry-run first, by using `make -n`.) ## Usage -Run `make` with optional parameters for _variables_ and _targets_ like so: +Run `make` with optional parameters for __variables__ and __targets__ like so: make [PYTHON=python3] [VIRTUAL_ENV=./venv] [OCRD_MODULES="..."] [TARGET...] @@ -129,9 +164,17 @@ Run `make` with optional parameters for _variables_ and _targets_ like so: Install system packages for all modules. (Depends on [_modules_](#modules).) +See [system package prerequisites](#system-packages) above. + +#### _deps-cuda_ + +Install CUDA toolkit and libraries. (Depends on [_ocrd_](#ocrd).) + +See (optional) [GPU support prerequisites](#gpu-support) above. + #### _modules_ -Download/update all modules, but do not install anything. +Checkout/update all modules, but do not install anything. #### _all_ @@ -139,11 +182,17 @@ Install executables from all modules into the venv. (Depends on [_modules_](#mod #### _ocrd_ -Install only OCR-D/core and its CLI `ocrd` into the venv. +Install only the `core` module and its CLI `ocrd` into the venv. #### _docker_ -(Re-)build a docker image for all modules/executables. (Depends on [_modules_](#modules).) +(Re-)build a Docker image for all modules/executables. (Depends on [_modules_](#modules).) + +#### _dockers_ + +(Re-)build Docker images for some pre-selected subsets of modules/executables. (Depends on [_modules_](#modules).) + +(These are the very same variants published as [prebuilt images on Docker Hub](#docker-hub).) #### _clean_ @@ -151,7 +200,7 @@ Remove the venv and the modules' build directories. #### _show_ -Print the venv directory, the module directories, and the executable names. +Print the venv directory, the module directories, and the executable names – as configured by the current variables. #### _check_ @@ -183,6 +232,12 @@ Override the list of git submodules to include. Targets affected by this include - [docker](#docker) (reducing the list of executables and modules to install) - [show](#show) (reducing the list of `OCRD_MODULES` and of `OCRD_EXECUTABLES` to print) +#### _NO_UPDATE_ + +If set to `1`, then when installing executables, does not attempt to `git submodule update` +any currently checked out modules. (Useful for development when testing different module version +prior to a commit.) + #### _PYTHON_ Name of the Python binary to use (at least python3 required). @@ -191,7 +246,8 @@ Name of the Python binary to use (at least python3 required). Directory prefix to use for local installation. -(This is set automatically when activating a virtual environment on the shell. The build system will re-use the venv if one already exists here, or create one.) +(This is set automatically when activating a virtual environment on the shell. +The build system will re-use the venv if one already exists here, or create one otherwise.) #### _TMPDIR_ @@ -201,7 +257,8 @@ Override the default path (`/tmp` on Unix) where temporary files during build ar Add extra options to the `pip install` command like `-q` or `-v` or `-e`. -(The latter will install Python modules in _editable mode_, i.e. any update to the source will directly affect the executables.) +> **Note**: The latter option will install Python modules in __editable mode__, +i.e. any update to the source would directly affect the executables. #### _GIT_RECURSIVE_ @@ -220,11 +277,10 @@ Set `configure` options for building Tesseract from source (`--disable-openmp -- ### Examples -The following examples assume a working development installation of Tesseract. To build the latest Tesseract locally, run this command first: # Get code, build and install Tesseract with the default English model. - make tesseract + make install-tesseract Optionally install additional Tesseract models. @@ -288,6 +344,9 @@ TESSERACT_MODELS = deu frk script/Fraktur script/Latin # install all of Tesseract's submodules to support unit tests and training tools, too tesseract: GIT_RECURSIVE = --recursive + +# avoid automatic submodule updates +NO_UPDATE = 1 ``` Note: When `local.mk` exists, variables can still be overridden on the command line, @@ -297,23 +356,27 @@ but not from the shell environment ### Docker Hub -The project is available as prebuilt Docker images from [Docker Hub as -`ocrd/all`](https://hub.docker.com/r/ocrd/all). You can choose from three tags, -`minimum`, `medium` and `maximum`. These differ in which modules are included, -with `maximum` being the equivalent of doing `make all` with the default (unset) value for `OCRD_MODULES`. To download the images -on the command line: - -```sh -docker pull ocrd/all:minimum -# or -docker pull ocrd/all:medium -# or -docker pull ocrd/all:maximum -``` +Besides native installation, `ocrd_all` is also available as prebuilt Docker images +from [Docker Hub as `ocrd/all`](https://hub.docker.com/r/ocrd/all). You can choose from three tags, +`minimum`, `medium` and `maximum`. These differ w.r.t. which modules are included, +with `maximum` being the equivalent of doing `make all` with the default (unset) value for `OCRD_MODULES`. + +To download the images on the command line: + + docker pull ocrd/all:minimum + # or + docker pull ocrd/all:medium + # or + docker pull ocrd/all:maximum + +In addition to these base variants, there are `minimum-cuda`, `medium-cuda` and `maximum-cuda` with GPU support. +(These also need [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) runtime, which will add the +`docker --gpus` option.) -In addition to these base variants, there are `minimum-cuda`, `medium-cuda` and `maximum-cuda` with GPU support. (Also needs [nvidia-docker](https://github.com/NVIDIA/nvidia-docker), which adds the `docker --gpus` option.) +These tags will be overwritten with every new release of ocrd_all. However, the `maximum` variant of each release +will also be aliased to a permanent tag by ISO date, e.g. `2023-04-02`. -Usage is the same [as if you had built the image yourself](#results). +Usage of the prebuilt Docker image is the same [as if you had built the image yourself](#results). This table lists which tag contains which module: | Module | `minimum` | `medium` | `maximum` | @@ -353,8 +416,9 @@ This table lists which tag contains which module: **Note**: The following modules have been disabled by default and can only be enabled by explicitly setting `OCRD_MODULES` or `DISABLED_MODULES`: -* cor-asv-fst (runtime issues) -* ocrd_ocropy (better implementation in ocrd_cis available) +* `cor-asv-fst` (runtime issues) +* `ocrd_ocropy` (better implementation in ocrd_cis available) +* `ocrd_pc_segmentation` (dependency and quality issues) ### Uninstall From 5f6a27b04416bf12dc642736c29b88effe295eb7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Jun 2023 11:04:42 +0200 Subject: [PATCH 55/63] improve readme markup --- README.md | 62 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index f9555c75..82fa85ee 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,8 @@ is a local user directory with shell scripts to load/unload itself in the current shell environment via PATH and PYTHONHOME.) > **Note**: If you are going to install ocrd_all, you may want to first consult -the [OCR-D setup guide](https://ocr-d.de/en/setup) on the [OCR-D website](https://ocr-d.de). -If you are a non-IT user, it is especially recommended you utilize the guide. +> the [OCR-D setup guide](https://ocr-d.de/en/setup) on the [OCR-D website](https://ocr-d.de). +> If you are a non-IT user, it is especially recommended you utilize the guide. * [Prerequisites](#prerequisites) * [Space](#space) @@ -120,13 +120,13 @@ or newer, then just install its development package: * Other modules will have additional system dependencies. > **Note**: System dependencies **for all modules** on Ubuntu 20.04 (or similar) -can also be installed **automatically** by running: - - # on Debian / Ubuntu: - make modules - sudo apt install make - sudo make deps-ubuntu - +> can also be installed **automatically** by running: +> +> # on Debian / Ubuntu: +> make modules +> sudo apt install make +> sudo make deps-ubuntu +> > (And you can define the scope of _all modules_ by setting the `OCRD_MODULES` [variable](#Variables) as described below. If unsure, consider doing a dry-run first, by using `make -n`.) @@ -142,11 +142,11 @@ libraries like [cuDNN](https://developer.nvidia.com/cudnn) for your system. The CUDA version currently supported is 11.8 (but other's may work as well). > **Note**: CUDA toolkit and libraries (in a development version with CUDA compiler) -can also be installed **automatically** by running: - - make ocrd - sudo make deps-cuda - +> can also be installed **automatically** by running: +> +> make ocrd +> sudo make deps-cuda +> > This will deploy [Micromamba](https://mamba.readthedocs.io/en/latest/index.html) non-intrusively (without system packages or Conda environments), but also share some of the CUDA libraries installed as Python packages system-wide via ld.so.conf rules. @@ -192,7 +192,13 @@ Install only the `core` module and its CLI `ocrd` into the venv. (Re-)build Docker images for some pre-selected subsets of modules/executables. (Depends on [_modules_](#modules).) -(These are the very same variants published as [prebuilt images on Docker Hub](#docker-hub).) +(These are the very same variants published as [prebuilt images on Docker Hub](#docker-hub), +cf. [CI configuration](.circleci/config.yml#L27-L65).) + +> **Note**: The image will contain all refs and branches of all checked out modules, +> which may not be actually needed. If you are planning on building and distributing +> Docker images with minimal size, consider using `GIT_DEPTH=--single-branch` +> before `modules` or running `make tidy` later-on. #### _clean_ @@ -258,7 +264,7 @@ Override the default path (`/tmp` on Unix) where temporary files during build ar Add extra options to the `pip install` command like `-q` or `-v` or `-e`. > **Note**: The latter option will install Python modules in __editable mode__, -i.e. any update to the source would directly affect the executables. +> i.e. any update to the source would directly affect the executables. #### _GIT_RECURSIVE_ @@ -268,8 +274,8 @@ Set to `--recursive` to checkout/update all modules recursively. (This usually i Add more models to the minimum required list of languages (`eng equ osd`) to install along with Tesseract. -Note: this only affects `make install-tesseract` (or `all`), but is independent of the `install-models` step. -(The latter delegates to `ocrd resmgr download`, which fetches all registered resources.) +> **Note**: this only affects `make install-tesseract` (or `all`), but is independent of the `install-models` step. +> (The latter delegates to `ocrd resmgr download`, which fetches all registered resources.) #### _TESSERACT_CONFIG_ @@ -349,10 +355,10 @@ tesseract: GIT_RECURSIVE = --recursive NO_UPDATE = 1 ``` -Note: When `local.mk` exists, variables can still be overridden on the command line, -(i.e. `make all OCRD_MODULES=` will build all executables for all modules again), -but not from the shell environment -(i.e. `OCRD_MODULES= make all` will still use the value from local.mk). +> **Note**: When `local.mk` exists, variables can still be overridden on the command line, +> (i.e. `make all OCRD_MODULES=` will build all executables for all modules again), +> but not from the shell environment +> (i.e. `OCRD_MODULES= make all` will still use the value from local.mk). ### Docker Hub @@ -413,12 +419,12 @@ This table lists which tag contains which module: | ocrd_ocropy | - | - | - | | ocrd_pc_segmentation | - | - | - | -**Note**: The following modules have been disabled by default and can only be -enabled by explicitly setting `OCRD_MODULES` or `DISABLED_MODULES`: - -* `cor-asv-fst` (runtime issues) -* `ocrd_ocropy` (better implementation in ocrd_cis available) -* `ocrd_pc_segmentation` (dependency and quality issues) +> **Note**: The following modules have been disabled by default and can only be +> enabled by explicitly setting `OCRD_MODULES` or `DISABLED_MODULES`: +> +> * `cor-asv-fst` (runtime issues) +> * `ocrd_ocropy` (better implementation in ocrd_cis available) +> * `ocrd_pc_segmentation` (dependency and quality issues) ### Uninstall From 8c625077ef5bdba47116bf3c755944f82106513f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 12 Jun 2023 18:01:53 +0200 Subject: [PATCH 56/63] improve/fix docker rules --- Makefile | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 96390085..0b09bd5a 100644 --- a/Makefile +++ b/Makefile @@ -263,6 +263,14 @@ $(SHARE)/opencv-python: opencv-python/setup.py | $(ACTIVATE_VENV) $(SHARE) $(BIN)/ocrd: $(SHARE)/opencv-python endif +ifneq ($(filter ocrd_detectron2, $(OCRD_MODULES)),) +OCRD_EXECUTABLES += $(OCRD_DETECTRON2) +OCRD_DETECTRON2 := $(BIN)/ocrd-detectron2-segment +$(call multirule,$(OCRD_DETECTRON2)): ocrd_detectron2 $(BIN)/ocrd + . $(ACTIVATE_VENV) && $(MAKE) -C $< deps + $(pip_install) +endif + ifneq ($(filter ocrd_kraken, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_KRAKEN) install-models: install-models-kraken @@ -308,14 +316,6 @@ else endif endif -ifneq ($(filter ocrd_detectron2, $(OCRD_MODULES)),) -OCRD_EXECUTABLES += $(OCRD_DETECTRON2) -OCRD_DETECTRON2 += $(BIN)/ocrd-detectron2-segment -$(call multirule,$(OCRD_DETECTRON2)): ocrd_detectron2 $(BIN)/ocrd - . $(ACTIVATE_VENV) && $(MAKE) -C $< deps - $(pip_install) -endif - ifneq ($(filter cor-asv-fst, $(OCRD_MODULES)),) deps-ubuntu-modules: cor-asv-fst OCRD_EXECUTABLES += $(OCRD_COR_ASV_FST) @@ -927,22 +927,23 @@ dockers: docker-minimum docker-minimum-cuda docker-medium docker-medium-cuda doc # (so components can be updated via git from the container alone) docker-%: PIP_OPTIONS = -e # old non-git alias -docke%um-git: docke%um +docker-%um-git: docker-%um # Minimum-size selection: use Ocropy binarization, use Tesseract from PPA -docker-mini%: DOCKER_MODULES = core ocrd_cis ocrd_fileformat ocrd_im6convert ocrd_pagetopdf ocrd_repair_inconsistencies ocrd_tesserocr ocrd_wrap tesserocr workflow-configuration ocrd_olahd_client +docker-mini%: DOCKER_MODULES := core ocrd_cis ocrd_fileformat ocrd_im6convert ocrd_pagetopdf ocrd_repair_inconsistencies ocrd_tesserocr ocrd_wrap tesserocr workflow-configuration ocrd_olahd_client # Medium-size selection: add Olena binarization and Calamari, use Tesseract from git, add evaluation -docker-medi%: DOCKER_MODULES = core cor-asv-ann dinglehopper docstruct format-converters nmalign ocrd_calamari ocrd_cis ocrd_fileformat ocrd_im6convert ocrd_keraslm ocrd_neat ocrd_olahd_client ocrd_olena ocrd_pagetopdf ocrd_repair_inconsistencies ocrd_segment ocrd_tesserocr ocrd_wrap tesseract tesserocr workflow-configuration +docker-medi%: DOCKER_MODULES := core cor-asv-ann dinglehopper docstruct format-converters nmalign ocrd_calamari ocrd_cis ocrd_fileformat ocrd_im6convert ocrd_keraslm ocrd_neat ocrd_olahd_client ocrd_olena ocrd_pagetopdf ocrd_repair_inconsistencies ocrd_segment ocrd_tesserocr ocrd_wrap tesseract tesserocr workflow-configuration # Maximum-size selection: use all modules -docker-maxi%: DOCKER_MODULES = $(OCRD_MODULES) +docker-maxi%: DOCKER_MODULES := $(OCRD_MODULES) # DOCKER_BASE_IMAGE -docker%um: DOCKER_BASE_IMAGE = docker.io/ocrd/core +docker-%um: DOCKER_BASE_IMAGE = docker.io/ocrd/core # CUDA variants -docker%-cuda: DOCKER_BASE_IMAGE = docker.io/ocrd/core-cuda +docker-%-cuda: DOCKER_BASE_IMAGE = docker.io/ocrd/core-cuda # Build rule for all selections -docker%: Dockerfile $(DOCKER_MODULES) +# FIXME: $(DOCKER_MODULES) ref does not work at phase 1; workaround: all modules +docker-%: Dockerfile modules docker build \ --progress=plain \ --build-arg BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ @@ -953,7 +954,7 @@ docker%: Dockerfile $(DOCKER_MODULES) --build-arg PARALLEL="$(DOCKER_PARALLEL)" \ --build-arg PYTHON="$(PYTHON)" \ --network=host \ - -t $(DOCKER_TAG):$(or $(*:-%=%),latest) . + -t $(DOCKER_TAG):$* . docker: DOCKER_MODULES ?= $(OCRD_MODULES) docker: DOCKER_PARALLEL ?= -j1 From 93b445f212f616c7d75a95da6926a90974f558b7 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 12 Jun 2023 18:37:31 +0200 Subject: [PATCH 57/63] :memo: changelog --- CHANGELOG.md | 182 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b48174d4..bdb85f46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,187 @@ # Changelog +## [v2023-06-12](https://github.com/OCR-D/ocrd_all/releases/v2023-06-12) + +Changed: + + * All docker images now contain git checkouts and retain `/build`, i.e. behave like the `-git` variants + +Added: + + * `make ocrd-all-tool.json`: Generate and upload a combination of all processors' `ocrd-tool.json`, #362 + * `make test-workflow`: Run a workflow with most processors as a general smoke test + * `make test-cuda`: to test whether CUDA properly set up and has GPU available + * `make test-core`: Run OCR-D/core unit tests + +### [cor-asv-ann](https://github.com/ASVLeipzig/cor-asv-ann) [006a70e](https://github.com/ASVLeipzig/cor-asv-ann/commits/006a70e)..[2c4b1ff](https://github.com/ASVLeipzig/cor-asv-ann/commits/2c4b1ff) + +> Release: [v0.1.14](https://github.com/ASVLeipzig/cor-asv-ann/releases/v0.1.14) + + > * CI: use ocrd/core-cuda as base image + > * CI: dummy venv + > * CI: use proper tab character + > * CI: clone first + > * CI: mkdir first + > * CI: chdir to tmp location + > * CI: use /tmp for aux clone of ocrd_all + > * try getting tensorflow-gpu from Nvidia + > * use proper URLs for submodules + > * Merge pull request #6 from kba/init-report-dict + > * evaluate: skip pages with no results + +### [core](https://github.com/OCR-D/core) [de08453](https://github.com/OCR-D/core/commits/de08453)..[6708624](https://github.com/OCR-D/core/commits/6708624) + +> Release: [v2.51.0](https://github.com/OCR-D/core/releases/v2.51.0) + + > * Merge pull request #1055 from bertsky/deps-cuda + > * ci: disable upterm for gh actions + > * readme: remove dockerhub/travis badge, add GH actions badge + > * debug gh actions + > * test bashlib: /usr/bin/env bash instead of /bin/bash + > * test_workspace_bagger: use ocr-d.de instead of google.com for testing + > * disable logging tests until properly fixed + > * docker-image: reuse local ghcr.io image instead of docker.io + > * :package: v2.51.0 + > * :memo: changelog + > * make help: improve description + > * Revert "Merge remote-tracking branch 'hnesk/no-more-pkg_resources' into release-2.36.0" + > * remove out-dated processor resources + > * docker-cuda: improve (reduce size) again… + > * docker-cuda: rewrite… + > * core-cuda: use same CUDA libs as needed for Torch anyway + > * Merge branch 'pr-1008' into reduce-cuda + > * Merge branch 'master' of https://github.com/OCR-D/core into reduce-cuda + > * make install on py36: revert to prefer-binary via install + > * make install on py36: fix prefer-binary syntax + > * make install on py36: prefer binary OpenCV/Numpy via pip config instead of preinstall + > * core-cuda: install more CUDA libs via pip and ld.so.conf, simplify Dockerfile for that + > * core-cuda: use CUDA 11.8, install cuDNN via pip and make available system-wide via ld.so.conf + > * reinstate workaround for shapely, but more robust + > * docker-cuda: change base image, no multi-CUDA runtimes + > * keep gcc, no autoremove + > * rehash after pip upgrade + > * give up workaround for shapely-CUDA issue + +### [dinglehopper](https://github.com/qurator-spk/dinglehopper) [0fd4ea1](https://github.com/qurator-spk/dinglehopper/commits/0fd4ea1)..[35be58c](https://github.com/qurator-spk/dinglehopper/commits/35be58c) + + > * Merge pull request #83 from INL/feat/batch-processing + > * Merge pull request #82 from CircleCI-config-suggestions-bot/StoreTestResults + > * 🧹 .gitignore .python-version (for pyenv) + > * 🧹 Remove qurator. namespace prefix + > * 🐛 Fix installing by calling find_namespace_packages in setup.py + > * 🕸Do not use deprecated ID, pageId options + > * 🔧 Remove explicit namespace_packages + > * ✔ CircleCI: Explicitly install binary opencv-python-headless (dep of OCR-D?) to avoid compilation + > * 🐛 Remove deprecated declare_namespace call + +### [eynollah](https://github.com/qurator-spk/eynollah) [ea792d1](https://github.com/qurator-spk/eynollah/commits/ea792d1)..[706433c](https://github.com/qurator-spk/eynollah/commits/706433c) + +> Release: [v0.2.0](https://github.com/qurator-spk/eynollah/releases/v0.2.0) + + > * Revert "Merge pull request #97 from qurator-spk/420-namespace-package" + > * Merge pull request #100 from bertsky/patch-2 + > * Merge pull request #97 from qurator-spk/420-namespace-package + +### [ocrd_cis](https://github.com/cisocrgroup/ocrd_cis) [c90b29f](https://github.com/cisocrgroup/ocrd_cis/commits/c90b29f)..[a0ea0a2](https://github.com/cisocrgroup/ocrd_cis/commits/a0ea0a2) + +> Release: [v0.1.5](https://github.com/cisocrgroup/ocrd_cis/releases/v0.1.5) + + > * Merge branch 'kba:typo' #91 into fix-alpha-shape + > * Merge branch 'kba:double-page-max-size' #96 into fix-alpha-shape + > * Merge branch 'kba:resolve-resources' #83 into fix-alpha-shape + > * segment: adapt to OpenCV changes + > * resegment (baseline/ccomps): improve handling of fg conflicts + > * resegment: add param baseline_only + > * check_page/region/line: skip assumptions on number of components + > * adapt to Shapely 2.0 deprecations + > * adapt to Numpy 1.24 dtypes + > * resegment: list instead of generator + > * re/segment: improve polygon simplification + > * re/segment: join_baselines: skip lines outside of polygon + > * re/segment: join_baselines: for complex subtypes, apply recursively + > * re/segment: join_polygons: connect touching neighbours, too + +### [ocrd_fileformat](https://github.com/OCR-D/ocrd_fileformat) [dacfa50](https://github.com/OCR-D/ocrd_fileformat/commits/dacfa50)..[4e7e0de](https://github.com/OCR-D/ocrd_fileformat/commits/4e7e0de) + +> Release: [v0.7.0](https://github.com/OCR-D/ocrd_fileformat/releases/v0.7.0) + + > * :package: v0.7.0 + > * update ocr-fileformat + +### [ocrd_kraken](https://github.com/OCR-D/ocrd_kraken) [802c6b0](https://github.com/OCR-D/ocrd_kraken/commits/802c6b0)..[b13dd8a](https://github.com/OCR-D/ocrd_kraken/commits/b13dd8a) + +> Release: [v0.3.0](https://github.com/OCR-D/ocrd_kraken/releases/v0.3.0) + + > * segment/recognize: default to device=cuda:0 (now backed by safe fall-back) + > * segment/recognize: fall back to CPU if no CUDA device + > * fix typo + > * update changelog + > * recognize: project text upwards in order by concatenation + > * recognize: ensure baseline/boundary are consistent + > * recognize: ignore invalid baselines + > * setup metadata: update/improve + > * deps-ubuntu: update + > * improve/update readme + > * Dockerfile: use CUDA base image, improve labels + > * update changelog + > * recognize: pass lines in baseline format if any baselines are annotated + > * update blla.model URL (master→main) + > * recognize: workaround for empty/failed line records + > * recognize: workaround for better quality box cuts + > * recognize: avoid invalid polygons on single-glyph words + > * Revert "recognize: avoid invalid polygons on single-glyph words" + > * segment: also show tags/type prediction + > * recognize: avoid invalid polygons on single-glyph words + > * recognize: use proper data structures of rpred + +### [ocrd_pagetopdf](https://github.com/UB-Mannheim/ocrd_pagetopdf) [6155605](https://github.com/UB-Mannheim/ocrd_pagetopdf/commits/6155605)..[4f4a330](https://github.com/UB-Mannheim/ocrd_pagetopdf/commits/4f4a330) + +> Release: [v1.0.0](https://github.com/UB-Mannheim/ocrd_pagetopdf/releases/v1.0.0) + + > * Merge pull request #22 from bertsky/fix-input-files + +### [ocrd_wrap](https://github.com/bertsky/ocrd_wrap) [63c04d5](https://github.com/bertsky/ocrd_wrap/commits/63c04d5)..[2cd800d](https://github.com/bertsky/ocrd_wrap/commits/2cd800d) + +> Release: [v0.1.8](https://github.com/bertsky/ocrd_wrap/releases/v0.1.8) + + > * :package: 0.1.8 + > * Merge pull request #10 from bertsky/update-numpy + +### [opencv-python](https://github.com/skvark/opencv-python) [6b73d90](https://github.com/skvark/opencv-python/commits/6b73d90)..[474a1cc](https://github.com/skvark/opencv-python/commits/474a1cc) + +> Release: [72](https://github.com/skvark/opencv-python/releases/72) + + > * Merge pull request #849 from asmorkalov/as/python3_for_build + > * Fix: numpy version for python 3.11 (#839) + > * Merge pull request #852 from asmorkalov:as/ci_check + > * Merge pull request #837 from bertsky/fix-py38-build + > * Merge pull request #838 from henryiii/patch-2 + +### [sbb_binarization](https://github.com/qurator-spk/sbb_binarization) [39ef3fd](https://github.com/qurator-spk/sbb_binarization/commits/39ef3fd)..[010ec99](https://github.com/qurator-spk/sbb_binarization/commits/010ec99) + +> Release: [v0.1.0](https://github.com/qurator-spk/sbb_binarization/releases/v0.1.0) + + > * :package: v0.1.0 + > * Update README.md + > * update CI badge + > * Merge pull request #59 from bertsky/change-model-url + > * Merge pull request #56 from bertsky/non-verbose + +### [workflow-configuration](https://github.com/bertsky/workflow-configuration) [cb923f7](https://github.com/bertsky/workflow-configuration/commits/cb923f7)..[5aff777](https://github.com/bertsky/workflow-configuration/commits/5aff777) + + > * ocrd-import: add option --regex (positive path selector) + > * ocrd-import: fix skipping in subshell + > * add METS transforms to TOC + > * generalise standalone CLI for both PAGE and METS XSL, update documentation + > * mets-copy-agents.xsl: make path for other-mets relative to input mets (not stylesheet file) + > * (ocrd-)page-transform: add pretty-printing option + > * add page-ensure-readingorder.xsl + > * add page-ensure-textequiv-index.xsl + > * ocrd-import: also replace comma in IDs + > * Merge remote-tracking branch 'origin/master' + > * page-textequiv-*: ensure target TextEquiv exists + + ## [v2023-03-24](https://github.com/OCR-D/ocrd_all/releases/v2023-03-24) Fixed: From edb8f23bf025832185d421e88220e901655b5df4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Jun 2023 16:24:11 +0200 Subject: [PATCH 58/63] switch detectron2/kraken dependency --- Makefile | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 0b09bd5a..5586a52f 100644 --- a/Makefile +++ b/Makefile @@ -263,14 +263,6 @@ $(SHARE)/opencv-python: opencv-python/setup.py | $(ACTIVATE_VENV) $(SHARE) $(BIN)/ocrd: $(SHARE)/opencv-python endif -ifneq ($(filter ocrd_detectron2, $(OCRD_MODULES)),) -OCRD_EXECUTABLES += $(OCRD_DETECTRON2) -OCRD_DETECTRON2 := $(BIN)/ocrd-detectron2-segment -$(call multirule,$(OCRD_DETECTRON2)): ocrd_detectron2 $(BIN)/ocrd - . $(ACTIVATE_VENV) && $(MAKE) -C $< deps - $(pip_install) -endif - ifneq ($(filter ocrd_kraken, $(OCRD_MODULES)),) OCRD_EXECUTABLES += $(OCRD_KRAKEN) install-models: install-models-kraken @@ -281,7 +273,15 @@ install-models-kraken: OCRD_KRAKEN := $(BIN)/ocrd-kraken-binarize OCRD_KRAKEN += $(BIN)/ocrd-kraken-segment OCRD_KRAKEN += $(BIN)/ocrd-kraken-recognize -$(call multirule,$(OCRD_KRAKEN)): ocrd_kraken $(BIN)/ocrd | $(OCRD_DETECTRON2) +$(call multirule,$(OCRD_KRAKEN)): ocrd_kraken $(BIN)/ocrd + $(pip_install) +endif + +ifneq ($(filter ocrd_detectron2, $(OCRD_MODULES)),) +OCRD_EXECUTABLES += $(OCRD_DETECTRON2) +OCRD_DETECTRON2 := $(BIN)/ocrd-detectron2-segment +$(call multirule,$(OCRD_DETECTRON2)): ocrd_detectron2 $(BIN)/ocrd | $(OCRD_KRAKEN) + . $(ACTIVATE_VENV) && $(MAKE) -C $< deps $(pip_install) endif From e4fe65dcab2c376cf2052aa2b52237c82e191eba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Jun 2023 17:05:32 +0200 Subject: [PATCH 59/63] update changelog again --- CHANGELOG.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bdb85f46..bb217a19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,14 +5,27 @@ Changed: * All docker images now contain git checkouts and retain `/build`, i.e. behave like the `-git` variants + * No more git updates within docker build, but fix git module dependency outside + * Reduce docker image size (by reinstating all-in-one layer, removing cache, avoiding duplicate CUDA libraries...) + * Use `git submodule update --single-branch` on CI to reduce docker image size Added: + * `make deps-cuda`: non-intrusively support CUDA system dependencies (in docker or native) * `make ocrd-all-tool.json`: Generate and upload a combination of all processors' `ocrd-tool.json`, #362 * `make test-workflow`: Run a workflow with most processors as a general smoke test * `make test-cuda`: to test whether CUDA properly set up and has GPU available * `make test-core`: Run OCR-D/core unit tests +Fixed: + + * dependencies between modules, esp. with custom `OCRD_MODULES` selection + * editable mode (`pip install -e`) + * OpenCV build + * get `tesserocr` from PyPI if disabled as a module + * get `ocrd` from PyPI if core disabled as a module + * consistent interoperable module versions (esp. Numpy/OpenCV/Shapely/Protobuf/Torch/TF Python dependencies) + ### [cor-asv-ann](https://github.com/ASVLeipzig/cor-asv-ann) [006a70e](https://github.com/ASVLeipzig/cor-asv-ann/commits/006a70e)..[2c4b1ff](https://github.com/ASVLeipzig/cor-asv-ann/commits/2c4b1ff) > Release: [v0.1.14](https://github.com/ASVLeipzig/cor-asv-ann/releases/v0.1.14) From 626110a1bfc1135ef7fec794457be9ba31b7419c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Jun 2023 22:28:44 +0200 Subject: [PATCH 60/63] GHA makedocker: add input switch for upterm console --- .github/workflows/makedocker.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/makedocker.yml b/.github/workflows/makedocker.yml index a478a4a6..9e7a7f7a 100644 --- a/.github/workflows/makedocker.yml +++ b/.github/workflows/makedocker.yml @@ -33,6 +33,10 @@ on: description: 'Upload Docker image Github Container Registry' default: False type: boolean + upterm-session: + description: 'Run SSH login server for debugging' + default: False + type: boolean # not yet: #push: # branches: [ "master" ] @@ -83,7 +87,8 @@ jobs: make ocrd-all-tool.json wc -l ocrd-all-tool.json - name: Setup upterm session - if: false # interactive SSH logins for debugging + # interactive SSH logins for debugging + if: github.event.inputs.upterm-session == true uses: lhotari/action-upterm@v1 - name: Upload ocrd-all-tool.json uses: actions/upload-artifact@v3 From 6131c4684858bcad029f207ae824201d76f0d8e8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Jun 2023 23:31:33 +0200 Subject: [PATCH 61/63] GHA makedocker: move upterm console step before build --- .github/workflows/makedocker.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/makedocker.yml b/.github/workflows/makedocker.yml index 9e7a7f7a..b5e00dc5 100644 --- a/.github/workflows/makedocker.yml +++ b/.github/workflows/makedocker.yml @@ -77,6 +77,10 @@ jobs: sudo rm -rf "$AGENT_TOOLSDIRECTORY" sudo du -mscx /* 2>/dev/null || true df -h / + - name: Setup upterm session + # interactive SSH logins for debugging + if: github.event.inputs.upterm-session == true + uses: lhotari/action-upterm@v1 - name: Make Docker image run: make docker-${{ github.event.inputs.docker-image }} GIT_DEPTH=--single-branch - name: Generate ocrd-all-tool.json @@ -86,10 +90,6 @@ jobs: export OCRD_MODULES=$(docker run --rm ocrd/all:${{ github.event.inputs.docker-image }} bash -c 'echo $OCRD_MODULES') make ocrd-all-tool.json wc -l ocrd-all-tool.json - - name: Setup upterm session - # interactive SSH logins for debugging - if: github.event.inputs.upterm-session == true - uses: lhotari/action-upterm@v1 - name: Upload ocrd-all-tool.json uses: actions/upload-artifact@v3 with: From 1a5a49aa5e836b884b9a41ed7c517a4c3c321111 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 14 Jun 2023 02:15:02 +0200 Subject: [PATCH 62/63] GHA makedocker: workaround for input boolean vs string mixup --- .github/workflows/makedocker.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/makedocker.yml b/.github/workflows/makedocker.yml index b5e00dc5..09f4bc81 100644 --- a/.github/workflows/makedocker.yml +++ b/.github/workflows/makedocker.yml @@ -79,7 +79,7 @@ jobs: df -h / - name: Setup upterm session # interactive SSH logins for debugging - if: github.event.inputs.upterm-session == true + if: github.event.inputs.upterm-session == 'true' uses: lhotari/action-upterm@v1 - name: Make Docker image run: make docker-${{ github.event.inputs.docker-image }} GIT_DEPTH=--single-branch @@ -97,10 +97,10 @@ jobs: path: ./ocrd-all-tool.json # if-no-files-found: error - name: Login to Docker Hub - if: github.event.inputs.upload-dockerhub == true + if: github.event.inputs.upload-dockerhub == 'true' run: echo ${{ secrets.DOCKERHUB_PASSWORD }} | docker login --username ${{ secrets.DOCKERHUB_USERNAME }} --password-stdin - name: Push to Docker Hub - if: github.event.inputs.upload-dockerhub == true + if: github.event.inputs.upload-dockerhub == 'true' run: | docker push ocrd/all:${{ github.event.inputs.docker-image }} if test ${{ github.event.inputs.docker-image }} = maximum-git; then @@ -109,14 +109,14 @@ jobs: docker push ocrd/all:latest fi - name: Login to GitHub Container Registry - if: github.event.inputs.upload-github == true + if: github.event.inputs.upload-github == 'true' uses: docker/login-action@v2 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Push to Github Container Registry - if: github.event.inputs.upload-github == true + if: github.event.inputs.upload-github == 'true' run: | docker tag ocrd/all:${{ github.event.inputs.docker-image }} ghcr.io/ocr-d/all:${{ github.event.inputs.docker-image }} docker push ghcr.io/ocr-d/all:${{ github.event.inputs.docker-image }} From 4ecde605ec169efaaf5c542d5fa916b907613b7b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 14 Jun 2023 03:21:17 +0200 Subject: [PATCH 63/63] docker*: avoid unconstrained parallelism (which leads to deadlock) --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7369787d..e40176c7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -101,7 +101,7 @@ RUN echo "hash -r" >> docker.sh # build/install all tools of the requested modules: RUN echo "make $PARALLEL all" >> docker.sh # remove unneeded automatic deps and clear pkg cache -RUN echo "apt-get remove automake autoconf libtool pkg-config g++ && apt-get clean" >> docker.sh +RUN echo "apt-get -y remove automake autoconf libtool pkg-config g++ && apt-get -y clean" >> docker.sh # clean-up some temporary files (git repos are also installation targets and must be kept) RUN echo "make -i clean-olena clean-tesseract; rm -fr /.cache" >> docker.sh # run the script in one layer/step (to minimise image size) @@ -110,7 +110,7 @@ RUN set -a; bash docker.sh # update ld.so cache for new libs in /usr/local RUN ldconfig # check installation -RUN make -j check CHECK_HELP=1 +RUN make -j4 check CHECK_HELP=1 RUN if echo $BASE_IMAGE | fgrep -q cuda; then make fix-cuda; fi # remove (dated) security workaround preventing use of