diff --git a/.gitmodules b/.gitmodules index e63bf021..aecbc18a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -30,6 +30,9 @@ [submodule "cor-asv-ann"] path = cor-asv-ann url = https://github.com/ASVLeipzig/cor-asv-ann.git +[submodule "tesseract"] + path = tesseract + url = https://github.com/tesseract-ocr/tesseract.git [submodule "format-converters"] path = format-converters url = https://github.com/OCR-D/format-converters.git diff --git a/Makefile b/Makefile index a935a330..0ccf360b 100644 --- a/Makefile +++ b/Makefile @@ -747,6 +747,29 @@ testcuda test-cuda: $(ACTIVATE_VENV) test-workflow: test-assets core $(BIN)/ocrd $(ACTIVATE_VENV) . $(ACTIVATE_VENV) && cd core/tests/assets/SBB0000F29300010000/data/ && bash -x $(CURDIR)/test-workflow.sh +DOCKER_COMPOSE = docker compose +INTEGRATION_TEST_IN_DOCKER = docker exec core_test + +network-integration-test: test-assets + $(DOCKER_COMPOSE) --file tests/network/docker-compose.yml up -d + docker cp core/tests/assets/kant_aufklaerung_1784/data/. ocrd_network_processing_server:/data + # Queues must exists and it takes time until they are created by the workers. We might need + # a mechanism to test if all queues are there but that is not available yet. So sleeping + # here for now + sleep 10 + -$(INTEGRATION_TEST_IN_DOCKER) pytest 'tests/network/test_ocrd_all_workflow.py' -v + $(DOCKER_COMPOSE) --file tests/network/docker-compose.yml down -v --remove-orphans + +network-integration-test-cicd: + $(DOCKER_COMPOSE) --file tests/network/docker-compose.yml up -d + docker cp core/tests/assets/kant_aufklaerung_1784/data/. ocrd_network_processing_server:/data + # Queues must exists and it takes time until they are created by the workers. We might need + # a mechanism to test if all queues are there but that is not available yet. So sleeping + # here for now + sleep 10 + $(INTEGRATION_TEST_IN_DOCKER) pytest 'tests/network/test_ocrd_all_workflow.py' -v + $(DOCKER_COMPOSE) --file tests/network/docker-compose.yml down -v --remove-orphans + test-assets: $(MAKE) -C core assets diff --git a/core b/core index c5b5580b..2a7ef7b4 160000 --- a/core +++ b/core @@ -1 +1 @@ -Subproject commit c5b5580ba5c517e6ee151a5b3a1d8fe1b3ba0d88 +Subproject commit 2a7ef7b49b8c94efc8e13df77f047e95f62a515d diff --git a/tests/network/.env b/tests/network/.env new file mode 100644 index 00000000..3b732822 --- /dev/null +++ b/tests/network/.env @@ -0,0 +1,24 @@ +DOCKER_OCRD_NETWORK_NAME=ocrd_network_test +DOCKER_OCRD_NETWORK_MTU=1450 + +OCRD_NETWORK_LOGS_ROOT=/tmp/ocrd_network_logs +OCRD_NETWORK_SOCKETS_ROOT=/tmp/ocrd_network_sockets +OCRD_NETWORK_DATA=ocrd-test-workspace + +OCRD_PS_HOST=ps-docker-host +OCRD_PS_PORT=8000 +OCRD_PS_URL=http://${OCRD_PS_HOST}.${DOCKER_OCRD_NETWORK_NAME}:${OCRD_PS_PORT} + +MONGODB_NAME=ocrd_network_test +MONGODB_USER=network_test +MONGODB_PASS=network_test +MONGODB_HOST=mongodb-docker-host +MONGODB_PORT=27017 +MONGODB_URL=mongodb://${MONGODB_USER}:${MONGODB_PASS}@${MONGODB_HOST}.${DOCKER_OCRD_NETWORK_NAME}:${MONGODB_PORT} + +RABBITMQ_FEATURE_FLAGS=quorum_queue,implicit_default_bindings,classic_mirrored_queue_version +RABBITMQ_USER=network_test +RABBITMQ_PASS=network_test +RABBITMQ_HOST=rabbitmq-docker-host +RABBITMQ_PORT=5672 +RABBITMQ_URL=amqp://${RABBITMQ_USER}:${RABBITMQ_PASS}@${RABBITMQ_HOST}.${DOCKER_OCRD_NETWORK_NAME}:${RABBITMQ_PORT} diff --git a/tests/network/docker-compose.yml b/tests/network/docker-compose.yml new file mode 100644 index 00000000..3e507d54 --- /dev/null +++ b/tests/network/docker-compose.yml @@ -0,0 +1,260 @@ +networks: + ocrd_network_test: + name: ${DOCKER_OCRD_NETWORK_NAME} + driver: bridge + driver_opts: + com.docker.network.driver.mtu: ${DOCKER_OCRD_NETWORK_MTU} + +volumes: + ocrd-test-workspace: {} + +services: + ocrd_network_processing_server: + image: "ocrd/core" + hostname: ${OCRD_PS_HOST} + container_name: ocrd_network_processing_server + depends_on: + ocrd_network_mongo_db: + condition: service_healthy + ocrd_network_rabbit_mq: + condition: service_healthy + networks: + - ${DOCKER_OCRD_NETWORK_NAME} + ports: + - ${OCRD_PS_PORT}:8000 + environment: + DB_NAME: ${MONGODB_NAME} + DB_URL: ${MONGODB_URL} + RABBITMQ_URL: ${RABBITMQ_URL} + OCRD_NETWORK_LOGS_ROOT_DIR: /ocrd-data/ocrd_network_logs + OCRD_NETWORK_SOCKETS_ROOT_DIR: /ocrd-data/ocrd_network_sockets + healthcheck: + test: curl -f ${OCRD_PS_URL}/ + interval: 1s + timeout: 3s + retries: 30 + volumes: + - ${OCRD_NETWORK_LOGS_ROOT}:/ocrd-data/ocrd_network_logs + - ${OCRD_NETWORK_SOCKETS_ROOT}:/ocrd-data/ocrd_network_sockets + - "./ocrd_logging.conf:/etc/ocrd_logging.conf" + - "./ps_config.yml:/ocrd-data/ps_config.yml" + - "${OCRD_NETWORK_DATA}:/data" + command: ocrd network processing-server -a 0.0.0.0:8000 /ocrd-data/ps_config.yml + + ocrd_network_mongo_db: + image: "mongo" + hostname: ${MONGODB_HOST} + container_name: ocrd_network_mongo_db + networks: + - ${DOCKER_OCRD_NETWORK_NAME} + ports: + - ${MONGODB_PORT}:27017 + environment: + - MONGO_INITDB_ROOT_USERNAME=${MONGODB_USER} + - MONGO_INITDB_ROOT_PASSWORD=${MONGODB_PASS} + healthcheck: + test: echo 'db.runCommand("ping").ok' | mongosh localhost:27017/test --quiet + interval: 1s + timeout: 3s + retries: 30 + + ocrd_network_rabbit_mq: + image: "rabbitmq:3.12-management" + hostname: ${RABBITMQ_HOST} + container_name: ocrd_network_rabbit_mq + networks: + - ${DOCKER_OCRD_NETWORK_NAME} + ports: + - ${RABBITMQ_PORT}:5672 + - 15672:15672 + - 25672:25672 + environment: + - RABBITMQ_DEFAULT_USER=${RABBITMQ_USER} + - RABBITMQ_DEFAULT_PASS=${RABBITMQ_PASS} + - RABBITMQ_FEATURE_FLAGS=${RABBITMQ_FEATURE_FLAGS} + healthcheck: + test: rabbitmq-diagnostics check_port_connectivity + interval: 1s + timeout: 3s + retries: 30 + + ocrd_network_core_test: + build: + context: ../../core + args: + BASE_IMAGE: 'ubuntu:22.04' + SKIP_ASSETS: 1 + target: ocrd_core_test + container_name: core_test + depends_on: + ocrd_network_processing_server: + condition: service_healthy + networks: + - ${DOCKER_OCRD_NETWORK_NAME} + environment: + DB_NAME: ${MONGODB_NAME} + DB_URL: ${MONGODB_URL} + PROCESSING_SERVER_URL: ${OCRD_PS_URL} + RABBITMQ_URL: ${RABBITMQ_URL} + OCRD_NETWORK_LOGS_ROOT_DIR: /ocrd-data/ocrd_network_logs + OCRD_NETWORK_SOCKETS_ROOT_DIR: /ocrd-data/ocrd_network_sockets + volumes: + - ${OCRD_NETWORK_LOGS_ROOT}:/ocrd-data/ocrd_network_logs + - ${OCRD_NETWORK_SOCKETS_ROOT}:/ocrd-data/ocrd_network_sockets + - "./ocrd_all-test-workflow.txt:/ocrd-data/assets/ocrd_all-test-workflow.txt" + - "./ocrd_logging.conf:/etc/ocrd_logging.conf" + + ocrd-olena-binarize: + image: ocrd/olena + container_name: ocrd-olena-binarize + depends_on: + ocrd_network_processing_server: + condition: service_healthy + networks: + - ${DOCKER_OCRD_NETWORK_NAME} + environment: + - OCRD_NETWORK_LOGS_ROOT_DIR=/ocrd-data/ocrd_network_logs + - OCRD_LOGGING_DEBUG=true + volumes: + - ${OCRD_NETWORK_LOGS_ROOT}:/ocrd-data/ocrd_network_logs + - ${OCRD_NETWORK_SOCKETS_ROOT}:/ocrd-data/ocrd_network_sockets + - ${OCRD_NETWORK_DATA}:/data + command: ocrd-olena-binarize worker --database ${MONGODB_URL} --queue ${RABBITMQ_URL} + + ocrd-anybaseocr-crop: + image: ocrd/anybaseocr + container_name: ocrd-anybaseocr-crop + depends_on: + ocrd_network_processing_server: + condition: service_healthy + networks: + - ${DOCKER_OCRD_NETWORK_NAME} + environment: + - OCRD_NETWORK_LOGS_ROOT_DIR=/ocrd-data/ocrd_network_logs + - OCRD_LOGGING_DEBUG=true + volumes: + - ${OCRD_NETWORK_LOGS_ROOT}:/ocrd-data/ocrd_network_logs + - ${OCRD_NETWORK_SOCKETS_ROOT}:/ocrd-data/ocrd_network_sockets + - ${OCRD_NETWORK_DATA}:/data + command: ocrd-anybaseocr-crop worker --database ${MONGODB_URL} --queue ${RABBITMQ_URL} + + ocrd-cis-ocropy-denoise: + image: ocrd/cis + container_name: ocrd-cis-ocropy-denoise + depends_on: + ocrd_network_processing_server: + condition: service_healthy + networks: + - ${DOCKER_OCRD_NETWORK_NAME} + environment: + - OCRD_NETWORK_LOGS_ROOT_DIR=/ocrd-data/ocrd_network_logs + - OCRD_LOGGING_DEBUG=true + volumes: + - ${OCRD_NETWORK_LOGS_ROOT}:/ocrd-data/ocrd_network_logs + - ${OCRD_NETWORK_SOCKETS_ROOT}:/ocrd-data/ocrd_network_sockets + - ${OCRD_NETWORK_DATA}:/data + command: ocrd-cis-ocropy-denoise worker --database ${MONGODB_URL} --queue ${RABBITMQ_URL} + + ocrd-cis-ocropy-clip: + image: ocrd/cis + container_name: ocrd-cis-ocropy-clip + depends_on: + ocrd_network_processing_server: + condition: service_healthy + networks: + - ${DOCKER_OCRD_NETWORK_NAME} + environment: + - OCRD_NETWORK_LOGS_ROOT_DIR=/ocrd-data/ocrd_network_logs + - OCRD_LOGGING_DEBUG=true + volumes: + - ${OCRD_NETWORK_LOGS_ROOT}:/ocrd-data/ocrd_network_logs + - ${OCRD_NETWORK_SOCKETS_ROOT}:/ocrd-data/ocrd_network_sockets + - ${OCRD_NETWORK_DATA}:/data + command: ocrd-cis-ocropy-clip worker --database ${MONGODB_URL} --queue ${RABBITMQ_URL} + + ocrd-cis-ocropy-segment: + image: ocrd/cis + depends_on: + ocrd_network_processing_server: + condition: service_healthy + networks: + - ${DOCKER_OCRD_NETWORK_NAME} + environment: + - OCRD_NETWORK_LOGS_ROOT_DIR=/ocrd-data/ocrd_network_logs + - OCRD_LOGGING_DEBUG=true + volumes: + - ${OCRD_NETWORK_LOGS_ROOT}:/ocrd-data/ocrd_network_logs + - ${OCRD_NETWORK_SOCKETS_ROOT}:/ocrd-data/ocrd_network_sockets + - ${OCRD_NETWORK_DATA}:/data + command: ocrd-cis-ocropy-segment worker --database ${MONGODB_URL} --queue ${RABBITMQ_URL} + + ocrd-cis-ocropy-dewarp: + image: ocrd/cis + container_name: ocrd-cis-ocropy-dewarp + depends_on: + ocrd_network_processing_server: + condition: service_healthy + networks: + - ${DOCKER_OCRD_NETWORK_NAME} + environment: + - OCRD_NETWORK_LOGS_ROOT_DIR=/ocrd-data/ocrd_network_logs + - OCRD_LOGGING_DEBUG=true + volumes: + - ${OCRD_NETWORK_LOGS_ROOT}:/ocrd-data/ocrd_network_logs + - ${OCRD_NETWORK_SOCKETS_ROOT}:/ocrd-data/ocrd_network_sockets + - ${OCRD_NETWORK_DATA}:/data + command: ocrd-cis-ocropy-dewarp worker --database ${MONGODB_URL} --queue ${RABBITMQ_URL} + + ocrd-tesserocr-segment-region: + image: ocrd/tesserocr + container_name: ocrd-tesserocr-segment-region + depends_on: + ocrd_network_processing_server: + condition: service_healthy + networks: + - ${DOCKER_OCRD_NETWORK_NAME} + environment: + - TESSDATA_PREFIX=/usr/local/share/tessdata + - OCRD_NETWORK_LOGS_ROOT_DIR=/ocrd-data/ocrd_network_logs + - OCRD_LOGGING_DEBUG=true + volumes: + - ${OCRD_NETWORK_LOGS_ROOT}:/ocrd-data/ocrd_network_logs + - ${OCRD_NETWORK_SOCKETS_ROOT}:/ocrd-data/ocrd_network_sockets + - ${OCRD_NETWORK_DATA}:/data + command: ocrd-tesserocr-segment-region worker --database ${MONGODB_URL} --queue ${RABBITMQ_URL} + + ocrd-tesserocr-recognize: + image: ocrd/tesserocr + container_name: ocrd-tesserocr-recognize + depends_on: + ocrd_network_processing_server: + condition: service_healthy + networks: + - ${DOCKER_OCRD_NETWORK_NAME} + environment: + - TESSDATA_PREFIX=/usr/local/share/tessdata + - OCRD_NETWORK_LOGS_ROOT_DIR=/ocrd-data/ocrd_network_logs + - OCRD_LOGGING_DEBUG=true + volumes: + - ${OCRD_NETWORK_LOGS_ROOT}:/ocrd-data/ocrd_network_logs + - ${OCRD_NETWORK_SOCKETS_ROOT}:/ocrd-data/ocrd_network_sockets + - ${OCRD_NETWORK_DATA}:/data + command: ocrd-tesserocr-recognize worker --database ${MONGODB_URL} --queue ${RABBITMQ_URL} + + ocrd-segment-repair: + image: ocrd/segment + container_name: ocrd-segment-repair + depends_on: + ocrd_network_processing_server: + condition: service_healthy + networks: + - ${DOCKER_OCRD_NETWORK_NAME} + environment: + - OCRD_NETWORK_LOGS_ROOT_DIR=/ocrd-data/ocrd_network_logs + - OCRD_LOGGING_DEBUG=true + volumes: + - ${OCRD_NETWORK_LOGS_ROOT}:/ocrd-data/ocrd_network_logs + - ${OCRD_NETWORK_SOCKETS_ROOT}:/ocrd-data/ocrd_network_sockets + - ${OCRD_NETWORK_DATA}:/data + command: ocrd-segment-repair worker --database ${MONGODB_URL} --queue ${RABBITMQ_URL} + diff --git a/tests/network/ocrd_all-test-workflow.txt b/tests/network/ocrd_all-test-workflow.txt new file mode 100644 index 00000000..2e4485da --- /dev/null +++ b/tests/network/ocrd_all-test-workflow.txt @@ -0,0 +1,9 @@ +olena-binarize -I OCR-D-IMG -O O-1 +anybaseocr-crop -I O-1 -O O-2 +cis-ocropy-denoise -I O-2 -O O-3 -P dpi 300 -P level-of-operation page -P noise_maxsize 3.0 +tesserocr-segment-region -I O-3 -O O-4 -P padding 5 -P find_tables false -P dpi 300 +segment-repair -I O-4 -O O-5 -P plausibilize true -P plausibilize_merge_min_overlap 0.7 +cis-ocropy-clip -I O-5 -O O-6 +cis-ocropy-segment -I O-6 -O O-7 -P spread 2.4 -P dpi 300 +cis-ocropy-dewarp -I O-7 -O O-8 +tesserocr-recognize -I O-8 -O PAGE -P textequiv_level word -P model Fraktur diff --git a/tests/network/ocrd_logging.conf b/tests/network/ocrd_logging.conf new file mode 100644 index 00000000..ee081b7b --- /dev/null +++ b/tests/network/ocrd_logging.conf @@ -0,0 +1,150 @@ +# This is a template configuration file which allows customizing +# format and destination of log messages with OCR-D. +# It is meant as an example, and should be customized. +# To get into effect, you must put a copy (under the same name) +# into your CWD, HOME or /etc. These directories are searched +# in said order, and the first find wins. When no config file +# is found, the default logging configuration applies (cf. ocrd.logging.py). +# +# mandatory loggers section +# configure loggers with corresponding keys "root", "" +# each logger requires a corresponding configuration section below +# +[loggers] +keys=root,ocrd,ocrd_network,ocrd_tensorflow,ocrd_shapely_geos,ocrd_PIL,uvicorn,uvicorn_access,uvicorn_error,multipart + +# +# mandatory handlers section +# handle output for each logging "channel" +# i.e. console, file, smtp, syslog, http, ... +# each handler requires a corresponding configuration section below +# +[handlers] +keys=consoleHandler,fileHandler,processingServerHandler + +# +# optional custom formatters section +# format message fields, to be used differently by logging handlers +# each formatter requires a corresponding formatter section below +# +[formatters] +keys=defaultFormatter,detailedFormatter + +# +# default logger "root" using consoleHandler +# +[logger_root] +level=DEBUG +handlers=consoleHandler,fileHandler + + +# +# additional logger configurations can be added +# as separate configuration sections like below +# +# example logger "ocrd_workspace" uses fileHandler and overrides +# default log level "INFO" with custom level "DEBUG" +# "qualname" must match the logger label used in the corresponding +# ocrd module +# see in the module-of-interest (moi) +# +#[logger_ocrd_workspace] +#level=DEBUG +#handlers=fileHandler +#qualname=ocrd.workspace + +# ocrd loggers +[logger_ocrd] +level=DEBUG +handlers=consoleHandler,fileHandler +qualname=ocrd +propagate=0 + +[logger_ocrd_network] +level=DEBUG +handlers=consoleHandler,processingServerHandler +qualname=ocrd_network +propagate=0 + +# +# logger tensorflow +# +[logger_ocrd_tensorflow] +level=DEBUG +handlers=consoleHandler +qualname=tensorflow + +# +# logger shapely.geos +# +[logger_ocrd_shapely_geos] +level=DEBUG +handlers=consoleHandler +qualname=shapely.geos + + +# +# logger PIL +# +[logger_ocrd_PIL] +level=DEBUG +handlers=consoleHandler +qualname=PIL + +# +# uvicorn loggers +# +[logger_uvicorn] +level=INFO +handlers=consoleHandler +qualname=uvicorn +[logger_uvicorn_access] +level=DEBUG +handlers=consoleHandler +qualname=uvicorn.access +[logger_uvicorn_error] +level=DEBUG +handlers=consoleHandler +qualname=uvicorn.error +[logger_multipart] +level=DEBUG +handlers=consoleHandler +qualname=multipart + + + +# +# handle stderr output +# +[handler_consoleHandler] +class=StreamHandler +formatter=defaultFormatter +args=(sys.stderr,) + +# +# example logfile handler +# handle output with logfile +# +[handler_fileHandler] +class=FileHandler +formatter=defaultFormatter +args=('ocrd.log','a+') + +[handler_processingServerHandler] +class=FileHandler +formatter=defaultFormatter +args=('/ocrd-data/ocrd_processing_server_conf.log','a+') + +# +# default log format conforming to OCR-D (https://ocr-d.de/en/spec/cli#logging) +# +[formatter_defaultFormatter] +format=%(asctime)s.%(msecs)03d %(levelname)s %(name)s - %(message)s +datefmt=%H:%M:%S + +# +# store more logging context information +# +[formatter_detailedFormatter] +format=%(asctime)s.%(msecs)03d %(levelname)-8s (%(name)s)[%(filename)s:%(lineno)d] - %(message)s +datefmt=%H:%M:%S diff --git a/tests/network/ps_config.yml b/tests/network/ps_config.yml new file mode 100644 index 00000000..655a847b --- /dev/null +++ b/tests/network/ps_config.yml @@ -0,0 +1,17 @@ +# the content of this config file is based on the .env +internal_callback_url: http://ps-docker-host.ocrd_network_test:8000 +process_queue: + address: rabbitmq-docker-host.ocrd_network_test + port: 5672 + skip_deployment: true + credentials: + username: network_test + password: network_test +database: + address: mongodb-docker-host.ocrd_network_test + port: 27017 + skip_deployment: true + credentials: + username: network_test + password: network_test +hosts: []