From 7ca03b633cbe4cd3142c513e458047f0177e0bea Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Mon, 11 Jan 2021 15:13:53 +0200 Subject: [PATCH 01/22] Health checking if master is reachable from worker and convertion before starting exareme. --- .../files/root/exareme/bootstrap.sh | 69 +++++++++++++++---- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/Exareme-Docker/files/root/exareme/bootstrap.sh b/Exareme-Docker/files/root/exareme/bootstrap.sh index 1a82f5e5c..47d3003a5 100755 --- a/Exareme-Docker/files/root/exareme/bootstrap.sh +++ b/Exareme-Docker/files/root/exareme/bootstrap.sh @@ -13,6 +13,7 @@ CONSUL_CONNECTION_MAX_ATTEMPTS=20 CONSUL_WAIT_FOR_MASTER_IP_MAX_ATTEMPTS=20 EXAREME_NODE_STARTUP_HEALTH_CHECK_MAX_ATTEMPTS=10 EXAREME_NODE_HEALTH_CHECK_TIMEOUT=30 +MASTER_NODE_REACHABLE_TIMEOUT=5 PERIODIC_EXAREME_NODES_HEALTH_CHECK_MAX_RETRIES=10 PERIODIC_EXAREME_NODES_HEALTH_CHECK_INTERVAL=120 PERIODIC_TEMP_FILES_REMOVAL=300 @@ -87,7 +88,7 @@ convertCSVsToDB() { # Skip convertion if flag is false if [[ ${CONVERT_CSVS} == "FALSE" ]]; then echo "$(timestamp) CSV convertion turned off. " - return 0 + return 0 fi # Removing all previous .db files from the DOCKER_DATA_FOLDER @@ -141,6 +142,23 @@ exaremeNodesHealthCheck() { return 0 } +# Health check that the MASTER is reachable (ping) +exaremeNodesReachableMasterHealthCheck() { + if [[ ${ENVIRONMENT_TYPE} != "PROD" ]]; then + return 0 + fi + + echo "$(timestamp) HEALTH CHECK from node with IP ${NODE_IP} and name ${NODE_NAME} to MASTER node with IP ${MASTER_IP} ." + + check=$(curl --head ${MASTER_IP}:9090 --max-time ${MASTER_NODE_REACHABLE_TIMEOUT}) + + if [[ -z ${check} ]]; then + return 1 + fi + + return 0 +} + # Exareme health check on startup startupExaremeNodesHealthCheck() { # If health check fails then try again until it succeeds or close the container. @@ -163,15 +181,15 @@ startupExaremeNodesHealthCheck() { # Periodic check for exareme's health. # If it fails shutdown the container periodicExaremeNodesHealthCheck() { + # If consul doesn't have master node's IP it means that it restarted. The nodes should restart. + if ! getMasterIPFromConsul; then + pkill -f 1 # Closing main bootstrap.sh process to stop the container. + fi + # Make a health check every 5 minutes. while true; do sleep $PERIODIC_EXAREME_NODES_HEALTH_CHECK_INTERVAL - # If consul doesn't have master node's IP it means that it restarted. The nodes should restart. - if ! getMasterIPFromConsul; then - pkill -f 1 # Closing main bootstrap.sh process to stop the container. - fi - # If health check fails then try again until it succeeds or close the container. if ! exaremeNodesHealthCheck; then attempts=0 @@ -189,6 +207,27 @@ periodicExaremeNodesHealthCheck() { done } +# Periodic check that the master node is reachable +# If it fails shutdown the container +periodicReachableMasterNodeCheck() { + # If consul doesn't have master node's IP it means that it restarted. The nodes should restart. + if ! getMasterIPFromConsul; then + pkill -f 1 # Closing main bootstrap.sh process to stop the container. + fi + + # Make a health check every 5 seconds. + while true; do + sleep 2 + + # If master node isn't reachable, close the container + if ! exaremeNodesReachableMasterHealthCheck; then + echo -e "\n$(timestamp) HEALTH CHECK FAILED. MASTER NODE IS NOT REACHABLE. Closing the container." + pkill -f 1 # Closing main bootstrap.sh process to stop the container. + fi + echo "$(timestamp) HEALTH CHECK successful on NODE_IP: $NODE_IP , MASTER NODE IS REACHABLE" + done +} + # Periodic deletion of temp files startTempFilesDeletionTask() { while true; do @@ -218,6 +257,14 @@ echo "Madis Server started." waitForConsulToStart +# Prepare datasets from CSVs to SQLite db files +convertCSVsToDB + + +# Updating consul with node's datasets. +echo "$(timestamp) Updating consul with node's datasets." +./set-local-datasets.sh + # Running bootstrap on a master node if [[ "${FEDERATION_ROLE}" == "master" ]]; then @@ -236,9 +283,6 @@ if [[ "${FEDERATION_ROLE}" == "master" ]]; then periodicExaremeNodesHealthCheck & - # Prepare datasets from CSVs to SQLite db files - convertCSVsToDB - else ##### Running bootstrap on a worker node ##### if ! getMasterIPFromConsul; then @@ -260,15 +304,10 @@ else ##### Running bootstrap on a worker node ##### periodicExaremeNodesHealthCheck & - # Prepare datasets from CSVs to SQLite db files - convertCSVsToDB + periodicReachableMasterNodeCheck & fi -# Updating consul with node's datasets. -echo "$(timestamp) Updating consul with node's datasets." -./set-local-datasets.sh - startTempFilesDeletionTask & # Creating the python log file From 38f642eb396e98242479b25c6318ae157117b492 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Mon, 11 Jan 2021 16:03:02 +0200 Subject: [PATCH 02/22] Reduced max retries. --- Exareme-Docker/files/root/exareme/bootstrap.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Exareme-Docker/files/root/exareme/bootstrap.sh b/Exareme-Docker/files/root/exareme/bootstrap.sh index 47d3003a5..ac459072a 100755 --- a/Exareme-Docker/files/root/exareme/bootstrap.sh +++ b/Exareme-Docker/files/root/exareme/bootstrap.sh @@ -14,7 +14,7 @@ CONSUL_WAIT_FOR_MASTER_IP_MAX_ATTEMPTS=20 EXAREME_NODE_STARTUP_HEALTH_CHECK_MAX_ATTEMPTS=10 EXAREME_NODE_HEALTH_CHECK_TIMEOUT=30 MASTER_NODE_REACHABLE_TIMEOUT=5 -PERIODIC_EXAREME_NODES_HEALTH_CHECK_MAX_RETRIES=10 +PERIODIC_EXAREME_NODES_HEALTH_CHECK_MAX_RETRIES=3 PERIODIC_EXAREME_NODES_HEALTH_CHECK_INTERVAL=120 PERIODIC_TEMP_FILES_REMOVAL=300 @@ -260,7 +260,6 @@ waitForConsulToStart # Prepare datasets from CSVs to SQLite db files convertCSVsToDB - # Updating consul with node's datasets. echo "$(timestamp) Updating consul with node's datasets." ./set-local-datasets.sh From 5366006c736635c1b922b83d066f25215b433447 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Mon, 11 Jan 2021 16:20:12 +0200 Subject: [PATCH 03/22] 2 retries if master node is unreachable. --- .../files/root/exareme/bootstrap.sh | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/Exareme-Docker/files/root/exareme/bootstrap.sh b/Exareme-Docker/files/root/exareme/bootstrap.sh index ac459072a..40883e357 100755 --- a/Exareme-Docker/files/root/exareme/bootstrap.sh +++ b/Exareme-Docker/files/root/exareme/bootstrap.sh @@ -148,9 +148,7 @@ exaremeNodesReachableMasterHealthCheck() { return 0 fi - echo "$(timestamp) HEALTH CHECK from node with IP ${NODE_IP} and name ${NODE_NAME} to MASTER node with IP ${MASTER_IP} ." - - check=$(curl --head ${MASTER_IP}:9090 --max-time ${MASTER_NODE_REACHABLE_TIMEOUT}) + check=$(curl -s --head ${MASTER_IP}:9090 --max-time ${MASTER_NODE_REACHABLE_TIMEOUT}) if [[ -z ${check} ]]; then return 1 @@ -215,16 +213,24 @@ periodicReachableMasterNodeCheck() { pkill -f 1 # Closing main bootstrap.sh process to stop the container. fi - # Make a health check every 5 seconds. + # Check that master is reachable every 2 seconds. while true; do sleep 2 - + # If master node isn't reachable, close the container if ! exaremeNodesReachableMasterHealthCheck; then - echo -e "\n$(timestamp) HEALTH CHECK FAILED. MASTER NODE IS NOT REACHABLE. Closing the container." - pkill -f 1 # Closing main bootstrap.sh process to stop the container. + attempts=0 + while ! exaremeNodesReachableMasterHealthCheck; do + if [[ $attempts -ge 2 ]]; then + echo -e "\n$(timestamp) HEALTH CHECK FAILED. MASTER NODE IS UNREACHABLE. Closing the container." + pkill -f 1 # Closing main bootstrap.sh process to stop the container. + fi + echo -e "\n$(timestamp) HEALTH CHECK FAILED. MASTER NODE IS UNREACHABLE. Trying again..." + attempts=$(($attempts + 1)) + sleep 2 + done fi - echo "$(timestamp) HEALTH CHECK successful on NODE_IP: $NODE_IP , MASTER NODE IS REACHABLE" + echo "$(timestamp) HEALTH CHECK successful on NODE_IP: $NODE_IP" done } From b9678eee154a5215a6be6591be2f11880a0c4a18 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Mon, 11 Jan 2021 16:26:58 +0200 Subject: [PATCH 04/22] Faster recovery if master is unreachable. --- Exareme-Docker/files/root/exareme/bootstrap.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Exareme-Docker/files/root/exareme/bootstrap.sh b/Exareme-Docker/files/root/exareme/bootstrap.sh index 40883e357..013f2994e 100755 --- a/Exareme-Docker/files/root/exareme/bootstrap.sh +++ b/Exareme-Docker/files/root/exareme/bootstrap.sh @@ -221,7 +221,7 @@ periodicReachableMasterNodeCheck() { if ! exaremeNodesReachableMasterHealthCheck; then attempts=0 while ! exaremeNodesReachableMasterHealthCheck; do - if [[ $attempts -ge 2 ]]; then + if [[ $attempts -ge 1 ]]; then echo -e "\n$(timestamp) HEALTH CHECK FAILED. MASTER NODE IS UNREACHABLE. Closing the container." pkill -f 1 # Closing main bootstrap.sh process to stop the container. fi @@ -230,7 +230,7 @@ periodicReachableMasterNodeCheck() { sleep 2 done fi - echo "$(timestamp) HEALTH CHECK successful on NODE_IP: $NODE_IP" + #echo "$(timestamp) HEALTH CHECK successful from NODE_IP: $NODE_IP, MASTER NODE IS REACHABLE" done } From bcef67607dce6c131331d67d8f843901d6aaf266 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Mon, 11 Jan 2021 16:46:03 +0200 Subject: [PATCH 05/22] Added node name parameter on health check for logging. --- Exareme-Docker/files/root/exareme/bootstrap.sh | 2 +- .../control/handler/HttpAsyncCheckWorker.java | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Exareme-Docker/files/root/exareme/bootstrap.sh b/Exareme-Docker/files/root/exareme/bootstrap.sh index 013f2994e..49485db23 100755 --- a/Exareme-Docker/files/root/exareme/bootstrap.sh +++ b/Exareme-Docker/files/root/exareme/bootstrap.sh @@ -119,7 +119,7 @@ exaremeNodesHealthCheck() { if [[ "${FEDERATION_ROLE}" == "master" ]]; then check=$(curl -s -X POST --max-time ${EXAREME_NODE_HEALTH_CHECK_TIMEOUT} ${NODE_IP}:9090/mining/query/HEALTH_CHECK) else - check=$(curl -s --max-time ${EXAREME_NODE_HEALTH_CHECK_TIMEOUT} ${MASTER_IP}:9092/check/worker?NODE_IP=${NODE_IP}) + check=$(curl -s --max-time ${EXAREME_NODE_HEALTH_CHECK_TIMEOUT} ${MASTER_IP}:9092/check/worker?NODE_IP=${NODE_IP}&NODE_NAME=${NODE_NAME}) fi if [[ -z ${check} ]]; then diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/control/handler/HttpAsyncCheckWorker.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/control/handler/HttpAsyncCheckWorker.java index d320049f9..0c9568b6d 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/control/handler/HttpAsyncCheckWorker.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/control/handler/HttpAsyncCheckWorker.java @@ -72,16 +72,26 @@ private void handleInternal( } AdpDBClientQueryStatus queryStatus; String NODE_IP = null; + String NODE_NAME = null; DataSerialization ds = DataSerialization.summary; - String[] getIP = request.getRequestLine().getUri().split("\\?"); + String[] url = request.getRequestLine().getUri().split("\\?"); + String[] urlParameters = url[1].split("&"); - if (getIP[1].split("=")[0].equals("NODE_IP")) - NODE_IP = getIP[1].split("=")[1]; + if (urlParameters[0].split("=")[0].equals("NODE_IP")) + NODE_IP = urlParameters[0].split("=")[1]; + + if (urlParameters[0].split("=")[0].equals("NODE_NAME")) + NODE_NAME = urlParameters[0].split("=")[1]; // Execute HEALTH_CHECK algorithm for health checks in bootstrap.sh via "curl -s ${MASTER_IP}:9092/check/worker?NODE_IP=${NODE_IP}" // Retrieve json result and check of the NODE_NAME of the node exist in the result. String algorithmKey = algorithmName + "_" + System.currentTimeMillis(); + log.info("Executing algorithm: " + algorithmName + " with key: " + algorithmKey); + + log.info("Algorithm Nodes: "); + log.info(" IP: " + NODE_IP + " , NAME: " + NODE_NAME); + String dfl; HashMap inputContent = new HashMap<>(); AlgorithmProperties algorithmProperties = Algorithms.getInstance().getAlgorithmProperties(algorithmName); From d9fd9bd837f23e0780e0e9bed8153fd9b4b459df Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Mon, 11 Jan 2021 16:56:20 +0200 Subject: [PATCH 06/22] Fix for NODE_NAME on Health check. --- .../master/gateway/control/handler/HttpAsyncCheckWorker.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/control/handler/HttpAsyncCheckWorker.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/control/handler/HttpAsyncCheckWorker.java index 0c9568b6d..bff9059b8 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/control/handler/HttpAsyncCheckWorker.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/control/handler/HttpAsyncCheckWorker.java @@ -81,8 +81,8 @@ private void handleInternal( if (urlParameters[0].split("=")[0].equals("NODE_IP")) NODE_IP = urlParameters[0].split("=")[1]; - if (urlParameters[0].split("=")[0].equals("NODE_NAME")) - NODE_NAME = urlParameters[0].split("=")[1]; + if (urlParameters[1].split("=")[0].equals("NODE_NAME")) + NODE_NAME = urlParameters[1].split("=")[1]; // Execute HEALTH_CHECK algorithm for health checks in bootstrap.sh via "curl -s ${MASTER_IP}:9092/check/worker?NODE_IP=${NODE_IP}" // Retrieve json result and check of the NODE_NAME of the node exist in the result. From 112ed0b7c4a31468d95eda9d766a4fea9221b301 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Mon, 11 Jan 2021 17:10:48 +0200 Subject: [PATCH 07/22] Fix for curl with & on bootstrap. --- Exareme-Docker/files/root/exareme/bootstrap.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Exareme-Docker/files/root/exareme/bootstrap.sh b/Exareme-Docker/files/root/exareme/bootstrap.sh index 49485db23..c79d4a9e2 100755 --- a/Exareme-Docker/files/root/exareme/bootstrap.sh +++ b/Exareme-Docker/files/root/exareme/bootstrap.sh @@ -119,7 +119,7 @@ exaremeNodesHealthCheck() { if [[ "${FEDERATION_ROLE}" == "master" ]]; then check=$(curl -s -X POST --max-time ${EXAREME_NODE_HEALTH_CHECK_TIMEOUT} ${NODE_IP}:9090/mining/query/HEALTH_CHECK) else - check=$(curl -s --max-time ${EXAREME_NODE_HEALTH_CHECK_TIMEOUT} ${MASTER_IP}:9092/check/worker?NODE_IP=${NODE_IP}&NODE_NAME=${NODE_NAME}) + check=$(curl -s --max-time ${EXAREME_NODE_HEALTH_CHECK_TIMEOUT} "${MASTER_IP}:9092/check/worker?NODE_IP=${NODE_IP}&NODE_NAME=${NODE_NAME}") fi if [[ -z ${check} ]]; then From b55c08ea6c7e9ad5c62a054ec5a16e6a180612d1 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Mon, 11 Jan 2021 18:00:22 +0200 Subject: [PATCH 08/22] Removing File API. --- .../exareme/master/gateway/async/HttpAsyncExaremeGateway.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/HttpAsyncExaremeGateway.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/HttpAsyncExaremeGateway.java index abff0fa84..0a81473a8 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/HttpAsyncExaremeGateway.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/HttpAsyncExaremeGateway.java @@ -58,7 +58,7 @@ public HttpAsyncExaremeGateway(AdpDBManager manager) throws Exception { ); final UriHttpAsyncRequestHandlerMapper registry = new UriHttpAsyncRequestHandlerMapper(); - registry.register(ExaremeGatewayUtils.GW_API_FILE, new HttpAsyncFileHandler()); + // registry.register(ExaremeGatewayUtils.GW_API_FILE, new HttpAsyncFileHandler()); registry.register(ExaremeGatewayUtils.GW_API_QUERY, new HttpAsyncQueryHandler()); registry.register(ExaremeGatewayUtils.GW_API_TABLE, new HttpAsyncTableHandler()); registry.register(ExaremeGatewayUtils.GW_API_MINING_ALGORITHMS, new HttpAsyncMiningAlgorithmsHandler()); From 4a786610068e4cf560f691df06bf4754df23e5a5 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Mon, 11 Jan 2021 23:58:52 +0200 Subject: [PATCH 09/22] Fix for semaphores on Memory Resource Storage. --- .../executor/remote/AdpDBArtJobMonitor.java | 33 ++++++++++++++++--- .../src/main/resources/db.properties | 2 +- .../PeriodicContainersStatusCheck.java | 1 - .../MemoryResourceStorage.java | 22 ++++++++----- 4 files changed, 44 insertions(+), 14 deletions(-) diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java index d20558435..dbdee1ff2 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java @@ -18,6 +18,7 @@ import java.rmi.RemoteException; import java.rmi.UnmarshalException; import java.util.ArrayList; +import java.util.concurrent.TimeoutException; /** * @author herald @@ -50,6 +51,7 @@ public AdpDBArtJobMonitor(ExecutionEngineSessionPlan sessionPlan, AdpDBStatus st @Override public void run() { + //int tries_remaining = 300; // Restart after 5 minutes try { sessionManager = sessionPlan.getPlanSessionStatusManagerProxy(); statsManager = sessionPlan.getPlanSessionStatisticsManagerProxy(); @@ -59,13 +61,24 @@ public void run() { statusManager.getStatistics(status.getId()).setTotalOperators(stats.getTotalProc()); statusManager.getStatistics(status.getId()).setTotalDataTransfers(stats.getTotalData()); + log.info("Monitor: " + status.getId() + " Line 62 status: " + status.getId()); while (sessionManager.hasFinished() == false && sessionManager.hasError() == false) { - Thread.sleep(100 * statsUpdateSecs); + Thread.sleep(1000 * statsUpdateSecs); + //tries_remaining--; + //if (tries_remaining == 0) { + // throw new TimeoutException("Session stuck and stopped after 5 minutes."); + //} boolean updateProgressStatistics = updateProgressStatistics(); + //log.info("Monitor: " + status.getId() + " Line 68 update: " + updateProgressStatistics); + //log.info("Monitor: " + status.getId() + " Line 69 update: " + sessionPlan); sessionManager = sessionPlan.getPlanSessionStatusManagerProxy(); + //log.info("Monitor: " + status.getId() + " Line 69.5 update: " + sessionManager); + //log.info("Monitor: " + status.getId() + " Line 70 update: " + sessionManager.hasFinished() + " " + sessionManager.hasError()); statsManager = sessionPlan.getPlanSessionStatisticsManagerProxy(); + //log.info("Monitor: " + status.getId() + " Line 71.5 update: " + statsManager); + //log.info("Monitor: " + status.getId() + " Line 72 update: " + statsManager); if (sessionManager == null || statsManager == null) { log.info("--+ error"); } @@ -75,26 +88,38 @@ public void run() { synchronized (listeners) { for (AdpDBQueryListener l : listeners) { log.debug(status.toString()); + + //log.info("Monitor: " + status.getId() + " Line 81 statusToBeChanged"); l.statusChanged(queryID, status); + + //log.info("Monitor: " + status.getId() + " Line 84 statusChanged"); } } } - - } + + //log.info("Monitor: " + status.getId() + " Line 90 update"); updateProgressStatistics(); + //log.info("Monitor: " + status.getId() + " Line 92 update"); statusManager.getStatistics(status.getId()) .setAdpEngineStatistics(statsManager.getStatistics()); + //log.info("Monitor: " + status.getId() + " Line 96 update"); if (sessionManager != null && sessionManager.hasError() == false) { statusManager.setFinished(status.getId()); + + // log.info("Monitor: " + status.getId() + " Line 100 update"); } else { + + // log.info("Monitor: " + status.getId() + " Line 103 update"); statusManager.setError(status.getId(), sessionManager.getErrorList().get(0)); } + //log.info("Monitor: " + status.getId() + " Line 106 update"); sessionPlan.close(); } catch (Exception e) { statusManager.setError(status.getId(), e); - log.error("Cannot monitor job!", e); + log.error("Cannot monitor job, queryId: " + queryID.getLongId(), e); + log.error("Cannot monitor job, status: " + status.getId(), e); } finally { log.debug("Terminate listeners ( " + listeners.size() + ")..."); synchronized (listeners) { diff --git a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/db.properties b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/db.properties index b29003bbb..55d641d1e 100644 --- a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/db.properties +++ b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/db.properties @@ -59,7 +59,7 @@ db.optimizer.scheduler.whatif.algs=[] # ---------------------------------------------------------------------------- # db.client.statisticsBuckets=10 # The frequency of the update (in seconds). -db.client.statisticsUpdate_sec=2 +db.client.statisticsUpdate_sec=1 # Force exit after the specified number of seconds. db.client.forceExit_sec=3 # off diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/containerStatusMgr/PeriodicContainersStatusCheck.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/containerStatusMgr/PeriodicContainersStatusCheck.java index dded44882..939804acc 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/containerStatusMgr/PeriodicContainersStatusCheck.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/containerStatusMgr/PeriodicContainersStatusCheck.java @@ -52,7 +52,6 @@ public void run() { log.error("Container connection error: " + e); faultyContainers.add(containerName); } - } if (!faultyContainers.isEmpty()) { if (planEventScheduler != null) { diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/registry/resourceStorage/MemoryResourceStorage.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/registry/resourceStorage/MemoryResourceStorage.java index ad52fe8a0..853909a02 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/registry/resourceStorage/MemoryResourceStorage.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/registry/resourceStorage/MemoryResourceStorage.java @@ -7,6 +7,7 @@ import madgik.exareme.worker.art.registry.Registerable; import madgik.exareme.worker.art.registry.Registerable.Type; import madgik.exareme.worker.art.registry.RegistryResourceStorage; +import org.apache.log4j.Logger; import java.rmi.NoSuchObjectException; import java.rmi.RemoteException; @@ -21,6 +22,7 @@ * @since 1.0 */ public class MemoryResourceStorage implements RegistryResourceStorage { + private static Logger log = Logger.getLogger(MemoryResourceStorage.class); private Semaphore semaphore = null; private RegistryResourceStorageStatus registryResourceStorageStatus; @@ -52,12 +54,12 @@ public void store(Registerable r) throws RemoteException { } l.add(r); - semaphore.release(); } catch (Exception e) { - semaphore.release(); throw new RemoteException( "Cannot store object: '" + r.getEntityName().getName() + "' at " + r.getEntityName() .getIP() + ":" + r.getEntityName().getPort(), e); + } finally { + semaphore.release(); } } @@ -66,7 +68,7 @@ public Registerable retrieve(EntityName epr) throws RemoteException { try { semaphore.acquire(); Registerable r = objectMap.get(epr.getName()); - semaphore.release(); + if (r == null) { throw new NoSuchObjectException( "Object was not found: '" + epr.getName() + "' at " + epr.getIP() + ":" + epr @@ -74,8 +76,9 @@ public Registerable retrieve(EntityName epr) throws RemoteException { } return r; } catch (Exception e) { - semaphore.release(); throw new RemoteException("Cannot retrieve object.", e); + } finally { + semaphore.release(); } } @@ -93,10 +96,11 @@ public void delete(EntityName epr) throws RemoteException { registryResourceStorageStatus.decreaseStoredObjects(); List l = typeMap.get(r.getType()); l.remove(r); - semaphore.release(); + } catch (Exception e) { - semaphore.release(); throw new RemoteException("Cannot delete object.", e); + } finally { + semaphore.release(); } } @@ -106,9 +110,10 @@ public Collection retrieveAll(Type type) throws RemoteException { try { semaphore.acquire(); col = typeMap.get(type); - semaphore.release(); } catch (InterruptedException ex) { throw new ServerException("Cannot retrieve all objects of type: " + type, ex); + } finally { + semaphore.release(); } return col; } @@ -133,9 +138,10 @@ public Collection retrieveAll() throws RemoteException { while (it.hasNext()) { col.addAll(it.next()); } - semaphore.release(); } catch (InterruptedException ex) { throw new ServerException("Cannot retrieve all objects", ex); + } finally { + semaphore.release(); } return col; } From c520183e5d3055c257a277d9579bd4e3147d59f9 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Tue, 12 Jan 2021 10:01:28 +0200 Subject: [PATCH 10/22] Bug fixes on the ArtJobMonitor. --- .../executor/remote/AdpDBArtJobMonitor.java | 37 ++++--------------- .../DynamicStatusManager.java | 3 ++ .../EventSchedulerManipulator.java | 7 ++-- .../rmi/RmiPlanSessionReportManagerProxy.java | 8 ++-- .../worker/art/remote/RmiObjectProxy.java | 8 +--- 5 files changed, 20 insertions(+), 43 deletions(-) diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java index dbdee1ff2..cb4ea244a 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java @@ -51,7 +51,6 @@ public AdpDBArtJobMonitor(ExecutionEngineSessionPlan sessionPlan, AdpDBStatus st @Override public void run() { - //int tries_remaining = 300; // Restart after 5 minutes try { sessionManager = sessionPlan.getPlanSessionStatusManagerProxy(); statsManager = sessionPlan.getPlanSessionStatisticsManagerProxy(); @@ -61,60 +60,40 @@ public void run() { statusManager.getStatistics(status.getId()).setTotalOperators(stats.getTotalProc()); statusManager.getStatistics(status.getId()).setTotalDataTransfers(stats.getTotalData()); - log.info("Monitor: " + status.getId() + " Line 62 status: " + status.getId()); - while (sessionManager.hasFinished() == false && sessionManager.hasError() == false) { Thread.sleep(1000 * statsUpdateSecs); - //tries_remaining--; - //if (tries_remaining == 0) { - // throw new TimeoutException("Session stuck and stopped after 5 minutes."); - //} - boolean updateProgressStatistics = updateProgressStatistics(); - //log.info("Monitor: " + status.getId() + " Line 68 update: " + updateProgressStatistics); - //log.info("Monitor: " + status.getId() + " Line 69 update: " + sessionPlan); - sessionManager = sessionPlan.getPlanSessionStatusManagerProxy(); - //log.info("Monitor: " + status.getId() + " Line 69.5 update: " + sessionManager); - //log.info("Monitor: " + status.getId() + " Line 70 update: " + sessionManager.hasFinished() + " " + sessionManager.hasError()); - statsManager = sessionPlan.getPlanSessionStatisticsManagerProxy(); - //log.info("Monitor: " + status.getId() + " Line 71.5 update: " + statsManager); - //log.info("Monitor: " + status.getId() + " Line 72 update: " + statsManager); + + // Was causing crashes, is this needed? + // sessionManager = sessionPlan.getPlanSessionStatusManagerProxy(); + // statsManager = sessionPlan.getPlanSessionStatisticsManagerProxy(); + if (sessionManager == null || statsManager == null) { - log.info("--+ error"); + log.error("Session Manager or stats Manager null! " + sessionManager + ", " + statsManager); } + + boolean updateProgressStatistics = updateProgressStatistics(); if (updateProgressStatistics) { log.info("Session is running..."); log.debug("Update listeners ..."); synchronized (listeners) { for (AdpDBQueryListener l : listeners) { log.debug(status.toString()); - - //log.info("Monitor: " + status.getId() + " Line 81 statusToBeChanged"); l.statusChanged(queryID, status); - - //log.info("Monitor: " + status.getId() + " Line 84 statusChanged"); } } } } - //log.info("Monitor: " + status.getId() + " Line 90 update"); updateProgressStatistics(); - //log.info("Monitor: " + status.getId() + " Line 92 update"); statusManager.getStatistics(status.getId()) .setAdpEngineStatistics(statsManager.getStatistics()); - //log.info("Monitor: " + status.getId() + " Line 96 update"); if (sessionManager != null && sessionManager.hasError() == false) { statusManager.setFinished(status.getId()); - - // log.info("Monitor: " + status.getId() + " Line 100 update"); } else { - - // log.info("Monitor: " + status.getId() + " Line 103 update"); statusManager.setError(status.getId(), sessionManager.getErrorList().get(0)); } - //log.info("Monitor: " + status.getId() + " Line 106 update"); sessionPlan.close(); } catch (Exception e) { statusManager.setError(status.getId(), e); diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicStatusManager.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicStatusManager.java index 9ed7157bf..0d3020e57 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicStatusManager.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicStatusManager.java @@ -7,6 +7,8 @@ import madgik.exareme.worker.art.executionEngine.session.ActiveExecutionPlan; import madgik.exareme.worker.art.executionEngine.session.ConcreteOperatorStatus; import madgik.exareme.worker.art.executionEngine.statusMgr.PlanSessionStatusManagerInterface; +import madgik.exareme.worker.art.remote.RmiObjectProxy; +import org.apache.log4j.Logger; import java.rmi.RemoteException; import java.util.List; @@ -17,6 +19,7 @@ */ public class DynamicStatusManager extends EventSchedulerManipulator implements PlanSessionStatusManagerInterface { + private static final Logger log = Logger.getLogger(DynamicStatusManager.class); public DynamicStatusManager() { } diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/EventSchedulerManipulator.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/EventSchedulerManipulator.java index c9c35b31d..1e7dde84d 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/EventSchedulerManipulator.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/EventSchedulerManipulator.java @@ -5,16 +5,15 @@ import madgik.exareme.common.art.PlanSessionID; import madgik.exareme.worker.art.executionEngine.session.PlanSessionReportID; +import org.apache.log4j.Logger; -import java.util.Collection; -import java.util.Collections; -import java.util.LinkedHashMap; -import java.util.Map; +import java.util.*; /** * @author heraldkllapi */ public class EventSchedulerManipulator { + private static final Logger log = Logger.getLogger(EventSchedulerManipulator.class); private PlanEventScheduler globalScheduler = null; private Map activeSchedulers = null; private Map activeSchedulersReportIdMap = null; diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiPlanSessionReportManagerProxy.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiPlanSessionReportManagerProxy.java index 4dbb7b873..e4200dfbc 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiPlanSessionReportManagerProxy.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiPlanSessionReportManagerProxy.java @@ -14,6 +14,7 @@ import java.io.Serializable; import java.rmi.RemoteException; +import java.util.Arrays; import java.util.Date; /** @@ -50,13 +51,14 @@ public void operatorSuccess(ConcreteOperatorID operatorID, int exidCode, throws RemoteException { // System.out.println("OperatorSuccess: " + operatorID.operatorName + " " + " " + exidCode); // System.out.println("RPSMP before get"); - log.debug("Reporting operatorSuccess: " + operatorID.operatorName); + log.info("Reporting operatorSuccess: " + operatorID.operatorName); boolean success = false; - int max_retries = 1000; + int max_retries = 100; // System.out.println("RPSMP after get"); while (!success && max_retries > 0) { try { max_retries--; + PlanSessionReportManager rmo = super.getRemoteObject(); rmo.operatorSuccess(operatorID, exidCode, exitMessage, time, containerID, internalSessionID, terminateGroup); @@ -65,7 +67,7 @@ public void operatorSuccess(ConcreteOperatorID operatorID, int exidCode, // System.out.println("RPSMP after success"); // System.out.println("DoneOperatorSuccess: " + operatorID.operatorName + " " + " " + exidCode); } catch (Exception e) { - System.out.println("RPSMP ERROR: " + e); + log.error("RPSMP ERROR: ", e); //this.operatorSuccess(operatorID, exidCode, exitMessage, time, containerID); } } diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/remote/RmiObjectProxy.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/remote/RmiObjectProxy.java index d8665e62c..415b7508f 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/remote/RmiObjectProxy.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/remote/RmiObjectProxy.java @@ -36,7 +36,7 @@ public synchronized T connect() throws RemoteException { int tries = 0; while (true) { try { - log.trace("Connecting to (" + tries + ") " + + log.debug("Connecting to (" + tries + ") " + regEntityName.getIP() + ":" + regEntityName.getPort() + " ..."); tries++; Registry registry = RmiRegistryCache.getRegistry(regEntityName); @@ -62,18 +62,12 @@ public synchronized T connect() throws RemoteException { @Override public T getRemoteObject() throws RemoteException { - Semaphore semaphore = new Semaphore(1); if (!isConnected) { try { connect(); // try to connect to remote object. If the connection is failing, maybe java is not running } catch (RemoteException exception) { throw new RemoteException("There was an error with worker " + "[" + regEntityName.getIP() + "]."); - } finally { - boolean acquired = semaphore.tryAcquire(); - if (!acquired) { - semaphore.release(); - } } } return remoteObject; From 2f938e60375feda53e9c3bc3654dc06bc14c43ec Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Tue, 12 Jan 2021 18:21:16 +0200 Subject: [PATCH 11/22] Removing log message. --- .../master/engine/executor/remote/AdpDBArtJobMonitor.java | 1 - .../executionEngine/rmi/RmiPlanSessionReportManagerProxy.java | 3 +-- .../java/madgik/exareme/worker/art/remote/RmiObjectProxy.java | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java index cb4ea244a..aa5b6d6bc 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java @@ -18,7 +18,6 @@ import java.rmi.RemoteException; import java.rmi.UnmarshalException; import java.util.ArrayList; -import java.util.concurrent.TimeoutException; /** * @author herald diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiPlanSessionReportManagerProxy.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiPlanSessionReportManagerProxy.java index e4200dfbc..d87c822f9 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiPlanSessionReportManagerProxy.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiPlanSessionReportManagerProxy.java @@ -14,7 +14,6 @@ import java.io.Serializable; import java.rmi.RemoteException; -import java.util.Arrays; import java.util.Date; /** @@ -51,7 +50,7 @@ public void operatorSuccess(ConcreteOperatorID operatorID, int exidCode, throws RemoteException { // System.out.println("OperatorSuccess: " + operatorID.operatorName + " " + " " + exidCode); // System.out.println("RPSMP before get"); - log.info("Reporting operatorSuccess: " + operatorID.operatorName); + log.debug("Reporting operatorSuccess: " + operatorID.operatorName); boolean success = false; int max_retries = 100; // System.out.println("RPSMP after get"); diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/remote/RmiObjectProxy.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/remote/RmiObjectProxy.java index 415b7508f..a0b156039 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/remote/RmiObjectProxy.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/remote/RmiObjectProxy.java @@ -9,7 +9,6 @@ import java.rmi.AccessException; import java.rmi.RemoteException; import java.rmi.registry.Registry; -import java.util.concurrent.Semaphore; /** * University of Athens / From 30f19a39b11405544a3c7e6131e85a3478779802 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Wed, 13 Jan 2021 13:06:12 +0200 Subject: [PATCH 12/22] Added no expiration policy on rmi proxies. --- .../exareme-utils/src/main/resources/arm.properties | 2 +- .../exareme-utils/src/main/resources/art.properties | 4 ++-- .../exareme/worker/arm/compute/rmi/RmiArmCompute.java | 9 +++++---- .../worker/arm/compute/rmi/RmiArmComputeProxy.java | 6 +++++- .../exareme/worker/art/container/rmi/RmiContainer.java | 9 +++++---- .../worker/art/container/rmi/RmiContainerProxy.java | 6 +++++- .../art/executionEngine/rmi/RmiExecutionEngine.java | 10 ++++++---- .../executionEngine/rmi/RmiExecutionEngineProxy.java | 6 +++++- 8 files changed, 34 insertions(+), 18 deletions(-) diff --git a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/arm.properties b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/arm.properties index c6f19b45c..03bb832d9 100644 --- a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/arm.properties +++ b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/arm.properties @@ -5,7 +5,7 @@ # -------------------------------------------------------------------------------- # arm.rmi.RmiResourceMediatorProxy.lifetime=30000 arm.rmi.RmiResourceMediator.lifetime=30000 -arm.compute.rmi.RmiArmCompute.lifetime=30000 +arm.compute.rmi.RmiArmCompute.lifetime=0 arm.cloud.OpenNebulaResourceMediatorInterface.ip= arm.cloud.OpenNebulaResourceMediatorInterface.port= # Storage diff --git a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties index c1ddcb712..22bd9ac71 100644 --- a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties +++ b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties @@ -16,7 +16,7 @@ art.registry.rmi.RmiArtRegistryProxy.cacheSize=10 # -------------------------------------------------------------------------------- # art.container.rmi.RmiContainer.defaultID=0 art.container.thread.OperatorMonitorThread.IsAlivePeriod=1 -art.container.rmi.RmiContainer.lifetime=30000 +art.container.rmi.RmiContainer.lifetime=0 art.container.maxStatusReportPeriod=1000 # Adaptor implementation [rmi, socket] (socket is very efficient with big data). art.container.adaptor.impl=socket @@ -33,7 +33,7 @@ art.container.zipBufferSize_kb=16 art.container.ioBufferSize_kb=512 # Execution engine # -------------------------------------------------------------------------------- # -art.executionEngine.rmi.RmiExecutionEngine.lifetime=30000 +art.executionEngine.rmi.RmiExecutionEngine.lifetime=0 art.executionEngine.forceSessionStopAfter_sec=10 art.executionEngine.dataOperators.memory=2 # centralized = Use a single node diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/arm/compute/rmi/RmiArmCompute.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/arm/compute/rmi/RmiArmCompute.java index a504ebe44..e38f90758 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/arm/compute/rmi/RmiArmCompute.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/arm/compute/rmi/RmiArmCompute.java @@ -63,10 +63,11 @@ public RmiArmCompute(ContainerManagerInterface containerManagerInterface, long lifeTime = AdpProperties.getArmProps().getLong("arm.compute.rmi.RmiArmCompute.lifetime"); - - registryUpdateDeamon = - RegistryUpdateDeamonFactory.createDeamon(this.createProxy(), (long) (0.75 * lifeTime)); - registryUpdateDeamon.startDeamon(); + if(lifeTime == 0) { + registryUpdateDeamon = + RegistryUpdateDeamonFactory.createDeamon(this.createProxy(), (long) (0.75 * lifeTime)); + registryUpdateDeamon.startDeamon(); + } } @Override diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/arm/compute/rmi/RmiArmComputeProxy.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/arm/compute/rmi/RmiArmComputeProxy.java index 90cff2dc0..17a316e72 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/arm/compute/rmi/RmiArmComputeProxy.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/arm/compute/rmi/RmiArmComputeProxy.java @@ -35,7 +35,11 @@ public RmiArmComputeProxy(String regEntryName, EntityName regEntityName) { long lifeTime = AdpProperties.getArmProps().getLong("arm.compute.rmi.RmiArmCompute.lifetime"); - this.registerPolicy = PolicyFactory.generateTimeExpirationDeletePolicy(lifeTime); + if(lifeTime == 0) { + this.registerPolicy = PolicyFactory.generateNoExpirationPolicy(); + }else { + this.registerPolicy = PolicyFactory.generateTimeExpirationDeletePolicy(lifeTime); + } } @Override diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainer.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainer.java index a18876c21..45a6c8b7f 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainer.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainer.java @@ -138,10 +138,11 @@ public RmiContainer(String containerName, ConcreteOperatorManagerInterface cOpMn log.debug("Create update deamon ..."); long lifeTime = AdpProperties.getArtProps().getLong("art.container.rmi.RmiContainer.lifetime"); - - registryUpdateDeamon = - RegistryUpdateDeamonFactory.createDeamon(this.createProxy(), (long) (0.75 * lifeTime)); - registryUpdateDeamon.startDeamon(); + if(lifeTime == 0) { + registryUpdateDeamon = + RegistryUpdateDeamonFactory.createDeamon(this.createProxy(), (long) (0.75 * lifeTime)); + registryUpdateDeamon.startDeamon(); + } //TODO(DSH): check executor diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainerProxy.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainerProxy.java index 6f5d5e5b3..9a0ce146b 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainerProxy.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainerProxy.java @@ -44,7 +44,11 @@ public RmiContainerProxy(String ip, String regEntryName, EntityName regEntityNam regEntityName.getDataTransferPort()); long lifeTime = AdpProperties.getArtProps().getLong("art.container.rmi.RmiContainer.lifetime"); - this.registerPolicy = PolicyFactory.generateTimeExpirationDeletePolicy(lifeTime); + if(lifeTime == 0) { + this.registerPolicy = PolicyFactory.generateNoExpirationPolicy(); + }else{ + this.registerPolicy = PolicyFactory.generateTimeExpirationDeletePolicy(lifeTime); + } } @Override diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiExecutionEngine.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiExecutionEngine.java index 6999192eb..7d9c248a8 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiExecutionEngine.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiExecutionEngine.java @@ -30,6 +30,7 @@ import madgik.exareme.worker.art.managementBean.ManagementUtil; import madgik.exareme.worker.art.registry.ArtRegistryLocator; import madgik.exareme.worker.art.registry.ArtRegistryProxy; +import madgik.exareme.worker.art.registry.PolicyFactory; import madgik.exareme.worker.art.registry.updateDeamon.RegistryUpdateDeamon; import madgik.exareme.worker.art.registry.updateDeamon.RegistryUpdateDeamonFactory; import madgik.exareme.worker.art.remote.RmiRemoteObject; @@ -93,11 +94,12 @@ public RmiExecutionEngine(PlanSessionManagerInterface sessionManagerInterface, long lifeTime = AdpProperties.getArtProps() .getLong("art.executionEngine.rmi.RmiExecutionEngine.lifetime"); + if(lifeTime == 0) { + registryUpdateDeamon = + RegistryUpdateDeamonFactory.createDeamon(this.createProxy(), (long) (0.75 * lifeTime)); - registryUpdateDeamon = - RegistryUpdateDeamonFactory.createDeamon(this.createProxy(), (long) (0.75 * lifeTime)); - - registryUpdateDeamon.startDeamon(); + registryUpdateDeamon.startDeamon(); + } } @Override diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiExecutionEngineProxy.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiExecutionEngineProxy.java index ab3b0510a..61c387fda 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiExecutionEngineProxy.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiExecutionEngineProxy.java @@ -43,7 +43,11 @@ public RmiExecutionEngineProxy(String ip, String regEntryName, EntityName regEnt this.entityName = new EntityName(regEntryName, ip); long lifeTime = AdpProperties.getArtProps() .getLong("art.executionEngine.rmi.RmiExecutionEngine.lifetime"); - this.registerPolicy = PolicyFactory.generateTimeExpirationDeletePolicy(lifeTime); + if(lifeTime == 0) { + this.registerPolicy = PolicyFactory.generateNoExpirationPolicy(); + }else{ + this.registerPolicy = PolicyFactory.generateTimeExpirationDeletePolicy(lifeTime); + } } @Override From 6015f402d03fac3897c602bf2a463cfa6e4b84b9 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Wed, 13 Jan 2021 13:33:47 +0200 Subject: [PATCH 13/22] Periodic container status check exits after reporting failure. --- .../master/engine/executor/remote/AdpDBArtJobMonitor.java | 1 - .../containerStatusMgr/PeriodicContainersStatusCheck.java | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java index aa5b6d6bc..d02c2de65 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java @@ -96,7 +96,6 @@ public void run() { sessionPlan.close(); } catch (Exception e) { statusManager.setError(status.getId(), e); - log.error("Cannot monitor job, queryId: " + queryID.getLongId(), e); log.error("Cannot monitor job, status: " + status.getId(), e); } finally { log.debug("Terminate listeners ( " + listeners.size() + ")..."); diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/containerStatusMgr/PeriodicContainersStatusCheck.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/containerStatusMgr/PeriodicContainersStatusCheck.java index 939804acc..90f10e831 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/containerStatusMgr/PeriodicContainersStatusCheck.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/containerStatusMgr/PeriodicContainersStatusCheck.java @@ -38,9 +38,7 @@ public void addConainerToCheck(EntityName container) { private class PeriodicCheck extends Thread { public void run() { - int i = 10; while (!planEventScheduler.getState().isTerminated()) { - i--; Set faultyContainers = new HashSet<>(); for (EntityName containerName : containersToCheck) { log.debug("Checking container: " + containerName); @@ -56,6 +54,8 @@ public void run() { if (!faultyContainers.isEmpty()) { if (planEventScheduler != null) { planEventScheduler.containersError(faultyContainers); + log.error("Reported container error and exiting!"); + return; } else { log.error("PlanEventScheduler should not be null!"); } From 3e4ce163e1450a5fcd3476357aa7607355d2df13 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Wed, 13 Jan 2021 13:56:34 +0200 Subject: [PATCH 14/22] Exaview added back. --- .../exareme/master/gateway/async/HttpAsyncExaremeGateway.java | 2 +- .../master/gateway/async/handler/HttpAsyncFileHandler.java | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/HttpAsyncExaremeGateway.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/HttpAsyncExaremeGateway.java index 0a81473a8..abff0fa84 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/HttpAsyncExaremeGateway.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/HttpAsyncExaremeGateway.java @@ -58,7 +58,7 @@ public HttpAsyncExaremeGateway(AdpDBManager manager) throws Exception { ); final UriHttpAsyncRequestHandlerMapper registry = new UriHttpAsyncRequestHandlerMapper(); - // registry.register(ExaremeGatewayUtils.GW_API_FILE, new HttpAsyncFileHandler()); + registry.register(ExaremeGatewayUtils.GW_API_FILE, new HttpAsyncFileHandler()); registry.register(ExaremeGatewayUtils.GW_API_QUERY, new HttpAsyncQueryHandler()); registry.register(ExaremeGatewayUtils.GW_API_TABLE, new HttpAsyncTableHandler()); registry.register(ExaremeGatewayUtils.GW_API_MINING_ALGORITHMS, new HttpAsyncMiningAlgorithmsHandler()); diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncFileHandler.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncFileHandler.java index 8251415c5..2fa7ca139 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncFileHandler.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncFileHandler.java @@ -63,7 +63,6 @@ private void handleInternal( String target = request.getRequestLine().getUri(); final File file = new File(this.docRoot, URLDecoder.decode(target, "UTF-8")); if (!file.exists()) { - response.setStatusCode(HttpStatus.SC_NOT_FOUND); NStringEntity entity = new NStringEntity( "

File" + @@ -71,7 +70,6 @@ private void handleInternal( ContentType.create("text/html", "UTF-8")); response.setEntity(entity); System.out.println("File " + file.getPath() + " not found"); - } else if (!file.canRead() || file.isDirectory() || !file.getCanonicalPath().startsWith(this.docRoot.getCanonicalPath())) { response.setStatusCode(HttpStatus.SC_FORBIDDEN); From 87912faa8238ef9a20db9eea1ea41cf108631bbf Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Wed, 13 Jan 2021 14:06:13 +0200 Subject: [PATCH 15/22] Multiple fixes on Exareme for stability. 1) Bootstrap pinging instead of curl to check that master is reachable. 2) Result Entities have a separate close session. The close() function was being called by the framework as well. 3) Registry update daemons used only if lifetime > 0. --- Exareme-Docker/Dockerfile | 2 +- .../files/root/exareme/bootstrap.sh | 86 +++++++------------ .../client/rmi/RmiAdpDBClientQueryStatus.java | 2 +- .../executor/remote/AdpDBArtJobMonitor.java | 34 +++++--- .../NIterativeAlgorithmResultEntity.java | 11 ++- .../engine/statusMgr/AdpDBJobSession.java | 1 + .../async/handler/HBP/HBPQueryHandler.java | 2 + .../handler/HttpAsyncDecomposerHandler.java | 1 + .../async/handler/HttpAsyncFileHandler.java | 2 +- .../handler/entity/NQueryResultEntity.java | 26 +++--- .../handler/entity/NQueryStatusEntity.java | 1 - .../control/handler/HttpAsyncCheckWorker.java | 7 +- .../src/main/resources/art.properties | 2 +- .../worker/arm/compute/rmi/RmiArmCompute.java | 6 +- .../art/container/rmi/RmiContainer.java | 6 +- .../DynamicPlanManager.java | 12 ++- .../DynamicStatusManager.java | 3 + .../PlanTerminationEventHandler.java | 6 +- ...atorElasticTreeTerminatedEventHandler.java | 1 + .../OperatorGroupTerminatedEventHandler.java | 3 +- .../rmi/RmiExecutionEngine.java | 7 +- .../rmi/RmiPlanSessionStatusManager.java | 1 - .../session/ExecutionEngineSessionPlan.java | 2 +- .../worker/art/remote/RmiObjectProxy.java | 2 + 24 files changed, 114 insertions(+), 112 deletions(-) diff --git a/Exareme-Docker/Dockerfile b/Exareme-Docker/Dockerfile index f36cd5db3..fd7db2fcc 100644 --- a/Exareme-Docker/Dockerfile +++ b/Exareme-Docker/Dockerfile @@ -14,7 +14,7 @@ RUN apt install -y --no-install-recommends python2 RUN ln -s /usr/bin/python2 /usr/bin/python # Installing Exareme requirements -RUN apt install -y openjdk-8-jdk curl jq +RUN apt install -y openjdk-8-jdk curl jq iputils-ping # Installing pip RUN curl -O https://raw.githubusercontent.com/pypa/get-pip/master/get-pip.py diff --git a/Exareme-Docker/files/root/exareme/bootstrap.sh b/Exareme-Docker/files/root/exareme/bootstrap.sh index c79d4a9e2..e4449628b 100755 --- a/Exareme-Docker/files/root/exareme/bootstrap.sh +++ b/Exareme-Docker/files/root/exareme/bootstrap.sh @@ -12,10 +12,11 @@ export CONSUL_ACTIVE_WORKERS_PATH="active_workers" CONSUL_CONNECTION_MAX_ATTEMPTS=20 CONSUL_WAIT_FOR_MASTER_IP_MAX_ATTEMPTS=20 EXAREME_NODE_STARTUP_HEALTH_CHECK_MAX_ATTEMPTS=10 -EXAREME_NODE_HEALTH_CHECK_TIMEOUT=30 +EXAREME_NODE_HEALTH_CHECK_TIMEOUT=60 MASTER_NODE_REACHABLE_TIMEOUT=5 PERIODIC_EXAREME_NODES_HEALTH_CHECK_MAX_RETRIES=3 PERIODIC_EXAREME_NODES_HEALTH_CHECK_INTERVAL=120 +EXAREME_HEALTH_CHECK_AWAIT_TIME=20 PERIODIC_TEMP_FILES_REMOVAL=300 if [[ -z ${CONSULURL} ]]; then @@ -117,7 +118,7 @@ exaremeNodesHealthCheck() { echo "$(timestamp) HEALTH CHECK for node with IP ${NODE_IP} and name ${NODE_NAME} ." if [[ "${FEDERATION_ROLE}" == "master" ]]; then - check=$(curl -s -X POST --max-time ${EXAREME_NODE_HEALTH_CHECK_TIMEOUT} ${NODE_IP}:9090/mining/query/HEALTH_CHECK) + check=$(curl -s --max-time ${EXAREME_NODE_HEALTH_CHECK_TIMEOUT} "${NODE_IP}:9092/check/worker?NODE_IP=${NODE_IP}&NODE_NAME=${NODE_NAME}") else check=$(curl -s --max-time ${EXAREME_NODE_HEALTH_CHECK_TIMEOUT} "${MASTER_IP}:9092/check/worker?NODE_IP=${NODE_IP}&NODE_NAME=${NODE_NAME}") fi @@ -142,36 +143,19 @@ exaremeNodesHealthCheck() { return 0 } -# Health check that the MASTER is reachable (ping) -exaremeNodesReachableMasterHealthCheck() { - if [[ ${ENVIRONMENT_TYPE} != "PROD" ]]; then - return 0 - fi - - check=$(curl -s --head ${MASTER_IP}:9090 --max-time ${MASTER_NODE_REACHABLE_TIMEOUT}) - - if [[ -z ${check} ]]; then - return 1 - fi - - return 0 -} - # Exareme health check on startup startupExaremeNodesHealthCheck() { # If health check fails then try again until it succeeds or close the container. - if ! exaremeNodesHealthCheck; then - attempts=0 - while ! exaremeNodesHealthCheck; do - if [[ $attempts -ge $EXAREME_NODE_STARTUP_HEALTH_CHECK_MAX_ATTEMPTS ]]; then - echo -e "\n$(timestamp) HEALTH CHECK FAILED. Closing the container." - return 1 # Exiting - fi - echo "$(timestamp) HEALTH CHECK failed. Trying again..." - attempts=$(($attempts + 1)) - sleep 5 - done - fi + attempts=0 + while ! exaremeNodesHealthCheck; do + if [[ $attempts -ge $EXAREME_NODE_STARTUP_HEALTH_CHECK_MAX_ATTEMPTS ]]; then + echo -e "\n$(timestamp) HEALTH CHECK FAILED. Closing the container." + return 1 # Exiting + fi + echo "$(timestamp) HEALTH CHECK failed. Trying again..." + attempts=$(($attempts + 1)) + sleep $EXAREME_HEALTH_CHECK_AWAIT_TIME + done echo "$(timestamp) HEALTH CHECK successful on NODE_IP: $NODE_IP" return 0 } @@ -189,18 +173,16 @@ periodicExaremeNodesHealthCheck() { sleep $PERIODIC_EXAREME_NODES_HEALTH_CHECK_INTERVAL # If health check fails then try again until it succeeds or close the container. - if ! exaremeNodesHealthCheck; then - attempts=0 - while ! exaremeNodesHealthCheck; do - if [[ $attempts -ge $PERIODIC_EXAREME_NODES_HEALTH_CHECK_MAX_RETRIES ]]; then - echo -e "\n$(timestamp) HEALTH CHECK FAILED. Closing the container." - pkill -f 1 # Closing main bootstrap.sh process to stop the container. - fi - echo "$(timestamp) HEALTH CHECK failed. Trying again..." - attempts=$(($attempts + 1)) - sleep 5 - done - fi + attempts=0 + while ! exaremeNodesHealthCheck; do + if [[ $attempts -ge $PERIODIC_EXAREME_NODES_HEALTH_CHECK_MAX_RETRIES ]]; then + echo -e "\n$(timestamp) HEALTH CHECK FAILED. Closing the container." + pkill -f 1 # Closing main bootstrap.sh process to stop the container. + fi + echo "$(timestamp) HEALTH CHECK failed. Trying again..." + attempts=$(($attempts + 1)) + sleep $EXAREME_HEALTH_CHECK_AWAIT_TIME + done echo "$(timestamp) HEALTH CHECK successful on NODE_IP: $NODE_IP" done } @@ -213,23 +195,15 @@ periodicReachableMasterNodeCheck() { pkill -f 1 # Closing main bootstrap.sh process to stop the container. fi - # Check that master is reachable every 2 seconds. + # Check that master is reachable every 5 seconds. while true; do - sleep 2 + sleep 5 + + if ! ping -c 5 -W 2 $MASTER_IP &>/dev/null ; then + echo -e "\n$(timestamp) MASTER NODE IS UNREACHABLE. Closing the container." + pkill -f 1 # Closing main bootstrap.sh process to stop the container. + fi - # If master node isn't reachable, close the container - if ! exaremeNodesReachableMasterHealthCheck; then - attempts=0 - while ! exaremeNodesReachableMasterHealthCheck; do - if [[ $attempts -ge 1 ]]; then - echo -e "\n$(timestamp) HEALTH CHECK FAILED. MASTER NODE IS UNREACHABLE. Closing the container." - pkill -f 1 # Closing main bootstrap.sh process to stop the container. - fi - echo -e "\n$(timestamp) HEALTH CHECK FAILED. MASTER NODE IS UNREACHABLE. Trying again..." - attempts=$(($attempts + 1)) - sleep 2 - done - fi #echo "$(timestamp) HEALTH CHECK successful from NODE_IP: $NODE_IP, MASTER NODE IS REACHABLE" done } diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/client/rmi/RmiAdpDBClientQueryStatus.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/client/rmi/RmiAdpDBClientQueryStatus.java index c6193488d..a692c7a8d 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/client/rmi/RmiAdpDBClientQueryStatus.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/client/rmi/RmiAdpDBClientQueryStatus.java @@ -63,7 +63,7 @@ public boolean hasFinished() throws RemoteException { try { String algorithmResult = IOUtils.toString(getResult(DataSerialization.summary), StandardCharsets.UTF_8); - log.info("Algorithm with queryId" + getQueryID() + " terminated. Result: \n " + algorithmResult); + log.info("Algorithm with queryId " + getQueryID().getQueryID() + " terminated. Result: \n " + algorithmResult); } catch (IOException e) { log.error("Could not read the algorithm result table." + getQueryID()); } diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java index d02c2de65..e0027ad6e 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java @@ -59,21 +59,12 @@ public void run() { statusManager.getStatistics(status.getId()).setTotalOperators(stats.getTotalProc()); statusManager.getStatistics(status.getId()).setTotalDataTransfers(stats.getTotalData()); - while (sessionManager.hasFinished() == false && sessionManager.hasError() == false) { - - Thread.sleep(1000 * statsUpdateSecs); - - // Was causing crashes, is this needed? - // sessionManager = sessionPlan.getPlanSessionStatusManagerProxy(); - // statsManager = sessionPlan.getPlanSessionStatisticsManagerProxy(); - - if (sessionManager == null || statsManager == null) { - log.error("Session Manager or stats Manager null! " + sessionManager + ", " + statsManager); - } + while (!sessionManager.hasFinished() && !sessionManager.hasError()) { boolean updateProgressStatistics = updateProgressStatistics(); if (updateProgressStatistics) { - log.info("Session is running..."); + log.info("Session is running... ID: " + sessionPlan.getSessionID().getLongId() + + " , QueryID: " + queryID.getQueryID()); log.debug("Update listeners ..."); synchronized (listeners) { for (AdpDBQueryListener l : listeners) { @@ -82,21 +73,36 @@ public void run() { } } } + + Thread.sleep(100 * statsUpdateSecs); + + // Reload the managers + sessionManager = sessionPlan.getPlanSessionStatusManagerProxy(); + statsManager = sessionPlan.getPlanSessionStatisticsManagerProxy(); + if (sessionManager == null || statsManager == null) { + log.error("Session Manager or stats Manager null! " + sessionManager + ", " + statsManager); + } } updateProgressStatistics(); statusManager.getStatistics(status.getId()) .setAdpEngineStatistics(statsManager.getStatistics()); - if (sessionManager != null && sessionManager.hasError() == false) { + if (sessionManager != null && !sessionManager.hasError()) { + log.info("Session finished, closing! ID: " + sessionPlan.getSessionID().getLongId() + + " , QueryID: " + queryID.getQueryID()); statusManager.setFinished(status.getId()); } else { statusManager.setError(status.getId(), sessionManager.getErrorList().get(0)); } + log.info("Session closing! ID: "+ sessionPlan.getSessionID().getLongId() + + " , QueryID: " + queryID.getQueryID()); sessionPlan.close(); + } catch (Exception e) { statusManager.setError(status.getId(), e); - log.error("Cannot monitor job, status: " + status.getId(), e); + log.error("Cannot monitor job, sessionID: " + sessionPlan.getSessionID().getLongId()); + log.error("Cannot monitor job, queryID: " + status.getQueryID().getQueryID(), e); } finally { log.debug("Terminate listeners ( " + listeners.size() + ")..."); synchronized (listeners) { diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/iterations/handler/NIterativeAlgorithmResultEntity.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/iterations/handler/NIterativeAlgorithmResultEntity.java index 38d4aef95..0d4c136c6 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/iterations/handler/NIterativeAlgorithmResultEntity.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/iterations/handler/NIterativeAlgorithmResultEntity.java @@ -168,11 +168,13 @@ public void produceContent(ContentEncoder encoder, IOControl ioctrl) throws IOEx } finally { if (iterativeAlgorithmState != null) iterativeAlgorithmState.releaseLock(); + + closeQuery(); + close(); } } - @Override - public void close() throws IOException { + public void closeQuery() throws IOException { if (finalizeQueryStatus != null) { // Case in which algorithm execution failed finalizeQueryStatus.close(); @@ -181,6 +183,11 @@ public void close() throws IOException { iterativeAlgorithmState = null; } + @Override + public void close() throws IOException { + + } + @Override public boolean isRepeatable() { return false; diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/statusMgr/AdpDBJobSession.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/statusMgr/AdpDBJobSession.java index c5de13bf1..990e4a499 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/statusMgr/AdpDBJobSession.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/statusMgr/AdpDBJobSession.java @@ -11,6 +11,7 @@ import org.apache.log4j.Logger; import java.rmi.RemoteException; +import java.util.Arrays; import java.util.Map; /** diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HBP/HBPQueryHandler.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HBP/HBPQueryHandler.java index bbc7c6b95..5cf1a1f70 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HBP/HBPQueryHandler.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HBP/HBPQueryHandler.java @@ -159,6 +159,8 @@ private void handleHBPAlgorithmExecution(HttpRequest request, HttpResponse respo AdpDBClientFactory.createDBClient(manager, clientProperties); queryStatus = dbClient.query(algorithmKey, dfl); + log.info("Executing algorithm " + algorithmKey + + " started with queryId " + queryStatus.getQueryID().getQueryID()); log.debug("Algorithm " + algorithmKey + " with queryID " + queryStatus.getQueryID() + " execution started. DFL Script: \n " + dfl); diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncDecomposerHandler.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncDecomposerHandler.java index 17ca63a0f..8096aa352 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncDecomposerHandler.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncDecomposerHandler.java @@ -465,6 +465,7 @@ public void run() { if (timeoutMs > 0) { long timePassed = System.currentTimeMillis() - start; if (timePassed > timeoutMs) { + log.info("Closing session!!!!"); status.close(); log.warn("Time out:" + timeoutMs + " ms passed"); throw new RuntimeException("Time out:" + timeoutMs + " ms passed"); diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncFileHandler.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncFileHandler.java index 2fa7ca139..182198318 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncFileHandler.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncFileHandler.java @@ -77,7 +77,7 @@ private void handleInternal( "

Access denied

", ContentType.create("text/html", "UTF-8")); response.setEntity(entity); - System.out.println("Cannot read file " + file.getPath()); + log.debug("Cannot read file " + file.getPath()); } else if (file.getPath().endsWith(".ser")) { HttpCoreContext coreContext = HttpCoreContext.adapt(context); diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/entity/NQueryResultEntity.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/entity/NQueryResultEntity.java index 0fa7a03fc..9af069e61 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/entity/NQueryResultEntity.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/entity/NQueryResultEntity.java @@ -40,7 +40,6 @@ public NQueryResultEntity(AdpDBClientQueryStatus status, DataSerialization ds, format = ds; } - @Override public void produceContent(ContentEncoder encoder, IOControl iocontrol) throws IOException { @@ -65,9 +64,7 @@ public void produceContent(ContentEncoder encoder, IOControl iocontrol) this.buffer.compact(); if (i < 1 && !buffering) { encoder.complete(); - close(); } - } else { log.trace("|" + queryStatus.getError() + "|"); if (queryStatus.getError().contains("ExaremeError:")) { @@ -77,7 +74,6 @@ public void produceContent(ContentEncoder encoder, IOControl iocontrol) logErrorMessage(result); encoder.write(ByteBuffer.wrap(result.getBytes())); encoder.complete(); - close(); } else if (queryStatus.getError().contains("PrivacyError")) { String data = "The Experiment could not run with the input provided because there are insufficient data."; //type could be error, user_error, warning regarding the error occurred along the process @@ -85,7 +81,6 @@ public void produceContent(ContentEncoder encoder, IOControl iocontrol) logErrorMessage(result); encoder.write(ByteBuffer.wrap(result.getBytes())); encoder.complete(); - close(); } else if (queryStatus.getError().contains("java.rmi.RemoteException")) { String data = "One or more containers are not responding. Please inform the system administrator."; //type could be error, user_error, warning regarding the error occurred along the process @@ -93,7 +88,6 @@ public void produceContent(ContentEncoder encoder, IOControl iocontrol) logErrorMessage(result); encoder.write(ByteBuffer.wrap(result.getBytes())); encoder.complete(); - close(); } else { log.info("Exception from madis: " + queryStatus.getError()); String data = "Something went wrong. Please inform the system administrator."; @@ -102,22 +96,28 @@ public void produceContent(ContentEncoder encoder, IOControl iocontrol) logErrorMessage(result); encoder.write(ByteBuffer.wrap(result.getBytes())); encoder.complete(); - close(); } } + closeQuery(); + close(); } - private void logErrorMessage(String error) { - log.info("Algorithm exited with error and returned:\n " + error); + @Override + public boolean isRepeatable() { + return false; } - @Override - public void close() throws IOException { + public void closeQuery() throws IOException { + log.info("Closing from Query Result : " + queryStatus.getQueryID().getQueryID()); queryStatus.close(); } @Override - public boolean isRepeatable() { - return false; + public void close() throws IOException { + + } + + private void logErrorMessage(String error) { + log.info("Algorithm exited with error and returned:\n " + error); } } diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/entity/NQueryStatusEntity.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/entity/NQueryStatusEntity.java index e595c7689..2986eccd6 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/entity/NQueryStatusEntity.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/entity/NQueryStatusEntity.java @@ -84,7 +84,6 @@ public QueryStatusListener(IOControl ioctrl) { @Override public void statusChanged(AdpDBQueryID queryID, AdpDBStatus status) { ioctl.requestOutput(); - } @Override diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/control/handler/HttpAsyncCheckWorker.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/control/handler/HttpAsyncCheckWorker.java index bff9059b8..c25aad68c 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/control/handler/HttpAsyncCheckWorker.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/control/handler/HttpAsyncCheckWorker.java @@ -70,7 +70,6 @@ private void handleInternal( if (!method.equals("GET") && !method.equals("HEAD") && !method.equals("POST")) { throw new MethodNotSupportedException(method + " method not supported"); } - AdpDBClientQueryStatus queryStatus; String NODE_IP = null; String NODE_NAME = null; DataSerialization ds = DataSerialization.summary; @@ -135,7 +134,11 @@ private void handleInternal( clientProperties.setContainerProxies(usedContainerProxies); AdpDBClient dbClient = AdpDBClientFactory.createDBClient(manager, clientProperties); - queryStatus = dbClient.query(algorithmKey, dfl); + + AdpDBClientQueryStatus queryStatus = dbClient.query(algorithmKey, dfl); + log.info("Executing algorithm " + algorithmKey + + " started with queryId " + queryStatus.getQueryID().getQueryID()); + BasicHttpEntity entity = new NQueryResultEntity(queryStatus, ds, ExaremeGatewayUtils.RESPONSE_BUFFER_SIZE); response.setStatusCode(HttpStatus.SC_OK); diff --git a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties index 22bd9ac71..1638ffb09 100644 --- a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties +++ b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties @@ -34,7 +34,7 @@ art.container.ioBufferSize_kb=512 # Execution engine # -------------------------------------------------------------------------------- # art.executionEngine.rmi.RmiExecutionEngine.lifetime=0 -art.executionEngine.forceSessionStopAfter_sec=10 +art.executionEngine.forceSessionStopAfter_sec=30 art.executionEngine.dataOperators.memory=2 # centralized = Use a single node # distributed = Use workers given the schedule diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/arm/compute/rmi/RmiArmCompute.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/arm/compute/rmi/RmiArmCompute.java index e38f90758..33d05976c 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/arm/compute/rmi/RmiArmCompute.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/arm/compute/rmi/RmiArmCompute.java @@ -63,9 +63,9 @@ public RmiArmCompute(ContainerManagerInterface containerManagerInterface, long lifeTime = AdpProperties.getArmProps().getLong("arm.compute.rmi.RmiArmCompute.lifetime"); - if(lifeTime == 0) { - registryUpdateDeamon = - RegistryUpdateDeamonFactory.createDeamon(this.createProxy(), (long) (0.75 * lifeTime)); + registryUpdateDeamon = + RegistryUpdateDeamonFactory.createDeamon(this.createProxy(), (long) (0.75 * lifeTime)); + if(lifeTime != 0) { registryUpdateDeamon.startDeamon(); } } diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainer.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainer.java index 45a6c8b7f..408111b7d 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainer.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainer.java @@ -138,9 +138,9 @@ public RmiContainer(String containerName, ConcreteOperatorManagerInterface cOpMn log.debug("Create update deamon ..."); long lifeTime = AdpProperties.getArtProps().getLong("art.container.rmi.RmiContainer.lifetime"); - if(lifeTime == 0) { - registryUpdateDeamon = - RegistryUpdateDeamonFactory.createDeamon(this.createProxy(), (long) (0.75 * lifeTime)); + registryUpdateDeamon = + RegistryUpdateDeamonFactory.createDeamon(this.createProxy(), (long) (0.75 * lifeTime)); + if(lifeTime != 0) { registryUpdateDeamon.startDeamon(); } diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicPlanManager.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicPlanManager.java index 73d1071c6..2660d4181 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicPlanManager.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicPlanManager.java @@ -19,6 +19,7 @@ import java.rmi.RemoteException; import java.rmi.ServerException; +import java.util.Arrays; import java.util.HashMap; import java.util.LinkedList; import java.util.Map; @@ -85,6 +86,7 @@ public void createGlobalScheduler() throws RemoteException { @Override public PlanSessionID createNewSession() throws RemoteException { + // TODO Is this thread safe? PlanSessionID sessionID = new PlanSessionID(sessionCount); PlanSessionReportID reportID = new PlanSessionReportID(sessionCount); reportID.reportManagerProxy = executionEngine.getPlanSessionReportManagerProxy(reportID); @@ -118,23 +120,27 @@ public ContainerSessionID createContainerSession(PlanSessionID planSessionID) @Override public void destroySession(PlanSessionID sessionID) throws RemoteException { + log.info("Inside Destroying session with ID: " + sessionID.getLongId()); try { PlanEventScheduler eventScheduler = schedulerMap.get(sessionID); IndependentEvents jobs = new IndependentEvents(eventScheduler.getState()); eventScheduler.closeSession(jobs); eventScheduler.queueIndependentEvents(jobs); Semaphore sem = new Semaphore(0); - if (eventScheduler.getState().isTerminated() == false) { + if (!eventScheduler.getState().isTerminated()) { + log.info("State not yet terminated. " + sessionID.getLongId()); eventScheduler.getState() .registerTerminationListener(new SemaphoreTerminationListener(sem)); log.debug( "Waiting '" + forceSessionStopAfter_sec + "' seconds for session to stop ..."); boolean stopped = sem.tryAcquire(forceSessionStopAfter_sec, TimeUnit.SECONDS); - if (stopped == false) { - log.warn("Force stop!"); + if (!stopped) { + log.error("Force stop! SessionID: " + sessionID.getLongId() + "\n" + Arrays.toString(Thread.currentThread().getStackTrace()).concat("\n")); } } + + log.info("Destroying session with ID: " + sessionID.getLongId()); PlanSessionReportID reportID = eventScheduler.getState().getPlanSessionReportID(); schedulerMap.remove(sessionID); containerSessionMap.remove(sessionID); diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicStatusManager.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicStatusManager.java index 0d3020e57..89a6a5efa 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicStatusManager.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicStatusManager.java @@ -27,6 +27,9 @@ public DynamicStatusManager() { @Override public boolean hasFinished(PlanSessionID sessionID) throws RemoteException { PlanEventScheduler eventScheduler = getSchedulerWithId(sessionID); + if(eventScheduler == null){ + log.error("Scheduler does not exist with SessionID: " + sessionID.getLongId() ); + } return eventScheduler.getState().getPlanSession().getPlanSessionStatus().hasFinished(); } diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/planTermination/PlanTerminationEventHandler.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/planTermination/PlanTerminationEventHandler.java index 9f90c891c..364e011fb 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/planTermination/PlanTerminationEventHandler.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/planTermination/PlanTerminationEventHandler.java @@ -28,13 +28,11 @@ public PlanTerminationEventHandler() { @Override public void preProcess(PlanTerminationEvent event, PlanEventSchedulerState state) throws RemoteException { - if (state.isTerminated() == false) { + if (!state.isTerminated()) { for (ContainerProxy proxy : state.getContainerProxies()) { try { - log.debug("closing session of container : " + proxy.getEntityName().getName()); proxy.destroySessions(state.getPlanSessionID()); - } catch (RemoteException e) { // state.addException(e); // throw new ServerException("Cannot close all sessions", e); @@ -54,7 +52,7 @@ public void preProcess(PlanTerminationEvent event, PlanEventSchedulerState state state.terminationListeners.clear(); log.debug("Triggered " + listenerCount + " listeners!"); } - if (state.isTerminated() == false) { + if (!state.isTerminated()) { state.setTerminated(true); state.getPlanSession().getPlanSessionStatus().setFinished(new Date()); } diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/terminated/OperatorElasticTreeTerminatedEventHandler.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/terminated/OperatorElasticTreeTerminatedEventHandler.java index 5e9a29492..6a0ed92ab 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/terminated/OperatorElasticTreeTerminatedEventHandler.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/terminated/OperatorElasticTreeTerminatedEventHandler.java @@ -100,6 +100,7 @@ public void preProcess(OperatorTerminatedEvent event, PlanEventSchedulerState st state.eventScheduler.queueIndependentEvents(termJobs); IndependentEvents closeJobs = new IndependentEvents(state); + log.info("Operators finished (2), Closing session!: " + state.getPlanSessionID().getLongId()); state.eventScheduler.closeSession(closeJobs); state.eventScheduler.queueIndependentEvents(closeJobs); diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/terminated/OperatorGroupTerminatedEventHandler.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/terminated/OperatorGroupTerminatedEventHandler.java index e78773588..0252b0041 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/terminated/OperatorGroupTerminatedEventHandler.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/terminated/OperatorGroupTerminatedEventHandler.java @@ -71,7 +71,7 @@ public void preProcess(OperatorTerminatedEvent event, PlanEventSchedulerState st activeOperator.exitMessage = event.exitMessage; activeOperator.exitDate = new Date(); // Check if the group has terminated - if ((activeGroup.hasError == false) && group.hasTerminated) { + if (!activeGroup.hasError && group.hasTerminated) { log.trace("Operator Group Terminated: " + group.toString()); state.groupDependencySolver().setTerminated(group); // Close the container sessions @@ -90,6 +90,7 @@ public void preProcess(OperatorTerminatedEvent event, PlanEventSchedulerState st state.eventScheduler.queueIndependentEvents(termJobs); IndependentEvents closeJobs = new IndependentEvents(state); + log.info("Operators finished (1), Closing session! ID: " + state.getPlanSessionID().getLongId()); state.eventScheduler.closeSession(closeJobs); state.eventScheduler.queueIndependentEvents(closeJobs); } diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiExecutionEngine.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiExecutionEngine.java index 7d9c248a8..e902941d2 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiExecutionEngine.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiExecutionEngine.java @@ -94,10 +94,9 @@ public RmiExecutionEngine(PlanSessionManagerInterface sessionManagerInterface, long lifeTime = AdpProperties.getArtProps() .getLong("art.executionEngine.rmi.RmiExecutionEngine.lifetime"); - if(lifeTime == 0) { - registryUpdateDeamon = - RegistryUpdateDeamonFactory.createDeamon(this.createProxy(), (long) (0.75 * lifeTime)); - + registryUpdateDeamon = + RegistryUpdateDeamonFactory.createDeamon(this.createProxy(), (long) (0.75 * lifeTime)); + if(lifeTime != 0) { registryUpdateDeamon.startDeamon(); } } diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiPlanSessionStatusManager.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiPlanSessionStatusManager.java index e1c600e94..82eb38b12 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiPlanSessionStatusManager.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/rmi/RmiPlanSessionStatusManager.java @@ -48,7 +48,6 @@ public boolean hasError(PlanSessionID sessionID) throws RemoteException { } public boolean hasFinished(PlanSessionID sessionID) throws RemoteException { - return statusManagerInterface.hasFinished(sessionID); } diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/session/ExecutionEngineSessionPlan.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/session/ExecutionEngineSessionPlan.java index f3b595766..b5a99e1af 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/session/ExecutionEngineSessionPlan.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/session/ExecutionEngineSessionPlan.java @@ -94,7 +94,7 @@ public PlanSessionStatisticsManagerProxy getPlanSessionStatisticsManagerProxy() public void close() throws RemoteException { lock.lock(); try { - if (isClosed == false) { + if (!isClosed) { engine.destroySession(sessionID); isClosed = true; } diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/remote/RmiObjectProxy.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/remote/RmiObjectProxy.java index a0b156039..fe3ba54a9 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/remote/RmiObjectProxy.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/remote/RmiObjectProxy.java @@ -41,6 +41,8 @@ public synchronized T connect() throws RemoteException { Registry registry = RmiRegistryCache.getRegistry(regEntityName); remoteObject = (T) registry.lookup(regEntryName); isConnected = true; + log.debug("Connected to " + + regEntityName.getIP() + ":" + regEntityName.getPort() + " ..."); return remoteObject; } catch (Exception e) { log.error("Cannot connect to " + From 8fedf3ac330546700413bbb69c503ccaf98b825f Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Thu, 14 Jan 2021 12:45:44 +0200 Subject: [PATCH 16/22] Proxy renewal reenabled. --- .../exareme/exareme-utils/src/main/resources/arm.properties | 2 +- .../exareme/exareme-utils/src/main/resources/art.properties | 4 ++-- .../CloseContainerSessionEventHandler.java | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/arm.properties b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/arm.properties index 03bb832d9..c6f19b45c 100644 --- a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/arm.properties +++ b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/arm.properties @@ -5,7 +5,7 @@ # -------------------------------------------------------------------------------- # arm.rmi.RmiResourceMediatorProxy.lifetime=30000 arm.rmi.RmiResourceMediator.lifetime=30000 -arm.compute.rmi.RmiArmCompute.lifetime=0 +arm.compute.rmi.RmiArmCompute.lifetime=30000 arm.cloud.OpenNebulaResourceMediatorInterface.ip= arm.cloud.OpenNebulaResourceMediatorInterface.port= # Storage diff --git a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties index 1638ffb09..5e2e1bfc8 100644 --- a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties +++ b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties @@ -16,7 +16,7 @@ art.registry.rmi.RmiArtRegistryProxy.cacheSize=10 # -------------------------------------------------------------------------------- # art.container.rmi.RmiContainer.defaultID=0 art.container.thread.OperatorMonitorThread.IsAlivePeriod=1 -art.container.rmi.RmiContainer.lifetime=0 +art.container.rmi.RmiContainer.lifetime=30000 art.container.maxStatusReportPeriod=1000 # Adaptor implementation [rmi, socket] (socket is very efficient with big data). art.container.adaptor.impl=socket @@ -33,7 +33,7 @@ art.container.zipBufferSize_kb=16 art.container.ioBufferSize_kb=512 # Execution engine # -------------------------------------------------------------------------------- # -art.executionEngine.rmi.RmiExecutionEngine.lifetime=0 +art.executionEngine.rmi.RmiExecutionEngine.lifetime=30000 art.executionEngine.forceSessionStopAfter_sec=30 art.executionEngine.dataOperators.memory=2 # centralized = Use a single node diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java index f091b7c7f..19e9cda93 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java @@ -85,7 +85,7 @@ public GetStatsAndCloseSession(ContainerSession session) { @Override public void run() { try { - log.trace("Closing session: " + session.getSessionID().getLongId()); + log.info("Closing session: " + session.getSessionID().getLongId()); ContainerJobs jobs = new ContainerJobs(); jobs.addJob(GetStatisticsJob.instance); results = session.execJobs(jobs); From 28f66652b744c10f735e578b5004b3d2404150eb Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Thu, 14 Jan 2021 14:16:24 +0200 Subject: [PATCH 17/22] Max threads on pools reduced to a much lowed number. --- .../async/handler/entity/NQueryResultEntity.java | 6 ++++-- .../exareme-utils/src/main/resources/arm.properties | 2 +- .../exareme-utils/src/main/resources/art.properties | 4 ++-- .../worker/art/container/ContainerSession.java | 12 +++++++++++- .../dataTransfer/rest/RestDataTransferGateway.java | 2 +- .../worker/art/container/rmi/RmiContainer.java | 1 + .../art/executionEngine/ExecEngineConstants.java | 2 +- .../CloseContainerSessionEventHandler.java | 11 ++++++----- .../containerJobs/ContainerJobsEventListener.java | 2 +- .../event/independent/IndependentEventsListener.java | 2 +- 10 files changed, 29 insertions(+), 15 deletions(-) diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/entity/NQueryResultEntity.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/entity/NQueryResultEntity.java index 9af069e61..649d6a632 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/entity/NQueryResultEntity.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/entity/NQueryResultEntity.java @@ -64,6 +64,8 @@ public void produceContent(ContentEncoder encoder, IOControl iocontrol) this.buffer.compact(); if (i < 1 && !buffering) { encoder.complete(); + closeQuery(); + close(); } } else { log.trace("|" + queryStatus.getError() + "|"); @@ -97,9 +99,9 @@ public void produceContent(ContentEncoder encoder, IOControl iocontrol) encoder.write(ByteBuffer.wrap(result.getBytes())); encoder.complete(); } + closeQuery(); + close(); } - closeQuery(); - close(); } @Override diff --git a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/arm.properties b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/arm.properties index c6f19b45c..3bb9d8f32 100644 --- a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/arm.properties +++ b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/arm.properties @@ -5,7 +5,7 @@ # -------------------------------------------------------------------------------- # arm.rmi.RmiResourceMediatorProxy.lifetime=30000 arm.rmi.RmiResourceMediator.lifetime=30000 -arm.compute.rmi.RmiArmCompute.lifetime=30000 +arm.compute.rmi.RmiArmCompute.lifetime=100000 arm.cloud.OpenNebulaResourceMediatorInterface.ip= arm.cloud.OpenNebulaResourceMediatorInterface.port= # Storage diff --git a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties index 5e2e1bfc8..6335f0fcd 100644 --- a/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties +++ b/Exareme-Docker/src/exareme/exareme-utils/src/main/resources/art.properties @@ -16,7 +16,7 @@ art.registry.rmi.RmiArtRegistryProxy.cacheSize=10 # -------------------------------------------------------------------------------- # art.container.rmi.RmiContainer.defaultID=0 art.container.thread.OperatorMonitorThread.IsAlivePeriod=1 -art.container.rmi.RmiContainer.lifetime=30000 +art.container.rmi.RmiContainer.lifetime=100000 art.container.maxStatusReportPeriod=1000 # Adaptor implementation [rmi, socket] (socket is very efficient with big data). art.container.adaptor.impl=socket @@ -33,7 +33,7 @@ art.container.zipBufferSize_kb=16 art.container.ioBufferSize_kb=512 # Execution engine # -------------------------------------------------------------------------------- # -art.executionEngine.rmi.RmiExecutionEngine.lifetime=30000 +art.executionEngine.rmi.RmiExecutionEngine.lifetime=100000 art.executionEngine.forceSessionStopAfter_sec=30 art.executionEngine.dataOperators.memory=2 # centralized = Use a single node diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/ContainerSession.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/ContainerSession.java index 3fdd537c3..2124c3739 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/ContainerSession.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/ContainerSession.java @@ -5,6 +5,8 @@ import madgik.exareme.common.art.ContainerSessionID; import madgik.exareme.common.art.PlanSessionID; +import madgik.exareme.worker.art.container.rmi.RmiContainer; +import org.apache.log4j.Logger; import java.io.Serializable; import java.rmi.RemoteException; @@ -19,6 +21,7 @@ * @since 1.0 */ public class ContainerSession implements Serializable { + private static final Logger log = Logger.getLogger(ContainerSession.class); private static final long serialVersionUID = 1L; private ContainerSessionID containerSessionID = null; @@ -42,7 +45,14 @@ public ContainerSessionID getContainerSessionID() { public ContainerJobResults execJobs(ContainerJobs jobs) throws RemoteException { jobs.setSession(containerSessionID, sessionID); - return containerProxy.getRemoteObject().execJobs(jobs); + log.info("Executing jobs for session ID: " + sessionID.getLongId()); + log.info("Executing " + jobs.getJobs().size() + " Jobs!"); + for (ContainerJob job : jobs.getJobs()) { + log.info("Job: " + job.getType().name() + " " + job.toString()); + } + ContainerJobResults results = containerProxy.getRemoteObject().execJobs(jobs); + log.info("Returning results for jobs from sessionID: " + sessionID.getLongId()); + return results; } public void closeSession() throws RemoteException { diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/dataTransfer/rest/RestDataTransferGateway.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/dataTransfer/rest/RestDataTransferGateway.java index a57d78b97..4dbbde1ba 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/dataTransfer/rest/RestDataTransferGateway.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/dataTransfer/rest/RestDataTransferGateway.java @@ -25,7 +25,7 @@ public class RestDataTransferGateway implements DataTransferGateway { private static final Logger log = Logger.getLogger(RestDataTransferGateway.class); // TODO(herald): this looks like a magic number! - private final int threads = 1000; + private final int threads = 20; private final int secondsToWait = 30; private String artRegistry = null; private ArtManager artManager = null; diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainer.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainer.java index 408111b7d..0fa5a297b 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainer.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/rmi/RmiContainer.java @@ -170,6 +170,7 @@ public final ContainerProxy createProxy() throws RemoteException { public ContainerJobResults execJobs(ContainerJobs jobs) throws RemoteException { ContainerJobResults results = new ContainerJobResults(); ContainerJobResult result = null; + log.debug("Executing " + jobs.getJobs().size() + " Jobs!"); for (ContainerJob job : jobs.getJobs()) { log.debug("Executing Job: " + job.getType().name() + " " + job.toString()); if (job.getType() diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/ExecEngineConstants.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/ExecEngineConstants.java index 7d02e3179..0f001db2d 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/ExecEngineConstants.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/ExecEngineConstants.java @@ -19,7 +19,7 @@ public class ExecEngineConstants { public static final String PRAGMA_INTER_CONTAINER_DATA_TRANSFER = "inter_container_data_transfer"; - public static final int THREADS_PER_INDEPENDENT_TASKS = 1024; + public static final int THREADS_PER_INDEPENDENT_TASKS = 20; public static double DATA_TRANSFER_MEM = 0.0; private ExecEngineConstants() { diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java index 19e9cda93..0136ceadf 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java @@ -38,17 +38,16 @@ public CloseContainerSessionEventHandler() { public void preProcess(CloseContainerSessionEvent event, PlanEventSchedulerState state) throws RemoteException { try { - ExecutorService service = - Executors.newFixedThreadPool(ExecEngineConstants.THREADS_PER_INDEPENDENT_TASKS); ArrayList workers = new ArrayList(); List sessions = state.getContSessions(event.containerSessionID); + ExecutorService service = Executors.newFixedThreadPool(sessions.size()); for (ContainerSession session : sessions) { GetStatsAndCloseSession w = new GetStatsAndCloseSession(session); workers.add(w); service.submit(w); } service.shutdown(); - service.awaitTermination(1, TimeUnit.DAYS); + service.awaitTermination(2, TimeUnit.MINUTES); for (GetStatsAndCloseSession w : workers) { state.getStatistics().containerStats.add(w.stats.getStats()); } @@ -85,7 +84,7 @@ public GetStatsAndCloseSession(ContainerSession session) { @Override public void run() { try { - log.info("Closing session: " + session.getSessionID().getLongId()); + log.info("Closing session: " + session.getSessionID().getLongId() + " , " + this.toString()); ContainerJobs jobs = new ContainerJobs(); jobs.addJob(GetStatisticsJob.instance); results = session.execJobs(jobs); @@ -93,7 +92,9 @@ public void run() { session.closeSession(); } catch (RemoteException e) { exception = e; - log.error("Cannot close session", e); + log.error("Cannot close session " + session.getSessionID().getLongId(), e); + }finally{ + log.info("Closed session: " + session.getSessionID().getLongId() + " , " + this.toString()); } } } diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/containerJobs/ContainerJobsEventListener.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/containerJobs/ContainerJobsEventListener.java index 2b2da22dc..112dc65e5 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/containerJobs/ContainerJobsEventListener.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/containerJobs/ContainerJobsEventListener.java @@ -20,7 +20,7 @@ public class ContainerJobsEventListener implements EventListener Date: Fri, 15 Jan 2021 14:20:02 +0200 Subject: [PATCH 18/22] MadisServer error message. --- Exareme-Docker/src/madisServer/MadisServer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Exareme-Docker/src/madisServer/MadisServer.py b/Exareme-Docker/src/madisServer/MadisServer.py index 0ee043fe0..599888d57 100644 --- a/Exareme-Docker/src/madisServer/MadisServer.py +++ b/Exareme-Docker/src/madisServer/MadisServer.py @@ -70,7 +70,7 @@ def post(self): str_result=self.execQuery(dbFilename,query) except QueryExecutionException as e: #raise tornado.web.HTTPError(status_code=500,log_message="...the log message??") - self.logger.debug("(MadisServer::post) QueryExecutionException: {}".format(str(e))) + self.logger.error("(MadisServer::post) QueryExecutionException: {}".format(str(e))) #print "QueryExecutionException ->{}".format(str(e)) self.set_status(500) self.write(str(e)) From 9c9b58e6fe7ee8977df0de606cd503d68a9912a4 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Fri, 15 Jan 2021 14:28:59 +0200 Subject: [PATCH 19/22] Added error when CloseContainerSession threads did not close properly. --- .../CloseContainerSessionEventHandler.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java index 0136ceadf..20c416f89 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java @@ -16,6 +16,7 @@ import java.rmi.RemoteException; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -30,6 +31,7 @@ public class CloseContainerSessionEventHandler public static final CloseContainerSessionEventHandler instance = new CloseContainerSessionEventHandler(); private static final long serialVersionUID = 1L; + private static final Logger log = Logger.getLogger(CloseContainerSessionEventHandler.class); public CloseContainerSessionEventHandler() { } @@ -47,7 +49,10 @@ public void preProcess(CloseContainerSessionEvent event, PlanEventSchedulerState service.submit(w); } service.shutdown(); - service.awaitTermination(2, TimeUnit.MINUTES); + if(!service.awaitTermination(2, TimeUnit.MINUTES)){ + log.error("Timeout when trying to fetch stats."); + throw new RemoteException("Timeout when trying to fetch stats." + Arrays.toString(Thread.currentThread().getStackTrace())); + } for (GetStatsAndCloseSession w : workers) { state.getStatistics().containerStats.add(w.stats.getStats()); } From 2db53239bcd1cb3702ba74410ce7a70a8cfb7530 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Fri, 15 Jan 2021 20:51:23 +0200 Subject: [PATCH 20/22] Changed info to debug messages. --- .../master/engine/executor/remote/AdpDBArtJobMonitor.java | 4 ++-- .../master/gateway/async/handler/HBP/HBPQueryHandler.java | 7 +++---- .../gateway/async/handler/HttpAsyncDecomposerHandler.java | 1 - .../exareme/worker/art/container/ContainerSession.java | 7 +++---- .../dynamicExecutionEngine/DynamicPlanManager.java | 5 +---- .../CloseContainerSessionEventHandler.java | 4 ++-- 6 files changed, 11 insertions(+), 17 deletions(-) diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java index e0027ad6e..f08d2f57f 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/executor/remote/AdpDBArtJobMonitor.java @@ -89,13 +89,13 @@ public void run() { .setAdpEngineStatistics(statsManager.getStatistics()); if (sessionManager != null && !sessionManager.hasError()) { - log.info("Session finished, closing! ID: " + sessionPlan.getSessionID().getLongId() + log.debug("Session finished, closing! ID: " + sessionPlan.getSessionID().getLongId() + " , QueryID: " + queryID.getQueryID()); statusManager.setFinished(status.getId()); } else { statusManager.setError(status.getId(), sessionManager.getErrorList().get(0)); } - log.info("Session closing! ID: "+ sessionPlan.getSessionID().getLongId() + log.debug("Session closing! ID: "+ sessionPlan.getSessionID().getLongId() + " , QueryID: " + queryID.getQueryID()); sessionPlan.close(); diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HBP/HBPQueryHandler.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HBP/HBPQueryHandler.java index 2c14d8b79..cb5281154 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HBP/HBPQueryHandler.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HBP/HBPQueryHandler.java @@ -159,10 +159,9 @@ private void handleHBPAlgorithmExecution(HttpRequest request, HttpResponse respo AdpDBClientFactory.createDBClient(manager, clientProperties); queryStatus = dbClient.query(algorithmKey, dfl); - log.info("Executing algorithm " + algorithmKey + - " started with queryId " + queryStatus.getQueryID().getQueryID()); - log.debug("Algorithm " + algorithmKey + " with queryID " - + queryStatus.getQueryID() + " execution started. DFL Script: \n " + dfl); + log.info("Algorithm " + algorithmKey + " with queryID " + + queryStatus.getQueryID() + " execution started."); + log.debug("DFL Script: \n " + dfl); BasicHttpEntity entity = new NQueryResultEntity(queryStatus, ds, ExaremeGatewayUtils.RESPONSE_BUFFER_SIZE); diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncDecomposerHandler.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncDecomposerHandler.java index 8096aa352..17ca63a0f 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncDecomposerHandler.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/gateway/async/handler/HttpAsyncDecomposerHandler.java @@ -465,7 +465,6 @@ public void run() { if (timeoutMs > 0) { long timePassed = System.currentTimeMillis() - start; if (timePassed > timeoutMs) { - log.info("Closing session!!!!"); status.close(); log.warn("Time out:" + timeoutMs + " ms passed"); throw new RuntimeException("Time out:" + timeoutMs + " ms passed"); diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/ContainerSession.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/ContainerSession.java index 2124c3739..d7883a37e 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/ContainerSession.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/container/ContainerSession.java @@ -45,13 +45,12 @@ public ContainerSessionID getContainerSessionID() { public ContainerJobResults execJobs(ContainerJobs jobs) throws RemoteException { jobs.setSession(containerSessionID, sessionID); - log.info("Executing jobs for session ID: " + sessionID.getLongId()); - log.info("Executing " + jobs.getJobs().size() + " Jobs!"); + log.debug("Executing " + jobs.getJobs().size() + " Jobs!"); for (ContainerJob job : jobs.getJobs()) { - log.info("Job: " + job.getType().name() + " " + job.toString()); + log.debug("Job: " + job.getType().name() + " " + job.toString()); } ContainerJobResults results = containerProxy.getRemoteObject().execJobs(jobs); - log.info("Returning results for jobs from sessionID: " + sessionID.getLongId()); + log.debug("Returning results for jobs from sessionID: " + sessionID.getLongId()); return results; } diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicPlanManager.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicPlanManager.java index 2660d4181..aa73a8e3c 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicPlanManager.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/DynamicPlanManager.java @@ -120,7 +120,6 @@ public ContainerSessionID createContainerSession(PlanSessionID planSessionID) @Override public void destroySession(PlanSessionID sessionID) throws RemoteException { - log.info("Inside Destroying session with ID: " + sessionID.getLongId()); try { PlanEventScheduler eventScheduler = schedulerMap.get(sessionID); IndependentEvents jobs = new IndependentEvents(eventScheduler.getState()); @@ -128,10 +127,8 @@ public void destroySession(PlanSessionID sessionID) throws RemoteException { eventScheduler.queueIndependentEvents(jobs); Semaphore sem = new Semaphore(0); if (!eventScheduler.getState().isTerminated()) { - log.info("State not yet terminated. " + sessionID.getLongId()); eventScheduler.getState() .registerTerminationListener(new SemaphoreTerminationListener(sem)); - log.debug( "Waiting '" + forceSessionStopAfter_sec + "' seconds for session to stop ..."); boolean stopped = sem.tryAcquire(forceSessionStopAfter_sec, TimeUnit.SECONDS); @@ -140,7 +137,7 @@ public void destroySession(PlanSessionID sessionID) throws RemoteException { } } - log.info("Destroying session with ID: " + sessionID.getLongId()); + log.debug("Destroying session with ID: " + sessionID.getLongId()); PlanSessionReportID reportID = eventScheduler.getState().getPlanSessionReportID(); schedulerMap.remove(sessionID); containerSessionMap.remove(sessionID); diff --git a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java index 20c416f89..aa4f95ead 100644 --- a/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java +++ b/Exareme-Docker/src/exareme/exareme-worker/src/main/java/madgik/exareme/worker/art/executionEngine/dynamicExecutionEngine/event/closeContainerSession/CloseContainerSessionEventHandler.java @@ -89,7 +89,7 @@ public GetStatsAndCloseSession(ContainerSession session) { @Override public void run() { try { - log.info("Closing session: " + session.getSessionID().getLongId() + " , " + this.toString()); + log.debug("Closing session: " + session.getSessionID().getLongId() + " , " + this.toString()); ContainerJobs jobs = new ContainerJobs(); jobs.addJob(GetStatisticsJob.instance); results = session.execJobs(jobs); @@ -99,7 +99,7 @@ public void run() { exception = e; log.error("Cannot close session " + session.getSessionID().getLongId(), e); }finally{ - log.info("Closed session: " + session.getSessionID().getLongId() + " , " + this.toString()); + log.debug("Closed session: " + session.getSessionID().getLongId() + " , " + this.toString()); } } } From 9fd60d5982501fd8f25d74eb461813656e2f4469 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Sun, 17 Jan 2021 19:43:43 +0200 Subject: [PATCH 21/22] Converting CSVs turned off by default on the Federated installation. --- Federated-Deployment/Compose-Files/docker-compose-master.yml | 2 +- Federated-Deployment/Compose-Files/docker-compose-worker.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Federated-Deployment/Compose-Files/docker-compose-master.yml b/Federated-Deployment/Compose-Files/docker-compose-master.yml index 1b38b05b9..eb0a74ad2 100644 --- a/Federated-Deployment/Compose-Files/docker-compose-master.yml +++ b/Federated-Deployment/Compose-Files/docker-compose-master.yml @@ -51,7 +51,7 @@ services: - NODE_COMMUNICATION_TIMEOUT=30000 # (MILIS) NODE COMMUNICATION WILL DROP IF TIMEOUT IS PASSED - ENVIRONMENT_TYPE=PROD # TEST / DEV / PROD - LOG_LEVEL=INFO # INFO / DEBUG - - CONVERT_CSVS=TRUE # TRUE / FALSE + - CONVERT_CSVS=FALSE # TRUE / FALSE depends_on: - exareme-keystore deploy: diff --git a/Federated-Deployment/Compose-Files/docker-compose-worker.yml b/Federated-Deployment/Compose-Files/docker-compose-worker.yml index 9ba97043f..e83d66e88 100644 --- a/Federated-Deployment/Compose-Files/docker-compose-worker.yml +++ b/Federated-Deployment/Compose-Files/docker-compose-worker.yml @@ -34,7 +34,7 @@ services: - NODE_COMMUNICATION_TIMEOUT=30000 # (MILIS) NODE COMMUNICATION WILL DROP IF TIMEOUT IS PASSED - ENVIRONMENT_TYPE=PROD # TEST / DEV / PROD - LOG_LEVEL=INFO # INFO / DEBUG - - CONVERT_CSVS=TRUE # TRUE / FALSE + - CONVERT_CSVS=FALSE # TRUE / FALSE deploy: placement: constraints: From 27b72fc40f3932907613f533aab67216af0fb703 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Mon, 18 Jan 2021 13:01:17 +0200 Subject: [PATCH 22/22] Fix for query closing on iterative algorithms. --- .../handler/NIterativeAlgorithmResultEntity.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/iterations/handler/NIterativeAlgorithmResultEntity.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/iterations/handler/NIterativeAlgorithmResultEntity.java index 0d4c136c6..dde8e356e 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/iterations/handler/NIterativeAlgorithmResultEntity.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/engine/iterations/handler/NIterativeAlgorithmResultEntity.java @@ -99,11 +99,15 @@ public void produceContent(ContentEncoder encoder, IOControl ioctrl) throws IOEx this.buffer.compact(); if (i < 1 && !buffering) { encoder.complete(); + closeQuery(); + close(); } } else { encoder.write(ByteBuffer.wrap( finalizeQueryStatus.getError().getBytes())); encoder.complete(); + closeQuery(); + close(); } } else { // Algorithm execution failed, notify the client. @@ -164,13 +168,12 @@ public void produceContent(ContentEncoder encoder, IOControl ioctrl) throws IOEx encoder.write(buffer); this.buffer.compact(); encoder.complete(); + closeQuery(); + close(); } } finally { if (iterativeAlgorithmState != null) iterativeAlgorithmState.releaseLock(); - - closeQuery(); - close(); } }