From 4beb79a699b7c74b28c86c5cb4ae1cd9511c752e Mon Sep 17 00:00:00 2001 From: Marek Horst Date: Wed, 11 Oct 2023 17:37:19 +0200 Subject: [PATCH] Closes #1426: Run IIS experiments by relying on spark 3.4 version WIP. Introducing required workflow.xml fixes for various workflows relying on spark3 to let their integration tests to succeed: * setting `spark.extraListeners` and `spark.sql.queryExecutionListeners` explicitly to empty values in order to avoid relying on incompatible, spark2 compliant, cloudera listeners * setting `spark.shuffle.useOldFetchProtocol=true` in order to address `2.4 to 3.0 migration guide` requirement regarding protocol for fetching shuffle blocks backward compatibility (and avoiding `IllegalArgumentException: Unexpected message type: ` kind of errors) The following modules were covered with workflow.xml related changes which resulted in successful integration tests execution: * `iis-wf-affmatching` * `iis-wf-citationmatching-direct` * `iis-wf-documentsclassification` This was introduced to avoid the following exception: java.lang.NoClassDefFoundError: org/apache/spark/internal/Logging$class Adding `hadoop-mapreduce-client-core` and `hadoop-common` dependencies in `iis-wf-affmatching` and `iis-wf-citationmatching-direct` modules to reflect dependencies set from `iis-wf-export-actionmanager` and to avoid exception: IncompatibleClassChangeError: Class org.apache.hadoop.fs.AvroFSInput does not implement the requested interface org.apache.avro.file.SeekableInput --- iis-wf/iis-wf-affmatching/pom.xml | 9 +++++++++ .../wf/affmatching/dedup/oozie_app/workflow.xml | 15 +++------------ .../wf/affmatching/main/oozie_app/workflow.xml | 15 +++------------ .../projectbased/oozie_app/workflow.xml | 15 +++------------ iis-wf/iis-wf-citationmatching-direct/pom.xml | 14 ++++++++++++++ .../direct/oozie_app/workflow.xml | 15 +++------------ .../oozie_app/workflow.xml | 15 +++------------ 7 files changed, 38 insertions(+), 60 deletions(-) diff --git a/iis-wf/iis-wf-affmatching/pom.xml b/iis-wf/iis-wf-affmatching/pom.xml index 8ea3863e4..508410851 100644 --- a/iis-wf/iis-wf-affmatching/pom.xml +++ b/iis-wf/iis-wf-affmatching/pom.xml @@ -33,6 +33,15 @@ test + + org.apache.hadoop + hadoop-mapreduce-client-core + + + org.apache.hadoop + hadoop-common + + org.apache.spark spark-core_2.12 diff --git a/iis-wf/iis-wf-affmatching/src/main/resources/eu/dnetlib/iis/wf/affmatching/dedup/oozie_app/workflow.xml b/iis-wf/iis-wf-affmatching/src/main/resources/eu/dnetlib/iis/wf/affmatching/dedup/oozie_app/workflow.xml index 340432d74..a1f4c9557 100644 --- a/iis-wf/iis-wf-affmatching/src/main/resources/eu/dnetlib/iis/wf/affmatching/dedup/oozie_app/workflow.xml +++ b/iis-wf/iis-wf-affmatching/src/main/resources/eu/dnetlib/iis/wf/affmatching/dedup/oozie_app/workflow.xml @@ -41,16 +41,6 @@ oozieActionShareLibForSpark2 oozie action sharelib for spark 2.* - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - spark2YarnHistoryServerAddress spark 2.* yarn history server address @@ -94,8 +84,9 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= + --conf spark.shuffle.useOldFetchProtocol=true --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} diff --git a/iis-wf/iis-wf-affmatching/src/main/resources/eu/dnetlib/iis/wf/affmatching/main/oozie_app/workflow.xml b/iis-wf/iis-wf-affmatching/src/main/resources/eu/dnetlib/iis/wf/affmatching/main/oozie_app/workflow.xml index 063379c11..add69f544 100644 --- a/iis-wf/iis-wf-affmatching/src/main/resources/eu/dnetlib/iis/wf/affmatching/main/oozie_app/workflow.xml +++ b/iis-wf/iis-wf-affmatching/src/main/resources/eu/dnetlib/iis/wf/affmatching/main/oozie_app/workflow.xml @@ -73,16 +73,6 @@ oozieActionShareLibForSpark2 oozie action sharelib for spark 2.* - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - spark2YarnHistoryServerAddress spark 2.* yarn history server address @@ -135,8 +125,9 @@ --conf spark.yarn.driver.memoryOverhead=${sparkDriverOverhead} --conf spark.network.timeout=${sparkNetworkTimeout} --conf spark.executor.heartbeatInterval=${sparkExecutorHeartbeatInterval} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= + --conf spark.shuffle.useOldFetchProtocol=true --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} diff --git a/iis-wf/iis-wf-affmatching/src/main/resources/eu/dnetlib/iis/wf/affmatching/projectbased/oozie_app/workflow.xml b/iis-wf/iis-wf-affmatching/src/main/resources/eu/dnetlib/iis/wf/affmatching/projectbased/oozie_app/workflow.xml index 713686743..0f3af1990 100644 --- a/iis-wf/iis-wf-affmatching/src/main/resources/eu/dnetlib/iis/wf/affmatching/projectbased/oozie_app/workflow.xml +++ b/iis-wf/iis-wf-affmatching/src/main/resources/eu/dnetlib/iis/wf/affmatching/projectbased/oozie_app/workflow.xml @@ -61,16 +61,6 @@ oozieActionShareLibForSpark2 oozie action sharelib for spark 2.* - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - spark2YarnHistoryServerAddress spark 2.* yarn history server address @@ -115,8 +105,9 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= + --conf spark.shuffle.useOldFetchProtocol=true --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} diff --git a/iis-wf/iis-wf-citationmatching-direct/pom.xml b/iis-wf/iis-wf-citationmatching-direct/pom.xml index 85e856684..febc02494 100644 --- a/iis-wf/iis-wf-citationmatching-direct/pom.xml +++ b/iis-wf/iis-wf-citationmatching-direct/pom.xml @@ -34,6 +34,15 @@ ${project.version} + + org.apache.hadoop + hadoop-mapreduce-client-core + + + org.apache.hadoop + hadoop-common + + org.apache.spark spark-core_2.12 @@ -44,6 +53,11 @@ spark-sql_2.12 + + org.apache.spark + spark-avro_2.12 + + pl.edu.icm.spark-utils spark-utils_2.12 diff --git a/iis-wf/iis-wf-citationmatching-direct/src/main/resources/eu/dnetlib/iis/wf/citationmatching/direct/oozie_app/workflow.xml b/iis-wf/iis-wf-citationmatching-direct/src/main/resources/eu/dnetlib/iis/wf/citationmatching/direct/oozie_app/workflow.xml index 2713ff4e4..e07c300c3 100644 --- a/iis-wf/iis-wf-citationmatching-direct/src/main/resources/eu/dnetlib/iis/wf/citationmatching/direct/oozie_app/workflow.xml +++ b/iis-wf/iis-wf-citationmatching-direct/src/main/resources/eu/dnetlib/iis/wf/citationmatching/direct/oozie_app/workflow.xml @@ -43,16 +43,6 @@ oozieActionShareLibForSpark2 oozie action sharelib for spark 2.* - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - spark2YarnHistoryServerAddress spark 2.* yarn history server address @@ -98,8 +88,9 @@ --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.yarn.executor.memoryOverhead=${sparkExecutorOverhead} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= + --conf spark.shuffle.useOldFetchProtocol=true --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} diff --git a/iis-wf/iis-wf-documentsclassification/src/main/resources/eu/dnetlib/iis/wf/documentsclassification/oozie_app/workflow.xml b/iis-wf/iis-wf-documentsclassification/src/main/resources/eu/dnetlib/iis/wf/documentsclassification/oozie_app/workflow.xml index 47657e3d9..a68babae8 100644 --- a/iis-wf/iis-wf-documentsclassification/src/main/resources/eu/dnetlib/iis/wf/documentsclassification/oozie_app/workflow.xml +++ b/iis-wf/iis-wf-documentsclassification/src/main/resources/eu/dnetlib/iis/wf/documentsclassification/oozie_app/workflow.xml @@ -35,16 +35,6 @@ oozieActionShareLibForSpark2 oozie action sharelib for spark 2.* - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - spark2YarnHistoryServerAddress spark 2.* yarn history server address @@ -99,8 +89,9 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= + --conf spark.shuffle.useOldFetchProtocol=true --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}