From 5241cc90972cdf7fbb5f11153469f5f006443382 Mon Sep 17 00:00:00 2001 From: Chia-Ping Tsai Date: Fri, 15 Mar 2024 06:33:01 +0800 Subject: [PATCH] [ETL] remove all etl-related code (#1848) --- .gitignore | 2 - README.md | 1 - build.gradle | 5 - config/spark2kafka.properties | 40 --- docker/start_etl.sh | 146 -------- docker/start_spark.sh | 225 ------------- docs/build_project.md | 1 - docs/etl/README.md | 51 --- docs/etl/experiments/etl_1.md | 138 -------- docs/run_spark.md | 14 - etl/build.gradle | 76 ----- .../scala/org/astraea/etl/DataColumn.scala | 19 -- .../org/astraea/etl/DataFrameProcessor.scala | 181 ---------- .../main/scala/org/astraea/etl/DataType.scala | 103 ------ .../main/scala/org/astraea/etl/Metadata.scala | 143 -------- .../scala/org/astraea/etl/Spark2Kafka.scala | 62 ---- .../org/astraea/etl/SparkStreamWriter.scala | 102 ------ .../main/scala/org/astraea/etl/Utils.scala | 50 --- .../astraea/etl/DataFrameProcessorTest.scala | 313 ------------------ .../scala/org/astraea/etl/FileCreator.scala | 86 ----- .../scala/org/astraea/etl/MetadataTest.scala | 80 ----- .../org/astraea/etl/Spark2KafkaTest.scala | 157 --------- .../scala/org/astraea/etl/UtilsTest.scala | 30 -- gradle/dependencies.gradle | 5 - settings.gradle | 2 +- 25 files changed, 1 insertion(+), 2031 deletions(-) delete mode 100644 config/spark2kafka.properties delete mode 100755 docker/start_etl.sh delete mode 100755 docker/start_spark.sh delete mode 100644 docs/etl/README.md delete mode 100644 docs/etl/experiments/etl_1.md delete mode 100644 docs/run_spark.md delete mode 100644 etl/build.gradle delete mode 100644 etl/src/main/scala/org/astraea/etl/DataColumn.scala delete mode 100644 etl/src/main/scala/org/astraea/etl/DataFrameProcessor.scala delete mode 100644 etl/src/main/scala/org/astraea/etl/DataType.scala delete mode 100644 etl/src/main/scala/org/astraea/etl/Metadata.scala delete mode 100644 etl/src/main/scala/org/astraea/etl/Spark2Kafka.scala delete mode 100644 etl/src/main/scala/org/astraea/etl/SparkStreamWriter.scala delete mode 100644 etl/src/main/scala/org/astraea/etl/Utils.scala delete mode 100644 etl/src/test/scala/org/astraea/etl/DataFrameProcessorTest.scala delete mode 100644 etl/src/test/scala/org/astraea/etl/FileCreator.scala delete mode 100644 etl/src/test/scala/org/astraea/etl/MetadataTest.scala delete mode 100644 etl/src/test/scala/org/astraea/etl/Spark2KafkaTest.scala delete mode 100644 etl/src/test/scala/org/astraea/etl/UtilsTest.scala diff --git a/.gitignore b/.gitignore index 93b05a6acc..b6876a3284 100644 --- a/.gitignore +++ b/.gitignore @@ -15,10 +15,8 @@ zookeeper.dockerfile controller.dockerfile broker.dockerfile app.dockerfile -spark.dockerfile deps.dockerfile worker.dockerfile -etl.dockerfile hadoop.dockerfile # we don't put binary file to git repo gradle-wrapper.jar diff --git a/README.md b/README.md index 523b2a7c48..cbe1fa3c2c 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,6 @@ 11. [GUI](docs/gui/README.md): 簡單好用的叢集資訊圖形化工具 12. [Connector](./docs/connector/README.md): 提供基於 `kafka connector` 實作的高效平行化工具,包含效能測試和資料遷移等工具 13. [Build](docs/build_project.md): 說明如何建構與測試本專案各模組 -14. [etl](./docs/etl/README.md): 構建 spark-kafka 的資料傳輸通道 # Kafka Q&A diff --git a/build.gradle b/build.gradle index f6be5ea3f5..a3800de960 100644 --- a/build.gradle +++ b/build.gradle @@ -36,9 +36,4 @@ spotless { } } } - scala { - licenseHeaderFile(file("$rootDir/checkstyle/apache.header"), "package ") - target '**/scala/**/*.scala' - scalafmt() - } } \ No newline at end of file diff --git a/config/spark2kafka.properties b/config/spark2kafka.properties deleted file mode 100644 index 7e6d195533..0000000000 --- a/config/spark2kafka.properties +++ /dev/null @@ -1,40 +0,0 @@ -#Parameters you must configure -#============================================================== -#The data source path should be a directory. -source.path = - -#The CSV Column Name.For example:sA=string,sB=integer,sC=boolean... -column.names = - -#Primary keys.For example:sA=string,sB=integer,sC=boolean... -primary.keys = - -#The Kafka bootstrap servers -kafka.bootstrap.servers = - -#Set your topic name. -topic.name = - -#Spark checkpoint path -checkpoint = - -#Parameters that can be selected for configuration -#============================================================== - -#Set the number of topic partitions, if it is empty that will set it to 15. -topic.partitions = - -#Set the number of topic replicas, if it is empty that will set it to 1. -topic.replicas = - -#The rest of the topic can be configured parameters.For example: keyA=valueA,keyB=valueB,keyC=valueC... -topic.config = - -#Option to clean up completed files after processing.Available options are "archive", "delete", "off". Default:delete -clean.source = - -#You must config it when clean.source set archive. -archive.path = - -#recursive.file is used to recursively load files. Default:true -recursive.file = diff --git a/docker/start_etl.sh b/docker/start_etl.sh deleted file mode 100755 index 3c829ff388..0000000000 --- a/docker/start_etl.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Configure the path of Spark2Kafka.properties to run. For example: ./docker/start_etl.sh PropertiesPath -declare -r DOCKER_FOLDER=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) -source "$DOCKER_FOLDER"/docker_build_common.sh - -# ===============================[version control]================================= -declare -r SPARK_VERSION=${SPARK_VERSION:-3.3.1} -# ===============================[global variables]================================ -declare -r LOCAL_PATH=$(cd -- "$(dirname -- "${DOCKER_FOLDER}")" &>/dev/null && pwd) -# ===============================[properties keys]================================= -declare -r SOURCE_KEY="source.path" -declare -r CHECKPOINT_KEY="checkpoint" -# ===============================[spark driver/executor resource]================== -declare -r RESOURCES_CONFIGS="${RESOURCES_CONFIGS:-"--conf spark.driver.memory=4g --conf spark.executor.memory=4g"}" -# ===================================[functions]=================================== - -function showHelp() { - echo "Usage: [ENV] start_etl.sh properties_path" - echo "required Arguments: " - echo " master where to submit spark job" - echo " property.file The path of Spark2Kafka.properties." - echo "ENV: " - echo " ACCOUNT=skiptests set the account to clone from" - echo " VERSION=main set branch of astraea" - echo " RESOURCES_CONFIGS=\"--conf spark.driver.memory=4g\" set spark memory" -} - -function runContainer() { - local master=$1 - local propertiesPath=$2 - - if [[ ! -f "$propertiesPath" ]]; then - echo "$propertiesPath is not a property file" - exit 1 - fi - - source_path=$(cat $propertiesPath | grep $SOURCE_KEY | cut -d "=" -f2) - checkpoint_path=$(cat $propertiesPath | grep $CHECKPOINT_KEY | cut -d "=" -f2) - source_name=$(echo "${source_path}" | tr '/' '-') - - docker run --rm -v "$LOCAL_PATH":/tmp/astraea \ - ghcr.io/skiptests/astraea/deps \ - /bin/bash \ - -c "cd /tmp/astraea && ./gradlew clean shadowJar --no-daemon" - - jar_path=$(find "$LOCAL_PATH"/etl/build/libs -type f -name "*all.jar") - - if [[ ! -f "$jar_path" ]]; then - echo "$jar_path is not a uber jar" - exit 1 - fi - - # the driver is running on client mode, so the source path must be readable from following container. - # hence, we will mount source path to container directly - mkdir -p "$source_path" - if [ $? -ne 0 ]; then - echo "failed to create folder on $source_path" - exit 1 - fi - - mkdir -p "$checkpoint_path" - if [ $? -ne 0 ]; then - echo "failed to create folder on $checkpoint_path" - exit 1 - fi - - ui_port=$(($(($RANDOM % 10000)) + 10000)) - if [[ "$master" == "local"* ]]; then - network_config="-p ${ui_port}:${ui_port}" - else - # expose the driver network - network_config="--network host" - fi - - if [[ "$master" == "spark:"* ]] || [[ "$master" == "local"* ]]; then - docker run -d --init \ - --name "csv-kafka-${source_name}" \ - $network_config \ - -v "$propertiesPath":"$propertiesPath":ro \ - -v "$jar_path":/tmp/astraea-etl.jar:ro \ - -v "${source_path}":"${source_path}" \ - -v "${checkpoint_path}":"${checkpoint_path}" \ - -e JAVA_OPTS="$HEAP_OPTS" \ - ghcr.io/skiptests/astraea/spark:$SPARK_VERSION \ - ./bin/spark-submit \ - --conf "spark.ui.port=$ui_port" \ - $RESOURCES_CONFIGS \ - --packages org.apache.spark:spark-sql-kafka-0-10_2.13:"$SPARK_VERSION" \ - --class org.astraea.etl.Spark2Kafka \ - --jars file:///tmp/astraea-etl.jar \ - --master $master \ - /tmp/astraea-etl.jar \ - "$propertiesPath" - - echo "application web: http:$ADDRESS:$ui_port" - else - echo "$master is unsupported" - exit 1 - fi - -} - -# ===================================[main]=================================== -checkDocker - -master="local[*]" -property_file_path="" - -while [[ $# -gt 0 ]]; do - if [[ "$1" == "help" ]]; then - showHelp - exit 0 - fi - - if [[ "$1" == "master"* ]]; then - master=$(echo "$1" | cut -d "=" -f 2) - fi - - if [[ "$1" == "property.file"* ]]; then - property_file_path=$(echo "$1" | cut -d "=" -f 2) - fi - - shift -done - -if [[ "$property_file_path" == "" ]]; then - showHelp - exit 0 -fi - -runContainer "$master" "$property_file_path" diff --git a/docker/start_spark.sh b/docker/start_spark.sh deleted file mode 100755 index c628aa53dc..0000000000 --- a/docker/start_spark.sh +++ /dev/null @@ -1,225 +0,0 @@ -#!/bin/bash -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -declare -r DOCKER_FOLDER=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) -source $DOCKER_FOLDER/docker_build_common.sh - -# ===============================[global variables]=============================== -declare -r VERSION=${REVISION:-${VERSION:-3.3.2}} -declare -r HADOOP_VERSION=${HADOOP_REVERSION:-${HADOOP_REVERSION:-3}} -declare -r REPO=${REPO:-ghcr.io/skiptests/astraea/spark} -declare -r IMAGE_NAME="$REPO:$VERSION" -declare -r SPARK_PORT=${SPARK_PORT:-"$(getRandomPort)"} -declare -r SPARK_UI_PORT=${SPARK_UI_PORT:-"$(getRandomPort)"} -declare -r DOCKERFILE=$DOCKER_FOLDER/spark.dockerfile -declare -r MASTER_NAME="spark-master" -declare -r WORKER_NAME="spark-worker" - -# ===================================[functions]=================================== - -function showHelp() { - echo "Usage: [ENV] start_spark.sh" - echo "Optional Arguments: " - echo " master=spark://node00:1111 start a spark worker. Or start a spark master if master-url is not defined" - echo " folder=/tmp/aa:/tmp/bb mount the host folder /tmp/aa to spark container /tmp/bb" - echo "ENV: " - echo " VERSION=3.3.1 set version of spark distribution" - echo " BUILD=false set true if you want to build image locally" - echo " RUN=false set false if you want to build/pull image only" - echo " PYTHON_DEPS=delta-spark=1.0.0 set the python dependencies which are pre-installed in the docker image" -} - -function checkOs() { - if [[ "$OSTYPE" == "darwin"* ]]; then - echo "This script requires to run container with \"--network host\", but the feature is unsupported by Mac OS" - exit 2 - fi -} - -# Spark needs to manage the hardware resource for this node, so we don't run multiples workers/masters in same node. -function checkConflictContainer() { - local name=$1 - local role=$2 - local container_names=$(docker ps --format "{{.Names}}") - if [[ $(echo "${container_names}" | grep "$name") != "" ]]; then - echo "It is disallowed to run multiples spark $role in same node" - exit 2 - fi -} - -function generateDockerfileByVersion() { - echo "# this dockerfile is generated dynamically -FROM ubuntu:20.04 AS build - -# install tools -RUN apt-get update && apt-get install -y wget unzip - -# download spark -WORKDIR /tmp -RUN wget https://archive.apache.org/dist/spark/spark-${VERSION}/spark-${VERSION}-bin-hadoop${HADOOP_VERSION}-scala2.13.tgz -RUN mkdir /opt/spark -RUN tar -zxvf spark-${VERSION}-bin-hadoop${HADOOP_VERSION}-scala2.13.tgz -C /opt/spark --strip-components=1 - -# the python3 in ubuntu 22.04 is 3.10 by default, and it has a known issue (https://github.com/vmprof/vmprof-python/issues/240) -# The issue obstructs us from installing 3-third python libraries, so we downgrade the ubuntu to 20.04 -FROM ubuntu:20.04 - -# Do not ask for confirmations when running apt-get, etc. -ENV DEBIAN_FRONTEND noninteractive - -# install tools -RUN apt-get update && apt-get install -y openjdk-11-jre python3 python3-pip - -# copy spark -COPY --from=build /opt/spark /opt/spark - -# add user -RUN groupadd $USER && useradd -ms /bin/bash -g $USER $USER - -# change user -RUN chown -R $USER:$USER /opt/spark -USER $USER - -# export ENV -ENV SPARK_HOME /opt/spark -WORKDIR /opt/spark -" >"$DOCKERFILE" -} - -function generateDockerfileBySource() { - echo "# this dockerfile is generated dynamically -FROM ubuntu:23.10 AS build - -# Do not ask for confirmations when running apt-get, etc. -ENV DEBIAN_FRONTEND noninteractive - -# install tools -RUN apt-get update && apt-get install -y openjdk-11-jdk python3 python3-pip git curl - -# build spark from source code -RUN git clone https://github.com/apache/spark /tmp/spark -WORKDIR /tmp/spark -RUN git checkout $VERSION -ENV MAVEN_OPTS=\"-Xmx3g\" -RUN ./dev/make-distribution.sh --pip --tgz -RUN mkdir /opt/spark -RUN tar -zxvf \$(find ./ -maxdepth 1 -type f -name spark-*SNAPSHOT*.tgz) -C /opt/spark --strip-components=1 -RUN ./build/mvn install -DskipTests - -# the python3 in ubuntu 22.04 is 3.10 by default, and it has a known issue (https://github.com/vmprof/vmprof-python/issues/240) -# The issue obstructs us from installing 3-third python libraries, so we downgrade the ubuntu to 20.04 -FROM ubuntu:20.04 - -# Do not ask for confirmations when running apt-get, etc. -ENV DEBIAN_FRONTEND noninteractive - -# install tools -RUN apt-get update && apt-get install -y openjdk-11-jre python3 python3-pip - -# copy spark -COPY --from=build /opt/spark /opt/spark - -# add user -RUN groupadd $USER && useradd -ms /bin/bash -g $USER $USER - -# change user -RUN chown -R $USER:$USER /opt/spark -USER $USER - -# export ENV -ENV SPARK_HOME /opt/spark -WORKDIR /opt/spark -" >"$DOCKERFILE" -} - -function generateDockerfile() { - if [[ -n "$REVISION" ]]; then - generateDockerfileBySource - else - generateDockerfileByVersion - fi -} - -# ===================================[main]=================================== - -raw_folder_mapping="" -master_url="" -mount_command="" - -while [[ $# -gt 0 ]]; do - if [[ "$1" == "help" ]]; then - showHelp - exit 0 - fi - - if [[ "$1" == "master"* ]]; then - master_url=$(echo "$1" | cut -d "=" -f 2) - fi - - if [[ "$1" == "folder"* ]]; then - raw_folder_mapping=$1 - folder_mapping=$(echo "$1" | cut -d "=" -f 2) - host_folder=$(echo "$folder_mapping" | cut -d ":" -f 1) - container_folder=$(echo "$folder_mapping" | cut -d ":" -f 2) - mount_command="-v $host_folder:$container_folder" - fi - shift -done - -checkDocker -generateDockerfile -buildImageIfNeed "$IMAGE_NAME" - -if [[ "$RUN" != "true" ]]; then - echo "docker image: $IMAGE_NAME is created" - exit 0 -fi - -checkNetwork -checkOs - -if [[ "$master_url" != "" ]]; then - checkConflictContainer $WORKER_NAME "worker" - docker run -d --init \ - -e SPARK_WORKER_WEBUI_PORT=$SPARK_UI_PORT \ - -e SPARK_WORKER_PORT=$SPARK_PORT \ - -e SPARK_NO_DAEMONIZE=true \ - $mount_command \ - --name "$WORKER_NAME" \ - --network host \ - "$IMAGE_NAME" ./sbin/start-worker.sh "$master_url" - - echo "=================================================" - echo "Starting Spark worker $ADDRESS:$SPARK_PORT" - echo "Bound WorkerWebUI started at http://${ADDRESS}:${SPARK_UI_PORT}" - echo "=================================================" -else - checkConflictContainer $MASTER_NAME "master" - docker run -d --init \ - -e SPARK_MASTER_WEBUI_PORT=$SPARK_UI_PORT \ - -e SPARK_MASTER_PORT=$SPARK_PORT \ - -e SPARK_NO_DAEMONIZE=true \ - $mount_command \ - --name "$MASTER_NAME" \ - --network host \ - "$IMAGE_NAME" ./sbin/start-master.sh - - echo "=================================================" - echo "Starting Spark master at spark://$ADDRESS:$SPARK_PORT" - echo "Bound MasterWebUI started at http://${ADDRESS}:${SPARK_UI_PORT}" - echo "execute $DOCKER_FOLDER/start_spark.sh master=spark://$ADDRESS:$SPARK_PORT $raw_folder_mapping to add worker" - echo "=================================================" -fi diff --git a/docs/build_project.md b/docs/build_project.md index 6d5d4003e5..3c5c160651 100644 --- a/docs/build_project.md +++ b/docs/build_project.md @@ -4,7 +4,6 @@ - app: 處理與web server,成本分析與負載平衡等功能之模組 - common: 有關叢集,客戶端等分析,管理與平衡等功能之模組 -- etl: 轉換 csv 檔,再匯入 delta 並使串接部份有更好的平行化處理之模組 - gui: 與圖形化界面功能相關的模組 - connector: 基於 Kafka connector 實作的各式分散式工具 - it: 針對專案測試所提供的叢集環境 diff --git a/docs/etl/README.md b/docs/etl/README.md deleted file mode 100644 index a10eef0793..0000000000 --- a/docs/etl/README.md +++ /dev/null @@ -1,51 +0,0 @@ -Astraea etl 中文文件 -=== - -Astraea etl 的目標是構建 kafka-spark-delta 的資料傳輸通道。目前支持的功能爲讀取csv檔案透過 spark streaming 轉入 kafka topic。 - -### 系統架構 - -spark -> kafka 將csv檔案透過 spark streaming 轉入 kafka topic,包括讀取資料,清洗資料與寫入資料。該模組分爲三個部分。 -1. 讀取資料: spark streaming 會讀取source path 下所有csv檔案,並將其轉化為dataframe。 -2. 清洗資料: dataframe 以行為單位,通過將json convertor 封裝進 udf 的方式,將資料變為 json 格式字串的集合。 -3. 寫入資料: 將轉化為 json 格式的資料以一行對應一條 record 的方式,發送到kafka中指定的topic。record的key為table的primary key,value 為轉換為json 字串的csv row。 - -![framework diagram](../pictures/etl_README_1.png) - -### Astraea etl 使用 - -Astraea etl 通過讀取[property file](../../config/spark2kafka.properties) 來獲取系統運行時可能需要的資訊。 -以下是property中參數簡介 - -| 參數名稱 | 說明 | 預設值 | -|:-----------------:|:----------------------------------------------------------------------------|:-----:| -| source.path | (必填) 資料來源路徑 | 無 | -| column.names | (必填) csv table的欄位名稱及該欄位對應的屬性 For example:sA=string,sB=integer,sC=boolean... | 無 | -| primary.keys | (必填) csv table中的primary key. For example:sA=string,sB=integer,sC=boolean... | 無 | -| bootstrap.servers | (必填) 欲連接的Kafka server address | 無 | -| topic.name | (必填) 欲發往的topic name | 無 | -| checkpoint | (必填) spark checkpoint 存放路徑 | 無 | -| topic.partitions | (選填) 目標topic的partition數量 | 15 | -| topic.replicas | (選填) 目標topic的replica數量 | 1 | -| topic.configs | (選填) 配置kafka的其他參數 For example: compression.type\=lz4 | 無 | - -#### 使用範例 - -專案內的工具都有整合到`container`中,使用者利用docker運行,可方便管理,使用前請注意兩件事情: - -1. 確認自己的Kafka server ip與spark master ip,並且Kafka與spark 有正常運作,關於啟動Kafka 可參考 [run_kafka_broker](run_kafka_broker.md)。 -2. 可使用 [Astraea GUI](../gui/README.md) 來建構測試用途的 `topics`。 - -使用`docker`執行`start_etl`,以下爲範例 - -```bash -# Run Spark-submit -./docker/start_etl.sh master=spark://192.168.103.189:8080 \ -property.file=/home/kafka/spark2kafkaTest/spark2kafka.properties -``` - -### Astraea etl 實驗 - -experiments 資料夾中收錄不同版本的實驗紀錄,主要使用 [performance tool](../performance_benchmark.md) 測試並紀錄數據。 - -* [2022 Dec26](experiments/etl_1.md), 測試 spark->kafka 模組功能及替換[Strict Cost Partitioner](../partitioner/strict_cost_partitioner.md) 的影響 (Astraea revision: [be8c3ffdf35ab0651dfc1a33b5552fd7e3381069](https://github.com/skiptests/astraea/tree/be8c3ffdf35ab0651dfc1a33b5552fd7e3381069)) \ No newline at end of file diff --git a/docs/etl/experiments/etl_1.md b/docs/etl/experiments/etl_1.md deleted file mode 100644 index 8290d49a18..0000000000 --- a/docs/etl/experiments/etl_1.md +++ /dev/null @@ -1,138 +0,0 @@ -# etl 測試 - -此次實驗目的主要是測試 Astraea etl 的三個部分,包括測試 -1. 10GB 資料需要多少時間跑完 -2. 檢查 input output 資料的一致性,例如資料筆數,抽檢資料是否一致 -3. 替換 spark 中的 kafka partitioner 再次測試效能,看有沒有變好 - -## 測試環境 - -### 硬體規格 - -實驗使用6台實體機器,以下皆以代號表示,分別是 B1, B2, B3, B4, M1, C1 ,六台實體機器規格均相同 - -| 硬體 | 品名 | -| ---------- | ------------------------------------------------------------ | -| CPU | Intel i9-12900K 3.2G(5.2G)/30M/UHD770/125W | -| 主機板 | 微星 Z690 CARBON WIFI(ATX/1H1P/Intel 2.5G+Wi-Fi 6E) | -| 記憶體 | 十銓 T-Force Vulcan 32G(16G*2) DDR5-5200 (CL40) | -| 硬碟 | 威剛XPG SX8200Pro 2TB/M.2 2280/讀:3500M/寫:3000M/TLC/SMI控 * 2 | -| 散熱器 | NZXT Kraken Z53 24cm水冷排/2.4吋液晶冷頭/6年/厚:5.6cm | -| 電源供應器 | 海韻 FOCUS GX-850(850W) 雙8/金牌/全模組 | -| 網卡 | Marvell AQtion 10Gbit Network Adapter | - -### 網路拓樸 - -``` - switch(10G) -┌─────┬─────┬─────┬─────┬─────┐ -B1 B2 B3 S1 S2 C1 -``` - -### 軟體版本 - -| 軟體 | 版本(/image ID) | -| ---------------------- |------------------------------------------| -| 作業系統 | ubuntu-20.04.3-live-server-amd64 | -| Astraea revision | 75bcc3faa39864d5ec5f5ed530346184e79fc0c9 | -| Zookeeper version | 3.8.0 | -| Apache Kafka version | 3.3.1 | -| Java version | OpenJDK 11 | -| Docker version | 20.10.17, build 100c701 | -| grafana image ID | b6ea013786be | -| prometheus version | v2.32.1 | -| node-exporter image ID | 1dbe0e931976 | - -實驗執行軟體 - -| 執行軟體 | B1 | B2 | B3 | S1 | S2 | C1 | -| ------------------------ | :--: | :--: | :--: | :--: | :--: | :--: | -| Spark master | | | | V | | | -| Spark worker | | | | V | V | | -| Zookeeper | V | | | | | | -| Kafka Broker | V | V | V | | | | -| Node Exporter | V | V | V | | | | -| Prometheus | | | V | | | | -| Grafana | | | V | | | | -| Astraea Performance tool | | | | | | V | - -## 測試情境 - -整個實驗分爲兩種情景,在普通情景下測試與在kafka叢集中一臺broker,網路延遲較高、且網路頻寬較低的不平衡情境下進行測試。 - -測試流程:兩種情景都需要首先啓動spark cluster。可以參考[start_spark](../../run_spark.md)啓動spark cluster. - - -### 普通情景 - -在普通情景下,只會用到上述的五臺機器{B1, B2, B3, S1, S2}。本情景將測試10GB 資料需要多少時間跑完與檢查 input output 資料的一致性。 - -#### 測試10GB資料需要多少時間跑完 - -```bash -# Run Spark-submit -./docker/start_etl.sh master=spark://192.168.103.189:8080 \ -property.file=/home/kafka/spark2kafkaTest/spark2kafka.properties -``` - -從圖中可以看出10GB的資料處理完畢需要花費2分55秒 -![processing time](../../pictures/etl_experiment_1_1.png) - -#### 檢查 input output 資料的一致性 - -10GB資料共98090266筆資料。 -![number of records](../../pictures/etl_experiment_1_2.png) - -當資料經過etl處理完畢發往指定topic後。通過subscribe topic來消費所有該topic下的資料,以測定資料是否有缺失。 -可以看到資料的筆數是相同的。 -![number of consumer records](../../pictures/etl_experiment_1_3.png) - -抽查對比consumer到的資料,各欄位資料內容也與原資料一致,所以資料一致性方面未發現問題。 -![record comparison](../../pictures/etl_experiment_1_8.png) - -### 不平衡情景 -#### 不平衡情景下會誘發的問題 -以下圖爲例 - -實驗環境:testTopic用來接受etl產生的資料,分佈於B1, B2, B3 - costTopic用來對kafka叢集中的單一節點造成負載,分佈於B1 - -圖中左側爲不平衡情景,右側爲普通情景,方便直觀感受差別 -costTopic: 接受使一個節點較忙碌的資料。它只分布在B1上。 -testTopic: etl產生的資料會發往該topic。它分布在B1, B2, B3上。 -圖中的testTopic有三個是因為它顯示了該topic在三個節點中各自的流量。 -而costTopic之所以只有一個是因為只有B1一個節點接收到資料。 - -左側實驗開始時先向costTopic發送資料,使其到達節點的頻寬上線。在一段時間後啓動etl,可以看到因爲etl發送資料分走了原先costTopic所佔據的頻寬,造成其效能下降。等到etl運行完畢costTopic的效能恢復到開始狀態。 -左側數據處理完畢共花費3分40秒 - -右側實驗在普通情景下的效能即每秒處理的資料量要明顯高於左側,數據處理完畢的時間也更短總共花費3分鐘。這證明在kafka叢集不平衡的情景下,會影響到etl的效能。 -![imbalance kafka cluster](../../pictures/etl_experiment_1_9.png) - -#### 實驗過程 -在該情景下會用到上述的全部六臺機器,同時B1, B2, B3的網路頻寬將被設置爲2.5G,確保etl效能的變化在叢集高負載的情況下會有較明顯的體現。 -其中C1將被用來向B1發送資料,以確保B1處在高負載的狀態。 - -使用default partitioner進行測試 -不替換時總共花費了4min處理完畢數據 - -![processing time of default partitioner](../../pictures/etl_experiment_1_4.png) - -替換 spark 中的 kafka partitioner進行測試 -替換partitioner後的花費5min處理完畢數據 - -![processing time of strict partitioner](../../pictures/etl_experiment_1_5.png) - -所以結論替換partitioner後反而變成了負優化,進一步觀察分布在各Broker partition的offset。處在負載最重的B1的partitions反而吃到了最多的資料。這與設想的不符合,這一問題還需要再探究其發生原因。 - -![partition offset of strict partitioner](../../pictures/etl_experiment_1_6.png) - -## 結論 - -在普通情景下,擁有兩個worker的spark cluster中,使用standalone mode 啓動 astraea etl ,處理資料的平均速率爲58.5MB/s。 - -在kafka叢集不平衡情境下,etl的效能會收到影響,這會導致處理相同的資料要花費更多的時間表。以下爲該情景下,替換partitioner前後的效能對比。 - -| 吞吐/延遲比較 | default partitioner | strict partitioner | 改善 | -| ------------- |---------------------|--------------------| ---------------------------- | -| 吞吐量 | 42.7 MB/second | 34.1 MiB/second | 平均吞吐提升:約 0.80 倍 | diff --git a/docs/run_spark.md b/docs/run_spark.md deleted file mode 100644 index becc90fa61..0000000000 --- a/docs/run_spark.md +++ /dev/null @@ -1,14 +0,0 @@ -#TODO astraea#1365 撰寫start_spark.sh中文文件 - -```bash -# Run Spark Master -SPARK_PORT=8080 SPARK_UI_PORT=7077 docker/start_spark.sh \ -folder=/home/kafka/spark2kafkaTest/ImportcsvTest/source:/home/kafka/spark2kafkaTest/ImportcsvTest/source -``` - -```bash -# Run Spark Worker -SPARK_PORT=8081 SPARK_UI_PORT=7078 docker/start_spark.sh \ -master=spark://192.168.103.189:8080 \ -folder=/home/kafka/spark2kafkaTest/ImportcsvTest/source:/home/kafka/spark2kafkaTest/ImportcsvTest/source -``` \ No newline at end of file diff --git a/etl/build.gradle b/etl/build.gradle deleted file mode 100644 index c99e2b5df7..0000000000 --- a/etl/build.gradle +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -plugins { - id "com.github.johnrengelman.shadow" version "8.1.1" - id 'scala' -} - -apply from: "$rootDir/gradle/dependencies.gradle" - -repositories { - // Use Maven Central for resolving dependencies. - mavenCentral() -} - - -dependencies { - // Gradle module metadata for jackson-databind 2.13.4.1 references non-existent jackson-bom. - // Unfortunately, spark 3.3.1 uses this version. - // This will be removed when spark is updated to 3.3.2. https://issues.apache.org/jira/browse/SPARK-40886 - compileOnly 'com.fasterxml.jackson:jackson-bom:2.13.4.20221013' - // Astraea etl is run by spark framework, so we don't need to package spark-related dependencies - compileOnly libs["scala"] - compileOnly libs["spark-sql"] - compileOnly libs["spark-kafka"] - compileOnly libs["jackson-databind"] - implementation libs["kafka-client"] - implementation project(':common') - - - testImplementation libs["junit"] - // This will be removed when spark is updated to 3.3.2. https://issues.apache.org/jira/browse/SPARK-40886 - testImplementation enforcedPlatform('com.fasterxml.jackson:jackson-bom:2.13.4.20221013') - // there are unit tests requiring spark, so we add them back for test scope - testImplementation libs["spark-sql"] - testImplementation libs["spark-kafka"] - testImplementation project(':it') -} - -ext { - numberOfForks = project.hasProperty('maxParallelForks') ? maxParallelForks.toInteger() : Math.max((int) (Runtime.runtime.availableProcessors() / 2), 1) -} - -archivesBaseName = "astraea-etl" - -tasks.named('test') { - // spark can't run with JDK 17 as it uses internal class - // see https://lists.apache.org/thread/814cpb1rpp73zkhtv9t4mkzzrznl82yn - jvmArgs '--add-opens=java.base/sun.nio.ch=ALL-UNNAMED' - // Use JUnit Platform for unit tests. - useJUnitPlatform() - - maxParallelForks = numberOfForks - // make isolation for tests. It may be expensive but stability is first choice. - forkEvery = 1 - testLogging { - events "PASSED", "STARTED", "FAILED", "SKIPPED" - exceptionFormat = 'full' - } - - minHeapSize = "1024m" - maxHeapSize = "2048m" -} \ No newline at end of file diff --git a/etl/src/main/scala/org/astraea/etl/DataColumn.scala b/etl/src/main/scala/org/astraea/etl/DataColumn.scala deleted file mode 100644 index 552ccca173..0000000000 --- a/etl/src/main/scala/org/astraea/etl/DataColumn.scala +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.astraea.etl - -case class DataColumn(name: String, isPk: Boolean = false, dataType: DataType) diff --git a/etl/src/main/scala/org/astraea/etl/DataFrameProcessor.scala b/etl/src/main/scala/org/astraea/etl/DataFrameProcessor.scala deleted file mode 100644 index 18d85ccf2d..0000000000 --- a/etl/src/main/scala/org/astraea/etl/DataFrameProcessor.scala +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.astraea.etl - -import org.apache.spark.sql.expressions.UserDefinedFunction -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{StructField, StructType} -import org.apache.spark.sql.{DataFrame, Row, SparkSession} -import org.astraea.common.json.JsonConverter - -import scala.jdk.CollectionConverters._ -import scala.language.implicitConversions - -class DataFrameProcessor(dataFrame: DataFrame) { - - val defaultConverter: UserDefinedFunction = - udf[String, Map[String, String]]((value: Map[String, String]) => { - JsonConverter - .jackson() - .toJson( - value.asJava - ) - }) - - /** Turn the original DataFrame into a key-value table.Integrate all columns - * into one value->josh. If there are multiple primary keys, key will become - * keyA_keyB_.... - * - * {{{ - * Seq(Person("Michael","A.K", 29), Person("Andy","B.C", 30), Person("Justin","C.L", 19)) - * - * Person - * |-- FirstName: string - * |-- SecondName: string - * |-- Age: Integer - * - * Key:FirstName,SecondName - * - * // +-----------+---------------------------------------------------+ - * // | key| value| - * // +-----------+---------------------------------------------------+ - * // |Michael,A.K|{"FirstName":"Michael","SecondName":"A.K","Age":29}| - * // | Andy,B.C|{"FirstName":"Andy","SecondName":"B.C","Age":30} | - * // | Justin,C.L|{"FirstName":"Justin","SecondName":"C.L","Age":19} | - * // +-----------+---------------------------------------------------+ - * }}} - * - * @param cols - * cols metadata - * @return - * json df - */ - def csvToJSON(cols: Seq[DataColumn]): DataFrameProcessor = { - new DataFrameProcessor( - dataFrame - .withColumn( - "value", - defaultConverter( - map(cols.flatMap(c => List(lit(c.name), col(c.name))): _*) - ) - ) - .withColumn( - "key", - defaultConverter( - map( - cols - .filter(dataColumn => dataColumn.isPk) - .flatMap(c => List(lit(c.name), col(c.name))): _* - ) - ) - ) - .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") - ) - } - - def dataFrame(): DataFrame = { - dataFrame - } - - def toKafkaWriterBuilder(metadata: Metadata): SparkStreamWriter[Row] = { - SparkStreamWriter.writeToKafka(this, metadata) - } -} - -object DataFrameProcessor { - def builder(sparkSession: SparkSession) = new Builder(sparkSession) - def fromLocalCsv( - sparkSession: SparkSession, - metadata: Metadata - ): DataFrameProcessor = { - builder(sparkSession) - .source(metadata.sourcePath) - .columns(metadata.columns) - .cleanSource(metadata.cleanSource) - .recursiveFileLookup(metadata.recursiveFile) - .sourceArchiveDir(metadata.archivePath) - .buildFromCsv() - } - class Builder( - private val sparkSession: SparkSession - ) { - private val SOURCE_ARCHIVE_DIR = "sourceArchiveDir" - private val CLEAN_SOURCE = "cleanSource" - private val RECURSIVE_FILE_LOOK_UP = "recursiveFileLookup" - - private var _recursiveFileLookup: String = "" - private var _cleanSource: String = "" - private var _source: String = "" - private var _sourceArchiveDir: String = "" - private var _columns: Seq[DataColumn] = Seq.empty - - def recursiveFileLookup(r: String): Builder = { - _recursiveFileLookup = r - this - } - - def cleanSource(c: String): Builder = { - _cleanSource = c - this - } - - def source(s: String): Builder = { - _source = s - this - } - - def columns(c: Seq[DataColumn]): Builder = { - _columns = c - this - } - - def sourceArchiveDir(a: String): Builder = { - _sourceArchiveDir = a - this - } - - def buildFromCsv(): DataFrameProcessor = { - if (_cleanSource == "archive") { - if (_sourceArchiveDir.isBlank) - throw new IllegalArgumentException( - s"$SOURCE_ARCHIVE_DIR is blank.When you set cleanSource to 'archive', you must configure ArchiveDir in spark2kafka config" - ) - } - - val df = sparkSession.readStream - .option(CLEAN_SOURCE, _cleanSource) - .option(SOURCE_ARCHIVE_DIR, _sourceArchiveDir) - .option(RECURSIVE_FILE_LOOK_UP, _recursiveFileLookup) - .schema(schema(_columns)) - .csv(_source) - .filter(row => { - val bool = (0 until row.length).exists(i => !row.isNullAt(i)) - bool - }) - new DataFrameProcessor(df) - } - - private def schema(columns: Seq[DataColumn]): StructType = - StructType(columns.map { col => - if (col.dataType != DataType.StringType) - throw new IllegalArgumentException( - "Sorry, only string type is currently supported.Because a problem(astraea #1286) has led to the need to wrap the non-nullable type." - ) - StructField(col.name, col.dataType.sparkType) - }) - } -} diff --git a/etl/src/main/scala/org/astraea/etl/DataType.scala b/etl/src/main/scala/org/astraea/etl/DataType.scala deleted file mode 100644 index de776c4707..0000000000 --- a/etl/src/main/scala/org/astraea/etl/DataType.scala +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.astraea.etl - -sealed abstract class DataType( - typeName: String, - sparkDataType: org.apache.spark.sql.types.DataType -) extends Serializable { - def name: String = { - typeName - } - - def sparkType: org.apache.spark.sql.types.DataType = { - sparkDataType - } -} - -/** Column Types supported by astraea partitioner. */ -object DataType { - private val STRING_TYPE = "string" - private val BOOLEAN_TYPE = "boolean" - private val DATE_TYPE = "date" - private val DOUBLE_TYPE = "double" - private val BYTE_TYPE = "byte" - private val INTEGER_TYPE = "integer" - private val LONG_TYPE = "long" - private val SHORT_TYPE = "short" - private val TIMESTAMP_TYPE = "timestamp" - - case object StringType - extends DataType(STRING_TYPE, org.apache.spark.sql.types.StringType) - case object BooleanType - extends DataType(BOOLEAN_TYPE, org.apache.spark.sql.types.BooleanType) - case object DateType - extends DataType(DATE_TYPE, org.apache.spark.sql.types.DateType) - case object DoubleType - extends DataType(DOUBLE_TYPE, org.apache.spark.sql.types.DoubleType) - case object ByteType - extends DataType(BYTE_TYPE, org.apache.spark.sql.types.ByteType) - case object IntegerType - extends DataType(INTEGER_TYPE, org.apache.spark.sql.types.IntegerType) - case object LongType - extends DataType(LONG_TYPE, org.apache.spark.sql.types.LongType) - case object ShortType - extends DataType(SHORT_TYPE, org.apache.spark.sql.types.ShortType) - case object TimestampType - extends DataType(TIMESTAMP_TYPE, org.apache.spark.sql.types.TimestampType) - - /** @param str - * String that needs to be parsed as a DataType. - * @return - * DataType - */ - def of(str: String): DataType = { - val value = all.filter(_.name == str) - if (value.isEmpty) { - throw new IllegalArgumentException( - s"$str is not supported data type.The data types supported ${all.mkString(",")}." - ) - } - value.head - } - - /** @param map - * A set of string needs to be parsed as DataType - * @return - * Map[String, DataType] - */ - def of(map: Map[String, String]): Map[String, DataType] = { - map.map(x => (x._1, of(x._2))) - } - - /** @return - * All supported data types. - */ - def all: Seq[DataType] = { - Seq( - StringType, - BooleanType, - DateType, - DoubleType, - ByteType, - IntegerType, - LongType, - ShortType, - TimestampType - ) - } -} diff --git a/etl/src/main/scala/org/astraea/etl/Metadata.scala b/etl/src/main/scala/org/astraea/etl/Metadata.scala deleted file mode 100644 index fe012c7adb..0000000000 --- a/etl/src/main/scala/org/astraea/etl/Metadata.scala +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.astraea.etl - -import java.io.File -import java.nio.file.{Files, Path} -import java.util.Properties -import scala.jdk.CollectionConverters._ -import scala.util.Using - -/** Parameters required for Astraea ETL. - * - * @param sourcePath - * The data source path should be a directory. - * @param columns - * The CSV Column metadata.Contains column name, data type and is pk or not. - * @param kafkaBootstrapServers - * The Kafka bootstrap servers. - * @param topicName - * Set your topic name, if it is empty that will set it to "'spark'-'current - * time by hourDay'-'random number'". - * @param numberOfPartitions - * Set the number of topic partitions, if it is empty that will set it to 15. - * @param numberOfReplicas - * Set the number of topic replicas, if it is empty that will set it to 3. - * @param checkpoint - * Spark checkpoint path. - */ -case class Metadata private ( - sourcePath: String, - checkpoint: String, - columns: Seq[DataColumn], - kafkaBootstrapServers: String, - topicName: String, - topicConfigs: Map[String, String], - numberOfPartitions: Int, - numberOfReplicas: Short, - cleanSource: String, - recursiveFile: String, - archivePath: String -) - -object Metadata { - private[etl] val ARCHIVE_PATH = "archive.path" - private[etl] val SOURCE_PATH_KEY = "source.path" - private[etl] val CHECKPOINT_KEY = "checkpoint" - private[etl] val COLUMN_NAME_KEY = "column.names" - private[etl] val COLUMN_TYPES_KEY = "column.types" - private[etl] val CLEAN_SOURCE = "clean.source" - private[etl] val PRIMARY_KEY_KEY = "primary.keys" - private[etl] val KAFKA_BOOTSTRAP_SERVERS_KEY = "kafka.bootstrap.servers" - private[etl] val TOPIC_NAME_KEY = "topic.name" - private[etl] val TOPIC_CONFIGS_KEY = "topic.configs" - private[etl] val TOPIC_PARTITIONS_KEY = "topic.partitions" - private[etl] val TOPIC_REPLICAS_KEY = "topic.replicas" - private[etl] val RECURSIVE_FILE = "recursive.file" - - private[etl] val DEFAULT_PARTITIONS = 15 - private[etl] val DEFAULT_REPLICAS = 1.toShort - private[etl] val DEFAULT_RECURSIVE = "ture" - private[etl] val DEFAULT_CLEAN_SOURCE = "delete" - - // Parameters needed to configure ETL. - def of(path: Path): Metadata = { - // remove the empty/blank value - val properties = - readProp(path).asScala.filter(_._2.nonEmpty).filterNot(_._2.isBlank) - val columnNames = properties(COLUMN_NAME_KEY).split(",").toSeq - if (columnNames.isEmpty) - throw new IllegalArgumentException("columns must be defined") - val pks = properties - .get(PRIMARY_KEY_KEY) - .map(_.split(",").toSet) - .getOrElse(columnNames.toSet) - val types = properties - .get(COLUMN_TYPES_KEY) - .map(s => - s.split(",") - .map(t => DataType.of(t)) - .toSeq - ) - .getOrElse(Seq.empty) - Metadata( - sourcePath = properties(SOURCE_PATH_KEY), - checkpoint = properties.getOrElse( - CHECKPOINT_KEY, - Files.createTempDirectory("astraea-etl").toString - ), - columns = columnNames.zipWithIndex.map { case (c, index) => - DataColumn( - name = c, - isPk = pks.contains(c), - dataType = if (types.isEmpty) DataType.StringType else types(index) - ) - }, - kafkaBootstrapServers = properties(KAFKA_BOOTSTRAP_SERVERS_KEY), - topicName = - properties.getOrElse(TOPIC_NAME_KEY, "astraea-etl-" + Math.random()), - topicConfigs = properties - .get(TOPIC_CONFIGS_KEY) - .map(s => - s.split(",") - .map(item => item.split("=")(0) -> item.split("=")(1)) - .toMap - ) - .getOrElse(Map.empty), - numberOfPartitions = properties - .get(TOPIC_PARTITIONS_KEY) - .map(_.toInt) - .getOrElse(DEFAULT_PARTITIONS), - numberOfReplicas = properties - .get(TOPIC_REPLICAS_KEY) - .map(_.toShort) - .getOrElse(DEFAULT_REPLICAS), - cleanSource = properties.getOrElse(CLEAN_SOURCE, DEFAULT_CLEAN_SOURCE), - recursiveFile = properties.getOrElse(RECURSIVE_FILE, DEFAULT_RECURSIVE), - archivePath = properties.getOrElse(ARCHIVE_PATH, "") - ) - } - - private[this] def readProp(path: Path): Properties = { - val properties = new Properties() - Using(scala.io.Source.fromInputStream(Files.newInputStream(path))) { - bufferedSource => - properties.load(bufferedSource.reader()) - } - properties - } -} diff --git a/etl/src/main/scala/org/astraea/etl/Spark2Kafka.scala b/etl/src/main/scala/org/astraea/etl/Spark2Kafka.scala deleted file mode 100644 index 6e7afeae6d..0000000000 --- a/etl/src/main/scala/org/astraea/etl/Spark2Kafka.scala +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.astraea.etl - -import org.apache.spark.sql.SparkSession -import org.astraea.common.admin.Admin - -import java.io.File -import java.nio.file.Path -import scala.concurrent.duration.Duration -import scala.jdk.CollectionConverters._ -import scala.util.Using - -object Spark2Kafka { - def executor( - sparkSession: SparkSession, - metadata: Metadata, - duration: Duration - ): Unit = { - Using(Admin.of(metadata.kafkaBootstrapServers))( - _.creator() - .topic(metadata.topicName) - .configs(metadata.topicConfigs.asJava) - .numberOfPartitions(metadata.numberOfPartitions) - .numberOfReplicas(metadata.numberOfReplicas) - .run() - .toCompletableFuture - .join() - ) - - DataFrameProcessor - .fromLocalCsv(sparkSession, metadata) - .csvToJSON(metadata.columns) - .toKafkaWriterBuilder(metadata) - .start(duration) - } - - def main(args: Array[String]): Unit = { - executor( - SparkSession - .builder() - .appName("astraea etl") - .getOrCreate(), - Metadata.of(Path.of(args(0))), - Duration("1000 seconds") - ) - } -} diff --git a/etl/src/main/scala/org/astraea/etl/SparkStreamWriter.scala b/etl/src/main/scala/org/astraea/etl/SparkStreamWriter.scala deleted file mode 100644 index 12d8fa2051..0000000000 --- a/etl/src/main/scala/org/astraea/etl/SparkStreamWriter.scala +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.astraea.etl - -import org.apache.kafka.clients.producer.ProducerConfig -import org.apache.spark.sql.Row -import org.apache.spark.sql.streaming.{DataStreamWriter, OutputMode} - -import scala.concurrent.duration.Duration -class SparkStreamWriter[T](dataStreamWriter: DataStreamWriter[T]) { - def start(duration: Duration): Boolean = { - dataStreamWriter - .start() - .awaitTermination(duration.toMillis) - } -} - -object SparkStreamWriter { - def builder(dataFrameProcessor: DataFrameProcessor) = - new Builder(dataFrameProcessor) - - def writeToKafka( - dataFrameProcessor: DataFrameProcessor, - metadata: Metadata - ): SparkStreamWriter[Row] = { - builder(dataFrameProcessor) - .target(metadata.topicName) - .checkpoint(metadata.checkpoint) - .buildToKafka(metadata.kafkaBootstrapServers) - } - class Builder( - var dataFrameProcessor: DataFrameProcessor - ) { - private var _target: String = "" - private var _checkpoint: String = "" - - def buildToKafka( - bootstrap: String - ): SparkStreamWriter[Row] = { - new SparkStreamWriter[Row]( - dataFrameProcessor - .dataFrame() - .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") - .writeStream - .outputMode(OutputMode.Append()) - .format("kafka") - .option( - "kafka." + - ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, - bootstrap - ) - // Spark to kafka transfer support for StringSerializer and ByteSerializer in spark 3.3.0 . - .option( - ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, - "org.apache.kafka.common.serialization.StringSerializer" - ) - .option( - ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, - "org.apache.kafka.common.serialization.StringSerializer" - ) - .option("topic", _target) - .option(ProducerConfig.ACKS_CONFIG, "all") - .option(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG, "true") - .option("checkpointLocation", _checkpoint) - ) - } - - /** Target represents the destination of the data, for Kafka it represents - * the topic, for other targets it can represent the data path. - * - * @param target - * destination of the data - * @return - * Writer - */ - def target(target: String): Builder = { - _target = target - this - } - - def checkpoint( - checkpoint: String - ): Builder = { - _checkpoint = checkpoint - this - } - } -} diff --git a/etl/src/main/scala/org/astraea/etl/Utils.scala b/etl/src/main/scala/org/astraea/etl/Utils.scala deleted file mode 100644 index 84849758e7..0000000000 --- a/etl/src/main/scala/org/astraea/etl/Utils.scala +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.astraea.etl - -import org.astraea.common.admin.Admin - -import java.awt.geom.IllegalPathStateException -import java.io.File -import java.nio.file.{Files, Path} -import java.util.concurrent.CompletionStage -import scala.concurrent.{Future, Promise} -import scala.util.control.NonFatal -import scala.jdk.CollectionConverters._ - -object Utils { - def requireFolder(path: String): Path = { - val file = Path.of(path) - if (!Files.isDirectory(file)) { - throw new IllegalPathStateException( - s"$path is not a folder. The path should be a folder." - ) - } - file - } - - def requireFile(path: String): Path = { - val file = Path.of(path) - if (!Files.isRegularFile(file)) { - throw new IllegalPathStateException( - s"$path is not a file. The file does not exist." - ) - } - file - } - -} diff --git a/etl/src/test/scala/org/astraea/etl/DataFrameProcessorTest.scala b/etl/src/test/scala/org/astraea/etl/DataFrameProcessorTest.scala deleted file mode 100644 index 0bbd6ee6a6..0000000000 --- a/etl/src/test/scala/org/astraea/etl/DataFrameProcessorTest.scala +++ /dev/null @@ -1,313 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.astraea.etl - -import org.apache.spark.sql.functions.col -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.{Row, SparkSession} -import org.astraea.etl.DataType.{IntegerType, StringType} -import org.astraea.etl.FileCreator.{createCSV, generateCSVF, getCSVFile} -import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue} -import org.junit.jupiter.api.Test - -import java.io._ -import java.nio.file.{Files, Path} -import java.util.concurrent.TimeUnit -import scala.concurrent.duration.Duration -import scala.jdk.CollectionConverters._ - -class DataFrameProcessorTest { - @Test - def skipBlankLineTest(): Unit = { - val sourceDir = Files.createTempDirectory("source") - val dataDir = Files.createTempDirectory("data") - val checkoutDir = Files.createTempDirectory("checkpoint") - - val columnOne: List[String] = - List("A1", "B1", null, "D1") - val columnTwo: List[String] = - List("52", "36", null, "25") - val columnThree: List[String] = - List("fghgh", "gjgbn", null, "dfjf") - - val row = columnOne - .zip(columnTwo.zip(columnThree)) - .foldLeft(List.empty[List[String]]) { case (acc, (a, (b, c))) => - List(a, b, c) +: acc - } - .reverse - - createCSV(sourceDir, row, 0) - - val df = DataFrameProcessor.fromLocalCsv( - createSpark(), - Metadata( - sourcePath = sourceDir.toAbsolutePath.toString, - checkpoint = "", - columns = Seq( - DataColumn("RecordNumber", true, StringType), - DataColumn("Size", true, StringType), - DataColumn("Type", true, StringType) - ), - kafkaBootstrapServers = "", - topicName = "", - topicConfigs = Map.empty, - numberOfPartitions = 10, - numberOfReplicas = 1, - cleanSource = "delete", - recursiveFile = "true", - archivePath = "" - ) - ) - - df.dataFrame() - .writeStream - .format("csv") - .option("path", dataDir.toAbsolutePath.toString) - .option("checkpointLocation", checkoutDir.toAbsolutePath.toString) - .outputMode("append") - .start() - .awaitTermination(Duration(20, TimeUnit.SECONDS).toMillis) - - val writeFile = getCSVFile(dataDir).head - val lines = Files.readAllLines(writeFile) - - assertEquals("A1,52,fghgh", lines.get(0)) - assertEquals("B1,36,gjgbn", lines.get(1)) - assertEquals("D1,25,dfjf", lines.get(2)) - } - - @Test - def sparkReadCSVTest(): Unit = { - val sourceDir = Files.createTempDirectory("source") - generateCSVF(sourceDir, rows) - - val checkoutDir = Files.createTempDirectory("checkpoint") - val dataDir = Files.createTempDirectory("data") - - val csvDF = DataFrameProcessor.fromLocalCsv( - createSpark(), - Metadata( - sourcePath = sourceDir.toAbsolutePath.toString, - checkpoint = "", - columns = Seq( - DataColumn("RecordNumber", true, StringType), - DataColumn("Size", true, StringType), - DataColumn("Type", true, StringType) - ), - kafkaBootstrapServers = "", - topicName = "", - topicConfigs = Map.empty, - numberOfPartitions = 10, - numberOfReplicas = 1, - cleanSource = "delete", - recursiveFile = "true", - archivePath = "" - ) - ) - - assertTrue( - csvDF.dataFrame().isStreaming, - "sessions must be a streaming Dataset" - ) - - csvDF - .dataFrame() - .writeStream - .format("csv") - .option("path", dataDir.toAbsolutePath.toString) - .option("checkpointLocation", checkoutDir.toAbsolutePath.toString) - .outputMode("append") - .start() - .awaitTermination(Duration(20, TimeUnit.SECONDS).toMillis) - - val writeFile = getCSVFile(dataDir).head - val lines = Files.readAllLines(writeFile) - assertEquals("A1,52,fghgh", lines.get(0)) - assertEquals("B1,36,gjgbn", lines.get(1)) - assertEquals("C1,45,fgbhjf", lines.get(2)) - assertEquals("D1,25,dfjf", lines.get(3)) - } - - @Test - def csvToJSONTest(): Unit = { - val spark = SparkSession - .builder() - .master("local[2]") - .appName("Astraea ETL") - .getOrCreate() - import spark.implicits._ - - val columns = Seq( - DataColumn("name", isPk = true, dataType = StringType), - DataColumn("age", isPk = false, dataType = IntegerType) - ) - - val result = new DataFrameProcessor( - Seq(("Michael", 29)).toDF().toDF("name", "age") - ).csvToJSON(columns) - .dataFrame() - .collectAsList() - .asScala - .map(row => (row.getAs[String]("key"), row.getAs[String]("value"))) - .toMap - - assertEquals(1, result.size) - assertEquals( - "{\"age\":\"29\",\"name\":\"Michael\"}", - result("{\"name\":\"Michael\"}") - ) - - val resultExchange = new DataFrameProcessor( - Seq((29, "Michael")).toDF().toDF("age", "name") - ).csvToJSON(columns) - .dataFrame() - .collectAsList() - .asScala - .map(row => (row.getAs[String]("key"), row.getAs[String]("value"))) - .toMap - - assertEquals(1, resultExchange.size) - assertEquals( - "{\"age\":\"29\",\"name\":\"Michael\"}", - resultExchange("{\"name\":\"Michael\"}") - ) - } - - @Test - def csvToJsonMulKeysTest(): Unit = { - val spark = SparkSession - .builder() - .master("local[2]") - .appName("Astraea ETL") - .getOrCreate() - import spark.implicits._ - val columns = Seq( - DataColumn("firstName", isPk = true, DataType.StringType), - DataColumn("secondName", isPk = true, DataType.StringType), - DataColumn("age", isPk = false, dataType = IntegerType) - ) - val result = new DataFrameProcessor( - Seq(("Michael", "A", 29)).toDF().toDF("firstName", "secondName", "age") - ).csvToJSON(columns) - .dataFrame() - .collectAsList() - .asScala - .map(row => (row.getAs[String]("key"), row.getAs[String]("value"))) - .toMap - - assertEquals(1, result.size) - assertEquals( - "{\"age\":\"29\",\"firstName\":\"Michael\",\"secondName\":\"A\"}", - result("{\"firstName\":\"Michael\",\"secondName\":\"A\"}") - ) - } - - @Test def csvToJsonNullTest(): Unit = { - val spark = SparkSession - .builder() - .master("local[2]") - .appName("Astraea ETL") - .getOrCreate() - import spark.implicits._ - val columns = Seq( - DataColumn("firstName", isPk = true, DataType.StringType), - DataColumn("secondName", isPk = true, DataType.StringType), - DataColumn("age", isPk = false, dataType = IntegerType) - ) - val result = new DataFrameProcessor( - Seq(("Michael", "A", null)).toDF().toDF("firstName", "secondName", "age") - ).csvToJSON(columns) - .dataFrame() - .collectAsList() - .asScala - .map(row => (row.getAs[String]("key"), row.getAs[String]("value"))) - .toMap - - assertEquals(1, result.size) - assertEquals( - "{\"firstName\":\"Michael\",\"secondName\":\"A\"}", - result("{\"firstName\":\"Michael\",\"secondName\":\"A\"}") - ) - } - - @Test - def jsonToByteTest(): Unit = { - val spark = SparkSession - .builder() - .master("local[2]") - .appName("Astraea ETL") - .getOrCreate() - - var data = Seq(Row("A1", 52, "fghgh", "sfjojs", "zzz", "final", 5)) - (0 to 10000).iterator.foreach(_ => - data = data ++ Seq(Row("A1", 52, "fghgh", "sfjojs", "zzz", "final", 5)) - ) - - val structType = new StructType() - .add("name", "string") - .add("age", "integer") - .add("xx", "string") - .add("yy", "string") - .add("zz", "string") - .add("f", "string") - .add("fInt", "integer") - - val columns = Seq( - DataColumn("name", isPk = false, dataType = StringType), - DataColumn("age", isPk = false, dataType = IntegerType), - DataColumn("xx", isPk = false, dataType = StringType), - DataColumn("yy", isPk = false, dataType = StringType), - DataColumn("zz", isPk = false, dataType = StringType), - DataColumn("f", isPk = false, dataType = StringType), - DataColumn("fInt", isPk = false, dataType = IntegerType) - ) - - val json = new DataFrameProcessor( - spark.createDataFrame(spark.sparkContext.parallelize(data), structType) - ).csvToJSON(columns) - .dataFrame() - .withColumn("byte", col("value").cast("Byte")) - .selectExpr("CAST(byte AS BYTE)") - val head = json.head() - assertTrue(json.filter(_ != head).isEmpty) - } - - private[this] def rows: List[List[String]] = { - val columnOne: List[String] = - List("A1", "B1", "C1", "D1") - val columnTwo: List[String] = - List("52", "36", "45", "25") - val columnThree: List[String] = - List("fghgh", "gjgbn", "fgbhjf", "dfjf") - - columnOne - .zip(columnTwo.zip(columnThree)) - .foldLeft(List.empty[List[String]]) { case (acc, (a, (b, c))) => - List(a, b, c) +: acc - } - .reverse - } - - private def createSpark(): SparkSession = { - SparkSession - .builder() - .master("local[2]") - .appName("Astraea ETL") - .getOrCreate() - } -} diff --git a/etl/src/test/scala/org/astraea/etl/FileCreator.scala b/etl/src/test/scala/org/astraea/etl/FileCreator.scala deleted file mode 100644 index 2cdecc08db..0000000000 --- a/etl/src/test/scala/org/astraea/etl/FileCreator.scala +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.astraea.etl - -import org.astraea.common.csv.CsvWriter - -import java.io.{BufferedWriter, FileWriter} -import java.nio.file.{Files, Path} -import java.util.concurrent.TimeUnit -import scala.concurrent.ExecutionContext.Implicits.global -import scala.concurrent.Future -import scala.concurrent.duration.Duration -import scala.jdk.CollectionConverters._ -import scala.util.{Failure, Try} -object FileCreator { - def generateCSVF( - sourceDir: Path, - rows: List[List[String]] - ): Future[Boolean] = { - Future { generateCSV(sourceDir, rows) } - } - - def generateCSV(sourceDir: Path, rows: List[List[String]]): Boolean = { - createCSV(sourceDir, rows, 0) - Thread.sleep(Duration(12, TimeUnit.SECONDS).toMillis) - createCSV(sourceDir, rows, 1) - true - } - - def createCSV( - sourceDir: Path, - rows: List[List[String]], - int: Int - ): Try[Unit] = { - val str = - sourceDir.toAbsolutePath.toString + "/local_kafka" + "-" + int + ".csv" - val fileCSV2 = Files.createFile(Path.of(str)) - writeCsvFile(fileCSV2.toAbsolutePath.toString, rows) - } - - def writeCsvFile( - path: String, - rows: List[List[String]] - ): Try[Unit] = - Try( - CsvWriter - .builder(new BufferedWriter(new FileWriter(path))) - .build() - ).flatMap((csvWriter: CsvWriter) => - Try { - rows.map(_.asJava).foreach(csvWriter.append) - csvWriter.close() - } match { - case f @ Failure(_) => - Try(csvWriter.close()).recoverWith { case _ => - f - } - case success => - success - } - ) - - def getCSVFile(file: Path): Seq[Path] = { - Files - .list(file) - .filter(f => Files.isRegularFile(f)) - .filter(f => f.getFileName.toString.endsWith(".csv")) - .iterator() - .asScala - .toSeq - } -} diff --git a/etl/src/test/scala/org/astraea/etl/MetadataTest.scala b/etl/src/test/scala/org/astraea/etl/MetadataTest.scala deleted file mode 100644 index bd6f4ae8b4..0000000000 --- a/etl/src/test/scala/org/astraea/etl/MetadataTest.scala +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.astraea.etl - -import org.junit.jupiter.api.{Assertions, Test} - -import java.io.{File, FileOutputStream} -import java.nio.file.{Files, Path} -import java.util.Properties -import scala.util.Using - -class MetadataTest { - - @Test - def testTopicConfigs(): Unit = { - val path = save( - Map( - Metadata.SOURCE_PATH_KEY -> "/tmp/aa", - Metadata.TOPIC_NAME_KEY -> "bb", - Metadata.COLUMN_NAME_KEY -> "a,b", - Metadata.KAFKA_BOOTSTRAP_SERVERS_KEY -> "host:1122", - Metadata.TOPIC_CONFIGS_KEY -> "a=b,c=d" - ) - ) - val metadata = Metadata.of(path) - Assertions.assertEquals(2, metadata.topicConfigs.size) - Assertions.assertEquals("b", metadata.topicConfigs("a")) - Assertions.assertEquals("d", metadata.topicConfigs("c")) - } - - @Test - def testDefault(): Unit = { - val path = save( - Map( - Metadata.SOURCE_PATH_KEY -> "/tmp/aa", - Metadata.TOPIC_NAME_KEY -> "bb", - Metadata.COLUMN_NAME_KEY -> "a,b", - Metadata.KAFKA_BOOTSTRAP_SERVERS_KEY -> "host:1122" - ) - ) - val metadata = Metadata.of(path) - - Assertions.assertEquals("/tmp/aa", metadata.sourcePath) - Assertions.assertEquals("host:1122", metadata.kafkaBootstrapServers) - Assertions.assertEquals("bb", metadata.topicName) - Assertions.assertEquals(2, metadata.columns.size) - Assertions.assertEquals("a", metadata.columns.head.name) - Assertions.assertTrue(metadata.columns.head.isPk) - Assertions.assertEquals(DataType.StringType, metadata.columns.head.dataType) - Assertions.assertEquals("b", metadata.columns(1).name) - Assertions.assertTrue(metadata.columns(1).isPk) - Assertions.assertEquals(DataType.StringType, metadata.columns(1).dataType) - } - - private def save(props: Map[String, String]): Path = { - val prop = new Properties - props.foreach { case (k, v) => - prop.put(k, v) - } - val file = Files.createTempFile("test", "props") - Using(Files.newOutputStream(file)) { output => - prop.store(output, null) - } - file - } -} diff --git a/etl/src/test/scala/org/astraea/etl/Spark2KafkaTest.scala b/etl/src/test/scala/org/astraea/etl/Spark2KafkaTest.scala deleted file mode 100644 index 68ef733061..0000000000 --- a/etl/src/test/scala/org/astraea/etl/Spark2KafkaTest.scala +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.astraea.etl - -import org.apache.kafka.clients.consumer.ConsumerConfig -import org.apache.spark.sql.SparkSession -import org.astraea.common.consumer.{Consumer, Deserializer} -import org.astraea.etl.FileCreator.generateCSVF -import org.astraea.etl.Metadata.{ - ARCHIVE_PATH, - DEFAULT_RECURSIVE, - RECURSIVE_FILE -} -import org.astraea.etl.Spark2KafkaTest.{COL_NAMES, rows} -import org.astraea.it.Service -import org.junit.jupiter.api.Assertions.assertEquals -import org.junit.jupiter.api.{AfterAll, BeforeAll, Test} - -import java.nio.file.Files -import java.util -import scala.collection.immutable -import scala.concurrent.duration.Duration -import scala.jdk.CollectionConverters._ - -class Spark2KafkaTest { - @Test - def consumerDataTest(): Unit = { - val topic = new util.HashSet[String] - topic.add("testTopic") - - val consumer = - Consumer - .forTopics(topic) - .bootstrapServers(Spark2KafkaTest.SERVICE.bootstrapServers()) - .keyDeserializer(Deserializer.STRING) - .valueDeserializer(Deserializer.STRING) - .configs( - Map(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "earliest").asJava - ) - .build() - - val records = - Range - .inclusive(0, 5) - .flatMap(_ => consumer.poll(java.time.Duration.ofSeconds(1)).asScala) - .map(record => (record.key(), record.value())) - .toMap - - val rowData = s2kType(rows) - - records.foreach(records => assertEquals(records._2, rowData(records._1))) - } - - def s2kType(rows: List[List[String]]): Map[String, String] = { - val colNames = - COL_NAMES.split(",").map(_.split("=")).map(elem => elem(0)).toSeq - Range - .inclusive(0, 3) - .map(i => - ( - s"""{"${colNames.head}":"${rows( - i - ).head}","${colNames(1)}":"${rows(i)(1)}"}""", - s"""{"${colNames(2)}":"${rows( - i - )( - 2 - )}","${colNames.head}":"${rows( - i - ).head}","${colNames(1)}":"${rows(i)(1)}"}""" - ) - ) - .toMap - } -} - -object Spark2KafkaTest { - private val SERVICE = Service.builder.numberOfBrokers(3).build() - - @AfterAll - private def closeService(): Unit = SERVICE.close() - - private val COL_NAMES = - "FirstName=string,SecondName=string,Age=integer" - - @BeforeAll - def setup(): Unit = { - val sourceDir = Files.createTempDirectory("source") - val checkoutDir = Files.createTempDirectory("checkpoint") - generateCSVF(sourceDir, rows) - - val metadata = Metadata( - sourcePath = sourceDir.toAbsolutePath.toString, - checkpoint = checkoutDir.toAbsolutePath.toString, - columns = immutable.Seq( - DataColumn( - name = "FirstName", - isPk = true, - dataType = DataType.StringType - ), - DataColumn( - name = "SecondName", - isPk = true, - dataType = DataType.StringType - ), - DataColumn(name = "Age", dataType = DataType.StringType) - ), - kafkaBootstrapServers = SERVICE.bootstrapServers(), - topicName = "testTopic", - topicConfigs = Map("compression.type" -> "lz4"), - numberOfPartitions = 10, - numberOfReplicas = 1, - cleanSource = "delete", - recursiveFile = "true", - archivePath = "" - ) - - Spark2Kafka.executor( - SparkSession - .builder() - .master("local[2]") - .getOrCreate(), - metadata, - Duration("10 seconds") - ) - } - - private def rows: List[List[String]] = { - val columnOne: List[String] = - List("Michael", "Andy", "Justin", "") - val columnTwo: List[String] = - List("A.K", "B.C", "C.L", "") - val columnThree: List[String] = - List("29", "30", "19", "") - - columnOne - .zip(columnTwo.zip(columnThree)) - .foldLeft(List.empty[List[String]]) { case (acc, (a, (b, c))) => - List(a, b, c) +: acc - } - .reverse - } -} diff --git a/etl/src/test/scala/org/astraea/etl/UtilsTest.scala b/etl/src/test/scala/org/astraea/etl/UtilsTest.scala deleted file mode 100644 index 99b9fee374..0000000000 --- a/etl/src/test/scala/org/astraea/etl/UtilsTest.scala +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.astraea.etl - -import org.junit.jupiter.api.Assertions.assertThrows -import org.junit.jupiter.api.Test - -import java.awt.geom.IllegalPathStateException -class UtilsTest { - @Test def requireFolderTest(): Unit = { - assertThrows( - classOf[IllegalPathStateException], - () => Utils.requireFolder("??") - ) - } -} diff --git a/gradle/dependencies.gradle b/gradle/dependencies.gradle index 87cd3985e7..af67be773c 100644 --- a/gradle/dependencies.gradle +++ b/gradle/dependencies.gradle @@ -32,9 +32,7 @@ def versions = [ kafka : project.properties['kafka.version'] ?: "3.7.0", mockito : project.properties['mockito.version'] ?: "5.11.0", "mockito-inline" : project.properties['mockito-inline.version'] ?: "5.2.0", - scala : project.properties['scala.version'] ?: "2.13.13", slf4j : project.properties['slf4j.version'] ?: "2.0.12", - spark : project.properties['spark.version'] ?: "3.5.1", hadoop : project.properties["hadoop.version"] ?: "3.3.6", ] @@ -59,10 +57,7 @@ libs += [ "kafka-metadata" : "org.apache.kafka:kafka-metadata:${versions["kafka"]}", "mockito-core" : "org.mockito:mockito-core:${versions["mockito"]}", "mockito-inline" : "org.mockito:mockito-inline:${versions["mockito-inline"]}", - scala : "org.scala-lang:scala-library:${versions["scala"]}", "slf4j-nop" : "org.slf4j:slf4j-nop:${versions["slf4j"]}", - "spark-kafka" : "org.apache.spark:spark-sql-kafka-0-10_2.13:${versions["spark"]}", - "spark-sql" : "org.apache.spark:spark-sql_2.13:${versions["spark"]}", "hadoop-common" : "org.apache.hadoop:hadoop-common:${versions["hadoop"]}", "hadoop-client" : "org.apache.hadoop:hadoop-client:${versions["hadoop"]}", "hadoop-minicluster" : "org.apache.hadoop:hadoop-minicluster:${versions["hadoop"]}", diff --git a/settings.gradle b/settings.gradle index 2af44f31e2..a227433177 100644 --- a/settings.gradle +++ b/settings.gradle @@ -19,4 +19,4 @@ pluginManagement { } } rootProject.name = 'astraea' -include('app', "common", "connector", "etl", "fs", "gui", "it") +include('app', "common", "connector", "fs", "gui", "it")