From b4b242ff3c75c3d65f99d8c5c95c152cd17c9c00 Mon Sep 17 00:00:00 2001 From: Mark Dokter Date: Thu, 9 Feb 2023 00:55:31 +0100 Subject: [PATCH] [DAPHNE-#473] Improved Apache Arrow support Cleaned up the integration of Apache Arrow for Parquet reader support. * Arrow is now always built from 11.0 release package * CMake code is now free from hard coded paths * It seems we can drop the requirement to install boost libraries as everything compiles just fine without it, and we don't use much Arrow anyway atm Closes #473 --- CMakeLists.txt | 17 ++----- build.sh | 47 +++++++++---------- doc/DaphneDSLBuiltins.md | 2 +- doc/GettingStarted.md | 1 - doc/development/BuildingDaphne.md | 1 - pack.sh | 3 +- release.sh | 2 +- src/parser/config/ConfigParser.cpp | 2 +- src/runtime/distributed/worker/CMakeLists.txt | 2 + src/runtime/local/io/ReadParquet.h | 18 +++---- src/runtime/local/kernels/CMakeLists.txt | 3 +- src/runtime/local/kernels/Read.h | 12 +---- test.sh | 7 +-- test/runtime/local/io/ReadParquetTest.cpp | 4 -- thirdparty/patches/0004-arrow-git-log.patch | 24 ++++++++++ 15 files changed, 66 insertions(+), 79 deletions(-) create mode 100644 thirdparty/patches/0004-arrow-git-log.patch diff --git a/CMakeLists.txt b/CMakeLists.txt index 73ca1edbc..f689c0bb1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -129,20 +129,9 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER) endif() endif() -option(USE_ARROW "Whether to activate compilation of Arrow/Parquet features" OFF) -if(USE_ARROW) - find_package(Arrow CONFIG REQUIRED - PATHS thirdparty/installed/lib/cmake/arrow - NO_DEFAULT_PATH - ) - find_package(Parquet CONFIG REQUIRED - PATHS thirdparty/installed/lib/cmake/arrow - NO_DEFAULT_PATH - ) - link_libraries(arrow_shared parquet_shared) - add_definitions(-DUSE_ARROW) - message(STATUS "Arrow/Parquet enabled") -endif() + +find_package(Arrow REQUIRED) +find_package(Parquet REQUIRED) option(USE_FPGAOPENCL "Whether to activate compilation of FPGA OpenCL features" OFF) if(USE_FPGAOPENCL) diff --git a/build.sh b/build.sh index 42246df06..3733165b0 100755 --- a/build.sh +++ b/build.sh @@ -46,7 +46,6 @@ function printHelp { echo " -nf, --no-fancy Suppress all colored and animated output" echo " -nd, --no-deps Avoid building third party dependencies at all" echo " -y, --yes Accept all prompts" - echo " --arrow Compile with support for Arrow/Parquet files" echo " --cuda Compile with support for CUDA ops" echo " --debug Compile with support for debug mode" echo " --fpgaopencl Compile with support for Intel PAC D5005 FPGA" @@ -393,7 +392,7 @@ openBlasVersion=0.3.19 abslVersion=20211102.0 grpcVersion=1.38.0 nlohmannjsonVersion=3.10.5 -arrowVersion=d9d78946607f36e25e9d812a5cc956bd00ab2bc9 +arrowVersion=11.0.0 #****************************************************************************** # Set some prefixes, paths and dirs @@ -427,7 +426,6 @@ par_clean="0" par_acceptAll="0" unknown_options="" BUILD_CUDA="-DUSE_CUDA=OFF" -BUILD_ARROW="-DUSE_ARROW=OFF" BUILD_FPGAOPENCL="-DUSE_FPGAOPENCL=OFF" BUILD_DEBUG="-DCMAKE_BUILD_TYPE=Release" WITH_DEPS=1 @@ -469,10 +467,6 @@ while [[ $# -gt 0 ]]; do echo using CUDA export BUILD_CUDA="-DUSE_CUDA=ON" ;; - --arrow) - echo using ARROW - BUILD_ARROW="-DUSE_ARROW=ON" - ;; --fpgaopencl) echo using FPGAOPENCL export BUILD_FPGAOPENCL="-DUSE_FPGAOPENCL=ON" @@ -718,24 +712,25 @@ if [ $WITH_DEPS -gt 0 ]; then #------------------------------------------------------------------------------ # Arrow / Parquet #------------------------------------------------------------------------------ - arrowDirName="arrow" - if [[ "$BUILD_ARROW" == "-DUSE_ARROW=ON" ]]; then - if ! is_dependency_downloaded "arrow_v${arrowVersion}"; then - rm -rf "${sourcePrefix:?}/${arrowDirName}" - git clone -n https://github.com/apache/arrow.git "${sourcePrefix}/${arrowDirName}" - cd "${sourcePrefix}/${arrowDirName}" - git checkout $arrowVersion - dependency_download_success "arrow_v${arrowVersion}" - fi - if ! is_dependency_installed "arrow_v${arrowVersion}"; then - cmake -G Ninja -S "${sourcePrefix}/${arrowDirName}/cpp" -B "${buildPrefix}/${arrowDirName}" \ - -DCMAKE_INSTALL_PREFIX="${installPrefix}" \ - -DARROW_CSV=ON -DARROW_FILESYSTEM=ON -DARROW_PARQUET=ON - cmake --build "${buildPrefix}/${arrowDirName}" --target install - dependency_install_success "arrow_v${arrowVersion}" - else - daphne_msg "No need to build Arrow again." - fi + arrowDirName="apache-arrow-$arrowVersion" + arrowArtifactFileName=$arrowDirName.tar.gz + if ! is_dependency_downloaded "arrow_v${arrowVersion}"; then + rm -rf "${sourcePrefix:?}/${arrowDirName}" + wget "https://dlcdn.apache.org/arrow/arrow-$arrowVersion/$arrowArtifactFileName" -qP "$cacheDir" + tar xzf "$cacheDir/$arrowArtifactFileName" --directory="$sourcePrefix" + daphne_msg "Applying 0004-arrow-git-log.patch" + patch -Np0 -i "$patchDir/0004-arrow-git-log.patch" -d "$sourcePrefix/$arrowDirName" + dependency_download_success "arrow_v${arrowVersion}" + fi + + if ! is_dependency_installed "arrow_v${arrowVersion}"; then + cmake -G Ninja -S "${sourcePrefix}/${arrowDirName}/cpp" -B "${buildPrefix}/${arrowDirName}" \ + -DCMAKE_INSTALL_PREFIX="${installPrefix}" \ + -DARROW_CSV=ON -DARROW_FILESYSTEM=ON -DARROW_PARQUET=ON + cmake --build "${buildPrefix}/${arrowDirName}" --target install + dependency_install_success "arrow_v${arrowVersion}" + else + daphne_msg "No need to build Arrow again." fi #------------------------------------------------------------------------------ @@ -821,7 +816,7 @@ daphne_msg "Build Daphne" cmake -S "$projectRoot" -B "$daphneBuildDir" -G Ninja -DANTLR_VERSION="$antlrVersion" \ -DCMAKE_PREFIX_PATH="$installPrefix" \ - $BUILD_CUDA $BUILD_ARROW $BUILD_FPGAOPENCL $BUILD_DEBUG + $BUILD_CUDA $BUILD_FPGAOPENCL $BUILD_DEBUG cmake --build "$daphneBuildDir" --target "$target" diff --git a/doc/DaphneDSLBuiltins.md b/doc/DaphneDSLBuiltins.md index a2307ac5a..8cea35c1e 100644 --- a/doc/DaphneDSLBuiltins.md +++ b/doc/DaphneDSLBuiltins.md @@ -392,7 +392,7 @@ The format is determined by the specified file name extension. Currently, the following formats are supported: - ".csv": comma-separated values - ".mtx": matrix market -- ".parquet": Parquet (requires DAPHNE to be built with `--arrow`) +- ".parquet": Apache Parquet format - ".dbdf": [DAPHNE's binary data format](/doc/BinaryFormat.md) For both reading and writing, file names can be specified as absolute or relative paths. diff --git a/doc/GettingStarted.md b/doc/GettingStarted.md index 5362b1f77..6ffe0787e 100644 --- a/doc/GettingStarted.md +++ b/doc/GettingStarted.md @@ -51,7 +51,6 @@ Newer versions should work as well, older versions might work as well. | java (e.g. openjdk) | 11 (1.7 should be fine) | | | gfortran | 9.3.0 | | | uuid-dev | | | -| libboost-dev | 1.71.0.0 | Only required when building with support for Arrow (`--arrow`) | | wget | | Used to fetch additional dependencies and other artefacts | | *** | *** | *** | | CUDA SDK | 11.7.1 | Optional for CUDA ops | diff --git a/doc/development/BuildingDaphne.md b/doc/development/BuildingDaphne.md index d5e5ef7a3..2a0d982f8 100644 --- a/doc/development/BuildingDaphne.md +++ b/doc/development/BuildingDaphne.md @@ -107,7 +107,6 @@ All possible options for the build script: | --debug | Compile the daphne binary with debug symbols | | --oneapi | Compile with support for accelerated operations using the OneAPI SDK | | --fpgaopencl | Compile with support for FPGA operations using the Intel FPGA SDK or OneAPI+FPGA Add-On | -| --arrow | Compile with support for Apache Arrow | ## 2. Extension ### Overview over the build script diff --git a/pack.sh b/pack.sh index 7b917e787..c080bac28 100755 --- a/pack.sh +++ b/pack.sh @@ -20,7 +20,7 @@ set -e function exit_with_usage { cat << EOF usage: pack.sh --version VERSION --feature FEATURE ---feature FEATURE......a feature flag like --cuda, --arrow, etc (omit or "none" for plain Daphne) +--feature FEATURE......a feature flag like --cuda, etc (omit or "none" for plain Daphne) EOF exit 1 } @@ -66,7 +66,6 @@ fi # shellcheck disable=SC2254 case "$FEATURE" in - arrow) ;& cuda) ;& debug) ;& fpgaopencl) diff --git a/release.sh b/release.sh index 6d148149d..6d4190aa8 100755 --- a/release.sh +++ b/release.sh @@ -26,7 +26,7 @@ usage: $0 --version VERSION --githash GIT_HASH [ --gpgkey GPG_KEY ] [ --artifact --artifact: If supplied, building the release artifact will be skipped and the script will only perform checksumming and optional signing. ---feature FEATURE......a feature flag like --cuda, --arrow, etc (omit or "none" for plain Daphne) +--feature FEATURE......a feature flag like --cuda, etc (omit or "none" for plain Daphne) EOF exit 1 } diff --git a/src/parser/config/ConfigParser.cpp b/src/parser/config/ConfigParser.cpp index 120751b54..e4026e53f 100644 --- a/src/parser/config/ConfigParser.cpp +++ b/src/parser/config/ConfigParser.cpp @@ -30,7 +30,7 @@ bool ConfigParser::fileExists(const std::string& filename) { void ConfigParser::readUserConfig(const std::string& filename, DaphneUserConfig& config) { std::ifstream ifs(filename); - nlohmann::basic_json jf = nlohmann::json::parse(ifs); + auto jf = nlohmann::json::parse(ifs); //try { checkAnyUnexpectedKeys(jf, filename); // raise an error if the config JSON file contains any unexpected keys diff --git a/src/runtime/distributed/worker/CMakeLists.txt b/src/runtime/distributed/worker/CMakeLists.txt index 099c99a68..196cefc48 100644 --- a/src/runtime/distributed/worker/CMakeLists.txt +++ b/src/runtime/distributed/worker/CMakeLists.txt @@ -40,6 +40,8 @@ set(LIBS CallData Proto DaphneMetaDataParser + Arrow::arrow_shared + Parquet::parquet_shared ) add_library(WorkerImpl ${SOURCES}) diff --git a/src/runtime/local/io/ReadParquet.h b/src/runtime/local/io/ReadParquet.h index bc9836e21..5f9dce8f4 100644 --- a/src/runtime/local/io/ReadParquet.h +++ b/src/runtime/local/io/ReadParquet.h @@ -15,7 +15,6 @@ */ #pragma once -#ifdef USE_ARROW #include #include @@ -81,23 +80,22 @@ void readParquet(DTRes *&res, const char *filename, size_t numRows, size_t numCo // **************************************************************************** inline struct File *arrowToCsv(const char *filename){ - arrow::Status st; arrow::MemoryPool* pool = arrow::default_memory_pool(); arrow::fs::LocalFileSystem file_system; std::shared_ptr input = file_system.OpenInputFile(filename).ValueOrDie(); std::unique_ptr arrow_reader; - st = parquet::arrow::OpenFile(input, pool, &arrow_reader); - if (!st.ok()) { - // TODO: Handle error instantiating file reader... - return NULL; - } + if(!(parquet::arrow::OpenFile(input, pool, &arrow_reader).ok())) + throw std::runtime_error("Could not open Parquet file"); std::shared_ptr table; - st = arrow_reader->ReadTable(&table); + if(!(arrow_reader->ReadTable(&table)).ok()) + throw std::runtime_error("Could not read Parquet table"); auto output = arrow::io::BufferOutputStream::Create().ValueOrDie(); - arrow::csv::WriteCSV(*table, arrow::csv::WriteOptions::Defaults(), output.get()); + if(!(arrow::csv::WriteCSV(*table, arrow::csv::WriteOptions::Defaults(), output.get())).ok()) + throw std::runtime_error("Could not write from Parquet to CSV format"); + auto finishResult = output->Finish(); auto csv = finishResult.ValueOrDie()->ToString(); @@ -148,5 +146,3 @@ template struct ReadParquet> { closeFile(file); } }; - -#endif diff --git a/src/runtime/local/kernels/CMakeLists.txt b/src/runtime/local/kernels/CMakeLists.txt index 555dfc073..09c31683a 100644 --- a/src/runtime/local/kernels/CMakeLists.txt +++ b/src/runtime/local/kernels/CMakeLists.txt @@ -94,5 +94,6 @@ else() list(APPEND LIBS LLVMSupport) endif() -list(APPEND LIBS DaphneMetaDataParser MLIRDaphne MLIRDaphneTransforms) +list(APPEND LIBS Arrow::arrow_shared Parquet::parquet_shared DaphneMetaDataParser MLIRDaphne MLIRDaphneTransforms) + target_link_libraries(AllKernels PUBLIC ${LIBS}) diff --git a/src/runtime/local/kernels/Read.h b/src/runtime/local/kernels/Read.h index 6e9508dbc..f092d3377 100644 --- a/src/runtime/local/kernels/Read.h +++ b/src/runtime/local/kernels/Read.h @@ -91,15 +91,11 @@ struct Read> { case 1: readMM(res, filename); break; -#ifdef USE_ARROW case 2: if(res == nullptr) - res = DataObjectFactory::create>( - fmd.numRows, fmd.numCols, false - ); + res = DataObjectFactory::create>(fmd.numRows, fmd.numCols, false); readParquet(res, filename, fmd.numRows, fmd.numCols); break; -#endif case 3: readDaphne(res, filename); break; @@ -135,15 +131,11 @@ struct Read> { case 1: readMM(res, filename); break; -#ifdef USE_ARROW case 2: if(res == nullptr) - res = DataObjectFactory::create>( - fmd.numRows, fmd.numCols, fmd.numNonZeros, false - ); + res = DataObjectFactory::create>(fmd.numRows, fmd.numCols, fmd.numNonZeros, false); readParquet(res, filename,fmd.numRows, fmd.numCols,fmd.numNonZeros, false); break; -#endif case 3: readDaphne(res, filename); break; diff --git a/test.sh b/test.sh index c9f44c24c..e95ac110b 100755 --- a/test.sh +++ b/test.sh @@ -28,7 +28,6 @@ set -e catch2_options="" BUILD_CUDA="" -BUILD_ARROW="" BUILD_FPGAOPENCL="" BUILD_DEBUG="" @@ -40,10 +39,6 @@ while [[ $# -gt 0 ]]; do echo using CUDA export BUILD_CUDA="--cuda" ;; - --arrow) - echo using ARROW - BUILD_ARROW="--arrow" - ;; --fpgaopencl) echo using FPGAOPENCL export BUILD_FPGAOPENCL="--fpgaopencl" @@ -59,7 +54,7 @@ while [[ $# -gt 0 ]]; do done # Build tests. -./build.sh $BUILD_CUDA $BUILD_ARROW $BUILD_FPGAOPENCL $BUILD_DEBUG --target run_tests +./build.sh $BUILD_CUDA $BUILD_FPGAOPENCL $BUILD_DEBUG --target run_tests # Preparations for running DaphneLib (Python API) tests. export PYTHONPATH="$PYTHONPATH:$PWD/src/" diff --git a/test/runtime/local/io/ReadParquetTest.cpp b/test/runtime/local/io/ReadParquetTest.cpp index 397c42c8b..1d369749d 100644 --- a/test/runtime/local/io/ReadParquetTest.cpp +++ b/test/runtime/local/io/ReadParquetTest.cpp @@ -14,8 +14,6 @@ * limitations under the License. */ -#ifdef USE_ARROW - #include #include #include @@ -85,5 +83,3 @@ TEMPLATE_PRODUCT_TEST_CASE("ReadParquet, DenseMatrix", TAG_IO, (DenseMatrix), (d DataObjectFactory::destroy(m); } - -#endif diff --git a/thirdparty/patches/0004-arrow-git-log.patch b/thirdparty/patches/0004-arrow-git-log.patch new file mode 100644 index 000000000..d2c183678 --- /dev/null +++ b/thirdparty/patches/0004-arrow-git-log.patch @@ -0,0 +1,24 @@ +--- cpp/cmake_modules/DefineOptions.cmake 2023-01-18 14:08:12.000000000 +0100 ++++ cpp/cmake_modules/DefineOptions.cmake 2023-02-08 23:45:20.907959122 +0100 + +@@ -729,7 +729,7 @@ + # Compute default values for omitted variables + + if(NOT ARROW_GIT_ID) ++ execute_process(COMMAND "git" " -c log.showSignature=false" "log" "-n1" "--format=%H" +- execute_process(COMMAND "git" "log" "-n1" "--format=%H" + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE ARROW_GIT_ID + OUTPUT_STRIP_TRAILING_WHITESPACE) +--- python/cmake_modules/DefineOptions.cmake 2023-01-18 14:08:12.000000000 +0100 ++++ python/cmake_modules/DefineOptions.cmake 2023-02-08 23:45:20.907959122 +0100 +@@ -729,7 +729,7 @@ + # Compute default values for omitted variables + + if(NOT ARROW_GIT_ID) ++ execute_process(COMMAND "git" " -c log.showSignature=false" "log" "-n1" "--format=%H" +- execute_process(COMMAND "git" "log" "-n1" "--format=%H" + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE ARROW_GIT_ID + OUTPUT_STRIP_TRAILING_WHITESPACE) +