Skip to content

Commit

Permalink
[DAPHNE-daphne-eu#473] Improved Apache Arrow support
Browse files Browse the repository at this point in the history
Cleaned up the integration of Apache Arrow for Parquet reader support.
* Arrow is now always built from 11.0 release package
* CMake code is now free from hard coded paths
* It seems we can drop the requirement to install boost libraries as everything compiles just fine without it, and we don't use much Arrow anyway atm

Closes daphne-eu#473
  • Loading branch information
corepointer committed Feb 15, 2023
1 parent b5b24e5 commit b4b242f
Show file tree
Hide file tree
Showing 15 changed files with 66 additions and 79 deletions.
17 changes: 3 additions & 14 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -129,20 +129,9 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER)
endif()
endif()

option(USE_ARROW "Whether to activate compilation of Arrow/Parquet features" OFF)
if(USE_ARROW)
find_package(Arrow CONFIG REQUIRED
PATHS thirdparty/installed/lib/cmake/arrow
NO_DEFAULT_PATH
)
find_package(Parquet CONFIG REQUIRED
PATHS thirdparty/installed/lib/cmake/arrow
NO_DEFAULT_PATH
)
link_libraries(arrow_shared parquet_shared)
add_definitions(-DUSE_ARROW)
message(STATUS "Arrow/Parquet enabled")
endif()

find_package(Arrow REQUIRED)
find_package(Parquet REQUIRED)

option(USE_FPGAOPENCL "Whether to activate compilation of FPGA OpenCL features" OFF)
if(USE_FPGAOPENCL)
Expand Down
47 changes: 21 additions & 26 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ function printHelp {
echo " -nf, --no-fancy Suppress all colored and animated output"
echo " -nd, --no-deps Avoid building third party dependencies at all"
echo " -y, --yes Accept all prompts"
echo " --arrow Compile with support for Arrow/Parquet files"
echo " --cuda Compile with support for CUDA ops"
echo " --debug Compile with support for debug mode"
echo " --fpgaopencl Compile with support for Intel PAC D5005 FPGA"
Expand Down Expand Up @@ -393,7 +392,7 @@ openBlasVersion=0.3.19
abslVersion=20211102.0
grpcVersion=1.38.0
nlohmannjsonVersion=3.10.5
arrowVersion=d9d78946607f36e25e9d812a5cc956bd00ab2bc9
arrowVersion=11.0.0

#******************************************************************************
# Set some prefixes, paths and dirs
Expand Down Expand Up @@ -427,7 +426,6 @@ par_clean="0"
par_acceptAll="0"
unknown_options=""
BUILD_CUDA="-DUSE_CUDA=OFF"
BUILD_ARROW="-DUSE_ARROW=OFF"
BUILD_FPGAOPENCL="-DUSE_FPGAOPENCL=OFF"
BUILD_DEBUG="-DCMAKE_BUILD_TYPE=Release"
WITH_DEPS=1
Expand Down Expand Up @@ -469,10 +467,6 @@ while [[ $# -gt 0 ]]; do
echo using CUDA
export BUILD_CUDA="-DUSE_CUDA=ON"
;;
--arrow)
echo using ARROW
BUILD_ARROW="-DUSE_ARROW=ON"
;;
--fpgaopencl)
echo using FPGAOPENCL
export BUILD_FPGAOPENCL="-DUSE_FPGAOPENCL=ON"
Expand Down Expand Up @@ -718,24 +712,25 @@ if [ $WITH_DEPS -gt 0 ]; then
#------------------------------------------------------------------------------
# Arrow / Parquet
#------------------------------------------------------------------------------
arrowDirName="arrow"
if [[ "$BUILD_ARROW" == "-DUSE_ARROW=ON" ]]; then
if ! is_dependency_downloaded "arrow_v${arrowVersion}"; then
rm -rf "${sourcePrefix:?}/${arrowDirName}"
git clone -n https://github.com/apache/arrow.git "${sourcePrefix}/${arrowDirName}"
cd "${sourcePrefix}/${arrowDirName}"
git checkout $arrowVersion
dependency_download_success "arrow_v${arrowVersion}"
fi
if ! is_dependency_installed "arrow_v${arrowVersion}"; then
cmake -G Ninja -S "${sourcePrefix}/${arrowDirName}/cpp" -B "${buildPrefix}/${arrowDirName}" \
-DCMAKE_INSTALL_PREFIX="${installPrefix}" \
-DARROW_CSV=ON -DARROW_FILESYSTEM=ON -DARROW_PARQUET=ON
cmake --build "${buildPrefix}/${arrowDirName}" --target install
dependency_install_success "arrow_v${arrowVersion}"
else
daphne_msg "No need to build Arrow again."
fi
arrowDirName="apache-arrow-$arrowVersion"
arrowArtifactFileName=$arrowDirName.tar.gz
if ! is_dependency_downloaded "arrow_v${arrowVersion}"; then
rm -rf "${sourcePrefix:?}/${arrowDirName}"
wget "https://dlcdn.apache.org/arrow/arrow-$arrowVersion/$arrowArtifactFileName" -qP "$cacheDir"
tar xzf "$cacheDir/$arrowArtifactFileName" --directory="$sourcePrefix"
daphne_msg "Applying 0004-arrow-git-log.patch"
patch -Np0 -i "$patchDir/0004-arrow-git-log.patch" -d "$sourcePrefix/$arrowDirName"
dependency_download_success "arrow_v${arrowVersion}"
fi

if ! is_dependency_installed "arrow_v${arrowVersion}"; then
cmake -G Ninja -S "${sourcePrefix}/${arrowDirName}/cpp" -B "${buildPrefix}/${arrowDirName}" \
-DCMAKE_INSTALL_PREFIX="${installPrefix}" \
-DARROW_CSV=ON -DARROW_FILESYSTEM=ON -DARROW_PARQUET=ON
cmake --build "${buildPrefix}/${arrowDirName}" --target install
dependency_install_success "arrow_v${arrowVersion}"
else
daphne_msg "No need to build Arrow again."
fi

#------------------------------------------------------------------------------
Expand Down Expand Up @@ -821,7 +816,7 @@ daphne_msg "Build Daphne"

cmake -S "$projectRoot" -B "$daphneBuildDir" -G Ninja -DANTLR_VERSION="$antlrVersion" \
-DCMAKE_PREFIX_PATH="$installPrefix" \
$BUILD_CUDA $BUILD_ARROW $BUILD_FPGAOPENCL $BUILD_DEBUG
$BUILD_CUDA $BUILD_FPGAOPENCL $BUILD_DEBUG

cmake --build "$daphneBuildDir" --target "$target"

Expand Down
2 changes: 1 addition & 1 deletion doc/DaphneDSLBuiltins.md
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ The format is determined by the specified file name extension.
Currently, the following formats are supported:
- ".csv": comma-separated values
- ".mtx": matrix market
- ".parquet": Parquet (requires DAPHNE to be built with `--arrow`)
- ".parquet": Apache Parquet format
- ".dbdf": [DAPHNE's binary data format](/doc/BinaryFormat.md)

For both reading and writing, file names can be specified as absolute or relative paths.
Expand Down
1 change: 0 additions & 1 deletion doc/GettingStarted.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ Newer versions should work as well, older versions might work as well.
| java (e.g. openjdk) | 11 (1.7 should be fine) | |
| gfortran | 9.3.0 | |
| uuid-dev | | |
| libboost-dev | 1.71.0.0 | Only required when building with support for Arrow (`--arrow`) |
| wget | | Used to fetch additional dependencies and other artefacts |
| *** | *** | *** |
| CUDA SDK | 11.7.1 | Optional for CUDA ops |
Expand Down
1 change: 0 additions & 1 deletion doc/development/BuildingDaphne.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,6 @@ All possible options for the build script:
| --debug | Compile the daphne binary with debug symbols |
| --oneapi | Compile with support for accelerated operations using the OneAPI SDK |
| --fpgaopencl | Compile with support for FPGA operations using the Intel FPGA SDK or OneAPI+FPGA Add-On |
| --arrow | Compile with support for Apache Arrow |

## 2. Extension
### Overview over the build script
Expand Down
3 changes: 1 addition & 2 deletions pack.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ set -e
function exit_with_usage {
cat << EOF
usage: pack.sh --version VERSION --feature FEATURE
--feature FEATURE......a feature flag like --cuda, --arrow, etc (omit or "none" for plain Daphne)
--feature FEATURE......a feature flag like --cuda, etc (omit or "none" for plain Daphne)
EOF
exit 1
}
Expand Down Expand Up @@ -66,7 +66,6 @@ fi

# shellcheck disable=SC2254
case "$FEATURE" in
arrow) ;&
cuda) ;&
debug) ;&
fpgaopencl)
Expand Down
2 changes: 1 addition & 1 deletion release.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ usage: $0 --version VERSION --githash GIT_HASH [ --gpgkey GPG_KEY ] [ --artifact
--artifact: If supplied, building the release artifact will be skipped and the script will only perform
checksumming and optional signing.
--feature FEATURE......a feature flag like --cuda, --arrow, etc (omit or "none" for plain Daphne)
--feature FEATURE......a feature flag like --cuda, etc (omit or "none" for plain Daphne)
EOF
exit 1
}
Expand Down
2 changes: 1 addition & 1 deletion src/parser/config/ConfigParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ bool ConfigParser::fileExists(const std::string& filename) {

void ConfigParser::readUserConfig(const std::string& filename, DaphneUserConfig& config) {
std::ifstream ifs(filename);
nlohmann::basic_json jf = nlohmann::json::parse(ifs);
auto jf = nlohmann::json::parse(ifs);

//try {
checkAnyUnexpectedKeys(jf, filename); // raise an error if the config JSON file contains any unexpected keys
Expand Down
2 changes: 2 additions & 0 deletions src/runtime/distributed/worker/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ set(LIBS
CallData
Proto
DaphneMetaDataParser
Arrow::arrow_shared
Parquet::parquet_shared
)

add_library(WorkerImpl ${SOURCES})
Expand Down
18 changes: 7 additions & 11 deletions src/runtime/local/io/ReadParquet.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
*/

#pragma once
#ifdef USE_ARROW

#include <runtime/local/datastructures/DataObjectFactory.h>
#include <runtime/local/datastructures/DenseMatrix.h>
Expand Down Expand Up @@ -81,23 +80,22 @@ void readParquet(DTRes *&res, const char *filename, size_t numRows, size_t numCo
// ****************************************************************************

inline struct File *arrowToCsv(const char *filename){
arrow::Status st;
arrow::MemoryPool* pool = arrow::default_memory_pool();
arrow::fs::LocalFileSystem file_system;
std::shared_ptr<arrow::io::RandomAccessFile> input = file_system.OpenInputFile(filename).ValueOrDie();

std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
st = parquet::arrow::OpenFile(input, pool, &arrow_reader);
if (!st.ok()) {
// TODO: Handle error instantiating file reader...
return NULL;
}
if(!(parquet::arrow::OpenFile(input, pool, &arrow_reader).ok()))
throw std::runtime_error("Could not open Parquet file");

std::shared_ptr<arrow::Table> table;
st = arrow_reader->ReadTable(&table);
if(!(arrow_reader->ReadTable(&table)).ok())
throw std::runtime_error("Could not read Parquet table");

auto output = arrow::io::BufferOutputStream::Create().ValueOrDie();
arrow::csv::WriteCSV(*table, arrow::csv::WriteOptions::Defaults(), output.get());
if(!(arrow::csv::WriteCSV(*table, arrow::csv::WriteOptions::Defaults(), output.get())).ok())
throw std::runtime_error("Could not write from Parquet to CSV format");

auto finishResult = output->Finish();

auto csv = finishResult.ValueOrDie()->ToString();
Expand Down Expand Up @@ -148,5 +146,3 @@ template <typename VT> struct ReadParquet<DenseMatrix<VT>> {
closeFile(file);
}
};

#endif
3 changes: 2 additions & 1 deletion src/runtime/local/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,5 +94,6 @@ else()
list(APPEND LIBS LLVMSupport)
endif()

list(APPEND LIBS DaphneMetaDataParser MLIRDaphne MLIRDaphneTransforms)
list(APPEND LIBS Arrow::arrow_shared Parquet::parquet_shared DaphneMetaDataParser MLIRDaphne MLIRDaphneTransforms)

target_link_libraries(AllKernels PUBLIC ${LIBS})
12 changes: 2 additions & 10 deletions src/runtime/local/kernels/Read.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,15 +91,11 @@ struct Read<DenseMatrix<VT>> {
case 1:
readMM(res, filename);
break;
#ifdef USE_ARROW
case 2:
if(res == nullptr)
res = DataObjectFactory::create<DenseMatrix<VT>>(
fmd.numRows, fmd.numCols, false
);
res = DataObjectFactory::create<DenseMatrix<VT>>(fmd.numRows, fmd.numCols, false);
readParquet(res, filename, fmd.numRows, fmd.numCols);
break;
#endif
case 3:
readDaphne(res, filename);
break;
Expand Down Expand Up @@ -135,15 +131,11 @@ struct Read<CSRMatrix<VT>> {
case 1:
readMM(res, filename);
break;
#ifdef USE_ARROW
case 2:
if(res == nullptr)
res = DataObjectFactory::create<CSRMatrix<VT>>(
fmd.numRows, fmd.numCols, fmd.numNonZeros, false
);
res = DataObjectFactory::create<CSRMatrix<VT>>(fmd.numRows, fmd.numCols, fmd.numNonZeros, false);
readParquet(res, filename,fmd.numRows, fmd.numCols,fmd.numNonZeros, false);
break;
#endif
case 3:
readDaphne(res, filename);
break;
Expand Down
7 changes: 1 addition & 6 deletions test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ set -e

catch2_options=""
BUILD_CUDA=""
BUILD_ARROW=""
BUILD_FPGAOPENCL=""
BUILD_DEBUG=""

Expand All @@ -40,10 +39,6 @@ while [[ $# -gt 0 ]]; do
echo using CUDA
export BUILD_CUDA="--cuda"
;;
--arrow)
echo using ARROW
BUILD_ARROW="--arrow"
;;
--fpgaopencl)
echo using FPGAOPENCL
export BUILD_FPGAOPENCL="--fpgaopencl"
Expand All @@ -59,7 +54,7 @@ while [[ $# -gt 0 ]]; do
done

# Build tests.
./build.sh $BUILD_CUDA $BUILD_ARROW $BUILD_FPGAOPENCL $BUILD_DEBUG --target run_tests
./build.sh $BUILD_CUDA $BUILD_FPGAOPENCL $BUILD_DEBUG --target run_tests

# Preparations for running DaphneLib (Python API) tests.
export PYTHONPATH="$PYTHONPATH:$PWD/src/"
Expand Down
4 changes: 0 additions & 4 deletions test/runtime/local/io/ReadParquetTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
* limitations under the License.
*/

#ifdef USE_ARROW

#include <runtime/local/datastructures/Frame.h>
#include <runtime/local/datastructures/DenseMatrix.h>
#include <runtime/local/datastructures/CSRMatrix.h>
Expand Down Expand Up @@ -85,5 +83,3 @@ TEMPLATE_PRODUCT_TEST_CASE("ReadParquet, DenseMatrix", TAG_IO, (DenseMatrix), (d

DataObjectFactory::destroy(m);
}

#endif
24 changes: 24 additions & 0 deletions thirdparty/patches/0004-arrow-git-log.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
--- cpp/cmake_modules/DefineOptions.cmake 2023-01-18 14:08:12.000000000 +0100
+++ cpp/cmake_modules/DefineOptions.cmake 2023-02-08 23:45:20.907959122 +0100

@@ -729,7 +729,7 @@
# Compute default values for omitted variables

if(NOT ARROW_GIT_ID)
+ execute_process(COMMAND "git" " -c log.showSignature=false" "log" "-n1" "--format=%H"
- execute_process(COMMAND "git" "log" "-n1" "--format=%H"
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE ARROW_GIT_ID
OUTPUT_STRIP_TRAILING_WHITESPACE)
--- python/cmake_modules/DefineOptions.cmake 2023-01-18 14:08:12.000000000 +0100
+++ python/cmake_modules/DefineOptions.cmake 2023-02-08 23:45:20.907959122 +0100
@@ -729,7 +729,7 @@
# Compute default values for omitted variables

if(NOT ARROW_GIT_ID)
+ execute_process(COMMAND "git" " -c log.showSignature=false" "log" "-n1" "--format=%H"
- execute_process(COMMAND "git" "log" "-n1" "--format=%H"
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE ARROW_GIT_ID
OUTPUT_STRIP_TRAILING_WHITESPACE)

0 comments on commit b4b242f

Please sign in to comment.