Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
split up generated kernels.cpp #829
Browse files Browse the repository at this point in the history
This patch splits up kernels.cpp and CUDAkernels.cpp into multiple
translation units, one per kernel. This improves compilation times, see
the PR #829.

closes #516.
philipportner committed Oct 5, 2024
1 parent 8d614f0 commit 529f0cb
Showing 12 changed files with 131 additions and 94 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -32,6 +32,7 @@ set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)
set(CMAKE_CXX_STANDARD 20 CACHE STRING "C++ standard to conform to")
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_OPTIMIZE_DEPENDENCIES 1)

set(CMAKE_CXX_FLAGS_DEBUG="${CMAKE_CXX_FLAGS_DEBUG} -g")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
44 changes: 26 additions & 18 deletions src/runtime/local/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -18,20 +18,21 @@
set(CMAKE_CXX_STANDARD 20 CACHE STRING "C++ standard to conform to")
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
find_package(Python3 REQUIRED COMPONENTS Interpreter)

# The library of pre-compiled CUDA kernels
if(USE_CUDA AND CMAKE_CUDA_COMPILER)
set(CMAKE_CUDA_STANDARD 20)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
execute_process(
COMMAND
${Python3_EXECUTABLE} genKernelInst.py kernels.json
${PROJECT_BINARY_DIR}/src/runtime/local/kernels/CUDAkernels
${PROJECT_SOURCE_DIR}/lib/CUDAcatalog.json CUDA
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/)

add_custom_command(
OUTPUT ${PROJECT_BINARY_DIR}/src/runtime/local/kernels/CUDAkernels.cpp ${PROJECT_SOURCE_DIR}/lib/CUDAcatalog.json
COMMAND python3 ARGS genKernelInst.py kernels.json
${PROJECT_BINARY_DIR}/src/runtime/local/kernels/CUDAkernels.cpp ${PROJECT_SOURCE_DIR}/lib/CUDAcatalog.json CUDA
MAIN_DEPENDENCY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/kernels.json
DEPENDS ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/genKernelInst.py
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/
)
file(GLOB CUDA_CODEGEN_CPP_FILES CONFIGURE_DEPENDS
"${PROJECT_BINARY_DIR}/src/runtime/local/kernels/CUDA*.cpp")

set(PREFIX ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/CUDA)
set(CUDAKernels_SRC
@@ -59,7 +60,7 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER)
${PREFIX}/Solve.cpp
${PREFIX}/Syrk.cu
${PREFIX}/Transpose.cpp
${PROJECT_BINARY_DIR}/src/runtime/local/kernels/CUDAkernels.cpp
${CUDA_CODEGEN_CPP_FILES}
${PROJECT_SOURCE_DIR}/src/runtime/local/vectorized/TasksCUDA.cpp
${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/VectorizedPipeline.h
${PROJECT_SOURCE_DIR}/src/runtime/local/vectorized/WorkerGPU.h
@@ -77,24 +78,28 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER)
set_target_properties(CUDAKernels PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
endif()

add_custom_command(
OUTPUT ${PROJECT_BINARY_DIR}/src/runtime/local/kernels/kernels.cpp ${PROJECT_SOURCE_DIR}/lib/catalog.json
COMMAND python3 ARGS genKernelInst.py kernels.json ${PROJECT_BINARY_DIR}/src/runtime/local/kernels/kernels.cpp ${PROJECT_SOURCE_DIR}/lib/catalog.json CPP
MAIN_DEPENDENCY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/kernels.json
DEPENDS ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/genKernelInst.py
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/
)
execute_process(
COMMAND
${Python3_EXECUTABLE} genKernelInst.py kernels.json
${PROJECT_BINARY_DIR}/src/runtime/local/kernels/kernels
${PROJECT_SOURCE_DIR}/lib/catalog.json CPP
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/)

file(GLOB CODEGEN_CPP_FILES CONFIGURE_DEPENDS
"${PROJECT_BINARY_DIR}/src/runtime/local/kernels/*.cpp")
# message("CODEGEN_CPP_FILES: ${CODEGEN_CPP_FILES}")

list(APPEND LIBS DataStructures IO BLAS::BLAS MLIRParser)

set(PREFIX ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/)
set(HEADERS_cpp_kernels
${PREFIX}/MatMul.h
)

set(SOURCES_cpp_kernels
${PREFIX}/MatMul.cpp
${PROJECT_SOURCE_DIR}/src/runtime/local/instrumentation/KernelInstrumentation.cpp
${PROJECT_BINARY_DIR}/src/runtime/local/kernels/kernels.cpp
${CODEGEN_CPP_FILES}
${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/CreateDaphneContext.cpp
${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/Pooling.cpp
${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/VectorizedPipeline.h
@@ -104,7 +109,9 @@ set(SOURCES_cpp_kernels
${PROJECT_SOURCE_DIR}/src/runtime/local/vectorized/WorkerCPU.h
)
# The library of pre-compiled kernels. Will be linked into the JIT-compiled user program.
add_library(AllKernels SHARED ${SOURCES_cpp_kernels} ${HEADERS_cpp_kernels})
add_library(KernelObjLib OBJECT ${SOURCES_cpp_kernels} ${HEADERS_cpp_kernels})
set_target_properties(KernelObjLib PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
add_library(AllKernels SHARED $<TARGET_OBJECTS:KernelObjLib>)
set_target_properties(AllKernels PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)

if(USE_CUDA AND CMAKE_CUDA_COMPILER)
@@ -130,4 +137,5 @@ if(USE_HDFS)
find_library(LIBHDFS3 NAMES libhdfs3.so HINTS ${PROJECT_BINARY_DIR}/installed/lib REQUIRED)
endif()

target_link_libraries(KernelObjLib PUBLIC ${LIBS} ${MPI_LIBRARIES} ${PAPI_LIB} ${HWLOC_LIB} ${LIBHDFS3})
target_link_libraries(AllKernels PUBLIC ${LIBS} ${MPI_LIBRARIES} ${PAPI_LIB} ${HWLOC_LIB} ${LIBHDFS3})
2 changes: 1 addition & 1 deletion src/runtime/local/kernels/Cartesian.h
Original file line number Diff line number Diff line change
@@ -30,7 +30,7 @@ void cartesianSet(ValueTypeCode vtcType, Frame *&res, const Frame *arg, const in
}
}

void cartesian(Frame *&res, const Frame *lhs, const Frame *rhs, DCTX(ctx)) {
inline void cartesian(Frame *&res, const Frame *lhs, const Frame *rhs, DCTX(ctx)) {
const size_t numRowRhs = rhs->getNumRows();
const size_t numRowLhs = lhs->getNumRows();
const size_t totalRows = numRowRhs * numRowLhs;
6 changes: 3 additions & 3 deletions src/runtime/local/kernels/CreateFrame.h
Original file line number Diff line number Diff line change
@@ -30,8 +30,8 @@
// Convenience function
// ****************************************************************************

void createFrame(Frame *&res, Structure **colMats, size_t numColMats, const char **labels, size_t numLabels,
DCTX(ctx)) {
inline void createFrame(Frame *&res, Structure **colMats, size_t numColMats, const char **labels, size_t numLabels,
DCTX(ctx)) {
std::vector<Structure *> colMatsVec;
for (size_t c = 0; c < numColMats; c++)
colMatsVec.push_back(colMats[c]);
@@ -46,4 +46,4 @@ void createFrame(Frame *&res, Structure **colMats, size_t numColMats, const char
delete[] labelsStr;
}

#endif // SRC_RUNTIME_LOCAL_KERNELS_CREATEFRAME_H
#endif // SRC_RUNTIME_LOCAL_KERNELS_CREATEFRAME_H
2 changes: 1 addition & 1 deletion src/runtime/local/kernels/Group.h
Original file line number Diff line number Diff line change
@@ -117,7 +117,7 @@ template <typename VTRes, typename VTArg> struct ColumnGroupAgg {
}
};

std::string myStringifyGroupEnum(mlir::daphne::GroupEnum val) {
inline std::string myStringifyGroupEnum(mlir::daphne::GroupEnum val) {
using mlir::daphne::GroupEnum;
switch (val) {
case GroupEnum::COUNT:
2 changes: 1 addition & 1 deletion src/runtime/local/kernels/InnerJoin.h
Original file line number Diff line number Diff line change
@@ -73,7 +73,7 @@ bool innerJoinProbeIf(
// Convenience function
// ****************************************************************************

void innerJoin(
inline void innerJoin(
// results
Frame *&res,
// input frames
4 changes: 2 additions & 2 deletions src/runtime/local/kernels/Quantize.h
Original file line number Diff line number Diff line change
@@ -42,7 +42,7 @@ template <class DTRes, class DTArg> void quantize(DTRes *&res, const DTArg *arg,
Quantize<DTRes, DTArg>::apply(res, arg, min, max, ctx);
}

void calc_quantization_params(float min, float max, float &scale, uint8_t &quantized_zero) {
inline void calc_quantization_params(float min, float max, float &scale, uint8_t &quantized_zero) {
// Make sure that 0 is included
min = (min > 0) ? 0 : min;
max = (max < 0) ? 0 : max;
@@ -63,7 +63,7 @@ void calc_quantization_params(float min, float max, float &scale, uint8_t &quant
}
}

uint8_t quantize_value(float a, float scale, uint8_t quantized_zero) {
inline uint8_t quantize_value(float a, float scale, uint8_t quantized_zero) {
// Map
float value = static_cast<float>(quantized_zero) + a / scale;

4 changes: 2 additions & 2 deletions src/runtime/local/kernels/SetColLabels.h
Original file line number Diff line number Diff line change
@@ -30,7 +30,7 @@
// Convenience function
// ****************************************************************************

void setColLabels(Frame *&res, const Frame *arg, const char **labels, size_t numLabels, DCTX(ctx)) {
inline void setColLabels(Frame *&res, const Frame *arg, const char **labels, size_t numLabels, DCTX(ctx)) {
const size_t numCols = arg->getNumCols();
if (numLabels != numCols)
throw std::runtime_error("the number of given labels does not match "
@@ -51,4 +51,4 @@ void setColLabels(Frame *&res, const Frame *arg, const char **labels, size_t num
delete[] labelsStr;
}

#endif // SRC_RUNTIME_LOCAL_KERNELS_SETCOLLABELS_H
#endif // SRC_RUNTIME_LOCAL_KERNELS_SETCOLLABELS_H
4 changes: 2 additions & 2 deletions src/runtime/local/kernels/SetColLabelsPrefix.h
Original file line number Diff line number Diff line change
@@ -30,7 +30,7 @@
// Convenience function
// ****************************************************************************

void setColLabelsPrefix(Frame *&res, const Frame *arg, const char *prefix, DCTX(ctx)) {
inline void setColLabelsPrefix(Frame *&res, const Frame *arg, const char *prefix, DCTX(ctx)) {
const size_t numCols = arg->getNumCols();
const std::string *oldLabels = arg->getLabels();
std::string *newLabels = new std::string[numCols];
@@ -50,4 +50,4 @@ void setColLabelsPrefix(Frame *&res, const Frame *arg, const char *prefix, DCTX(
delete[] newLabels;
}

#endif // SRC_RUNTIME_LOCAL_KERNELS_SETCOLLABELSPREFIX_H
#endif // SRC_RUNTIME_LOCAL_KERNELS_SETCOLLABELSPREFIX_H
4 changes: 3 additions & 1 deletion src/runtime/local/kernels/Stop.h
Original file line number Diff line number Diff line change
@@ -18,4 +18,6 @@

#include <runtime/local/context/DaphneContext.h>

void stop(const char *message, DCTX(ctx)) { throw std::runtime_error(std::string("system stopped: ") + message); }
inline void stop(const char *message, DCTX(ctx)) {
throw std::runtime_error(std::string("system stopped: ") + message);
}
4 changes: 2 additions & 2 deletions src/runtime/local/kernels/ThetaJoin.h
Original file line number Diff line number Diff line change
@@ -365,8 +365,8 @@ template <> class ThetaJoin<Frame, Frame, Frame> {
}
};

void thetaJoin(Frame *&res, const Frame *lhs, const Frame *rhs, const char **lhsOn, size_t numLhsOn, const char **rhsOn,
size_t numRhsOn, CompareOperation *cmp, size_t numCmp, DCTX(ctx)) {
inline void thetaJoin(Frame *&res, const Frame *lhs, const Frame *rhs, const char **lhsOn, size_t numLhsOn,
const char **rhsOn, size_t numRhsOn, CompareOperation *cmp, size_t numCmp, DCTX(ctx)) {
ThetaJoin<Frame, Frame, Frame>::apply(res, lhs, rhs, lhsOn, numLhsOn, rhsOn, numRhsOn, cmp, numCmp);
}
#endif // SRC_RUNTIME_LOCAL_KERNELS_THETAJOIN_H
Loading

0 comments on commit 529f0cb

Please sign in to comment.