Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve compile times #829

Merged
merged 1 commit into from
Oct 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)
set(CMAKE_CXX_STANDARD 20 CACHE STRING "C++ standard to conform to")
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_OPTIMIZE_DEPENDENCIES 1)

set(CMAKE_CXX_FLAGS_DEBUG="${CMAKE_CXX_FLAGS_DEBUG} -g")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
Expand Down
44 changes: 26 additions & 18 deletions src/runtime/local/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,21 @@
set(CMAKE_CXX_STANDARD 20 CACHE STRING "C++ standard to conform to")
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
find_package(Python3 REQUIRED COMPONENTS Interpreter)

# The library of pre-compiled CUDA kernels
if(USE_CUDA AND CMAKE_CUDA_COMPILER)
set(CMAKE_CUDA_STANDARD 20)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
execute_process(
COMMAND
${Python3_EXECUTABLE} genKernelInst.py kernels.json
${PROJECT_BINARY_DIR}/src/runtime/local/kernels/CUDAkernels
${PROJECT_SOURCE_DIR}/lib/CUDAcatalog.json CUDA
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/)

add_custom_command(
OUTPUT ${PROJECT_BINARY_DIR}/src/runtime/local/kernels/CUDAkernels.cpp ${PROJECT_SOURCE_DIR}/lib/CUDAcatalog.json
COMMAND python3 ARGS genKernelInst.py kernels.json
${PROJECT_BINARY_DIR}/src/runtime/local/kernels/CUDAkernels.cpp ${PROJECT_SOURCE_DIR}/lib/CUDAcatalog.json CUDA
MAIN_DEPENDENCY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/kernels.json
DEPENDS ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/genKernelInst.py
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/
)
file(GLOB CUDA_CODEGEN_CPP_FILES CONFIGURE_DEPENDS
"${PROJECT_BINARY_DIR}/src/runtime/local/kernels/CUDA*.cpp")

set(PREFIX ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/CUDA)
set(CUDAKernels_SRC
Expand Down Expand Up @@ -59,7 +60,7 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER)
${PREFIX}/Solve.cpp
${PREFIX}/Syrk.cu
${PREFIX}/Transpose.cpp
${PROJECT_BINARY_DIR}/src/runtime/local/kernels/CUDAkernels.cpp
${CUDA_CODEGEN_CPP_FILES}
${PROJECT_SOURCE_DIR}/src/runtime/local/vectorized/TasksCUDA.cpp
${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/VectorizedPipeline.h
${PROJECT_SOURCE_DIR}/src/runtime/local/vectorized/WorkerGPU.h
Expand All @@ -77,24 +78,28 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER)
set_target_properties(CUDAKernels PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
endif()

add_custom_command(
OUTPUT ${PROJECT_BINARY_DIR}/src/runtime/local/kernels/kernels.cpp ${PROJECT_SOURCE_DIR}/lib/catalog.json
COMMAND python3 ARGS genKernelInst.py kernels.json ${PROJECT_BINARY_DIR}/src/runtime/local/kernels/kernels.cpp ${PROJECT_SOURCE_DIR}/lib/catalog.json CPP
MAIN_DEPENDENCY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/kernels.json
DEPENDS ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/genKernelInst.py
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/
)
execute_process(
COMMAND
${Python3_EXECUTABLE} genKernelInst.py kernels.json
${PROJECT_BINARY_DIR}/src/runtime/local/kernels/kernels
${PROJECT_SOURCE_DIR}/lib/catalog.json CPP
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/)

file(GLOB CODEGEN_CPP_FILES CONFIGURE_DEPENDS
"${PROJECT_BINARY_DIR}/src/runtime/local/kernels/*.cpp")
# message("CODEGEN_CPP_FILES: ${CODEGEN_CPP_FILES}")

list(APPEND LIBS DataStructures IO BLAS::BLAS MLIRParser)

set(PREFIX ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/)
set(HEADERS_cpp_kernels
${PREFIX}/MatMul.h
)

set(SOURCES_cpp_kernels
${PREFIX}/MatMul.cpp
${PROJECT_SOURCE_DIR}/src/runtime/local/instrumentation/KernelInstrumentation.cpp
${PROJECT_BINARY_DIR}/src/runtime/local/kernels/kernels.cpp
${CODEGEN_CPP_FILES}
${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/CreateDaphneContext.cpp
${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/Pooling.cpp
${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/VectorizedPipeline.h
Expand All @@ -104,7 +109,9 @@ set(SOURCES_cpp_kernels
${PROJECT_SOURCE_DIR}/src/runtime/local/vectorized/WorkerCPU.h
)
# The library of pre-compiled kernels. Will be linked into the JIT-compiled user program.
add_library(AllKernels SHARED ${SOURCES_cpp_kernels} ${HEADERS_cpp_kernels})
add_library(KernelObjLib OBJECT ${SOURCES_cpp_kernels} ${HEADERS_cpp_kernels})
set_target_properties(KernelObjLib PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
add_library(AllKernels SHARED $<TARGET_OBJECTS:KernelObjLib>)
set_target_properties(AllKernels PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)

if(USE_CUDA AND CMAKE_CUDA_COMPILER)
Expand All @@ -130,4 +137,5 @@ if(USE_HDFS)
find_library(LIBHDFS3 NAMES libhdfs3.so HINTS ${PROJECT_BINARY_DIR}/installed/lib REQUIRED)
endif()

target_link_libraries(KernelObjLib PUBLIC ${LIBS} ${MPI_LIBRARIES} ${PAPI_LIB} ${HWLOC_LIB} ${LIBHDFS3})
target_link_libraries(AllKernels PUBLIC ${LIBS} ${MPI_LIBRARIES} ${PAPI_LIB} ${HWLOC_LIB} ${LIBHDFS3})
2 changes: 1 addition & 1 deletion src/runtime/local/kernels/Cartesian.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ void cartesianSet(ValueTypeCode vtcType, Frame *&res, const Frame *arg, const in
}
}

void cartesian(Frame *&res, const Frame *lhs, const Frame *rhs, DCTX(ctx)) {
inline void cartesian(Frame *&res, const Frame *lhs, const Frame *rhs, DCTX(ctx)) {
const size_t numRowRhs = rhs->getNumRows();
const size_t numRowLhs = lhs->getNumRows();
const size_t totalRows = numRowRhs * numRowLhs;
Expand Down
6 changes: 3 additions & 3 deletions src/runtime/local/kernels/CreateFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
// Convenience function
// ****************************************************************************

void createFrame(Frame *&res, Structure **colMats, size_t numColMats, const char **labels, size_t numLabels,
DCTX(ctx)) {
inline void createFrame(Frame *&res, Structure **colMats, size_t numColMats, const char **labels, size_t numLabels,
DCTX(ctx)) {
std::vector<Structure *> colMatsVec;
for (size_t c = 0; c < numColMats; c++)
colMatsVec.push_back(colMats[c]);
Expand All @@ -46,4 +46,4 @@ void createFrame(Frame *&res, Structure **colMats, size_t numColMats, const char
delete[] labelsStr;
}

#endif // SRC_RUNTIME_LOCAL_KERNELS_CREATEFRAME_H
#endif // SRC_RUNTIME_LOCAL_KERNELS_CREATEFRAME_H
2 changes: 1 addition & 1 deletion src/runtime/local/kernels/Group.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ template <typename VTRes, typename VTArg> struct ColumnGroupAgg {
}
};

std::string myStringifyGroupEnum(mlir::daphne::GroupEnum val) {
inline std::string myStringifyGroupEnum(mlir::daphne::GroupEnum val) {
using mlir::daphne::GroupEnum;
switch (val) {
case GroupEnum::COUNT:
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/local/kernels/InnerJoin.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ bool innerJoinProbeIf(
// Convenience function
// ****************************************************************************

void innerJoin(
inline void innerJoin(
// results
Frame *&res,
// input frames
Expand Down
4 changes: 2 additions & 2 deletions src/runtime/local/kernels/Quantize.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ template <class DTRes, class DTArg> void quantize(DTRes *&res, const DTArg *arg,
Quantize<DTRes, DTArg>::apply(res, arg, min, max, ctx);
}

void calc_quantization_params(float min, float max, float &scale, uint8_t &quantized_zero) {
inline void calc_quantization_params(float min, float max, float &scale, uint8_t &quantized_zero) {
// Make sure that 0 is included
min = (min > 0) ? 0 : min;
max = (max < 0) ? 0 : max;
Expand All @@ -63,7 +63,7 @@ void calc_quantization_params(float min, float max, float &scale, uint8_t &quant
}
}

uint8_t quantize_value(float a, float scale, uint8_t quantized_zero) {
inline uint8_t quantize_value(float a, float scale, uint8_t quantized_zero) {
// Map
float value = static_cast<float>(quantized_zero) + a / scale;

Expand Down
4 changes: 2 additions & 2 deletions src/runtime/local/kernels/SetColLabels.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
// Convenience function
// ****************************************************************************

void setColLabels(Frame *&res, const Frame *arg, const char **labels, size_t numLabels, DCTX(ctx)) {
inline void setColLabels(Frame *&res, const Frame *arg, const char **labels, size_t numLabels, DCTX(ctx)) {
const size_t numCols = arg->getNumCols();
if (numLabels != numCols)
throw std::runtime_error("the number of given labels does not match "
Expand All @@ -51,4 +51,4 @@ void setColLabels(Frame *&res, const Frame *arg, const char **labels, size_t num
delete[] labelsStr;
}

#endif // SRC_RUNTIME_LOCAL_KERNELS_SETCOLLABELS_H
#endif // SRC_RUNTIME_LOCAL_KERNELS_SETCOLLABELS_H
4 changes: 2 additions & 2 deletions src/runtime/local/kernels/SetColLabelsPrefix.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
// Convenience function
// ****************************************************************************

void setColLabelsPrefix(Frame *&res, const Frame *arg, const char *prefix, DCTX(ctx)) {
inline void setColLabelsPrefix(Frame *&res, const Frame *arg, const char *prefix, DCTX(ctx)) {
const size_t numCols = arg->getNumCols();
const std::string *oldLabels = arg->getLabels();
std::string *newLabels = new std::string[numCols];
Expand All @@ -50,4 +50,4 @@ void setColLabelsPrefix(Frame *&res, const Frame *arg, const char *prefix, DCTX(
delete[] newLabels;
}

#endif // SRC_RUNTIME_LOCAL_KERNELS_SETCOLLABELSPREFIX_H
#endif // SRC_RUNTIME_LOCAL_KERNELS_SETCOLLABELSPREFIX_H
4 changes: 3 additions & 1 deletion src/runtime/local/kernels/Stop.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@

#include <runtime/local/context/DaphneContext.h>

void stop(const char *message, DCTX(ctx)) { throw std::runtime_error(std::string("system stopped: ") + message); }
inline void stop(const char *message, DCTX(ctx)) {
throw std::runtime_error(std::string("system stopped: ") + message);
}
4 changes: 2 additions & 2 deletions src/runtime/local/kernels/ThetaJoin.h
Original file line number Diff line number Diff line change
Expand Up @@ -365,8 +365,8 @@ template <> class ThetaJoin<Frame, Frame, Frame> {
}
};

void thetaJoin(Frame *&res, const Frame *lhs, const Frame *rhs, const char **lhsOn, size_t numLhsOn, const char **rhsOn,
size_t numRhsOn, CompareOperation *cmp, size_t numCmp, DCTX(ctx)) {
inline void thetaJoin(Frame *&res, const Frame *lhs, const Frame *rhs, const char **lhsOn, size_t numLhsOn,
const char **rhsOn, size_t numRhsOn, CompareOperation *cmp, size_t numCmp, DCTX(ctx)) {
ThetaJoin<Frame, Frame, Frame>::apply(res, lhs, rhs, lhsOn, numLhsOn, rhsOn, numRhsOn, cmp, numCmp);
}
#endif // SRC_RUNTIME_LOCAL_KERNELS_THETAJOIN_H
Loading