diff --git a/CMakeLists.txt b/CMakeLists.txt index 8aa705dc5..8664e3520 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -145,6 +145,19 @@ if(USE_ARROW) message(STATUS "Arrow/Parquet enabled") endif() +option(USE_FPGAOPENCL "Whether to activate compilation of FPGA OpenCL features" OFF) +if(USE_FPGAOPENCL) + if(NOT DEFINED ENV{QUARTUSDIR}) + message(SEND_ERROR "Intel(R) Quartus installation directory should be defined by QUARTUSDIR varaiable (e.g. /opt/intel/intelFPGA_pro/21.4/)") + execute_process(COMMAND sleep 10) + endif() + include_directories($ENV{QUARTUSDIR}/hld/examples_aoc/common/inc/ $ENV{QUARTUSDIR}/hld/host/include/) + message(STATUS "cmake: using FPGA") + add_definitions(-DUSE_FPGAOPENCL) +endif() + +set(CMAKE_VERBOSE_MAKEFILE ON) + # ***************************************************************************** # Project-specific include directories # ***************************************************************************** @@ -170,6 +183,10 @@ add_subdirectory(src/runtime/distributed/worker) add_subdirectory(src/runtime/local/datastructures) add_subdirectory(src/runtime/local/io) add_subdirectory(src/runtime/local/kernels) +if(USE_FPGAOPENCL) + add_subdirectory(src/runtime/local/kernels/FPGAOPENCL) +endif() add_subdirectory(src/util) + add_subdirectory(test) diff --git a/build.sh b/build.sh index 0da089f64..18114ad86 100755 --- a/build.sh +++ b/build.sh @@ -381,6 +381,7 @@ par_acceptAll="0" unknown_options="" BUILD_CUDA="-DUSE_CUDA=OFF" BUILD_ARROW="-DUSE_ARROW=OFF" +BUILD_FPGAOPENCL="-DUSE_FPGAOPENCL=OFF" BUILD_DEBUG="-DCMAKE_BUILD_TYPE=Release" while [[ $# -gt 0 ]]; do @@ -414,6 +415,10 @@ while [[ $# -gt 0 ]]; do echo using ARROW BUILD_ARROW="-DUSE_ARROW=ON" ;; + --fpgaopencl) + echo using FPGAOPENCL + export BUILD_FPGAOPENCL="-DUSE_FPGAOPENCL=ON" + ;; --debug) echo building DEBUG version export BUILD_DEBUG="-DCMAKE_BUILD_TYPE=Debug" @@ -701,6 +706,17 @@ else daphne_msg "No need to build MLIR/LLVM again." fi +if [[ $BUILD_FPGAOPENCL = *"ON"* ]]; then + FPGAOPENCL_BISTREAM_DIR="$projectRoot/src/runtime/local/kernels/FPGAOPENCL/bitstreams" + FPGAOPENCL_BISTREAM_URL="https://github.com/daphne-eu/supplemental-binaries/raw/main/fpga_bitstreams/" + if [ ! -d $FPGAOPENCL_BISTREAM_DIR ]; then + echo fetching FPGA bitstreams + mkdir -p $FPGAOPENCL_BISTREAM_DIR + cd $FPGAOPENCL_BISTREAM_DIR + wget $FPGAOPENCL_BISTREAM_URL/sgemm.aocx + cd - + fi +fi # ***************************************************************************** # Build DAPHNE target. @@ -708,7 +724,7 @@ fi daphne_msg "Build Daphne" -cmake -S "$projectRoot" -B "$daphneBuildDir" -G Ninja $BUILD_CUDA $BUILD_ARROW $BUILD_DEBUG \ +cmake -S "$projectRoot" -B "$daphneBuildDir" -G Ninja $BUILD_CUDA $BUILD_ARROW $BUILD_FPGAOPENCL $BUILD_DEBUG \ -DCMAKE_PREFIX_PATH="$installPrefix" -DANTLR_VERSION="$antlrVersion" \ -DMLIR_DIR="$buildPrefix/$llvmName/lib/cmake/mlir/" \ -DLLVM_DIR="$buildPrefix/$llvmName/lib/cmake/llvm/" diff --git a/doc/FPGAconfiguration.md b/doc/FPGAconfiguration.md new file mode 100644 index 000000000..8cd702d0d --- /dev/null +++ b/doc/FPGAconfiguration.md @@ -0,0 +1,62 @@ + + +# FPGA configuration for usage in DAPHNE + + +### System requirments + +Daphne build script for FPGA kernels support requires additional QUARTUSDIR system variable definition. +Example command is presented in fpga-build-env.sh or in the following command: + +export QUARTUSDIR=/opt/intel/intelFPGA_pro/21.4 + +To build the Daphne with the FPGA support -fpgaopencl flag has to be used: + + ./build.sh --fpgaopenc + + +To run developed or precompiled, included in Daphne repository FPGA OpenCL kernels an installedand configured FPGA device is required. +Our example kernels have been tested using Intel(R) PAC D5005 card (https://www.intel.com/content/www/us/en/products/sku/193921/intel-fpga-pac-d5005/specifications.html) + +DAPHNE contains some example linear algebra kernels developed using T2SP framework(https://github.com/IntelLabs/t2sp/blob/master/README.md). +Example precompiled FPGA kernels can be usedon DAPHNE DSL description level. +To prepare the system for the precompiled FPGA kernels some FPGA and OpenCL system variables are required. +The easiest way to set up required varables is to use the init_opencl.sh script from installed Intel(R) Quartus sowtware or from the +Intel(R) OpenCL RTE or Intel(R) OpenCL SDK packages. + +Example script usage: +source /opt/intel/intelFPGA_pro/21.4/hld/init_opencl.sh + +For additional details please look into https://www.intel.com/content/www/us/en/docs/programmable/683550/18-1/standard-edition-getting-started-guide.html +or https://www.intel.com/content/www/us/en/software/programmable/sdk-for-opencl/overview.html. + + +### Precompiled FPGA kernels usage + +To use a precompiled FPGA kernel a FPGA image is required (*.aocx). FPGA device has to programmed with particular image which contains required kernel implementation. +Example FPGA programming command using example FPGA image: + + aocl program acl0 src/runtime/local/kernels/FPGAOPENCL/bitstreams/sgemm.aocx + + +Additionally the BITSTREAM variable has to be defind in the system. +Please look into the following example: + + export BITSTREAM=src/runtime/local/kernels/FPGAOPENCL/bitstreams/sgemm.aocx + +When another FPGA image contains implementation for another required computational kernel then FPGA device has to be reprogrammed and BITSTREAM variable value has to be changed. + diff --git a/fpga-build-env.sh b/fpga-build-env.sh new file mode 100644 index 000000000..ca3958fc3 --- /dev/null +++ b/fpga-build-env.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +#initialize Intel FPGA OpenCL environment +export QUARTUSDIR=/opt/intel/intelFPGA_pro/21.4 +source $QUARTUSDIR/hld/init_opencl.sh +echo $INTELFPGAOCLSDKROOT +export ALTERAOCLSDKROOT=$INTELFPGAOCLSDKROOT +#set up BITSTREAM variable for required FPGA image (can be different different for varius implemented kernels) +export BITSTREAM=src/runtime/local/kernels/FPGAOPENCL/bitstreams/sgemm.aocx # SGEMM computational kernel + diff --git a/scripts/examples/fpga-gemm.daph b/scripts/examples/fpga-gemm.daph new file mode 100644 index 000000000..55fcc743a --- /dev/null +++ b/scripts/examples/fpga-gemm.daph @@ -0,0 +1,43 @@ +/* + * Copyright 2021 The DAPHNE Consortium + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +############## IMPORTANT ################# +# EXAMPLE command to run the script: +# +# env BITSTREAM=../../src/runtime/local/kernels/FPGAOPENCL/bistreams/sgemm.aocx ./build/bin/daphne -fpgaopencl scripts/examples/fpga-gemm.daph +# +# WHERE: +# -fpgaopencl is a flag required for FPGA usage +# BITSTREAM variable point out required FPGA image with its location +######################################### + +# Creating input matrices +m = rand(448,1024, as.f32(2.0), as.f32(2.0), 1.0, -1); +//m = rand(896,8192, as.f32(2.0), as.f32(2.0), 1.0, -1); +//m = rand(448,16384, as.f32(2.0), as.f32(2.0), 1.0, -1); + +m2 = rand(1024,512, as.f32(1.0), as.f32(1.0), 1.0, -1); +//m2 = rand(8192,1024, as.f32(1.0), as.f32(1.0), 1.0, -1); +//m2 = rand(16384,1024, as.f32(1.0), as.f32(1.0), 1.0, -1); + +# test prints +//print(m[0,]); +//print(m2[0,]); + +Z = m @ m2; +//print(Z[,0]); + +print("Bye!"); diff --git a/src/api/cli/CMakeLists.txt b/src/api/cli/CMakeLists.txt index 316045715..3faad34b9 100644 --- a/src/api/cli/CMakeLists.txt +++ b/src/api/cli/CMakeLists.txt @@ -32,6 +32,10 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER) list(APPEND LIB_DEPS CUDAKernels) endif() +if(USE_FPGAOPENCL) + list(APPEND LIB_DEPS FPGAOPENCLKernels) +endif() + add_llvm_executable(daphne daphne.cpp DaphneUserConfig.h DEPENDS ${LIB_DEPS}) llvm_update_compile_flags(daphne) diff --git a/src/api/cli/DaphneUserConfig.h b/src/api/cli/DaphneUserConfig.h index 37870f0db..0ee08d76e 100644 --- a/src/api/cli/DaphneUserConfig.h +++ b/src/api/cli/DaphneUserConfig.h @@ -40,6 +40,7 @@ struct DaphneUserConfig { bool pinWorkers = false; bool hyperthreadingEnabled = false; bool debugMultiThreading = false; + bool use_fpgaopencl = false; bool debug_llvm = false; bool explain_kernels = false; @@ -65,6 +66,11 @@ struct DaphneUserConfig { // ToDo: This is an arbitrary default taken from sample code // int cublas_workspace_size = 1024 * 1024 * 4; #endif +#ifdef USE_FPGAOPENCL + std::vector fpga_devices; +#endif + + std::string libdir; std::vector library_paths; }; diff --git a/src/api/cli/daphne.cpp b/src/api/cli/daphne.cpp index c9fa22e9b..3225f30f4 100644 --- a/src/api/cli/daphne.cpp +++ b/src/api/cli/daphne.cpp @@ -176,6 +176,10 @@ main(int argc, char** argv) "cuda", cat(daphneOptions), desc("Use CUDA") ); + opt fpgaopencl( + "fpgaopencl", cat(daphneOptions), + desc("Use FPGAOPENCL") + ); opt libDir( "libdir", cat(daphneOptions), desc("The directory containing kernel libraries") @@ -324,6 +328,11 @@ main(int argc, char** argv) } } + if(fpgaopencl) { + user_config.use_fpgaopencl = true; + } + + // add this after the cli args loop to work around args order if(!user_config.libdir.empty() && user_config.use_cuda) user_config.library_paths.push_back(user_config.libdir + "/libCUDAKernels.so"); diff --git a/src/compiler/execution/DaphneIrExecutor.cpp b/src/compiler/execution/DaphneIrExecutor.cpp index 94e857c27..412ca894d 100644 --- a/src/compiler/execution/DaphneIrExecutor.cpp +++ b/src/compiler/execution/DaphneIrExecutor.cpp @@ -135,6 +135,12 @@ bool DaphneIrExecutor::runPasses(mlir::ModuleOp module) pm.addNestedPass(mlir::daphne::createMarkCUDAOpsPass(userConfig_)); #endif +#ifdef USE_FPGAOPENCL + if(userConfig_.use_fpgaopencl) + pm.addNestedPass(mlir::daphne::createMarkFPGAOPENCLOpsPass(userConfig_)); +#endif + + if(userConfig_.use_obj_ref_mgnt) pm.addNestedPass(mlir::daphne::createManageObjRefsPass()); if(userConfig_.explain_obj_ref_mgnt) @@ -182,6 +188,14 @@ std::unique_ptr DaphneIrExecutor::createExecutionEngine(m } } #endif + +#ifdef USE_FPGAOPENCL + if(userConfig_.use_fpgaopencl) { + if(userConfig_.libdir.empty()) { + sharedLibRefs.push_back("build/src/runtime/local/kernels/libFPGAOPENCLKernels.so"); + } + } +#endif registerLLVMDialectTranslation(context_); // module.dump(); auto maybeEngine = mlir::ExecutionEngine::create( diff --git a/src/compiler/lowering/CMakeLists.txt b/src/compiler/lowering/CMakeLists.txt index b4d0aba3c..887c86037 100644 --- a/src/compiler/lowering/CMakeLists.txt +++ b/src/compiler/lowering/CMakeLists.txt @@ -17,6 +17,7 @@ add_mlir_dialect_library(MLIRDaphneTransforms DistributeComputationsPass.cpp DistributePipelinesPass.cpp MarkCUDAOpsPass.cpp + MarkFPGAOPENCLOpsPass.cpp InsertDaphneContextPass.cpp ManageObjRefsPass.cpp LowerToLLVMPass.cpp diff --git a/src/compiler/lowering/InsertDaphneContextPass.cpp b/src/compiler/lowering/InsertDaphneContextPass.cpp index 56e9c4055..de6ed0b01 100644 --- a/src/compiler/lowering/InsertDaphneContextPass.cpp +++ b/src/compiler/lowering/InsertDaphneContextPass.cpp @@ -63,6 +63,13 @@ void InsertDaphneContextPass::runOnFunction() if (user_config.use_distributed){ builder.create(loc); } +#ifdef USE_FPGAOPENCL + if(user_config.use_fpgaopencl) { + builder.create(loc); + } +#endif + + // Insert a DestroyDaphneContextOp as the last operation in the block, but // before the block's terminator. builder.setInsertionPoint(b.getTerminator()); diff --git a/src/compiler/lowering/MarkFPGAOPENCLOpsPass.cpp b/src/compiler/lowering/MarkFPGAOPENCLOpsPass.cpp new file mode 100644 index 000000000..da8a9223d --- /dev/null +++ b/src/compiler/lowering/MarkFPGAOPENCLOpsPass.cpp @@ -0,0 +1,57 @@ +/* + * Copyright 2021 The DAPHNE Consortium + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifdef USE_FPGAOPENCL +#include "compiler/CompilerUtils.h" +#include "ir/daphneir/Daphne.h" +#include "ir/daphneir/Passes.h" +#include + +#include + +using namespace mlir; + +struct MarkFPGAOPENCLOpsPass : public PassWrapper { + + /** + * @brief User configuration influencing the rewrite pass + */ + const DaphneUserConfig& cfg; + + explicit MarkFPGAOPENCLOpsPass(const DaphneUserConfig& cfg) : cfg(cfg) { + } + + void runOnFunction() final; + + bool checkUseFPGAOPENCL(Operation* op) const { +// std::cout << "checkUseFPGAOPENCL: " << op->getName().getStringRef().str() << std::endl; + return op->hasTrait(); + } +}; + +void MarkFPGAOPENCLOpsPass::runOnFunction() { + getFunction()->walk([&](Operation* op) { + OpBuilder builder(op); + if(checkUseFPGAOPENCL(op)) { + op->setAttr("fpgaopencl_device", builder.getI32IntegerAttr(0)); + } + WalkResult::advance(); + }); +} + +std::unique_ptr daphne::createMarkFPGAOPENCLOpsPass(const DaphneUserConfig& cfg) { + return std::make_unique(cfg); +} +#endif diff --git a/src/compiler/lowering/RewriteToCallKernelOpPass.cpp b/src/compiler/lowering/RewriteToCallKernelOpPass.cpp index 3f0297d16..dd968b9d2 100644 --- a/src/compiler/lowering/RewriteToCallKernelOpPass.cpp +++ b/src/compiler/lowering/RewriteToCallKernelOpPass.cpp @@ -153,6 +153,10 @@ namespace // else // std::cout << "attr = null: " << op->getName().getStringRef().str() << std::endl; } + else if(op->hasAttr("fpgaopencl_device")) { + callee << "FPGAOPENCL"; + } + callee << '_' << op->getName().stripDialect().data(); diff --git a/src/ir/daphneir/Daphne.h b/src/ir/daphneir/Daphne.h index c3b941d00..6c56bd5c4 100644 --- a/src/ir/daphneir/Daphne.h +++ b/src/ir/daphneir/Daphne.h @@ -47,6 +47,12 @@ #include #include +namespace mlir::OpTrait { + template + class FPGAOPENCLSupport : public TraitBase { + }; +} + namespace mlir::daphne { enum class MatrixRepresentation { Dense = 0, @@ -67,4 +73,4 @@ namespace mlir::daphne { #define GET_OP_CLASSES #include "ir/daphneir/DaphneOps.h.inc" -#endif //SRC_IR_DAPHNEIR_DAPHNE_H \ No newline at end of file +#endif //SRC_IR_DAPHNEIR_DAPHNE_H diff --git a/src/ir/daphneir/DaphneOps.td b/src/ir/daphneir/DaphneOps.td index 29ae28d75..a423f8878 100644 --- a/src/ir/daphneir/DaphneOps.td +++ b/src/ir/daphneir/DaphneOps.td @@ -28,6 +28,7 @@ include "ir/daphneir/DaphneInferTypesOpInterface.td" include "ir/daphneir/DaphneVectorizableOpInterface.td" include "ir/daphneir/DaphneShapeInferenceTraits.td" include "ir/daphneir/CUDASupport.td" +include "ir/daphneir/FPGAOPENCLSupport.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Interfaces/ControlFlowInterfaces.td" @@ -142,7 +143,7 @@ def Daphne_MatMulOp : Daphne_Op<"matMul", [ DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, CUDASupport + DeclareOpInterfaceMethods, CUDASupport, FPGAOPENCLSupport ]> { let arguments = (ins MatrixOf<[NumScalar]>:$lhs, MatrixOf<[NumScalar]>:$rhs, BoolScalar:$transa, BoolScalar:$transb); let results = (outs MatrixOf<[NumScalar]>:$res); @@ -1212,6 +1213,12 @@ def Daphne_CreateDistributedContextOp : Daphne_Op<"createDistributedContext", [] let results = (outs); } +def Daphne_CreateFPGAContextOp : Daphne_Op<"createFPGAContext", [FPGAOPENCLSupport]> { + let arguments = (ins); + let results = (outs); +} + + // **************************************************************************** // Vectorized operations // **************************************************************************** diff --git a/src/ir/daphneir/FPGAOPENCLSupport.td b/src/ir/daphneir/FPGAOPENCLSupport.td new file mode 100644 index 000000000..4322b6aa5 --- /dev/null +++ b/src/ir/daphneir/FPGAOPENCLSupport.td @@ -0,0 +1,24 @@ +/* + * Copyright 2021 The DAPHNE Consortium + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_IR_DAPHNEIR_FPGAOPENCLSUPPORT_TD +#define SRC_IR_DAPHNEIR_FPGAOPENCLSUPPORT_TD + +include "mlir/IR/OpBase.td" + +def FPGAOPENCLSupport : NativeOpTrait<"FPGAOPENCLSupport">; + +#endif // SRC_IR_DAPHNEIR_FPGAOPENCLSUPPORT_TD diff --git a/src/ir/daphneir/Passes.h b/src/ir/daphneir/Passes.h index 3aa1d72cb..6f91bb21d 100644 --- a/src/ir/daphneir/Passes.h +++ b/src/ir/daphneir/Passes.h @@ -57,6 +57,10 @@ namespace mlir::daphne { std::unique_ptr createMarkCUDAOpsPass(const DaphneUserConfig& cfg); #endif +#ifdef USE_FPGAOPENCL + std::unique_ptr createMarkFPGAOPENCLOpsPass(const DaphneUserConfig& cfg); +#endif + #define GEN_PASS_REGISTRATION #include "ir/daphneir/Passes.h.inc" } // namespace mlir::daphne diff --git a/src/runtime/local/context/DaphneContext.h b/src/runtime/local/context/DaphneContext.h index cfd262f5b..af5ed0e37 100644 --- a/src/runtime/local/context/DaphneContext.h +++ b/src/runtime/local/context/DaphneContext.h @@ -24,6 +24,13 @@ #include "IContext.h" +#ifdef USE_FPGAOPENCL + #include "FPGAContext.h" +#endif + + + + // This macro is intended to be used in kernel function signatures, such that // we can change the ubiquitous DaphneContext parameter in a single place, if // required. @@ -47,6 +54,8 @@ struct DaphneContext { std::vector> cuda_contexts; + std::vector> fpga_contexts; + std::unique_ptr distributed_context; @@ -67,7 +76,13 @@ struct DaphneContext { for (auto& ctx : cuda_contexts) { ctx->destroy(); } + for (auto& ctx : fpga_contexts) { + ctx->destroy(); + } cuda_contexts.clear(); + fpga_contexts.clear(); + + } #ifdef USE_CUDA @@ -76,12 +91,23 @@ struct DaphneContext { return cuda_contexts[dev_id].get(); } #endif +#ifdef USE_FPGAOPENCL + // ToDo: in a multi device setting this should use a find call instead of a direct [] access + [[nodiscard]] FPGAContext* getFPGAContext(int dev_id) const { + // std::cout<<"inside getFPGAContext"<(fpga_contexts[dev_id].get()); + } +#endif + + + [[nodiscard]] bool useCUDA() const { return !cuda_contexts.empty(); } + [[nodiscard]] bool useFPGA() const { return !fpga_contexts.empty(); } [[nodiscard]] IContext *getDistributedContext() const { return distributed_context.get(); } [[maybe_unused]] [[nodiscard]] DaphneUserConfig getUserConfig() const { return config; } -}; +}; \ No newline at end of file diff --git a/src/runtime/local/context/FPGAContext.cpp b/src/runtime/local/context/FPGAContext.cpp new file mode 100644 index 000000000..14131617e --- /dev/null +++ b/src/runtime/local/context/FPGAContext.cpp @@ -0,0 +1,181 @@ +/* + * Copyright 2021 The DAPHNE Consortium + + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "AOCLUtils/aocl_utils.h" +#include "CL/opencl.h" +#include "runtime/local/context/FPGAContext.h" +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include + +using namespace std; +using namespace aocl_utils; + +#define DPRINTF(...) \ + printf(__VA_ARGS__); \ + fflush(stdout); + +#define CHECK(status) \ + if (status != CL_SUCCESS) { \ + printf("error %d in line %d.\n", status, __LINE__); \ + exit(1); \ + } + +void FPGAContext::destroy() { +#ifndef NDEBUG + std::cout << "Destroying FPGA context..." << std::endl; +#endif +} + +void FPGAContext::init() { +#ifndef NDEBUG + std::cout << "creating FPGA context..." << std::endl; + DPRINTF("\n===== Host-CPU setting up the OpenCL platform and device ======\n\n"); + unsigned int buf_uint; +#endif + cl_int status; + char buffer[4096]; + int device_found = 0; + + // Use clGetPlatformIDs() to retrieve the number of platforms + status = clGetPlatformIDs(0, NULL, &numPlatforms); +#ifndef NDEBUG + DPRINTF("Number of platforms = %d\n", numPlatforms); +#endif + // Allocate enough space for each platform + platforms = (cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id)); +#ifndef NDEBUG + DPRINTF("Allocated space for Platform\n"); +#endif + status = clGetPlatformIDs(numPlatforms, platforms, NULL); + CHECK(status); +#ifndef NDEBUG + DPRINTF("Filled in platforms\n"); + DPRINTF("Initializing IDs\n"); +#endif + for (int i = 0; i < (int)numPlatforms; i++) { + status = clGetDeviceIDs(platforms[i], + CL_DEVICE_TYPE_ALL, + maxDevices, + devices, + &numDevices); + + if (status == CL_SUCCESS) { + clGetPlatformInfo(platforms[i], + CL_PLATFORM_NAME, + 4096, + buffer, + NULL); +#if defined(ALTERA_CL) + if (strstr(buffer, "Altera") != NULL) { + device_found = 1; + } +// DPRINTF("%s\n", buffer); +#elif defined(NVIDIA_CL) + if (strstr(buffer, "NVIDIA") != NULL) { + device_found = 1; + } +#else + if (strstr(buffer, "Intel") != NULL) { + device_found = 1; + } +#endif +#ifndef NDEBUG + DPRINTF("Platform found : %s\n", buffer); +#endif + device_found = 1; + } + } + if (!device_found) { + DPRINTF("failed to find a OpenCL device\n"); + exit(-1); + } +#ifndef NDEBUG + DPRINTF("Total number of devices: %d", numDevices); + for (unsigned int i = 0; i < numDevices; i++) { + clGetDeviceInfo(devices[i], + CL_DEVICE_NAME, + 4096, + buffer, + NULL); + DPRINTF("\nDevice Name: %s\n", buffer); + clGetDeviceInfo(devices[i], + CL_DEVICE_VENDOR, + 4096, + buffer, + NULL); + DPRINTF("Device Vendor: %s\n", buffer); + clGetDeviceInfo(devices[i], + CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(buf_uint), + &buf_uint, + NULL); + DPRINTF("Device Computing Units: %u\n", buf_uint); + clGetDeviceInfo(devices[i], + CL_DEVICE_GLOBAL_MEM_SIZE, + sizeof(unsigned long), + &buffer, + NULL); + DPRINTF("Global Memory Size: %li\n", *((unsigned long*)buffer)); + clGetDeviceInfo(devices[i], + CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(unsigned long), + &buffer, + NULL); + DPRINTF("Global Memory Allocation Size: %li\n\n", *((unsigned long*)buffer)); + } +#endif + //---------------------------------------------- + // Create a context +#ifndef NDEBUG + DPRINTF("\n===== Host-CPU setting up the OpenCL command queues ======\n\n"); +#endif + context = clCreateContext( + NULL, + 1, + devices, + NULL, + NULL, + &status); + CHECK(status); +} + +std::unique_ptr FPGAContext::createFpgaContext(int device_id) { + +/* if(FPGAContext::numDevices < 1) { + std::cerr << "Not creating requested FPGA context. No FPGA devices available." << std::endl; + return nullptr; + } + + if(device_id >= (int)numDevices) { + std::cerr << "Requested device ID " << device_id << " >= device count "<(new FPGAContext(device_id)); + ctx->init(); + + return ctx; +} + + diff --git a/src/runtime/local/context/FPGAContext.h b/src/runtime/local/context/FPGAContext.h new file mode 100644 index 000000000..ff3366cd1 --- /dev/null +++ b/src/runtime/local/context/FPGAContext.h @@ -0,0 +1,68 @@ +/* + * Copyright 2021 The DAPHNE Consortium + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +//#include +#include "IContext.h" +//#include +//#include +#include +#include +#include +#include +#include + +class FPGAContext : public IContext { + int device_id = -1; + size_t mem_budget = 0; + + + explicit FPGAContext(int id) : device_id(id) { + //std::cout<<"fpga context constructor"< createFpgaContext(int id); + +// [[nodiscard]] cublasHandle_t getCublasHandle() const { return cublas_handle; } +// [[nodiscard]] cusparseHandle_t getCusparseHandle() const { return cusparse_handle; } + + // [[nodiscard]] const cudaDeviceProp* getDeviceProperties() const { return &device_properties; } + + +// size_t getMemBudget() { return mem_budget; } + + +private: + void init(); +}; diff --git a/src/runtime/local/kernels/FPGAOPENCL/CMakeLists.txt b/src/runtime/local/kernels/FPGAOPENCL/CMakeLists.txt new file mode 100644 index 000000000..09b68d56d --- /dev/null +++ b/src/runtime/local/kernels/FPGAOPENCL/CMakeLists.txt @@ -0,0 +1,53 @@ +# Copyright 2021 The DAPHNE Consortium +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Specifies how to generate the file "kernels.cpp" (which resides in the build +# directory) as the basis for the pre-compiled kernels library. + +# * more properly set up paths (ugly relative include path below) +#cmake_minimum_required(VERSION 3.21) + +#project(FPGAOPENCLkernels) + +include_directories($ENV{QUARTUSDIR}/hld/examples_aoc/common/inc/ $ENV{QUARTUSDIR}/hld/host/include/) + +set(PREFIX ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/FPGAOPENCL) +#message(STATUS "cmake fpga test! $ENV{QUARTUSDIR}") +add_custom_command( + OUTPUT ${PROJECT_BINARY_DIR}/FPGAOPENCLkernels.cpp + COMMAND python3 ARGS genKernelInst.py kernels.json + ${PROJECT_BINARY_DIR}/FPGAOPENCLkernels.cpp FPGAOPENCL + MAIN_DEPENDENCY ${PREFIX}/../kernels.json + DEPENDS ${PREFIX}/../genKernelInst.py + WORKING_DIRECTORY ${PREFIX}/.. +) + +#set(PREFIX ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/FPGAOPENCL) +set(FPGAOPENCLKernels_SRC + ${PREFIX}/../../context/FPGAContext.cpp + ${PREFIX}/../../context/FPGAContext.h + ${PREFIX}/CreateFPGAContext.h + ${PREFIX}/MatMul.h + ${PREFIX}/gemm_interface.cpp + ${PREFIX}/gemm_interface.h + ${PROJECT_BINARY_DIR}/FPGAOPENCLkernels.cpp +) + +add_library(FPGAOPENCLKernels SHARED ${FPGAOPENCLKernels_SRC}) +set_target_properties(FPGAOPENCLKernels PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/src/runtime/local/kernels) + +target_include_directories(FPGAOPENCLKernels PUBLIC ${PROJECT_SOURCE_DIR}/src/) + +target_link_libraries(FPGAOPENCLKernels PUBLIC AllKernels LLVMSupport $ENV{QUARTUSDIR}/hld/linux64/lib/libOpenCL.so) + diff --git a/src/runtime/local/kernels/FPGAOPENCL/CreateFPGAContext.h b/src/runtime/local/kernels/FPGAOPENCL/CreateFPGAContext.h new file mode 100644 index 000000000..e208eedf9 --- /dev/null +++ b/src/runtime/local/kernels/FPGAOPENCL/CreateFPGAContext.h @@ -0,0 +1,31 @@ +/* + * Copyright 2021 The DAPHNE Consortium + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "runtime/local/context/DaphneContext.h" +#include "runtime/local/context/FPGAContext.h" + +// **************************************************************************** +// Convenience function +// **************************************************************************** + +namespace FPGAOPENCL { + static void createFPGAContext(DCTX(ctx)) { + // ToDo: one context per device + ctx->fpga_contexts.emplace_back(FPGAContext::createFpgaContext(0)); + } +} diff --git a/src/runtime/local/kernels/FPGAOPENCL/MatMul.h b/src/runtime/local/kernels/FPGAOPENCL/MatMul.h new file mode 100644 index 000000000..f03a7ce1b --- /dev/null +++ b/src/runtime/local/kernels/FPGAOPENCL/MatMul.h @@ -0,0 +1,180 @@ +/* + * Copyright 2021 The DAPHNE Consortium + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_RUNTIME_LOCAL_KERNELS_MATMUL_H +#define SRC_RUNTIME_LOCAL_KERNELS_MATMUL_H + +#include +#include +#include + +#include +#include + +#include +#include +#include "gemm_interface.h" + +namespace FPGAOPENCL { +// **************************************************************************** +// Struct for partial template specialization +// **************************************************************************** + +template +struct MatMul { +// static void apply(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, DCTX(ctx)) = delete; + static void apply(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, bool transa, bool transb, DCTX(ctx)) = delete; +}; + +// **************************************************************************** +// Convenience function +// **************************************************************************** + +template +void matMul(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, bool transa, bool transb, DCTX(ctx)) { + MatMul::apply(res, lhs, rhs,transa, transb, ctx); +} + +// **************************************************************************** +// (Partial) template specializations for different data/value types +// **************************************************************************** + +// ---------------------------------------------------------------------------- +// DenseMatrix <- DenseMatrix, DenseMatrix +// ---------------------------------------------------------------------------- + +template<> +struct MatMul, DenseMatrix, DenseMatrix> { + static void apply(DenseMatrix *& res, const DenseMatrix * lhs, const DenseMatrix * rhs, bool transa, bool transb,DCTX(ctx)) { + const size_t nr1 = lhs->getNumRows(); + const size_t nc1 = lhs->getNumCols(); + const size_t nc2 = rhs->getNumCols(); +#ifndef NDEBUG + const size_t nr2 = rhs->getNumRows(); + assert((nc1 == nr2) && "#cols of lhs and #rows of rhs must be the same"); +#endif +// printf("\ntest MatMul f32 \n"); +// Parameters of the systolic array in the bitstream. Do not change. + +#define II 32 +#define JJ 32 +#define KK 32 +#define III 14 +#define JJJ 16 +#define KKK 16 + +#ifndef NDEBUG + assert((nr1%(II*III)==0) && "lhs #rows number must be a multiple of 448"); + assert((nc1%(JJ*JJJ)==0 && nc1>512 && nr2%(JJ*JJJ)==0 && nc1>512) && "#cols of lhs and #rows of rhs must be a multiple of 512 (and minimum 1024)"); + assert((nc2%(KK*KKK)==0) && "#cols of rhs must be a multiple of 512"); +#endif +// Testing purpose only: help define the sizes of test inputs +// Can be arbitrarily set. +// matrix a: 10K * 2K +// matrix b: 2K * 8K + +//#define OUTERMOST_I 1//32 +//#define OUTERMOST_J 1//32 +//#define OUTERMOST_K 2//4 + +//#define TYPE float + +#define ACL_ALIGNMENT 64 +//void *acl_aligned_malloc(size_t size) { +// void *result = NULL; +// posix_memalign(&result, ACL_ALIGNMENT, size); +// return result; +//} + const int OUTERMOST_I = ceil(nr1/448); + const int OUTERMOST_J = ceil(nc2/512); + const int OUTERMOST_K = ceil(nc1/512); + + float *A, *B, *C; + void *aa=NULL,*bb=NULL,*cc=NULL; + const int TOTAL_I = III * II * OUTERMOST_I; + const int TOTAL_J = JJJ * JJ * OUTERMOST_J; + const int TOTAL_K = KKK * KK * OUTERMOST_K; + + long int num_elem_A = (long int)TOTAL_I*TOTAL_K; + long int num_elem_B = (long int)TOTAL_K*TOTAL_J; + long int num_elem_C = (long int)TOTAL_I*TOTAL_J; + + posix_memalign(&aa,ACL_ALIGNMENT,num_elem_A * sizeof(float)); + A=(float*)aa; + if (A==NULL) + perror("Failed malloc of matrix A"); + posix_memalign(&bb,ACL_ALIGNMENT,num_elem_B * sizeof(float)); + B=(float*)bb; + if (B==NULL) + perror("Failed malloc of matrix B"); + posix_memalign(&cc,ACL_ALIGNMENT,num_elem_C * sizeof(float)); + C=(float*)cc; + if (C==NULL) + perror("Failed malloc of matrix C"); + + + // printf("\nbefore memcpy()\n"); + + memcpy(A,lhs->getValues(),num_elem_A * sizeof(float));//sizeof(lhs)); + memcpy(B,rhs->getValues(),num_elem_B * sizeof(float));//sizeof(rhs)); + + //printf("\nA values %f\n",*A); + //printf("\nB values %f\n",*B); + sgemm(A, B, C, OUTERMOST_I, OUTERMOST_J, OUTERMOST_K, ctx); + + // printf("\nC values %f\n",*C); + + if(res == nullptr) + res = DataObjectFactory::create>(nr1, nc2, false); + + + //printf("\nres: %p\n", res); + //printf("\nres->getValues(): %p\n", res->getValues()); + memcpy(res->getValues(),C,num_elem_C * sizeof(float));//sizeof(C) + // printf("\nres memcpy2\n"); + + } +}; +/* TODO +template<> +struct MatMul, DenseMatrix, DenseMatrix> { + static void apply(DenseMatrix *& res, const DenseMatrix * lhs, const DenseMatrix * rhs, bool transa, bool transb, DCTX(ctx)) { + const size_t nr1 = lhs->getNumRows(); + const size_t nc1 = lhs->getNumCols(); + const size_t nc2 = rhs->getNumCols(); +#ifndef NDEBUG + const size_t nr2 = rhs->getNumRows(); + assert((nc1 == nr2) && "#cols of lhs and #rows of rhs must be the same"); +#endif + if(res == nullptr) + res = DataObjectFactory::create>(nr1, nc2, false); + + if(nr1 == 1 && nc2 == 1) // Vector-Vector + res->set(0, 0, cblas_ddot(nc1, lhs->getValues(), 1, rhs->getValues(), rhs->getRowSkip())); + else if(nc2 == 1) // Matrix-Vector + cblas_dgemv(CblasRowMajor, CblasNoTrans, nr1, nc1, 1, lhs->getValues(), + lhs->getRowSkip(), rhs->getValues(), rhs->getRowSkip(), 0, + res->getValues(), res->getRowSkip()); + else // Matrix-Matrix + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, nr1, nc2, nc1, + 1, lhs->getValues(), lhs->getRowSkip(), rhs->getValues(), + rhs->getRowSkip(), 0, res->getValues(), res->getRowSkip()); + } +}; +*/ +} + +#endif //SRC_RUNTIME_LOCAL_KERNELS_MATMUL_H diff --git a/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.cpp b/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.cpp new file mode 100755 index 000000000..cc198a5ce --- /dev/null +++ b/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.cpp @@ -0,0 +1,558 @@ +// Copyright (C) 2013-2019 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + + +// This file is modified from /glob/development-tools/versions/fpgasupportstack/a10/1.2.1/intelFPGA_pro/hld/examples_aoc/matrix_mult/host/src/main.cpp + +#include "AOCLUtils/aocl_utils.h" +#include "CL/opencl.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +// Parameters of the systolic array +#define II 32 +#define JJ 32 +#define KK 32 +#define III 14 +#define JJJ 16 +#define KKK 16 + +using namespace aocl_utils; + +#define TYPE float + +#define STR_HELPER(x) #x +#define STR(x) STR_HELPER(x) + +#define DPRINTF(...) \ + printf(__VA_ARGS__); \ + fflush(stdout); + +#define NUM_QUEUES_TO_CREATE 6 +#define NUM_KERNELS_TO_CREATE 6 + +#define CHECK(status) \ + if (status != CL_SUCCESS) { \ + printf("error %d in line %d.\n", status, __LINE__); \ + exit(1); \ + } + +#define ACL_ALIGNMENT 64 +void *acl_aligned_malloc(size_t size) { + void *result = NULL; + posix_memalign(&result, ACL_ALIGNMENT, size); + return result; +} + +void cleanup() {} + +const char *kernel_name[] = { + "kernel_A_loader", + "kernel_B_loader", + "kernel_unloader_WAIT_FINISH", + "kernel_A_feeder", + "kernel_B_feeder", + "kernel_Out" +}; + +double compute_kernel_execution_time(cl_event &event, double &start_d, double &end_d) { + cl_ulong start, end; + + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); + + start_d = (double)1.0e-9 * start; + end_d = (double)1.0e-9 * end; + //return (double)(end-start); + return (double)1.0e-9 * (end - start); // nanoseconds to seconds +} + + +int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const int OUTERMOST_J, const int OUTERMOST_K, DCTX(ctx)) { + const int TOTAL_I = III * II * OUTERMOST_I; + const int TOTAL_J = JJJ * JJ * OUTERMOST_J; + const int TOTAL_K = KKK * KK * OUTERMOST_K; + + long int num_elem_A = (long int)TOTAL_I*TOTAL_K; + long int num_elem_B = (long int)TOTAL_K*TOTAL_J; + long int num_elem_C = (long int)TOTAL_I*TOTAL_J; + + float *serialized_A, *serialized_B; + if ((serialized_A = (float *)acl_aligned_malloc(num_elem_A * sizeof(float))) == NULL) { + perror("Failed malloc of matrix serialized_A"); + } + if ((serialized_B = (float *)acl_aligned_malloc(num_elem_B * sizeof(float))) == NULL) { + perror("Failed malloc of matrix serialized_A"); + } + + // Serialize A + long int addr = 0; + for (int i = 0; i < TOTAL_I; i++) + for (int k = 0; k < TOTAL_K; k++) { + serialized_A[addr++] = A[k + i*TOTAL_K]; + } + // Serialize B + addr = 0; + for (int j = 0; j < TOTAL_J; j++) + for (int k = 0; k < TOTAL_K; k++) { + serialized_B[addr++] = B[j+k*TOTAL_J]; + } + + + cl_int status; + auto fctx = ctx->getFPGAContext(0); + + //---------------------------------------------- + // Create command queues + //--------------------------------------------- + + cl_command_queue cmdQueue[NUM_QUEUES_TO_CREATE + 1]; // extra queue for reading buffer D + + // Create a command queue using clCreateCommandQueue(), + // and associate it with the device you want to execute on + for (int i = 0; i < NUM_QUEUES_TO_CREATE; i++) { + //fDPRINTF(stdout,"cmdQueue i = %d\n", i); + cmdQueue[i] = clCreateCommandQueue( + fctx->context, + fctx->devices[0], + CL_QUEUE_PROFILING_ENABLE, + &status); + CHECK(status); + } + + //fDPRINTF(stdout,"cmdQueue i = %d, a queue for reading the C buffer\n", i); + cmdQueue[NUM_QUEUES_TO_CREATE] = clCreateCommandQueue( + fctx->context, + fctx->devices[0], + CL_QUEUE_PROFILING_ENABLE, + &status); + CHECK(status); + + //---------------------------------------------- + // Create device buffers + //---------------------------------------------- + cl_mem input_A_buf; + cl_mem input_B_buf; + cl_mem output_C_buf; +#ifndef NDEBUG + DPRINTF("\n===== Host-CPU transferring W and X to the FPGA device global memory (DDR4) via PCIe ======\n\n"); +#endif + input_A_buf = clCreateBuffer( + fctx->context, + CL_MEM_READ_ONLY, + num_elem_A * sizeof(cl_float), + NULL, + &status); + CHECK(status); + + input_B_buf = clCreateBuffer( + fctx->context, + CL_MEM_READ_ONLY, + num_elem_B * sizeof(cl_float), + NULL, + &status); + CHECK(status); + + output_C_buf = clCreateBuffer( + fctx->context, + CL_MEM_WRITE_ONLY, + num_elem_C * sizeof(cl_float), + NULL, + &status); + CHECK(status); + + //---------------------------------------------- + // Write host data to device buffers + //---------------------------------------------- + // blocking writes + status = clEnqueueWriteBuffer( + cmdQueue[0], + input_A_buf, + CL_TRUE, + 0, + num_elem_A * sizeof(cl_float), + serialized_A, + 0, + NULL, + NULL); + CHECK(status); + + status = clEnqueueWriteBuffer( + cmdQueue[1], + input_B_buf, + CL_TRUE, + 0, + num_elem_B * sizeof(cl_float), + serialized_B, + 0, + NULL, + NULL); + CHECK(status); + + //---------------------------------------------- + // Create the program from binaries + //---------------------------------------------- + //DPRINTF("\n===== Host-CPU setting up OpenCL program and kernels ======\n\n"); + + cl_program program; + size_t binary_length; + const unsigned char *binary; + + fflush(stdout); + // create the program using binary already compiled offline using aoc (i.e. the .aocx file) + char *aocx_file = getenv("BITSTREAM"); + FILE *fp = fopen(aocx_file, "rb"); + + if (fp == NULL) { + DPRINTF("Failed to open the AOCX file (fopen).\n"); + return -1; + } + + fseek(fp, 0, SEEK_END); + binary_length = ftell(fp); + binary = (unsigned char *)malloc(sizeof(unsigned char) * binary_length); + assert(binary && "Malloc failed"); + rewind(fp); + + if (fread((void *)binary, binary_length, 1, fp) == 0) { + DPRINTF("Failed to read from the AOCX file (fread).\n"); + return -1; + } + fclose(fp); + + //DPRINTF("Create program with binary\n"); + // Create a program using clCreateProgramWithBinary() + program = clCreateProgramWithBinary( + fctx->context, + 1, + fctx->devices, + &binary_length, + (const unsigned char **)&binary, + &status, + NULL); + CHECK(status); + + //---------------------------------------------- + // Create the kernel + //---------------------------------------------- + status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); + if (status != CL_SUCCESS) { + char log[128 * 1024] = {0}; + clGetProgramBuildInfo( + program, + fctx->devices[0], + CL_PROGRAM_BUILD_LOG, 128 * 1024, log, NULL); + CHECK(status); + } + + cl_kernel kernel[NUM_KERNELS_TO_CREATE]; + + for (int j = 0; j < NUM_KERNELS_TO_CREATE; j++) { + kernel[j] = clCreateKernel(program, (const char *)kernel_name[j], &status); + CHECK(status); + } +#ifndef NDEBUG + DPRINTF("All kernels created\n"); +#endif + // A_loader + status = clSetKernelArg( + kernel[0], + 0, + sizeof(int), + &TOTAL_K); + CHECK(status); + status = clSetKernelArg( + kernel[0], + 1, + sizeof(int), + &TOTAL_I); + CHECK(status); + status = clSetKernelArg( + kernel[0], + 2, + sizeof(int), + &TOTAL_J); + CHECK(status); + status = clSetKernelArg( + kernel[0], + 3, + sizeof(cl_mem), + &input_A_buf); + CHECK(status); + // B_loader + status = clSetKernelArg( + kernel[1], + 0, + sizeof(int), + &TOTAL_K); + CHECK(status); + status = clSetKernelArg( + kernel[1], + 1, + sizeof(int), + &TOTAL_I); + CHECK(status); + status = clSetKernelArg( + kernel[1], + 2, + sizeof(int), + &TOTAL_J); + CHECK(status); + status = clSetKernelArg( + kernel[1], + 3, + sizeof(cl_mem), + &input_B_buf); + CHECK(status); + // unloader + status = clSetKernelArg( + kernel[2], + 0, + sizeof(int), + &TOTAL_I); + CHECK(status); + status = clSetKernelArg( + kernel[2], + 1, + sizeof(int), + &TOTAL_J); + CHECK(status); + status = clSetKernelArg( + kernel[2], + 2, + sizeof(cl_mem), + &output_C_buf); + CHECK(status); + // A_feeder + status = clSetKernelArg( + kernel[3], + 0, + sizeof(int), + &TOTAL_K); + CHECK(status); + status = clSetKernelArg( + kernel[3], + 1, + sizeof(int), + &TOTAL_I); + CHECK(status); + status = clSetKernelArg( + kernel[3], + 2, + sizeof(int), + &TOTAL_J); + CHECK(status); + // B_feeder + status = clSetKernelArg( + kernel[4], + 0, + sizeof(int), + &TOTAL_K); + CHECK(status); + status = clSetKernelArg( + kernel[4], + 1, + sizeof(int), + &TOTAL_I); + CHECK(status); + status = clSetKernelArg( + kernel[4], + 2, + sizeof(int), + &TOTAL_J); + CHECK(status); + // Out + status = clSetKernelArg( + kernel[5], + 0, + sizeof(int), + &TOTAL_K); + CHECK(status); + status = clSetKernelArg( + kernel[5], + 1, + sizeof(int), + &TOTAL_I); + CHECK(status); + status = clSetKernelArg( + kernel[5], + 2, + sizeof(int), + &TOTAL_J); + CHECK(status); + + //---------------------------------------------- + // Configure the work-item structure (using only tasks atm) + //---------------------------------------------- + + // Define the number of threads that will be created + // as well as the number of work groups + size_t globalWorkSize[1]; + size_t localWorkSize[1]; + + //---------------------------------------------- + // Enqueue the kernel for execution + //---------------------------------------------- + + // all kernels are always tasks + globalWorkSize[0] = 1; + localWorkSize[0] = 1; + + cl_event kernel_exec_event[NUM_KERNELS_TO_CREATE]; + +#ifndef NDEBUG + DPRINTF("\n===== Host-CPU enqeuing the OpenCL kernels to the FPGA device ======\n\n"); +#endif + for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) { + // Alternatively, can use clEnqueueTaskKernel +#ifndef NDEBUG + DPRINTF("clEnqueueNDRangeKernel[%d]: %s!\n", i, kernel_name[i]); +#endif + status = clEnqueueNDRangeKernel( + cmdQueue[i], + kernel[i], + 1, + NULL, + globalWorkSize, + localWorkSize, + 0, + NULL, + &kernel_exec_event[i]); + CHECK(status); + } +#ifndef NDEBUG + DPRINTF(" *** FPGA execution started!\n"); +#endif + for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) { + status = clFlush(cmdQueue[i]); + CHECK(status); + } + + for (int i = 0; i < NUM_QUEUES_TO_CREATE; i++) { +#ifndef NDEBUG + DPRINTF("cmd queue: %d\n", i); +#endif + fflush(stdout); + status = clFinish(cmdQueue[i]); + CHECK(status); + } +#ifndef NDEBUG + DPRINTF(" *** FPGA execution finished!\n"); + DPRINTF("\n\n"); +//#endif + + double k_start_time[NUM_KERNELS_TO_CREATE]; + double k_end_time[NUM_KERNELS_TO_CREATE]; + double k_exec_time[NUM_KERNELS_TO_CREATE]; + double max_time = 0; + for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) { + k_exec_time[i] = compute_kernel_execution_time(kernel_exec_event[i], k_start_time[i], k_end_time[i]); + if (k_exec_time[i] > max_time) { + max_time = k_exec_time[i]; + } + } +//#ifndef NDEBUG + DPRINTF("Time taken: %lf sec\n\n", max_time); + + printf("\n===== Reporting measured throughput ======\n\n"); +//#endif + double k_earliest_start_time = k_start_time[0]; + double k_latest_end_time = k_end_time[0]; + + for (int i = 1; i < NUM_KERNELS_TO_CREATE; i++) { + if (k_start_time[i] < k_earliest_start_time) + k_earliest_start_time = k_start_time[i]; + + if (k_end_time[i] > k_latest_end_time) + k_latest_end_time = k_end_time[i]; + } + + // IMPORTANT: we care about the finish time of drain_C, once data is drained we are done + k_latest_end_time = k_end_time[NUM_KERNELS_TO_CREATE - 1]; + + for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) { + printf(" Kernel execution time on FPGA: %s, \n \t\t\t\t\t\t\t\t\texec time = %.5f s, start=%.5f s, end=%.5f s\n", kernel_name[i], k_exec_time[i], k_start_time[i], k_end_time[i]); + } +//#endif + + double k_overall_exec_time = k_latest_end_time - k_earliest_start_time; +//#ifndef NDEBUG + printf("\n"); + printf(" Loader kernels start time\t\t= %.5f s\n", k_earliest_start_time); + printf(" Unloader kernels end time\t\t= %.5f s\n", k_latest_end_time); + printf(" FPGA GEMM exec time\t\t= %.5f s\n", k_overall_exec_time); + + // multiplied by 1.0e-9 to get G-FLOPs + printf("\n"); + + double num_operations = (double)2.0 * (TOTAL_K) * (double)(TOTAL_I) * (double)(TOTAL_J); + + printf(" # operations = %.0f\n", num_operations ); + printf(" Throughput: %.5f GFLOPS\n", (double)1.0e-9 * num_operations / k_overall_exec_time); + + DPRINTF("\n===== Host-CPU transferring result matrix C from the FPGA device global memory (DDR4) via PCIe ======\n\n"); +#endif + // Read the results back from the device, blocking read + float *serialized_Z; + if ((serialized_Z = (float *)acl_aligned_malloc(num_elem_C * sizeof(float))) == NULL) { + perror("Failed malloc of matrix serialized_Z"); + } + + clEnqueueReadBuffer( + //cmdQueue[KID_DRAIN_MAT_C], + cmdQueue[NUM_KERNELS_TO_CREATE], // using a special queue for reading buffer C + output_C_buf, + CL_TRUE, + 0, + num_elem_C * sizeof(cl_float), + serialized_Z, + 0, + NULL, + NULL); + CHECK(status); + + // Deserialize Z + addr = 0; + for (int i = 0; i < TOTAL_I; i++) + for (int j = 0; j < TOTAL_J; j++) { + C[j + i*TOTAL_J] = serialized_Z[addr++]; + } + return 0; +} + diff --git a/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.h b/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.h new file mode 100755 index 000000000..0a8b4de9d --- /dev/null +++ b/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.h @@ -0,0 +1,10 @@ +#ifndef SGEMM_INTERFACE +#define SGEMM_INTERFACE +#include + + +extern int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const int OUTERMOST_J, const int OUTERMOST_K, DCTX(ctx)); + +#endif + + diff --git a/src/runtime/local/kernels/genKernelInst.py b/src/runtime/local/kernels/genKernelInst.py index 1b6708092..2566f4f33 100755 --- a/src/runtime/local/kernels/genKernelInst.py +++ b/src/runtime/local/kernels/genKernelInst.py @@ -204,7 +204,6 @@ def printHelp(): print() printHelp() sys.exit(1) - # Parse arguments. inFilePath = sys.argv[1] outFilePath = sys.argv[2] diff --git a/src/runtime/local/kernels/kernels.json b/src/runtime/local/kernels/kernels.json index 1087fc0bd..fc8931bfe 100644 --- a/src/runtime/local/kernels/kernels.json +++ b/src/runtime/local/kernels/kernels.json @@ -433,6 +433,21 @@ }] }, + { + "kernelTemplate": { + "header": "CreateFPGAContext.h", + "opName": "createFPGAContext", + "returnType": "void", + "templateParams": [], + "runtimeParams": [] + }, + "api": [ + { + "name": ["FPGAOPENCL"], + "instantiations": [[]] + + }] + }, { "kernelTemplate": { "header": "CreateDistributedContext.h", @@ -1079,6 +1094,12 @@ [["DenseMatrix", "float"], ["DenseMatrix", "float"], ["DenseMatrix", "float"]], [["DenseMatrix", "double"], ["DenseMatrix", "double"], ["DenseMatrix", "double"]] ] + }, + { + "name": ["FPGAOPENCL"], + "instantiations": [ + [["DenseMatrix", "float"], ["DenseMatrix", "float"], ["DenseMatrix", "float"]] + ] } ] }, @@ -2812,7 +2833,7 @@ { "name": ["CPP"], "instantiations": [ - [["DenseMatrix", "double"]] + [["DenseMatrix", "double"]] ] } ]