diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8aa705dc5..8664e3520 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,6 +145,19 @@ if(USE_ARROW)
     message(STATUS "Arrow/Parquet enabled")
 endif()
 
+option(USE_FPGAOPENCL "Whether to activate compilation of FPGA OpenCL features" OFF)
+if(USE_FPGAOPENCL)
+	if(NOT DEFINED ENV{QUARTUSDIR})
+		message(SEND_ERROR "Intel(R) Quartus installation directory should be defined by QUARTUSDIR varaiable (e.g. /opt/intel/intelFPGA_pro/21.4/)")
+		execute_process(COMMAND sleep 10)
+	endif()
+	include_directories($ENV{QUARTUSDIR}/hld/examples_aoc/common/inc/ $ENV{QUARTUSDIR}/hld/host/include/)
+	message(STATUS "cmake: using FPGA")
+	add_definitions(-DUSE_FPGAOPENCL)
+endif()
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+
 # *****************************************************************************
 # Project-specific include directories
 # *****************************************************************************
@@ -170,6 +183,10 @@ add_subdirectory(src/runtime/distributed/worker)
 add_subdirectory(src/runtime/local/datastructures)
 add_subdirectory(src/runtime/local/io)
 add_subdirectory(src/runtime/local/kernels)
+if(USE_FPGAOPENCL)
+    add_subdirectory(src/runtime/local/kernels/FPGAOPENCL)
+endif() 
 add_subdirectory(src/util)
 
+
 add_subdirectory(test)
diff --git a/build.sh b/build.sh
index 0da089f64..18114ad86 100755
--- a/build.sh
+++ b/build.sh
@@ -381,6 +381,7 @@ par_acceptAll="0"
 unknown_options=""
 BUILD_CUDA="-DUSE_CUDA=OFF"
 BUILD_ARROW="-DUSE_ARROW=OFF"
+BUILD_FPGAOPENCL="-DUSE_FPGAOPENCL=OFF"
 BUILD_DEBUG="-DCMAKE_BUILD_TYPE=Release"
 
 while [[ $# -gt 0 ]]; do
@@ -414,6 +415,10 @@ while [[ $# -gt 0 ]]; do
             echo using ARROW
             BUILD_ARROW="-DUSE_ARROW=ON"
             ;;
+        --fpgaopencl)
+            echo using FPGAOPENCL
+            export BUILD_FPGAOPENCL="-DUSE_FPGAOPENCL=ON"
+            ;;
         --debug)
             echo building DEBUG version
             export BUILD_DEBUG="-DCMAKE_BUILD_TYPE=Debug"
@@ -701,6 +706,17 @@ else
     daphne_msg "No need to build MLIR/LLVM again."
 fi
 
+if [[ $BUILD_FPGAOPENCL = *"ON"* ]]; then
+  FPGAOPENCL_BISTREAM_DIR="$projectRoot/src/runtime/local/kernels/FPGAOPENCL/bitstreams"
+  FPGAOPENCL_BISTREAM_URL="https://github.com/daphne-eu/supplemental-binaries/raw/main/fpga_bitstreams/"
+  if [ ! -d $FPGAOPENCL_BISTREAM_DIR ]; then
+    echo fetching FPGA bitstreams
+    mkdir -p $FPGAOPENCL_BISTREAM_DIR
+    cd $FPGAOPENCL_BISTREAM_DIR
+    wget $FPGAOPENCL_BISTREAM_URL/sgemm.aocx
+    cd -
+  fi
+fi
 
 # *****************************************************************************
 # Build DAPHNE target.
@@ -708,7 +724,7 @@ fi
 
 daphne_msg "Build Daphne"
 
-cmake -S "$projectRoot" -B "$daphneBuildDir" -G Ninja $BUILD_CUDA $BUILD_ARROW $BUILD_DEBUG \
+cmake -S "$projectRoot" -B "$daphneBuildDir" -G Ninja $BUILD_CUDA $BUILD_ARROW $BUILD_FPGAOPENCL  $BUILD_DEBUG \
   -DCMAKE_PREFIX_PATH="$installPrefix" -DANTLR_VERSION="$antlrVersion"  \
   -DMLIR_DIR="$buildPrefix/$llvmName/lib/cmake/mlir/" \
   -DLLVM_DIR="$buildPrefix/$llvmName/lib/cmake/llvm/"
diff --git a/doc/FPGAconfiguration.md b/doc/FPGAconfiguration.md
new file mode 100644
index 000000000..8cd702d0d
--- /dev/null
+++ b/doc/FPGAconfiguration.md
@@ -0,0 +1,62 @@
+<!--
+Copyright 2021 The DAPHNE Consortium
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# FPGA configuration for usage in DAPHNE 
+
+
+### System requirments
+
+Daphne build script for FPGA kernels support requires additional QUARTUSDIR system variable definition.
+Example command is presented in fpga-build-env.sh or in the following command:
+
+export QUARTUSDIR=/opt/intel/intelFPGA_pro/21.4
+
+To build the Daphne with the FPGA support -fpgaopencl flag has to be used:
+
+ ./build.sh --fpgaopenc 
+
+
+To run developed or precompiled, included in Daphne repository FPGA OpenCL kernels an installedand configured  FPGA device is required.
+Our example kernels have been tested using Intel(R) PAC D5005 card (https://www.intel.com/content/www/us/en/products/sku/193921/intel-fpga-pac-d5005/specifications.html)
+
+DAPHNE contains some example linear algebra kernels developed using T2SP framework(https://github.com/IntelLabs/t2sp/blob/master/README.md). 
+Example precompiled FPGA kernels can be usedon  DAPHNE DSL description level. 
+To prepare the system for the precompiled FPGA kernels some FPGA and OpenCL system variables are required. 
+The easiest way to set up required varables is to use the init_opencl.sh script from installed Intel(R) Quartus sowtware or from the 
+Intel(R) OpenCL RTE or Intel(R) OpenCL SDK packages. 
+
+Example script usage:
+source /opt/intel/intelFPGA_pro/21.4/hld/init_opencl.sh
+
+For additional details please look into https://www.intel.com/content/www/us/en/docs/programmable/683550/18-1/standard-edition-getting-started-guide.html
+or https://www.intel.com/content/www/us/en/software/programmable/sdk-for-opencl/overview.html. 
+
+
+### Precompiled FPGA kernels usage 
+
+To use a precompiled FPGA kernel a FPGA image is required (*.aocx). FPGA device has to programmed with particular image which contains required kernel implementation.
+Example FPGA programming command using example FPGA image:
+
+ aocl program acl0 src/runtime/local/kernels/FPGAOPENCL/bitstreams/sgemm.aocx   
+
+
+Additionally the BITSTREAM variable has to be defind in the system.
+Please look into the following example:
+
+ export BITSTREAM=src/runtime/local/kernels/FPGAOPENCL/bitstreams/sgemm.aocx 
+
+When another FPGA image contains implementation for another required computational kernel then FPGA device has to be reprogrammed and BITSTREAM variable value has to be changed.
+
diff --git a/fpga-build-env.sh b/fpga-build-env.sh
new file mode 100644
index 000000000..ca3958fc3
--- /dev/null
+++ b/fpga-build-env.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+#initialize Intel FPGA OpenCL environment
+export QUARTUSDIR=/opt/intel/intelFPGA_pro/21.4
+source $QUARTUSDIR/hld/init_opencl.sh
+echo $INTELFPGAOCLSDKROOT
+export ALTERAOCLSDKROOT=$INTELFPGAOCLSDKROOT
+#set up BITSTREAM variable for required FPGA image (can be different different for varius implemented kernels)
+export BITSTREAM=src/runtime/local/kernels/FPGAOPENCL/bitstreams/sgemm.aocx  # SGEMM computational kernel
+
diff --git a/scripts/examples/fpga-gemm.daph b/scripts/examples/fpga-gemm.daph
new file mode 100644
index 000000000..55fcc743a
--- /dev/null
+++ b/scripts/examples/fpga-gemm.daph
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2021 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+############## IMPORTANT #################
+# EXAMPLE command to run the script: 
+# 
+# env BITSTREAM=../../src/runtime/local/kernels/FPGAOPENCL/bistreams/sgemm.aocx  ./build/bin/daphne -fpgaopencl scripts/examples/fpga-gemm.daph
+# 
+# WHERE: 
+# -fpgaopencl is a flag required for FPGA usage
+# BITSTREAM variable point out required FPGA image with its location 
+#########################################
+
+# Creating input matrices
+m  = rand(448,1024, as.f32(2.0), as.f32(2.0), 1.0, -1);
+//m  = rand(896,8192, as.f32(2.0), as.f32(2.0), 1.0, -1);
+//m  = rand(448,16384, as.f32(2.0), as.f32(2.0), 1.0, -1);
+
+m2 = rand(1024,512, as.f32(1.0), as.f32(1.0), 1.0, -1);
+//m2 = rand(8192,1024, as.f32(1.0), as.f32(1.0), 1.0, -1);
+//m2 = rand(16384,1024, as.f32(1.0), as.f32(1.0), 1.0, -1);
+
+# test prints
+//print(m[0,]);
+//print(m2[0,]);
+
+Z = m @ m2;
+//print(Z[,0]);
+
+print("Bye!");
diff --git a/src/api/cli/CMakeLists.txt b/src/api/cli/CMakeLists.txt
index 316045715..3faad34b9 100644
--- a/src/api/cli/CMakeLists.txt
+++ b/src/api/cli/CMakeLists.txt
@@ -32,6 +32,10 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER)
     list(APPEND LIB_DEPS CUDAKernels)
 endif()
 
+if(USE_FPGAOPENCL)
+    list(APPEND LIB_DEPS FPGAOPENCLKernels)
+endif()
+
 add_llvm_executable(daphne daphne.cpp DaphneUserConfig.h DEPENDS ${LIB_DEPS})
 
 llvm_update_compile_flags(daphne)
diff --git a/src/api/cli/DaphneUserConfig.h b/src/api/cli/DaphneUserConfig.h
index 37870f0db..0ee08d76e 100644
--- a/src/api/cli/DaphneUserConfig.h
+++ b/src/api/cli/DaphneUserConfig.h
@@ -40,6 +40,7 @@ struct DaphneUserConfig {
     bool pinWorkers = false;
     bool hyperthreadingEnabled = false;
     bool debugMultiThreading = false;
+    bool use_fpgaopencl = false;
 
     bool debug_llvm = false;
     bool explain_kernels = false;
@@ -65,6 +66,11 @@ struct DaphneUserConfig {
     // ToDo: This is an arbitrary default taken from sample code
 //    int cublas_workspace_size = 1024 * 1024 * 4;
 #endif
+#ifdef USE_FPGAOPENCL
+    std::vector<int> fpga_devices;
+#endif
+    
+    
     std::string libdir;
     std::vector<std::string> library_paths;
 };
diff --git a/src/api/cli/daphne.cpp b/src/api/cli/daphne.cpp
index c9fa22e9b..3225f30f4 100644
--- a/src/api/cli/daphne.cpp
+++ b/src/api/cli/daphne.cpp
@@ -176,6 +176,10 @@ main(int argc, char** argv)
             "cuda", cat(daphneOptions),
             desc("Use CUDA")
     );
+    opt<bool> fpgaopencl(
+            "fpgaopencl", cat(daphneOptions),
+            desc("Use FPGAOPENCL")
+    );
     opt<string> libDir(
             "libdir", cat(daphneOptions),
             desc("The directory containing kernel libraries")
@@ -324,6 +328,11 @@ main(int argc, char** argv)
         }
     }
 
+    if(fpgaopencl) {
+        user_config.use_fpgaopencl = true;
+    }
+
+
     // add this after the cli args loop to work around args order
     if(!user_config.libdir.empty() && user_config.use_cuda)
             user_config.library_paths.push_back(user_config.libdir + "/libCUDAKernels.so");
diff --git a/src/compiler/execution/DaphneIrExecutor.cpp b/src/compiler/execution/DaphneIrExecutor.cpp
index 94e857c27..412ca894d 100644
--- a/src/compiler/execution/DaphneIrExecutor.cpp
+++ b/src/compiler/execution/DaphneIrExecutor.cpp
@@ -135,6 +135,12 @@ bool DaphneIrExecutor::runPasses(mlir::ModuleOp module)
             pm.addNestedPass<mlir::FuncOp>(mlir::daphne::createMarkCUDAOpsPass(userConfig_));
 #endif
 
+#ifdef USE_FPGAOPENCL
+        if(userConfig_.use_fpgaopencl)
+            pm.addNestedPass<mlir::FuncOp>(mlir::daphne::createMarkFPGAOPENCLOpsPass(userConfig_));
+#endif
+
+
         if(userConfig_.use_obj_ref_mgnt)
             pm.addNestedPass<mlir::FuncOp>(mlir::daphne::createManageObjRefsPass());
         if(userConfig_.explain_obj_ref_mgnt)
@@ -182,6 +188,14 @@ std::unique_ptr<mlir::ExecutionEngine> DaphneIrExecutor::createExecutionEngine(m
             }
         }
 #endif
+ 
+#ifdef USE_FPGAOPENCL
+        if(userConfig_.use_fpgaopencl) {
+            if(userConfig_.libdir.empty()) {
+                sharedLibRefs.push_back("build/src/runtime/local/kernels/libFPGAOPENCLKernels.so");
+            }
+        }
+#endif
         registerLLVMDialectTranslation(context_);
         // module.dump();
         auto maybeEngine = mlir::ExecutionEngine::create(
diff --git a/src/compiler/lowering/CMakeLists.txt b/src/compiler/lowering/CMakeLists.txt
index b4d0aba3c..887c86037 100644
--- a/src/compiler/lowering/CMakeLists.txt
+++ b/src/compiler/lowering/CMakeLists.txt
@@ -17,6 +17,7 @@ add_mlir_dialect_library(MLIRDaphneTransforms
     DistributeComputationsPass.cpp
     DistributePipelinesPass.cpp
     MarkCUDAOpsPass.cpp
+    MarkFPGAOPENCLOpsPass.cpp
     InsertDaphneContextPass.cpp
     ManageObjRefsPass.cpp
     LowerToLLVMPass.cpp
diff --git a/src/compiler/lowering/InsertDaphneContextPass.cpp b/src/compiler/lowering/InsertDaphneContextPass.cpp
index 56e9c4055..de6ed0b01 100644
--- a/src/compiler/lowering/InsertDaphneContextPass.cpp
+++ b/src/compiler/lowering/InsertDaphneContextPass.cpp
@@ -63,6 +63,13 @@ void InsertDaphneContextPass::runOnFunction()
     if (user_config.use_distributed){
         builder.create<daphne::CreateDistributedContextOp>(loc);
     }
+#ifdef USE_FPGAOPENCL
+    if(user_config.use_fpgaopencl) {
+        builder.create<daphne::CreateFPGAContextOp>(loc);
+    }
+#endif
+
+ 
     // Insert a DestroyDaphneContextOp as the last operation in the block, but
     // before the block's terminator.
     builder.setInsertionPoint(b.getTerminator());
diff --git a/src/compiler/lowering/MarkFPGAOPENCLOpsPass.cpp b/src/compiler/lowering/MarkFPGAOPENCLOpsPass.cpp
new file mode 100644
index 000000000..da8a9223d
--- /dev/null
+++ b/src/compiler/lowering/MarkFPGAOPENCLOpsPass.cpp
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2021 The DAPHNE Consortium
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+#ifdef USE_FPGAOPENCL
+#include "compiler/CompilerUtils.h"
+#include "ir/daphneir/Daphne.h"
+#include "ir/daphneir/Passes.h"
+#include <mlir/IR/BlockAndValueMapping.h>
+
+#include <iostream>
+
+using namespace mlir;
+
+struct MarkFPGAOPENCLOpsPass : public PassWrapper<MarkFPGAOPENCLOpsPass, FunctionPass> {
+
+    /**
+     * @brief User configuration influencing the rewrite pass
+     */
+    const DaphneUserConfig& cfg;
+
+    explicit MarkFPGAOPENCLOpsPass(const DaphneUserConfig& cfg) : cfg(cfg) {
+    }
+
+    void runOnFunction() final;
+
+    bool checkUseFPGAOPENCL(Operation* op) const {
+//        std::cout << "checkUseFPGAOPENCL: " << op->getName().getStringRef().str() << std::endl;
+        return op->hasTrait<mlir::OpTrait::FPGAOPENCLSupport>();
+    }
+};
+
+void MarkFPGAOPENCLOpsPass::runOnFunction() {
+    getFunction()->walk([&](Operation* op) {
+        OpBuilder builder(op);
+        if(checkUseFPGAOPENCL(op)) {
+            op->setAttr("fpgaopencl_device", builder.getI32IntegerAttr(0));
+        }
+        WalkResult::advance();
+    });
+}
+
+std::unique_ptr<Pass> daphne::createMarkFPGAOPENCLOpsPass(const DaphneUserConfig& cfg) {
+    return std::make_unique<MarkFPGAOPENCLOpsPass>(cfg);
+}
+#endif 
diff --git a/src/compiler/lowering/RewriteToCallKernelOpPass.cpp b/src/compiler/lowering/RewriteToCallKernelOpPass.cpp
index 3f0297d16..dd968b9d2 100644
--- a/src/compiler/lowering/RewriteToCallKernelOpPass.cpp
+++ b/src/compiler/lowering/RewriteToCallKernelOpPass.cpp
@@ -153,6 +153,10 @@ namespace
 //                else
 //                    std::cout << "attr = null: " << op->getName().getStringRef().str() << std::endl;
             }
+	    else if(op->hasAttr("fpgaopencl_device")) {
+		 callee << "FPGAOPENCL";
+	    }
+		    
 
             callee << '_' << op->getName().stripDialect().data();
 
diff --git a/src/ir/daphneir/Daphne.h b/src/ir/daphneir/Daphne.h
index c3b941d00..6c56bd5c4 100644
--- a/src/ir/daphneir/Daphne.h
+++ b/src/ir/daphneir/Daphne.h
@@ -47,6 +47,12 @@
 #include <utility>
 #include <vector>
 
+namespace mlir::OpTrait {
+    template<class ConcreteOp>
+    class FPGAOPENCLSupport : public TraitBase<ConcreteOp, FPGAOPENCLSupport> {
+    };
+}
+
 namespace mlir::daphne {
     enum class MatrixRepresentation {
         Dense = 0,
@@ -67,4 +73,4 @@ namespace mlir::daphne {
 #define GET_OP_CLASSES
 #include "ir/daphneir/DaphneOps.h.inc"
 
-#endif //SRC_IR_DAPHNEIR_DAPHNE_H
\ No newline at end of file
+#endif //SRC_IR_DAPHNEIR_DAPHNE_H
diff --git a/src/ir/daphneir/DaphneOps.td b/src/ir/daphneir/DaphneOps.td
index 29ae28d75..a423f8878 100644
--- a/src/ir/daphneir/DaphneOps.td
+++ b/src/ir/daphneir/DaphneOps.td
@@ -28,6 +28,7 @@ include "ir/daphneir/DaphneInferTypesOpInterface.td"
 include "ir/daphneir/DaphneVectorizableOpInterface.td"
 include "ir/daphneir/DaphneShapeInferenceTraits.td"
 include "ir/daphneir/CUDASupport.td"
+include "ir/daphneir/FPGAOPENCLSupport.td"
 
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
@@ -142,7 +143,7 @@ def Daphne_MatMulOp : Daphne_Op<"matMul", [
     DeclareOpInterfaceMethods<VectorizableOpInterface>,
     DeclareOpInterfaceMethods<InferTypesOpInterface>,
     DeclareOpInterfaceMethods<InferShapeOpInterface>,
-    DeclareOpInterfaceMethods<InferSparsityOpInterface>, CUDASupport
+    DeclareOpInterfaceMethods<InferSparsityOpInterface>, CUDASupport, FPGAOPENCLSupport
 ]> {
     let arguments = (ins MatrixOf<[NumScalar]>:$lhs, MatrixOf<[NumScalar]>:$rhs, BoolScalar:$transa, BoolScalar:$transb);
     let results = (outs MatrixOf<[NumScalar]>:$res);
@@ -1212,6 +1213,12 @@ def Daphne_CreateDistributedContextOp : Daphne_Op<"createDistributedContext", []
     let results = (outs);
 }
 
+def Daphne_CreateFPGAContextOp : Daphne_Op<"createFPGAContext", [FPGAOPENCLSupport]> {
+    let arguments = (ins);
+    let results = (outs);
+}
+
+
 // ****************************************************************************
 // Vectorized operations
 // ****************************************************************************
diff --git a/src/ir/daphneir/FPGAOPENCLSupport.td b/src/ir/daphneir/FPGAOPENCLSupport.td
new file mode 100644
index 000000000..4322b6aa5
--- /dev/null
+++ b/src/ir/daphneir/FPGAOPENCLSupport.td
@@ -0,0 +1,24 @@
+/*
+ *  Copyright 2021 The DAPHNE Consortium
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#ifndef SRC_IR_DAPHNEIR_FPGAOPENCLSUPPORT_TD
+#define SRC_IR_DAPHNEIR_FPGAOPENCLSUPPORT_TD
+
+include "mlir/IR/OpBase.td"
+
+def FPGAOPENCLSupport : NativeOpTrait<"FPGAOPENCLSupport">;
+
+#endif // SRC_IR_DAPHNEIR_FPGAOPENCLSUPPORT_TD 
diff --git a/src/ir/daphneir/Passes.h b/src/ir/daphneir/Passes.h
index 3aa1d72cb..6f91bb21d 100644
--- a/src/ir/daphneir/Passes.h
+++ b/src/ir/daphneir/Passes.h
@@ -57,6 +57,10 @@ namespace mlir::daphne {
     std::unique_ptr<Pass> createMarkCUDAOpsPass(const DaphneUserConfig& cfg);
 #endif
 
+#ifdef USE_FPGAOPENCL
+    std::unique_ptr<Pass> createMarkFPGAOPENCLOpsPass(const DaphneUserConfig& cfg);
+#endif
+
 #define GEN_PASS_REGISTRATION
 #include "ir/daphneir/Passes.h.inc"
 } // namespace mlir::daphne
diff --git a/src/runtime/local/context/DaphneContext.h b/src/runtime/local/context/DaphneContext.h
index cfd262f5b..af5ed0e37 100644
--- a/src/runtime/local/context/DaphneContext.h
+++ b/src/runtime/local/context/DaphneContext.h
@@ -24,6 +24,13 @@
 
 #include "IContext.h"
 
+#ifdef USE_FPGAOPENCL
+    #include "FPGAContext.h"
+#endif
+
+
+
+
 // This macro is intended to be used in kernel function signatures, such that
 // we can change the ubiquitous DaphneContext parameter in a single place, if
 // required.
@@ -47,6 +54,8 @@ struct DaphneContext {
 
 
     std::vector<std::unique_ptr<IContext>> cuda_contexts;
+    std::vector<std::unique_ptr<IContext>> fpga_contexts;
+
 
     std::unique_ptr<IContext> distributed_context;
 
@@ -67,7 +76,13 @@ struct DaphneContext {
         for (auto& ctx : cuda_contexts) {
             ctx->destroy();
         }
+        for (auto& ctx : fpga_contexts) {
+            ctx->destroy();
+        }
         cuda_contexts.clear();
+        fpga_contexts.clear();
+
+
     }
 
 #ifdef USE_CUDA
@@ -76,12 +91,23 @@ struct DaphneContext {
         return cuda_contexts[dev_id].get();
     }
 #endif
+#ifdef USE_FPGAOPENCL
+    // ToDo: in a multi device setting this should use a find call instead of a direct [] access
+    [[nodiscard]] FPGAContext* getFPGAContext(int dev_id) const {
+ //	std::cout<<"inside getFPGAContext"<<std::endl;
+       return dynamic_cast<FPGAContext*>(fpga_contexts[dev_id].get());
+    }
+#endif
+ 
+
+
 
     [[nodiscard]] bool useCUDA() const { return !cuda_contexts.empty(); }
+    [[nodiscard]] bool useFPGA() const { return !fpga_contexts.empty(); }
 
     [[nodiscard]] IContext *getDistributedContext() const {
         return distributed_context.get();
     }
     
     [[maybe_unused]] [[nodiscard]] DaphneUserConfig getUserConfig() const { return config; }
-};
+};
\ No newline at end of file
diff --git a/src/runtime/local/context/FPGAContext.cpp b/src/runtime/local/context/FPGAContext.cpp
new file mode 100644
index 000000000..14131617e
--- /dev/null
+++ b/src/runtime/local/context/FPGAContext.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2021 The DAPHNE Consortium
+ 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "AOCLUtils/aocl_utils.h"
+#include "CL/opencl.h"
+#include "runtime/local/context/FPGAContext.h"
+//#include <cstdio>
+//#include <cstdlib>
+//#include <cstring>
+//#include <fstream>
+//#include <iomanip>
+//#include <iostream>
+//#include <math.h>
+//#include <sstream>
+//#include <stdint.h>
+//#include <stdio.h>
+
+using namespace std;
+using namespace aocl_utils;
+
+#define DPRINTF(...)     \
+    printf(__VA_ARGS__); \
+    fflush(stdout);
+
+#define CHECK(status)                                       \
+    if (status != CL_SUCCESS) {                             \
+        printf("error %d in line %d.\n", status, __LINE__); \
+        exit(1);                                            \
+    }
+
+void FPGAContext::destroy() {
+#ifndef NDEBUG
+    std::cout << "Destroying FPGA context..." << std::endl;
+#endif
+}
+
+void FPGAContext::init() {
+#ifndef NDEBUG
+    std::cout << "creating FPGA context..." << std::endl;
+    DPRINTF("\n===== Host-CPU setting up the OpenCL platform and device ======\n\n");
+    unsigned int buf_uint;
+#endif
+    cl_int status;
+    char buffer[4096];
+    int device_found = 0;
+
+    // Use clGetPlatformIDs() to retrieve the  number of platforms
+    status = clGetPlatformIDs(0, NULL, &numPlatforms);
+#ifndef NDEBUG
+    DPRINTF("Number of platforms = %d\n", numPlatforms);
+#endif
+    // Allocate enough space for each platform
+    platforms = (cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id));
+#ifndef NDEBUG
+    DPRINTF("Allocated space for Platform\n");
+#endif
+    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+    CHECK(status);
+#ifndef NDEBUG
+    DPRINTF("Filled in platforms\n");
+    DPRINTF("Initializing IDs\n");
+#endif
+    for (int i = 0; i < (int)numPlatforms; i++) {
+        status = clGetDeviceIDs(platforms[i],
+                                CL_DEVICE_TYPE_ALL,
+                                maxDevices,
+                                devices,
+                                &numDevices);
+
+        if (status == CL_SUCCESS) {
+            clGetPlatformInfo(platforms[i],
+                              CL_PLATFORM_NAME,
+                              4096,
+                              buffer,
+                              NULL);
+#if defined(ALTERA_CL)
+            if (strstr(buffer, "Altera") != NULL) {
+                device_found = 1;
+            }
+//            DPRINTF("%s\n", buffer);
+#elif defined(NVIDIA_CL)
+            if (strstr(buffer, "NVIDIA") != NULL) {
+                device_found = 1;
+            }
+#else
+            if (strstr(buffer, "Intel") != NULL) {
+                device_found = 1;
+            }
+#endif
+#ifndef NDEBUG
+            DPRINTF("Platform found : %s\n", buffer);
+#endif
+	    device_found = 1;
+        }
+    }
+    if (!device_found) {
+        DPRINTF("failed to find a OpenCL device\n");
+        exit(-1);
+    }
+#ifndef NDEBUG
+    DPRINTF("Total number of devices: %d", numDevices);
+    for (unsigned int i = 0; i < numDevices; i++) {
+        clGetDeviceInfo(devices[i],
+                        CL_DEVICE_NAME,
+                        4096,
+                        buffer,
+                        NULL);
+        DPRINTF("\nDevice Name: %s\n", buffer);
+        clGetDeviceInfo(devices[i],
+                        CL_DEVICE_VENDOR,
+                        4096,
+                        buffer,
+                        NULL);
+        DPRINTF("Device Vendor: %s\n", buffer);
+        clGetDeviceInfo(devices[i],
+                        CL_DEVICE_MAX_COMPUTE_UNITS,
+                        sizeof(buf_uint),
+                        &buf_uint,
+                        NULL);
+        DPRINTF("Device Computing Units: %u\n", buf_uint);
+        clGetDeviceInfo(devices[i],
+                        CL_DEVICE_GLOBAL_MEM_SIZE,
+                        sizeof(unsigned long),
+                        &buffer,
+                        NULL);
+        DPRINTF("Global Memory Size: %li\n", *((unsigned long*)buffer));
+        clGetDeviceInfo(devices[i],
+                        CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                        sizeof(unsigned long),
+                        &buffer,
+                        NULL);
+        DPRINTF("Global Memory Allocation Size: %li\n\n", *((unsigned long*)buffer));
+    }
+#endif
+    //----------------------------------------------
+    // Create a context
+#ifndef NDEBUG
+    DPRINTF("\n===== Host-CPU setting up the OpenCL command queues ======\n\n");
+#endif
+    context = clCreateContext(
+        NULL,
+        1,
+        devices,
+        NULL,
+        NULL,
+        &status);
+    CHECK(status);
+}
+
+std::unique_ptr<IContext> FPGAContext::createFpgaContext(int device_id) {
+
+/*    	if(FPGAContext::numDevices < 1) {
+        std::cerr << "Not creating requested FPGA context. No FPGA devices available." << std::endl;
+        return nullptr;
+    }
+
+    if(device_id >= (int)numDevices) {
+        std::cerr << "Requested device ID " << device_id << " >= device count "<<std::endl;// << device_count << std::endl;
+        return nullptr;
+    }
+*/
+    auto ctx = std::unique_ptr<FPGAContext>(new FPGAContext(device_id));
+    ctx->init();
+
+    return ctx;
+}
+
+
diff --git a/src/runtime/local/context/FPGAContext.h b/src/runtime/local/context/FPGAContext.h
new file mode 100644
index 000000000..ff3366cd1
--- /dev/null
+++ b/src/runtime/local/context/FPGAContext.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2021 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+//#include <api/cli/DaphneUserConfig.h>
+#include "IContext.h"
+//#include <runtime/local/kernels/CUDA/HostUtils.h>
+//#include <cublasLt.h>
+#include <AOCLUtils/aocl_utils.h>
+#include <CL/opencl.h>
+#include <cassert>
+#include <iostream>
+#include <memory>
+
+class FPGAContext : public IContext {
+    int device_id = -1;
+    size_t mem_budget = 0;
+
+
+    explicit FPGAContext(int id) : device_id(id) {
+	    //std::cout<<"fpga context constructor"<<std::endl; 
+	    }
+public:
+    cl_uint numPlatforms = 0;
+    cl_platform_id *platforms = NULL;
+    cl_uint maxDevices = 4;
+    cl_device_id devices[4];//maxDevices];
+    cl_uint numDevices = 0;
+    cl_context context = NULL;
+
+
+
+
+
+    FPGAContext() = delete;
+    FPGAContext(const FPGAContext&) = delete;
+    FPGAContext& operator=(const FPGAContext&) = delete;
+    ~FPGAContext() = default;
+
+    void destroy() override;
+    static std::unique_ptr<IContext> createFpgaContext(int id);
+
+//    [[nodiscard]] cublasHandle_t getCublasHandle() const { return cublas_handle; }
+//    [[nodiscard]] cusparseHandle_t getCusparseHandle() const { return cusparse_handle; }
+
+  //  [[nodiscard]] const cudaDeviceProp* getDeviceProperties() const { return &device_properties; }
+
+
+//    size_t getMemBudget() { return mem_budget; }
+
+
+private:
+    void init();
+};
diff --git a/src/runtime/local/kernels/FPGAOPENCL/CMakeLists.txt b/src/runtime/local/kernels/FPGAOPENCL/CMakeLists.txt
new file mode 100644
index 000000000..09b68d56d
--- /dev/null
+++ b/src/runtime/local/kernels/FPGAOPENCL/CMakeLists.txt
@@ -0,0 +1,53 @@
+# Copyright 2021 The DAPHNE Consortium
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Specifies how to generate the file "kernels.cpp" (which resides in the build
+# directory) as the basis for the pre-compiled kernels library.
+
+#       * more properly set up paths (ugly relative include path below)
+#cmake_minimum_required(VERSION 3.21)
+
+#project(FPGAOPENCLkernels)
+
+include_directories($ENV{QUARTUSDIR}/hld/examples_aoc/common/inc/ $ENV{QUARTUSDIR}/hld/host/include/)
+
+set(PREFIX ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/FPGAOPENCL)
+#message(STATUS "cmake fpga test! $ENV{QUARTUSDIR}")
+add_custom_command(
+    OUTPUT ${PROJECT_BINARY_DIR}/FPGAOPENCLkernels.cpp
+    COMMAND python3 ARGS genKernelInst.py kernels.json
+    ${PROJECT_BINARY_DIR}/FPGAOPENCLkernels.cpp FPGAOPENCL
+    MAIN_DEPENDENCY ${PREFIX}/../kernels.json
+    DEPENDS ${PREFIX}/../genKernelInst.py
+    WORKING_DIRECTORY ${PREFIX}/..
+)
+
+#set(PREFIX ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/FPGAOPENCL)
+set(FPGAOPENCLKernels_SRC
+	${PREFIX}/../../context/FPGAContext.cpp
+ 	${PREFIX}/../../context/FPGAContext.h
+	${PREFIX}/CreateFPGAContext.h
+        ${PREFIX}/MatMul.h
+	${PREFIX}/gemm_interface.cpp
+	${PREFIX}/gemm_interface.h
+	${PROJECT_BINARY_DIR}/FPGAOPENCLkernels.cpp
+)
+
+add_library(FPGAOPENCLKernels SHARED ${FPGAOPENCLKernels_SRC})
+set_target_properties(FPGAOPENCLKernels PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/src/runtime/local/kernels)
+
+target_include_directories(FPGAOPENCLKernels PUBLIC ${PROJECT_SOURCE_DIR}/src/)
+
+target_link_libraries(FPGAOPENCLKernels PUBLIC AllKernels LLVMSupport $ENV{QUARTUSDIR}/hld/linux64/lib/libOpenCL.so)
+
diff --git a/src/runtime/local/kernels/FPGAOPENCL/CreateFPGAContext.h b/src/runtime/local/kernels/FPGAOPENCL/CreateFPGAContext.h
new file mode 100644
index 000000000..e208eedf9
--- /dev/null
+++ b/src/runtime/local/kernels/FPGAOPENCL/CreateFPGAContext.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2021 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "runtime/local/context/DaphneContext.h"
+#include "runtime/local/context/FPGAContext.h"
+
+// ****************************************************************************
+// Convenience function
+// ****************************************************************************
+
+namespace FPGAOPENCL {
+    static void createFPGAContext(DCTX(ctx)) {
+        // ToDo: one context per device
+        ctx->fpga_contexts.emplace_back(FPGAContext::createFpgaContext(0));
+    }
+}
diff --git a/src/runtime/local/kernels/FPGAOPENCL/MatMul.h b/src/runtime/local/kernels/FPGAOPENCL/MatMul.h
new file mode 100644
index 000000000..f03a7ce1b
--- /dev/null
+++ b/src/runtime/local/kernels/FPGAOPENCL/MatMul.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright 2021 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_RUNTIME_LOCAL_KERNELS_MATMUL_H
+#define SRC_RUNTIME_LOCAL_KERNELS_MATMUL_H
+
+#include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/datastructures/DataObjectFactory.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
+
+#include <cblas.h>
+#include <math.h>
+
+#include <cassert>
+#include <cstddef>
+#include "gemm_interface.h"
+
+namespace FPGAOPENCL {
+// ****************************************************************************
+// Struct for partial template specialization
+// ****************************************************************************
+
+template<class DTRes, class DTLhs, class DTRhs>
+struct MatMul {
+//   static void apply(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, DCTX(ctx)) = delete;
+   static void apply(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, bool transa, bool transb, DCTX(ctx)) = delete;
+};
+
+// ****************************************************************************
+// Convenience function
+// ****************************************************************************
+
+template<class DTRes, class DTLhs, class DTRhs>
+void matMul(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, bool transa, bool transb, DCTX(ctx)) {
+    MatMul<DTRes, DTLhs, DTRhs>::apply(res, lhs, rhs,transa, transb, ctx);
+}
+
+// ****************************************************************************
+// (Partial) template specializations for different data/value types
+// ****************************************************************************
+
+// ----------------------------------------------------------------------------
+// DenseMatrix <- DenseMatrix, DenseMatrix
+// ----------------------------------------------------------------------------
+
+template<>
+struct MatMul<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>> {
+    static void apply(DenseMatrix<float> *& res, const DenseMatrix<float> * lhs, const DenseMatrix<float> * rhs, bool transa, bool transb,DCTX(ctx)) {
+        const size_t nr1 = lhs->getNumRows();
+        const size_t nc1 = lhs->getNumCols();
+        const size_t nc2 = rhs->getNumCols();
+#ifndef NDEBUG
+        const size_t nr2 = rhs->getNumRows();
+        assert((nc1 == nr2) && "#cols of lhs and #rows of rhs must be the same");        
+#endif
+//	printf("\ntest MatMul f32 \n");
+// Parameters of the systolic array in the bitstream. Do not change.
+
+#define II   32
+#define JJ   32
+#define KK   32
+#define III  14
+#define JJJ  16
+#define KKK  16
+
+#ifndef NDEBUG
+	assert((nr1%(II*III)==0) && "lhs #rows number must be a multiple of 448");        
+	assert((nc1%(JJ*JJJ)==0 && nc1>512 && nr2%(JJ*JJJ)==0 && nc1>512) && "#cols of lhs and #rows of rhs must be a multiple of 512 (and minimum 1024)");        
+	assert((nc2%(KK*KKK)==0) && "#cols of rhs must be a multiple of 512");        
+#endif       
+// Testing purpose only: help define the sizes of test inputs
+// Can be arbitrarily set.
+// matrix a: 10K * 2K
+// matrix b: 2K * 8K
+
+//#define OUTERMOST_I 1//32
+//#define OUTERMOST_J 1//32
+//#define OUTERMOST_K 2//4
+
+//#define TYPE float
+
+#define ACL_ALIGNMENT 64
+//void *acl_aligned_malloc(size_t size) {
+//    void *result = NULL;
+//    posix_memalign(&result, ACL_ALIGNMENT, size);
+//    return result;
+//}
+    const int OUTERMOST_I = ceil(nr1/448);
+    const int OUTERMOST_J = ceil(nc2/512);
+    const int OUTERMOST_K = ceil(nc1/512);
+
+    float *A, *B, *C;
+    void *aa=NULL,*bb=NULL,*cc=NULL;
+    const int TOTAL_I = III * II * OUTERMOST_I;
+    const int TOTAL_J = JJJ * JJ * OUTERMOST_J;
+    const int TOTAL_K = KKK * KK * OUTERMOST_K;
+    
+    long int num_elem_A = (long int)TOTAL_I*TOTAL_K;
+    long int num_elem_B = (long int)TOTAL_K*TOTAL_J;
+    long int num_elem_C = (long int)TOTAL_I*TOTAL_J;
+    
+    posix_memalign(&aa,ACL_ALIGNMENT,num_elem_A * sizeof(float));
+    A=(float*)aa;
+    if (A==NULL)
+       perror("Failed malloc of matrix A");
+    posix_memalign(&bb,ACL_ALIGNMENT,num_elem_B * sizeof(float));
+    B=(float*)bb;
+    if (B==NULL)
+       perror("Failed malloc of matrix B");
+    posix_memalign(&cc,ACL_ALIGNMENT,num_elem_C * sizeof(float));
+    C=(float*)cc;
+    if (C==NULL)
+       perror("Failed malloc of matrix C");
+
+
+    // printf("\nbefore memcpy()\n");
+   
+    memcpy(A,lhs->getValues(),num_elem_A * sizeof(float));//sizeof(lhs));
+    memcpy(B,rhs->getValues(),num_elem_B * sizeof(float));//sizeof(rhs));
+     
+    //printf("\nA values %f\n",*A);
+    //printf("\nB values %f\n",*B);
+    sgemm(A, B, C, OUTERMOST_I, OUTERMOST_J, OUTERMOST_K, ctx);
+ 
+  //  printf("\nC values %f\n",*C);
+ 
+    if(res == nullptr)
+       res = DataObjectFactory::create<DenseMatrix<float>>(nr1, nc2, false);
+
+
+    //printf("\nres: %p\n", res);
+    //printf("\nres->getValues(): %p\n", res->getValues());
+    memcpy(res->getValues(),C,num_elem_C * sizeof(float));//sizeof(C)
+   // printf("\nres memcpy2\n");
+
+   }
+};
+/* TODO
+template<>
+struct MatMul<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>> {
+    static void apply(DenseMatrix<double> *& res, const DenseMatrix<double> * lhs, const DenseMatrix<double> * rhs, bool transa, bool transb, DCTX(ctx)) {
+        const size_t nr1 = lhs->getNumRows();
+        const size_t nc1 = lhs->getNumCols();
+        const size_t nc2 = rhs->getNumCols();
+#ifndef NDEBUG
+        const size_t nr2 = rhs->getNumRows();
+        assert((nc1 == nr2) && "#cols of lhs and #rows of rhs must be the same");
+#endif 
+        if(res == nullptr)
+            res = DataObjectFactory::create<DenseMatrix<double>>(nr1, nc2, false);
+
+        if(nr1 == 1 && nc2 == 1) // Vector-Vector
+            res->set(0, 0, cblas_ddot(nc1, lhs->getValues(), 1, rhs->getValues(), rhs->getRowSkip()));
+        else if(nc2 == 1)        // Matrix-Vector
+            cblas_dgemv(CblasRowMajor, CblasNoTrans, nr1, nc1, 1, lhs->getValues(),
+                lhs->getRowSkip(), rhs->getValues(), rhs->getRowSkip(), 0,
+                res->getValues(), res->getRowSkip());
+        else                     // Matrix-Matrix
+            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, nr1, nc2, nc1,
+                1, lhs->getValues(), lhs->getRowSkip(), rhs->getValues(),
+                rhs->getRowSkip(), 0, res->getValues(), res->getRowSkip());
+    }
+};
+*/
+}
+
+#endif //SRC_RUNTIME_LOCAL_KERNELS_MATMUL_H
diff --git a/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.cpp b/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.cpp
new file mode 100755
index 000000000..cc198a5ce
--- /dev/null
+++ b/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.cpp
@@ -0,0 +1,558 @@
+// Copyright (C) 2013-2019 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+
+// This file is modified from /glob/development-tools/versions/fpgasupportstack/a10/1.2.1/intelFPGA_pro/hld/examples_aoc/matrix_mult/host/src/main.cpp
+
+#include "AOCLUtils/aocl_utils.h"
+#include "CL/opencl.h"
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <math.h>
+#include <sstream>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+#include <runtime/local/context/FPGAContext.h>
+#include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/kernels/FPGAOPENCL/gemm_interface.h>
+
+
+
+// Parameters of the systolic array
+#define II   32
+#define JJ   32
+#define KK   32
+#define III  14
+#define JJJ  16
+#define KKK  16
+
+using namespace aocl_utils;
+
+#define TYPE float
+
+#define STR_HELPER(x) #x
+#define STR(x) STR_HELPER(x)
+
+#define DPRINTF(...)     \
+    printf(__VA_ARGS__); \
+    fflush(stdout);
+
+#define NUM_QUEUES_TO_CREATE    6
+#define NUM_KERNELS_TO_CREATE   6
+
+#define CHECK(status)                                       \
+    if (status != CL_SUCCESS) {                             \
+        printf("error %d in line %d.\n", status, __LINE__); \
+        exit(1);                                            \
+    }
+
+#define ACL_ALIGNMENT 64
+void *acl_aligned_malloc(size_t size) {
+    void *result = NULL;
+    posix_memalign(&result, ACL_ALIGNMENT, size);
+    return result;
+}
+
+void cleanup() {}
+
+const char *kernel_name[] = {
+    "kernel_A_loader",
+    "kernel_B_loader",
+    "kernel_unloader_WAIT_FINISH",
+    "kernel_A_feeder",
+    "kernel_B_feeder",
+    "kernel_Out"
+};
+
+double compute_kernel_execution_time(cl_event &event, double &start_d, double &end_d) {
+    cl_ulong start, end;
+
+    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
+    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
+
+    start_d = (double)1.0e-9 * start;
+    end_d = (double)1.0e-9 * end;
+    //return (double)(end-start);
+    return (double)1.0e-9 * (end - start); // nanoseconds to seconds
+}
+
+
+int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const int OUTERMOST_J, const int OUTERMOST_K, DCTX(ctx)) {
+    const int TOTAL_I = III * II * OUTERMOST_I;
+    const int TOTAL_J = JJJ * JJ * OUTERMOST_J;
+    const int TOTAL_K = KKK * KK * OUTERMOST_K;
+    
+    long int num_elem_A = (long int)TOTAL_I*TOTAL_K;
+    long int num_elem_B = (long int)TOTAL_K*TOTAL_J;
+    long int num_elem_C = (long int)TOTAL_I*TOTAL_J;
+
+    float *serialized_A, *serialized_B;
+    if ((serialized_A = (float *)acl_aligned_malloc(num_elem_A * sizeof(float))) == NULL) {
+        perror("Failed malloc of matrix serialized_A");
+    }
+    if ((serialized_B = (float *)acl_aligned_malloc(num_elem_B * sizeof(float))) == NULL) {
+        perror("Failed malloc of matrix serialized_A");
+    }
+
+    // Serialize A
+    long int addr = 0;
+    for (int i = 0; i < TOTAL_I; i++)
+        for (int k = 0; k < TOTAL_K; k++) {
+            serialized_A[addr++] = A[k + i*TOTAL_K];
+        }
+    // Serialize B
+    addr = 0;
+    for (int j = 0; j < TOTAL_J; j++)
+        for (int k = 0; k < TOTAL_K; k++) {
+            serialized_B[addr++] = B[j+k*TOTAL_J];
+        }
+
+
+    cl_int status;
+    auto fctx = ctx->getFPGAContext(0);    
+
+    //----------------------------------------------
+    // Create command queues
+    //---------------------------------------------
+
+    cl_command_queue cmdQueue[NUM_QUEUES_TO_CREATE + 1]; // extra queue for reading buffer D
+
+    // Create a command queue using clCreateCommandQueue(),
+    // and associate it with the device you want to execute on
+    for (int i = 0; i < NUM_QUEUES_TO_CREATE; i++) {
+        //fDPRINTF(stdout,"cmdQueue i = %d\n", i);
+        cmdQueue[i] = clCreateCommandQueue(
+            fctx->context,
+            fctx->devices[0],
+            CL_QUEUE_PROFILING_ENABLE,
+            &status);
+        CHECK(status);
+    }
+
+    //fDPRINTF(stdout,"cmdQueue i = %d, a queue for reading the C buffer\n", i);
+    cmdQueue[NUM_QUEUES_TO_CREATE] = clCreateCommandQueue(
+        fctx->context,
+        fctx->devices[0],
+        CL_QUEUE_PROFILING_ENABLE,
+        &status);
+    CHECK(status);
+
+    //----------------------------------------------
+    // Create device buffers
+    //----------------------------------------------
+    cl_mem input_A_buf;
+    cl_mem input_B_buf;
+    cl_mem output_C_buf;
+#ifndef NDEBUG
+    DPRINTF("\n===== Host-CPU transferring W and X to the FPGA device global memory (DDR4) via PCIe ======\n\n");
+#endif
+    input_A_buf = clCreateBuffer(
+        fctx->context,
+        CL_MEM_READ_ONLY,
+        num_elem_A * sizeof(cl_float),
+        NULL,
+        &status);
+    CHECK(status);
+
+    input_B_buf = clCreateBuffer(
+        fctx->context,
+        CL_MEM_READ_ONLY,
+        num_elem_B * sizeof(cl_float),
+        NULL,
+        &status);
+    CHECK(status);
+
+    output_C_buf = clCreateBuffer(
+        fctx->context,
+        CL_MEM_WRITE_ONLY,
+        num_elem_C * sizeof(cl_float),
+        NULL,
+        &status);
+    CHECK(status);
+
+    //----------------------------------------------
+    // Write host data to device buffers
+    //----------------------------------------------
+    // blocking writes
+    status = clEnqueueWriteBuffer(
+        cmdQueue[0],
+        input_A_buf,
+        CL_TRUE,
+        0,
+        num_elem_A * sizeof(cl_float),
+        serialized_A,
+        0,
+        NULL,
+        NULL);
+    CHECK(status);
+
+    status = clEnqueueWriteBuffer(
+        cmdQueue[1],
+        input_B_buf,
+        CL_TRUE,
+        0,
+        num_elem_B * sizeof(cl_float),
+        serialized_B,
+        0,
+        NULL,
+        NULL);
+    CHECK(status);
+
+    //----------------------------------------------
+    // Create the program from binaries
+    //----------------------------------------------
+    //DPRINTF("\n===== Host-CPU setting up OpenCL program and kernels ======\n\n");
+
+    cl_program program;
+    size_t binary_length;
+    const unsigned char *binary;
+
+    fflush(stdout);
+    // create the program using binary already compiled offline using aoc (i.e. the .aocx file)
+    char *aocx_file = getenv("BITSTREAM");
+    FILE *fp = fopen(aocx_file, "rb");
+
+    if (fp == NULL) {
+        DPRINTF("Failed to open the AOCX file (fopen).\n");
+        return -1;
+    }
+
+    fseek(fp, 0, SEEK_END);
+    binary_length = ftell(fp);
+    binary = (unsigned char *)malloc(sizeof(unsigned char) * binary_length);
+    assert(binary && "Malloc failed");
+    rewind(fp);
+
+    if (fread((void *)binary, binary_length, 1, fp) == 0) {
+        DPRINTF("Failed to read from the AOCX file (fread).\n");
+        return -1;
+    }
+    fclose(fp);
+
+    //DPRINTF("Create program with binary\n");
+    // Create a program using clCreateProgramWithBinary()
+    program = clCreateProgramWithBinary(
+        fctx->context,
+        1,
+        fctx->devices,
+        &binary_length,
+        (const unsigned char **)&binary,
+        &status,
+        NULL);
+    CHECK(status);
+
+    //----------------------------------------------
+    // Create the kernel
+    //----------------------------------------------
+    status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+    if (status != CL_SUCCESS) {
+        char log[128 * 1024] = {0};
+        clGetProgramBuildInfo(
+		program, 
+		fctx->devices[0], 
+		CL_PROGRAM_BUILD_LOG, 128 * 1024, log, NULL);
+        CHECK(status);
+    }
+
+    cl_kernel kernel[NUM_KERNELS_TO_CREATE];
+
+    for (int j = 0; j < NUM_KERNELS_TO_CREATE; j++) {
+        kernel[j] = clCreateKernel(program, (const char *)kernel_name[j], &status);
+        CHECK(status);
+    }
+#ifndef NDEBUG
+    DPRINTF("All kernels created\n");
+#endif
+    // A_loader
+    status = clSetKernelArg(
+        kernel[0],
+        0,
+        sizeof(int),
+	&TOTAL_K);
+    CHECK(status);
+    status = clSetKernelArg(
+        kernel[0],
+        1,
+        sizeof(int),
+	&TOTAL_I);
+    CHECK(status);
+    status = clSetKernelArg(
+        kernel[0],
+        2,
+        sizeof(int),
+	&TOTAL_J);
+    CHECK(status);
+    status = clSetKernelArg(
+        kernel[0],
+        3,
+        sizeof(cl_mem),
+	&input_A_buf);
+    CHECK(status);
+    // B_loader
+    status = clSetKernelArg(
+        kernel[1],
+        0,
+        sizeof(int),
+	&TOTAL_K);
+    CHECK(status);
+    status = clSetKernelArg(
+        kernel[1],
+        1,
+        sizeof(int),
+	&TOTAL_I);
+    CHECK(status);
+    status = clSetKernelArg(
+        kernel[1],
+        2,
+        sizeof(int),
+	&TOTAL_J);
+    CHECK(status);
+    status = clSetKernelArg(
+        kernel[1],
+        3,
+        sizeof(cl_mem),
+	&input_B_buf);
+    CHECK(status);
+    // unloader
+    status = clSetKernelArg(
+        kernel[2],
+        0,
+        sizeof(int),
+	&TOTAL_I);
+    CHECK(status);
+    status = clSetKernelArg(
+        kernel[2],
+        1,
+        sizeof(int),
+	&TOTAL_J);
+    CHECK(status);
+    status = clSetKernelArg(
+        kernel[2],
+        2,
+        sizeof(cl_mem),
+	&output_C_buf);
+    CHECK(status);
+    // A_feeder
+    status = clSetKernelArg(
+        kernel[3],
+        0,
+        sizeof(int),
+	&TOTAL_K);
+    CHECK(status);
+    status = clSetKernelArg(
+        kernel[3],
+        1,
+        sizeof(int),
+	&TOTAL_I);
+    CHECK(status);
+    status = clSetKernelArg(
+        kernel[3],
+        2,
+        sizeof(int),
+	&TOTAL_J);
+    CHECK(status);
+    // B_feeder
+    status = clSetKernelArg(
+        kernel[4],
+        0,
+        sizeof(int),
+	&TOTAL_K);
+    CHECK(status);
+    status = clSetKernelArg(
+        kernel[4],
+        1,
+        sizeof(int),
+	&TOTAL_I);
+    CHECK(status);
+    status = clSetKernelArg(
+        kernel[4],
+        2,
+        sizeof(int),
+	&TOTAL_J);
+    CHECK(status);
+    // Out
+    status = clSetKernelArg(
+        kernel[5],
+        0,
+        sizeof(int),
+	&TOTAL_K);
+    CHECK(status);
+    status = clSetKernelArg(
+        kernel[5],
+        1,
+        sizeof(int),
+	&TOTAL_I);
+    CHECK(status);
+    status = clSetKernelArg(
+        kernel[5],
+        2,
+        sizeof(int),
+	&TOTAL_J);
+    CHECK(status);
+
+    //----------------------------------------------
+    // Configure the work-item structure (using only tasks atm)
+    //----------------------------------------------
+
+    // Define the number of threads that will be created
+    // as well as the number of work groups
+    size_t globalWorkSize[1];
+    size_t localWorkSize[1];
+
+    //----------------------------------------------
+    // Enqueue the kernel for execution
+    //----------------------------------------------
+
+    // all kernels are always tasks
+    globalWorkSize[0] = 1;
+    localWorkSize[0] = 1;
+
+    cl_event kernel_exec_event[NUM_KERNELS_TO_CREATE];
+
+#ifndef NDEBUG
+    DPRINTF("\n===== Host-CPU enqeuing the OpenCL kernels to the FPGA device ======\n\n");
+#endif
+    for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) {
+        // Alternatively, can use clEnqueueTaskKernel
+#ifndef NDEBUG
+        DPRINTF("clEnqueueNDRangeKernel[%d]: %s!\n", i, kernel_name[i]);
+#endif
+	status = clEnqueueNDRangeKernel(
+            cmdQueue[i],
+            kernel[i],
+            1,
+            NULL,
+            globalWorkSize,
+            localWorkSize,
+            0,
+            NULL,
+            &kernel_exec_event[i]);
+        CHECK(status);
+    }
+#ifndef NDEBUG
+    DPRINTF(" *** FPGA execution started!\n");
+#endif    
+    for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) {
+        status = clFlush(cmdQueue[i]);
+        CHECK(status);
+    }
+
+    for (int i = 0; i < NUM_QUEUES_TO_CREATE; i++) {
+#ifndef NDEBUG
+        DPRINTF("cmd queue: %d\n", i);
+#endif    
+        fflush(stdout);
+        status = clFinish(cmdQueue[i]);
+        CHECK(status);
+    }
+#ifndef NDEBUG
+    DPRINTF(" *** FPGA execution finished!\n");
+    DPRINTF("\n\n");
+//#endif    
+ 
+    double k_start_time[NUM_KERNELS_TO_CREATE];
+    double k_end_time[NUM_KERNELS_TO_CREATE];
+    double k_exec_time[NUM_KERNELS_TO_CREATE];
+    double max_time = 0;
+    for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) {
+        k_exec_time[i] = compute_kernel_execution_time(kernel_exec_event[i], k_start_time[i], k_end_time[i]);
+        if (k_exec_time[i] > max_time) {
+            max_time = k_exec_time[i];
+        }
+    }
+//#ifndef NDEBUG
+    DPRINTF("Time taken: %lf sec\n\n", max_time);
+
+    printf("\n===== Reporting measured throughput ======\n\n");
+//#endif    
+    double k_earliest_start_time = k_start_time[0];
+    double k_latest_end_time = k_end_time[0];
+    
+    for (int i = 1; i < NUM_KERNELS_TO_CREATE; i++) {
+        if (k_start_time[i] < k_earliest_start_time)
+            k_earliest_start_time = k_start_time[i];
+
+        if (k_end_time[i] > k_latest_end_time)
+            k_latest_end_time = k_end_time[i];
+    }
+
+    // IMPORTANT: we care about the finish time of drain_C, once data is drained we are done
+    k_latest_end_time = k_end_time[NUM_KERNELS_TO_CREATE - 1];
+
+    for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) {
+        printf("  Kernel execution time on FPGA: %s, \n   \t\t\t\t\t\t\t\t\texec time = %.5f s, start=%.5f s, end=%.5f s\n", kernel_name[i], k_exec_time[i], k_start_time[i], k_end_time[i]);
+    }
+//#endif
+ 
+    double k_overall_exec_time = k_latest_end_time - k_earliest_start_time;
+//#ifndef NDEBUG
+    printf("\n");
+    printf("  Loader kernels start time\t\t= %.5f s\n", k_earliest_start_time);
+    printf("  Unloader kernels end time\t\t= %.5f s\n", k_latest_end_time);
+    printf("  FPGA GEMM exec time\t\t= %.5f s\n", k_overall_exec_time);
+
+    // multiplied by 1.0e-9 to get G-FLOPs
+    printf("\n");
+
+    double num_operations = (double)2.0 * (TOTAL_K) * (double)(TOTAL_I) * (double)(TOTAL_J);
+
+    printf("  # operations = %.0f\n", num_operations );
+    printf("  Throughput: %.5f GFLOPS\n", (double)1.0e-9 * num_operations / k_overall_exec_time);
+
+    DPRINTF("\n===== Host-CPU transferring result matrix C from the FPGA device global memory (DDR4) via PCIe ======\n\n");
+#endif
+    // Read the results back from the device, blocking read
+    float *serialized_Z;
+    if ((serialized_Z = (float *)acl_aligned_malloc(num_elem_C * sizeof(float))) == NULL) {
+        perror("Failed malloc of matrix serialized_Z");
+    }
+
+    clEnqueueReadBuffer(
+        //cmdQueue[KID_DRAIN_MAT_C],
+        cmdQueue[NUM_KERNELS_TO_CREATE], // using a special queue for reading buffer C
+        output_C_buf,
+        CL_TRUE,
+        0,
+        num_elem_C * sizeof(cl_float),
+        serialized_Z,
+        0,
+        NULL,
+        NULL);
+    CHECK(status);
+
+    // Deserialize Z
+    addr = 0;
+    for (int i = 0; i < TOTAL_I; i++)
+        for (int j = 0; j < TOTAL_J; j++) {
+            C[j + i*TOTAL_J] = serialized_Z[addr++];
+        }
+    return 0;
+}
+
diff --git a/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.h b/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.h
new file mode 100755
index 000000000..0a8b4de9d
--- /dev/null
+++ b/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.h
@@ -0,0 +1,10 @@
+#ifndef SGEMM_INTERFACE
+#define SGEMM_INTERFACE
+#include <runtime/local/context/DaphneContext.h>
+
+
+extern int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const int OUTERMOST_J, const int OUTERMOST_K, DCTX(ctx));
+
+#endif
+
+
diff --git a/src/runtime/local/kernels/genKernelInst.py b/src/runtime/local/kernels/genKernelInst.py
index 1b6708092..2566f4f33 100755
--- a/src/runtime/local/kernels/genKernelInst.py
+++ b/src/runtime/local/kernels/genKernelInst.py
@@ -204,7 +204,6 @@ def printHelp():
         print()
         printHelp()
         sys.exit(1)
-
     # Parse arguments.
     inFilePath = sys.argv[1]
     outFilePath = sys.argv[2]
diff --git a/src/runtime/local/kernels/kernels.json b/src/runtime/local/kernels/kernels.json
index 1087fc0bd..fc8931bfe 100644
--- a/src/runtime/local/kernels/kernels.json
+++ b/src/runtime/local/kernels/kernels.json
@@ -433,6 +433,21 @@
 
             }]
     },
+    {
+        "kernelTemplate": {
+            "header": "CreateFPGAContext.h",
+            "opName": "createFPGAContext",
+            "returnType": "void",
+            "templateParams": [],
+            "runtimeParams": []
+        },
+        "api": [
+            {
+                "name":  ["FPGAOPENCL"],
+                "instantiations": [[]]
+
+            }]
+    },
     {
         "kernelTemplate": {
             "header": "CreateDistributedContext.h",
@@ -1079,6 +1094,12 @@
                     [["DenseMatrix", "float"], ["DenseMatrix", "float"], ["DenseMatrix", "float"]],
                     [["DenseMatrix", "double"], ["DenseMatrix", "double"], ["DenseMatrix", "double"]]
                 ]
+            },
+             {
+                "name":  ["FPGAOPENCL"],
+                "instantiations": [
+                    [["DenseMatrix", "float"], ["DenseMatrix", "float"], ["DenseMatrix", "float"]]
+                ]
             }
         ]
     },
@@ -2812,7 +2833,7 @@
             {
                 "name":  ["CPP"],
                 "instantiations": [
-                    [["DenseMatrix", "double"]]                    
+                    [["DenseMatrix", "double"]]
                 ]
             }
         ]