From dc2ed63d2bdf49bf77e1608ab698f72b0a395f8e Mon Sep 17 00:00:00 2001
From: Philipp Ortner <philipp.ortner@student.tugraz.at>
Date: Fri, 10 Nov 2023 19:46:45 +0100
Subject: [PATCH] MLIR-based code generation pipeline (#633)

As an alternative way to implement DAPHNE operators, this commit provides an MLIR-based code generation pipeline which progressively lowers the DaphneIR available after parsing the DaphneDSL script to operations in either the same dialect or operations from other dialects. With that, DAPHNE can optionally replace certain kernels by generating code directly, and also perform a hybrid compilation approach where we mix kernel calls with code generation in order to exploit advantages of both, precompiled kernel libraries and code generation.

This includes the following features:
- Documentation (Codegen.md).
- Code generation passes src/compiler/lowering/.
- Kernels to facilitate interoperability between the memref-dialect and the DenseMatrix runtime object.
- CLI arguments to enable code generation (--mlir_codegen and --mlir_codegen --mlir_hybrid_codegen) and explain-passes (--explain mlir_codegen).
- Script-level tests that run daphne with a DaphneDSL script as input in test/api/cli/codegen/.
- llvm-lit-based unit tests of the IR produced from passes/pass pipelines in test/codegen/ using intermediate IR as input (.mlir files).
- The daphne-opt tool, DAPHNE's own version of the mlir-opt tool with its source files in daphne-opt/ and a new build target daphne-opt.

daphne-opt is included since the lit tests require the daphne-opt target. With daphne-opt one can test passes in isolation by providing the input IR for a pass to daphne-opt and the correct flag to run the pass (or
a pass pipeline) on the IR. The daphne-opt tool inherits all the functionality of the mlir-opt tool.

The list of affected operations (only when daphne is executed with the --codegen flag): MatMulOp, MapOp, Ew(Sqrt|Abs|Add|Sub|Mul|Div|Pow|Mod)Op, AllAggSumOp, ConvertDenseMatrixToMemRefOp, ConvertMemRefToDenseMatrixOp.

daphne --mlir_codegen --mlir_hybrid_codegen can be used to disable code generation for certain operators in order to run a hybrid compilation pipeline (currently only the MatMulLoweringPass is affected by this.)
---
 .github/workflows/main.yml                    |   5 +-
 .gitignore                                    |  26 ++
 CMakeLists.txt                                |   1 +
 UserConfig.json                               |   2 +
 containers/daphne.Dockerfile                  |   2 +-
 daphne-opt/CMakeLists.txt                     |  45 ++
 daphne-opt/daphne-opt.cpp                     |  62 +++
 daphne-opt/daphne-opt.h                       |  24 +
 doc/Codegen.md                                | 100 +++++
 doc/GettingStarted.md                         |  47 +-
 install-ubuntu-packages.sh                    |   3 +-
 src/api/cli/DaphneUserConfig.h                |   4 +
 src/api/internal/daphne_internal.cpp          |  20 +-
 src/compiler/execution/DaphneIrExecutor.cpp   | 412 +++++++++++-------
 src/compiler/execution/DaphneIrExecutor.h     |   3 +
 src/compiler/explanation/PrintIRPass.cpp      |  27 +-
 .../inference/AdaptTypesToKernelsPass.cpp     |   4 +
 src/compiler/inference/InferencePass.cpp      |   5 +-
 .../SelectMatrixRepresentationsPass.cpp       |   5 +-
 src/compiler/lowering/AggAllOpLowering.cpp    | 180 ++++++++
 src/compiler/lowering/CMakeLists.txt          |  11 +
 src/compiler/lowering/DaphneOptPass.cpp       | 102 +++++
 .../lowering/DistributeComputationsPass.cpp   |   3 +
 .../lowering/DistributePipelinesPass.cpp      |   3 +
 src/compiler/lowering/EwOpsLowering.cpp       | 344 +++++++++++++++
 src/compiler/lowering/LowerToLLVMPass.cpp     |  59 +--
 src/compiler/lowering/ManageObjRefsPass.cpp   |  78 ++--
 src/compiler/lowering/MapOpLowering.cpp       | 146 +++++++
 src/compiler/lowering/MatMulOpLowering.cpp    | 236 ++++++++++
 src/compiler/lowering/ModOpLowering.cpp       | 226 ++++++++++
 src/compiler/lowering/RewriteSqlOpPass.cpp    |   3 +
 .../lowering/RewriteToCallKernelOpPass.cpp    |  16 +-
 .../SpecializeGenericFunctionsPass.cpp        |   3 +
 .../WhileLoopInvariantCodeMotionPass.cpp      |   3 +
 src/compiler/utils/CMakeLists.txt             |   3 +-
 src/compiler/utils/CompilerUtils.cpp          |   8 +
 src/compiler/utils/CompilerUtils.h            |   5 +-
 src/compiler/utils/LoweringUtils.cpp          | 188 ++++++++
 src/compiler/utils/LoweringUtils.h            |  65 +++
 src/ir/daphneir/CMakeLists.txt                |   2 +-
 src/ir/daphneir/Daphne.h                      |   1 +
 src/ir/daphneir/DaphneDialect.cpp             |  91 +++-
 .../DaphneDistributableOpInterface.cpp        |   5 +-
 src/ir/daphneir/DaphneOps.td                  |  40 +-
 .../DaphneVectorizableOpInterface.cpp         |   3 +
 src/ir/daphneir/Passes.h                      |   7 +
 src/ir/daphneir/Passes.td                     |  23 +-
 src/parser/config/ConfigParser.cpp            |   6 +-
 src/parser/config/JsonParams.h                |   6 +-
 src/runtime/local/kernels/BinaryOpCode.h      |  36 +-
 .../kernels/ConvertDenseMatrixToMemRef.h      |  38 ++
 .../kernels/ConvertMemRefToDenseMatrix.h      |  32 ++
 src/runtime/local/kernels/MatMul.h            |   1 -
 src/runtime/local/kernels/genKernelInst.py    |   5 +-
 src/runtime/local/kernels/kernels.json        |  81 +++-
 test/CMakeLists.txt                           |  19 +-
 test/api/cli/Utils.h                          |  27 +-
 test/api/cli/codegen/AggAllTest.cpp           |  33 ++
 test/api/cli/codegen/EwBinaryScalarTest.cpp   |  75 ++++
 test/api/cli/codegen/EwOpLoopFusionTest.cpp   |  42 ++
 test/api/cli/codegen/MapOpTest.cpp            |  37 ++
 test/api/cli/codegen/MatMulTest.cpp           |  49 +++
 test/api/cli/codegen/abs.daphne               |   7 +
 test/api/cli/codegen/add.daphne               |   8 +
 test/api/cli/codegen/div.daphne               |   8 +
 test/api/cli/codegen/fusion.daphne            |  11 +
 test/api/cli/codegen/log.daphne               |   8 +
 test/api/cli/codegen/map.daphne               |  10 +
 test/api/cli/codegen/matmul.daphne            |   9 +
 test/api/cli/codegen/matvec.daphne            |   9 +
 test/api/cli/codegen/mul.daphne               |   8 +
 test/api/cli/codegen/pow.daphne               |   8 +
 test/api/cli/codegen/sub.daphne               |   8 +
 test/api/cli/codegen/sum_aggall.daphne        |   5 +
 test/codegen/.gitignore                       |   2 +
 test/codegen/CodegenTest.cpp                  |  40 ++
 test/codegen/daphne_opt.mlir                  |  18 +
 test/codegen/ew.mlir                          | 105 +++++
 test/codegen/fusion.mlir                      |  29 ++
 test/codegen/lit.cfg                          |  17 +
 test/codegen/mapop.mlir                       |  26 ++
 test/codegen/matmul.mlir                      |  32 ++
 test/codegen/run-lit.py                       |   4 +
 test/codegen/sum_agg.mlir                     |  26 ++
 test/tags.h                                   |   1 +
 85 files changed, 3206 insertions(+), 331 deletions(-)
 create mode 100644 daphne-opt/CMakeLists.txt
 create mode 100644 daphne-opt/daphne-opt.cpp
 create mode 100644 daphne-opt/daphne-opt.h
 create mode 100644 doc/Codegen.md
 create mode 100644 src/compiler/lowering/AggAllOpLowering.cpp
 create mode 100644 src/compiler/lowering/DaphneOptPass.cpp
 create mode 100644 src/compiler/lowering/EwOpsLowering.cpp
 create mode 100644 src/compiler/lowering/MapOpLowering.cpp
 create mode 100644 src/compiler/lowering/MatMulOpLowering.cpp
 create mode 100644 src/compiler/lowering/ModOpLowering.cpp
 create mode 100644 src/compiler/utils/LoweringUtils.cpp
 create mode 100644 src/compiler/utils/LoweringUtils.h
 create mode 100644 src/runtime/local/kernels/ConvertDenseMatrixToMemRef.h
 create mode 100644 src/runtime/local/kernels/ConvertMemRefToDenseMatrix.h
 create mode 100644 test/api/cli/codegen/AggAllTest.cpp
 create mode 100644 test/api/cli/codegen/EwBinaryScalarTest.cpp
 create mode 100644 test/api/cli/codegen/EwOpLoopFusionTest.cpp
 create mode 100644 test/api/cli/codegen/MapOpTest.cpp
 create mode 100644 test/api/cli/codegen/MatMulTest.cpp
 create mode 100644 test/api/cli/codegen/abs.daphne
 create mode 100644 test/api/cli/codegen/add.daphne
 create mode 100644 test/api/cli/codegen/div.daphne
 create mode 100644 test/api/cli/codegen/fusion.daphne
 create mode 100644 test/api/cli/codegen/log.daphne
 create mode 100644 test/api/cli/codegen/map.daphne
 create mode 100644 test/api/cli/codegen/matmul.daphne
 create mode 100644 test/api/cli/codegen/matvec.daphne
 create mode 100644 test/api/cli/codegen/mul.daphne
 create mode 100644 test/api/cli/codegen/pow.daphne
 create mode 100644 test/api/cli/codegen/sub.daphne
 create mode 100644 test/api/cli/codegen/sum_aggall.daphne
 create mode 100644 test/codegen/.gitignore
 create mode 100644 test/codegen/CodegenTest.cpp
 create mode 100644 test/codegen/daphne_opt.mlir
 create mode 100644 test/codegen/ew.mlir
 create mode 100644 test/codegen/fusion.mlir
 create mode 100644 test/codegen/lit.cfg
 create mode 100644 test/codegen/mapop.mlir
 create mode 100644 test/codegen/matmul.mlir
 create mode 100644 test/codegen/run-lit.py
 create mode 100644 test/codegen/sum_agg.mlir

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index b1208e667..e267a64bf 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -47,8 +47,7 @@ jobs:
     - name: Testing
       run: |
         mkdir --parents src/api/python/tmp
-        PYTHONPATH="$PYTHONPATH:$PWD/src/" bin/run_tests
-  
+        LD_LIBRARY_PATH=$PWD/lib:$LD_LIBRARY_PATH PATH=$PWD/bin:/usr/lib/llvm-10/bin:$PATH PYTHONPATH="$PYTHONPATH:$PWD/src/:/usr/lib/llvm-10/build/utils/lit/" bin/run_tests
 
     - name: "List generated files"
       run: |
@@ -64,4 +63,4 @@ jobs:
         name: daphne
         path: |
           bin/
-          lib/
\ No newline at end of file
+          lib/
diff --git a/.gitignore b/.gitignore
index ca9e5dec4..7110d9658 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,9 @@ build_*/
 /lib
 /tmp
 
+# runtime dump
+**/*.ll
+
 # documentation build output
 doc_build/
 
@@ -25,7 +28,26 @@ __pycache__/
 .idea/
 .clion.source.upload.marker
 
+# local test/dev scripts
 tmpdaphne.daphne
+*.daphne
+*.mlir
+*.log
+
+# tags file
+tags
+tags.lock
+tags.temp
+
+# clangd cache
+.cache/
+
+# gdb
+.gdb_history
+
+# compile commands
+compile_commands.json
+
 
 # release scripts output
 /artifacts
@@ -36,3 +58,7 @@ profiler/
 precompiled-dependencies/
 /cmake*/
 /data
+
+# Allow .daphne and .mlir files in test/
+!test/**/*.mlir
+!test/**/*.daphne
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d4940066..51c895637 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -182,4 +182,5 @@ add_subdirectory(src/util)
 
 add_dependencies(CompilerUtils MLIRDaphneTransformsIncGen)
 
+add_subdirectory(daphne-opt)
 add_subdirectory(test)
diff --git a/UserConfig.json b/UserConfig.json
index 8f8e73075..5443be282 100644
--- a/UserConfig.json
+++ b/UserConfig.json
@@ -3,6 +3,7 @@
     "use_vectorized_exec": false,
     "use_obj_ref_mgnt": true,
     "cuda_fuse_any": false,
+    "use_mlir_codegen": false,
     "vectorized_single_queue": false,
     "debug_llvm": false,
     "explain_kernels": false,
@@ -14,6 +15,7 @@
     "explain_type_adaptation": false,
     "explain_vectorized": false,
     "explain_obj_ref_mgnt": false,
+    "explain_mlir_codegen": false,
     "taskPartitioningScheme": "STATIC",
     "numberOfThreads": -1,
     "minimumTaskSize": 1,
diff --git a/containers/daphne.Dockerfile b/containers/daphne.Dockerfile
index 28b138aa4..96ac02124 100644
--- a/containers/daphne.Dockerfile
+++ b/containers/daphne.Dockerfile
@@ -64,7 +64,7 @@ LABEL "org.opencontainers.image.version"="$TIMESTAMP"
 LABEL "org.opencontainers.image.created"="${CREATION_DATE}"
 LABEL "org.opencontainers.image.revision"="${GIT_HASH}"
 RUN apt-get -qq -y update && apt-get -y upgrade && apt-get -y --no-install-recommends install  \
-    libtinfo6 libssl1.1 zlib1g python3-numpy python3-pandas \
+    libtinfo6 libssl1.1 zlib1g python3-numpy python3-pandas\
     && apt-get clean && rm -rf /var/lib/apt/lists/*
 COPY --from=daphne-build $DAPHNE_DIR/bin/* /usr/local/bin
 COPY --from=daphne-build $DAPHNE_DIR/lib/* /usr/local/lib
diff --git a/daphne-opt/CMakeLists.txt b/daphne-opt/CMakeLists.txt
new file mode 100644
index 000000000..b89da923a
--- /dev/null
+++ b/daphne-opt/CMakeLists.txt
@@ -0,0 +1,45 @@
+# Copyright 2023 The DAPHNE Consortium
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+set(LIBS
+        ${dialect_libs}
+        ${conversion_libs}
+
+        MLIRDaphne
+        MLIRAnalysis
+        MLIRCallInterfaces
+        MLIRCastInterfaces
+        MLIRExecutionEngine
+        MLIRIR
+        # MLIRLLVMCommonConversion
+        MLIRLLVMToLLVMIRTranslation
+        # MLIRMemRefDialect
+        # MLIRLLVMDialect
+        MLIRParser
+        MLIRPass
+        MLIRSideEffectInterfaces
+        MLIRSupport
+        MLIRTargetLLVMIRExport
+        MLIRTransforms
+        MLIROptLib
+        )
+add_llvm_executable(daphne-opt daphne-opt.cpp)
+set_target_properties(daphne-opt PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/bin)
+
+llvm_update_compile_flags(daphne-opt)
+target_link_libraries(daphne-opt PRIVATE ${LIBS})
+
+mlir_check_all_link_libraries(daphne-opt)
diff --git a/daphne-opt/daphne-opt.cpp b/daphne-opt/daphne-opt.cpp
new file mode 100644
index 000000000..380f0b5cf
--- /dev/null
+++ b/daphne-opt/daphne-opt.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "daphne-opt.h"
+
+#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
+
+#include "ir/daphneir/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
+
+int main(int argc, char **argv) {
+    mlir::registerAllPasses();
+    // NOTE: One can also register standalone passes here.
+    mlir::daphne::registerDaphnePasses();
+
+    mlir::DialectRegistry registry;
+    registry.insert<mlir::daphne::DaphneDialect, mlir::arith::ArithDialect,
+                    mlir::func::FuncDialect, mlir::scf::SCFDialect,
+                    mlir::LLVM::LLVMDialect, mlir::AffineDialect,
+                    mlir::memref::MemRefDialect, mlir::linalg::LinalgDialect,
+                    mlir::math::MathDialect>();
+    // Add the following to include *all* MLIR Core dialects, or selectively
+    // include what you need like above. You only need to register dialects that
+    // will be *parsed* by the tool, not the one generated
+    // registerAllDialects(registry);
+
+    return mlir::asMainReturnCode(mlir::MlirOptMain(
+        argc, argv, "Standalone DAPHNE optimizing compiler driver\n",
+        registry));
+}
diff --git a/daphne-opt/daphne-opt.h b/daphne-opt/daphne-opt.h
new file mode 100644
index 000000000..3b0f77bea
--- /dev/null
+++ b/daphne-opt/daphne-opt.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DAPHNEOPT_DAPHNEOP_H
+#define DAPHNEOPT_DAPHNEOP_H
+
+#include "mlir/IR/Dialect.h"
+
+#include "ir/daphneir/Daphne.h"
+
+#endif // DAPHNEOPT_DAPHNEOP_H
diff --git a/doc/Codegen.md b/doc/Codegen.md
new file mode 100644
index 000000000..8d690c1bc
--- /dev/null
+++ b/doc/Codegen.md
@@ -0,0 +1,100 @@
+# Code Generation with MLIR
+
+This document describes the process of directly generating code with the MLIR
+framework.
+
+## Motivation
+
+DAPHNE provides a kernel for (almost) every DaphneIR operation which reside in
+`src/runtime/local/kernels/`. These are precompiled as a shared library and
+linked during compile-time. Even though these kernels can be highly optimized
+and thus achieve great runtime characteristics, they may not provide a desired
+level of extensibility for custom value types. They may also be lacking
+information only available at compile-time that could enable further
+optimizations. Additionally, through the process of progressively lowering the
+input IR, the code generation pipeline may enable more optimization
+possibilities such as operator or loop fusion.
+
+
+As an alternative way to implement our operators we provide the code generation
+pipeline which progressively lowers the DaphneIR available after parsing the
+DaphneDSL script to operations in either the same dialect or operations from
+other dialects. With that, we can optionally replace certain kernels by
+generating code directly, and also perform a hybrid compilation approach where
+we mix kernel calls with code generation in order to exploit advantages of
+both, precompiled kernel libraries and code generation. Code generation passes
+are found in `src/compiler/lowering/`.
+
+
+## Guidelines
+
+Currently, the code generation pipeline is enabled with the CLI flag
+`--mlir-codegen`. This adds the following passes that perform transformations and
+lowerings:
+
+- [DenseMatrixOptPass](src/compiler/lowering/DaphneOptPass.cpp)
+- [MatMulOpLoweringPass](src/compiler/lowering/MatMulOpLowering.cpp)
+- [AggAllLoweringPass](src/compiler/lowering/AggAllOpLowering.cpp)
+- [MapOpLoweringPass](src/compiler/lowering/MapOpLowering.cpp)
+- InlinerPass
+- [LowerEwOpPass](src/compiler/lowering/EwOpsLowering.cpp)
+- ConvertMathToLLVMPass
+- [ModOpLoweringPass](src/compiler/lowering/ModOpLowering.cpp)
+- Canonicalizer
+- CSE
+- LoopFusion
+- AffineScalarReplacement
+- LowerAffinePass
+
+These passes are added in the `DaphneIrExecutor::buildCodegenPipeline`
+function. The `--mlir-hybrid-codegen` flag disables the `MatMulOpLoweringPass` since the
+kernel implementation vastly outperforms the generated code of this pass.
+
+
+#### Runtime Interoperability
+
+Runtime interoperability with the `DenseMatrix` object is achieved with two
+kernels in `src/runtime/local/kernels/ConvertDenseMatrixToMemRef.h` and
+`src/runtime/local/kernels/ConvertMemRefToDenseMatrix.h` and the corresponding
+DaphneOps `Daphne_ConvertMemRefToDenseMatrix` and
+`Daphne_ConvertDenseMatrixToMemRef`. These kernels define how a MemRef is
+passed to a kernel and how a kernel can return a `StridedMemRefType`.
+
+
+#### Debugging
+
+In order to enable our debug `PrintIRPass` pass, one has to add `--explain
+mlir_code_gen` when running `daphne`. Additionally, it is recommended to use the
+`daphne-opt` tool to test passes in isolation. One just has to provide the
+input IR for a pass to `daphne-opt` and the correct flag to run the pass (or
+multiple passes) on the IR. `daphne-opt` provides all the functionality of the
+`mlir-opt` tool.
+
+`daphne-opt --lower-ew --debug-only=dialect-conversion ew.mlir` performs the
+`LowerEwOpPass` on the input file `ew.mlir` while providing dialect conversion
+debug information.
+
+
+
+#### Testing
+
+To test the generated code, there currently are two different approaches.
+
+End-to-end tests can be found under `test/api/cli/codegen/` and are part of the
+existing Catch2 test-suite with the its own tag, `TAG_CODEGEN`.
+
+Additionally, there are tests that check the generated IR by running the
+`llvm-lit`, `daphne-opt`, and `FileCheck` utilities. These tests reside under
+`test/compiler/lowering/`. They are `.mlir` files containing the input IR of a
+certain pass, or pass pipeline, and the `llvm-lit` directive at the top of the
+file (`RUN:`). In that line we specify how `llvm-lit` executes the test, e.g.,
+`// RUN: daphne-opt --lower-ew %s | FileCheck %s`, means that `daphne-opt` is
+called with the `--lower-ew` flag and the current file as input, the output of
+that, in addition to the file itself, is piped to `FileCheck`. `FileCheck` uses
+the comments in the `.mlir` file to check for certain conditions, e.g., `//
+CHECK-NOT: daphne.ewAdd` looks through the IR and fails if `daphne.ewAdd` can be
+found. These `llvm-lit` tests are all run by the `codegen` testcase in
+`test/codegen/Codegen.cpp`.
+
+
+All codegen tests can be executed by running `bin/run_tests '[codegen]'`.
diff --git a/doc/GettingStarted.md b/doc/GettingStarted.md
index 98d02b0f0..20d072d05 100644
--- a/doc/GettingStarted.md
+++ b/doc/GettingStarted.md
@@ -42,29 +42,30 @@ launching DAPHNE via Docker (see below) should work the same way as in a native
 
 ### Software
 
-| tool/lib                             | version known to work (*) | comment                                                                                                                                 |
-|--------------------------------------|---------------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
-| GCC/G++                              | 9.3.0                     | Last checked version: 12.2                                                                                                              |
-| clang                                | 10.0.0                    |                                                                                                                                         |
-| cmake                                | 3.20                      | On Ubuntu 20.04, install by `sudo snap install cmake --classic` to fulfill the version requirement; `apt` provides only version 3.16.3. |
-| git                                  | 2.25.1                    |                                                                                                                                         |
-| libssl-dev                           | 1.1.1                     | Dependency introduced while optimizing grpc build (which used to build ssl unnecessarily)                                               |
-| libpfm4-dev                          | 4.10                      | This dependency is needed for profiling support [DAPHNE-#479]                                                                           |
-| lld                                  | 10.0.0                    |                                                                                                                                         |
-| ninja                                | 1.10.0                    |                                                                                                                                         |
-| pkg-config                           | 0.29.1                    |                                                                                                                                         |
-| python3                              | 3.8.5                     |                                                                                                                                         |
-| numpy                                | 1.19.5                    |                                                                                                                                         |
-| pandas                               | 0.25.3                    |                                                                                                                                         |
-| java (e.g. openjdk)                  | 11 (1.7 should be fine)   |                                                                                                                                         |
-| gfortran                             | 9.3.0                     |                                                                                                                                         |
-| uuid-dev                             |                           |                                                                                                                                         |
-| wget                                 |                           | Used to fetch additional dependencies and other artefacts                                                                               |
-| jq                                   |                           | json commandline processor used in docker image generation scripts                                                                      |
-| ***                                  | ***                       | ***                                                                                                                                     |
-| CUDA SDK                             | 11.7.1                    | Optional for CUDA ops                                                                                                                   |
-| OneAPI SDK                           | 2022.x                    | Optional for OneAPI ops                                                                                                                 |
-| Intel FPGA SDK or OneAPI FPGA Add-On | 2022.x                    | Optional for FPGAOPENCL ops                                                                                                             |
+| tool/lib                             | version known to work (*)    | comment                                                                                                                                 |
+|--------------------------------------|------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
+| GCC/G++                              | 9.3.0                        | Last checked version: 12.2                                                                                                              |
+| clang                                | 10.0.0                       |                                                                                                                                         |
+| cmake                                | 3.20                         | On Ubuntu 20.04, install by `sudo snap install cmake --classic` to fulfill the version requirement; `apt` provides only version 3.16.3. |
+| git                                  | 2.25.1                       |                                                                                                                                         |
+| libssl-dev                           | 1.1.1                        | Dependency introduced while optimizing grpc build (which used to build ssl unnecessarily)                                               |
+| libpfm4-dev                          | 4.10                         | This dependency is needed for profiling support [DAPHNE-#479]                                                                           |
+| lld                                  | 10.0.0                       |                                                                                                                                         |
+| ninja                                | 1.10.0                       |                                                                                                                                         |
+| pkg-config                           | 0.29.1                       |                                                                                                                                         |
+| python3                              | 3.8.5                        |                                                                                                                                         |
+| numpy                                | 1.19.5                       |                                                                                                                                         |
+| pandas                               | 0.25.3                       |                                                                                                                                         |
+| java (e.g. openjdk)                  | 11 (1.7 should be fine)      |                                                                                                                                         |
+| gfortran                             | 9.3.0                        |                                                                                                                                         |
+| uuid-dev                             |                              |                                                                                                                                         |
+| llvm-10-tools                        | 10, 15                       | On Ubuntu 22.04 you may need to install a newer `llvm-*-tools` version, such as `llvm-15-tools`.                                        |
+| wget                                 |                              | Used to fetch additional dependencies and other artefacts                                                                               |
+| jq                                   |                              | json commandline processor used in docker image generation scripts                                                                      |
+| ***                                  | ***                          | ***                                                                                                                                     |
+| CUDA SDK                             | 11.7.1                       | Optional for CUDA ops                                                                                                                   |
+| OneAPI SDK                           | 2022.x                       | Optional for OneAPI ops                                                                                                                 |
+| Intel FPGA SDK or OneAPI FPGA Add-On | 2022.x                       | Optional for FPGAOPENCL ops                                                                                                             |
 
 ### Hardware
 
diff --git a/install-ubuntu-packages.sh b/install-ubuntu-packages.sh
index 8281493d9..6644005db 100644
--- a/install-ubuntu-packages.sh
+++ b/install-ubuntu-packages.sh
@@ -15,5 +15,6 @@
 # limitations under the License.
 
 # This is a convenience script to install the required packages on Ubuntu 20+ systems to compile DAPHNE
+# On Ubuntu 22.04 you may need to change the version of llvm-10-tools to a newer one, such as llvm-15-tools.
 sudo apt install build-essential clang cmake git libssl-dev libpfm4-dev lld ninja-build pkg-config python3-numpy \
- python3-pandas default-jdk-headless gfortran uuid-dev wget unzip jq
+ python3-pandas default-jdk-headless gfortran uuid-dev wget unzip jq llvm-10-tools
diff --git a/src/api/cli/DaphneUserConfig.h b/src/api/cli/DaphneUserConfig.h
index 92a5e6b23..3b7a2de93 100644
--- a/src/api/cli/DaphneUserConfig.h
+++ b/src/api/cli/DaphneUserConfig.h
@@ -42,6 +42,8 @@ struct DaphneUserConfig {
     bool use_obj_ref_mgnt = true;
     bool use_ipa_const_propa = true;
     bool use_phy_op_selection = true;
+    bool use_mlir_codegen = false;
+    bool use_mlir_hybrid_codegen = false;
     bool cuda_fuse_any = false;
     bool vectorized_single_queue = false;
     bool prePartitionRows = false;
@@ -63,6 +65,8 @@ struct DaphneUserConfig {
     bool explain_type_adaptation = false;
     bool explain_vectorized = false;
     bool explain_obj_ref_mgnt = false;
+    bool explain_mlir_codegen = false;
+
     SelfSchedulingScheme taskPartitioningScheme = STATIC;
     QueueTypeOption queueSetupScheme = CENTRALIZED;
 	VictimSelectionLogic victimSelection = SEQPRI;
diff --git a/src/api/internal/daphne_internal.cpp b/src/api/internal/daphne_internal.cpp
index 138012c79..5ba81007c 100644
--- a/src/api/internal/daphne_internal.cpp
+++ b/src/api/internal/daphne_internal.cpp
@@ -256,6 +256,14 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
             "libdir", cat(daphneOptions),
             desc("The directory containing kernel libraries")
     );
+    static opt<bool> mlirCodegen(
+        "mlir-codegen", cat(daphneOptions),
+        desc("Enables lowering of certain DaphneIR operations on DenseMatrix to low-level MLIR operations.")
+    );
+    static opt<bool> performHybridCodegen(
+        "mlir-hybrid-codegen", cat(daphneOptions),
+        desc("Enables prototypical hybrid code generation combining pre-compiled kernels and MLIR code generation.")
+    );
 
     enum ExplainArgs {
       kernels,
@@ -268,7 +276,8 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
       phy_op_selection,
       type_adaptation,
       vectorized,
-      obj_ref_mgnt
+      obj_ref_mgnt,
+      mlir_codegen
     };
 
     static llvm::cl::list<ExplainArgs> explainArgList(
@@ -286,7 +295,8 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
             clEnumVal(vectorized, "Show DaphneIR after vectorization"),
             clEnumVal(obj_ref_mgnt, "Show DaphneIR after managing object references"),
             clEnumVal(kernels, "Show DaphneIR after kernel lowering"),
-            clEnumVal(llvm, "Show DaphneIR after llvm lowering")),
+            clEnumVal(llvm, "Show DaphneIR after llvm lowering"),
+            clEnumVal(mlir_codegen, "Show DaphneIR after MLIR codegen")),
         CommaSeparated);
 
     static llvm::cl::list<string> scriptArgs1(
@@ -367,6 +377,9 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
     user_config.use_obj_ref_mgnt = !noObjRefMgnt;
     user_config.use_ipa_const_propa = !noIPAConstPropa;
     user_config.use_phy_op_selection = !noPhyOpSelection;
+    user_config.use_mlir_codegen = mlirCodegen;
+    user_config.use_mlir_hybrid_codegen = performHybridCodegen;
+
     if(!libDir.getValue().empty())
         user_config.libdir = libDir.getValue();
     user_config.library_paths.push_back(user_config.libdir + "/libAllKernels.so");
@@ -428,6 +441,9 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
             case obj_ref_mgnt:
                 user_config.explain_obj_ref_mgnt = true;
                 break;
+            case mlir_codegen:
+                user_config.explain_mlir_codegen = true;
+                break;
         }
     }
 
diff --git a/src/compiler/execution/DaphneIrExecutor.cpp b/src/compiler/execution/DaphneIrExecutor.cpp
index 2376ad20b..1c5ab19f5 100644
--- a/src/compiler/execution/DaphneIrExecutor.cpp
+++ b/src/compiler/execution/DaphneIrExecutor.cpp
@@ -14,234 +14,310 @@
  *  limitations under the License.
  */
 
+#include "DaphneIrExecutor.h"
+
 #include <ir/daphneir/Daphne.h>
 #include <ir/daphneir/Passes.h>
-#include "DaphneIrExecutor.h"
+#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
+#include <mlir/Dialect/LLVMIR/Transforms/Passes.h>
+
+#include <filesystem>
+#include <memory>
+#include <utility>
 
 #include "llvm/Support/TargetSelect.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
+#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
+#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
 #include "mlir/ExecutionEngine/OptUtils.h"
 #include "mlir/IR/BuiltinOps.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/Passes.h"
-#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
-#include <mlir/Dialect/LLVMIR/Transforms/Passes.h>
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Transforms/Passes.h"
 
-#include <filesystem>
-#include <memory>
-#include <utility>
-
-DaphneIrExecutor::DaphneIrExecutor(bool selectMatrixRepresentations, DaphneUserConfig cfg) : userConfig_(std::move(cfg)),
-        selectMatrixRepresentations_(selectMatrixRepresentations) {
+DaphneIrExecutor::DaphneIrExecutor(bool selectMatrixRepresentations,
+                                   DaphneUserConfig cfg)
+    : userConfig_(std::move(cfg)),
+      selectMatrixRepresentations_(selectMatrixRepresentations) {
     // register loggers
-    if(userConfig_.log_ptr != nullptr)
-        userConfig_.log_ptr->registerLoggers();
+    if (userConfig_.log_ptr != nullptr) userConfig_.log_ptr->registerLoggers();
 
     context_.getOrLoadDialect<mlir::daphne::DaphneDialect>();
     context_.getOrLoadDialect<mlir::arith::ArithDialect>();
     context_.getOrLoadDialect<mlir::func::FuncDialect>();
     context_.getOrLoadDialect<mlir::scf::SCFDialect>();
     context_.getOrLoadDialect<mlir::LLVM::LLVMDialect>();
+    context_.getOrLoadDialect<mlir::AffineDialect>();
+    context_.getOrLoadDialect<mlir::memref::MemRefDialect>();
+    context_.getOrLoadDialect<mlir::linalg::LinalgDialect>();
+    context_.getOrLoadDialect<mlir::math::MathDialect>();
 
     llvm::InitializeNativeTarget();
     llvm::InitializeNativeTargetAsmPrinter();
 }
 
-bool DaphneIrExecutor::runPasses(mlir::ModuleOp module)
-{
-    // FIXME: operations in `template` functions (functions with unknown inputs) can't be verified
+bool DaphneIrExecutor::runPasses(mlir::ModuleOp module) {
+    // FIXME: operations in `template` functions (functions with unknown inputs)
+    // can't be verified
     //  as their type constraints are not met.
-    //if (failed(mlir::verify(module))) {
-        //module->emitError("failed to verify the module right after parsing");
-        //return false;
+    // if (failed(mlir::verify(module))) {
+    // module->emitError("failed to verify the module right after parsing");
+    // return false;
     //}
 
-    if (module) {
-        // This flag is really useful to figure out why the lowering failed
-        llvm::DebugFlag = userConfig_.debug_llvm;
-        {
-            mlir::PassManager pm(&context_);
-            // TODO Enable the verifier for all passes where it is possible.
-            // Originally, it was only turned off for the SpecializeGenericFunctionsPass.
-            pm.enableVerifier(false);
-
-            if(userConfig_.explain_parsing)
-                pm.addPass(mlir::daphne::createPrintIRPass("IR after parsing:"));
-
-            pm.addPass(mlir::createCanonicalizerPass());
-            pm.addPass(mlir::createCSEPass());
-            if(userConfig_.explain_parsing_simplified)
-                pm.addPass(mlir::daphne::createPrintIRPass("IR after parsing and some simplifications:"));
-
-            pm.addPass(mlir::daphne::createRewriteSqlOpPass()); // calls SQL Parser
-            if(userConfig_.explain_sql)
-                pm.addPass(mlir::daphne::createPrintIRPass("IR after SQL parsing:"));
-
-            pm.addPass(mlir::daphne::createSpecializeGenericFunctionsPass(userConfig_));
-            if(userConfig_.explain_property_inference)
-                pm.addPass(mlir::daphne::createPrintIRPass("IR after inference:"));
-
-            if(failed(pm.run(module))) {
-                module->dump();
-                module->emitError("module pass error");
-                return false;
-            }
-        }
+    if (!module) return false;
+
+    // This flag is really useful to figure out why the lowering failed
+    llvm::DebugFlag = userConfig_.debug_llvm;
+    {
         mlir::PassManager pm(&context_);
+        // TODO Enable the verifier for all passes where it is possible.
+        // Originally, it was only turned off for the
+        // SpecializeGenericFunctionsPass.
+        pm.enableVerifier(false);
+
+        if (userConfig_.explain_parsing)
+            pm.addPass(mlir::daphne::createPrintIRPass("IR after parsing:"));
 
-        // Note that property inference and canonicalization have already been done
-        // in the SpecializeGenericFunctionsPass, so actually, it's not necessary
-        // here anymore.
-        // TODO There is a cyclic dependency between (shape) inference and
-        // constant folding (included in canonicalization), at the moment we
-        // run only three iterations of both passes (see #173).
-        pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createInferencePass());
         pm.addPass(mlir::createCanonicalizerPass());
+        pm.addPass(mlir::createCSEPass());
+        if (userConfig_.explain_parsing_simplified)
+            pm.addPass(mlir::daphne::createPrintIRPass(
+                "IR after parsing and some simplifications:"));
 
-        if(selectMatrixRepresentations_)
-            pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createSelectMatrixRepresentationsPass());
-        if(userConfig_.explain_select_matrix_repr)
-            pm.addPass(mlir::daphne::createPrintIRPass("IR after selecting matrix representations:"));
+        pm.addPass(mlir::daphne::createRewriteSqlOpPass());  // calls SQL Parser
+        if (userConfig_.explain_sql)
+            pm.addPass(
+                mlir::daphne::createPrintIRPass("IR after SQL parsing:"));
 
-        if(userConfig_.use_phy_op_selection) {
-            pm.addPass(mlir::daphne::createPhyOperatorSelectionPass());
-            pm.addPass(mlir::createCSEPass());
+        pm.addPass(
+            mlir::daphne::createSpecializeGenericFunctionsPass(userConfig_));
+        if (userConfig_.explain_property_inference)
+            pm.addPass(mlir::daphne::createPrintIRPass("IR after inference:"));
+
+        if (failed(pm.run(module))) {
+            module->dump();
+            module->emitError("module pass error");
+            return false;
         }
-        if(userConfig_.explain_phy_op_selection)
-            pm.addPass(mlir::daphne::createPrintIRPass("IR after selecting physical operators:"));
+    }
 
-        pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createAdaptTypesToKernelsPass());
-        if(userConfig_.explain_type_adaptation)
-            pm.addPass(mlir::daphne::createPrintIRPass("IR after type adaptation:"));
+    mlir::PassManager pm(&context_);
+    // Note that property inference and canonicalization have already been done
+    // in the SpecializeGenericFunctionsPass, so actually, it's not necessary
+    // here anymore.
+
+    // TODO There is a cyclic dependency between (shape) inference and
+    // constant folding (included in canonicalization), at the moment we
+    // run only three iterations of both passes (see #173).
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createInferencePass());
+    pm.addPass(mlir::createCanonicalizerPass());
+
+    if (selectMatrixRepresentations_)
+        pm.addNestedPass<mlir::func::FuncOp>(
+            mlir::daphne::createSelectMatrixRepresentationsPass());
+    if (userConfig_.explain_select_matrix_repr)
+        pm.addPass(mlir::daphne::createPrintIRPass(
+            "IR after selecting matrix representations:"));
+
+    if (userConfig_.use_phy_op_selection) {
+        pm.addPass(mlir::daphne::createPhyOperatorSelectionPass());
+        pm.addPass(mlir::createCSEPass());
+    }
+    if (userConfig_.explain_phy_op_selection)
+        pm.addPass(mlir::daphne::createPrintIRPass(
+            "IR after selecting physical operators:"));
+
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::daphne::createAdaptTypesToKernelsPass());
+    if (userConfig_.explain_type_adaptation)
+        pm.addPass(
+            mlir::daphne::createPrintIRPass("IR after type adaptation:"));
 
 #if 0
-        if (userConfig_.use_distributed) {
-            pm.addPass(mlir::daphne::createDistributeComputationsPass());
-            //pm.addPass(mlir::daphne::createPrintIRPass("IR after distribution:"));
-            pm.addPass(mlir::createCSEPass());
-            //pm.addPass(mlir::daphne::createPrintIRPass("IR after distribution - CSE:"));
-            pm.addPass(mlir::createCanonicalizerPass());
-            //pm.addPass(mlir::daphne::createPrintIRPass("IR after distribution - canonicalization:"));
-            pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createWhileLoopInvariantCodeMotionPass());
-            //pm.addPass(mlir::daphne::createPrintIRPass("IR after distribution - WhileLICM:"));
-        }
+    if (userConfig_.use_distributed) {
+        pm.addPass(mlir::daphne::createDistributeComputationsPass());
+        //pm.addPass(mlir::daphne::createPrintIRPass("IR after distribution"));
+        pm.addPass(mlir::createCSEPass());
+        //pm.addPass(mlir::daphne::createPrintIRPass("IR after distribution - CSE"));
+        pm.addPass(mlir::createCanonicalizerPass());
+        //pm.addPass(mlir::daphne::createPrintIRPass("IR after distribution - canonicalization"));
+        pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createWhileLoopInvariantCodeMotionPass());
+        //pm.addPass(mlir::daphne::createPrintIRPass("IR after distribution - WhileLICM"));
+    }
 #endif
-        
-        // For now, in order to use the distributed runtime we also require the vectorized engine to be enabled
-        // to create pipelines. Therefore, *if* distributed runtime is enabled, we need to make a vectorization pass.
-        if(userConfig_.use_vectorized_exec || userConfig_.use_distributed) {
-            // TODO: add inference here if we have rewrites that could apply to vectorized pipelines due to smaller sizes
-            pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createVectorizeComputationsPass());
-            pm.addPass(mlir::createCanonicalizerPass());
-        }
-        if(userConfig_.explain_vectorized)
-            pm.addPass(mlir::daphne::createPrintIRPass("IR after vectorization:"));
-        
-        if (userConfig_.use_distributed)
-            pm.addPass(mlir::daphne::createDistributePipelinesPass());
 
-        if (userConfig_.enable_profiling)
-            pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createProfilingPass());
+    // For now, in order to use the distributed runtime we also require the
+    // vectorized engine to be enabled to create pipelines. Therefore, *if*
+    // distributed runtime is enabled, we need to make a vectorization pass.
+    if (userConfig_.use_vectorized_exec || userConfig_.use_distributed) {
+        // TODO: add inference here if we have rewrites that could apply to
+        // vectorized pipelines due to smaller sizes
+        pm.addNestedPass<mlir::func::FuncOp>(
+            mlir::daphne::createVectorizeComputationsPass());
+        pm.addPass(mlir::createCanonicalizerPass());
+    }
+    if (userConfig_.explain_vectorized)
+        pm.addPass(mlir::daphne::createPrintIRPass("IR after vectorization:"));
 
-        pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createInsertDaphneContextPass(userConfig_));
+    if (userConfig_.use_distributed)
+        pm.addPass(mlir::daphne::createDistributePipelinesPass());
+
+    if (userConfig_.use_mlir_codegen || userConfig_.use_mlir_hybrid_codegen) buildCodegenPipeline(pm);
+
+    if (userConfig_.enable_profiling)
+        pm.addNestedPass<mlir::func::FuncOp>(
+            mlir::daphne::createProfilingPass());
+
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::daphne::createInsertDaphneContextPass(userConfig_));
 
 #ifdef USE_CUDA
-        if(userConfig_.use_cuda)
-            pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createMarkCUDAOpsPass(userConfig_));
+    if (userConfig_.use_cuda)
+        pm.addNestedPass<mlir::func::FuncOp>(
+            mlir::daphne::createMarkCUDAOpsPass(userConfig_));
 #endif
 
 #ifdef USE_FPGAOPENCL
-        if(userConfig_.use_fpgaopencl)
-            pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createMarkFPGAOPENCLOpsPass(userConfig_));
+    if (userConfig_.use_fpgaopencl)
+        pm.addNestedPass<mlir::func::FuncOp>(
+            mlir::daphne::createMarkFPGAOPENCLOpsPass(userConfig_));
 #endif
 
-        // Tidy up the IR before managing object reference counters with IncRefOp and DecRefOp.
-        // This is important, because otherwise, an SSA value whose references are managed could
-        // be cleared away by common subexpression elimination (CSE), while retaining its
-        // IncRefOps/DecRefOps, which could lead to double frees etc.
-        pm.addPass(mlir::createCanonicalizerPass());
-        pm.addPass(mlir::createCSEPass());
+    // Tidy up the IR before managing object reference counters with IncRefOp
+    // and DecRefOp. This is important, because otherwise, an SSA value whose
+    // references are managed could be cleared away by common subexpression
+    // elimination (CSE), while retaining its IncRefOps/DecRefOps, which could
+    // lead to double frees etc.
+    pm.addPass(mlir::createCanonicalizerPass());
+    pm.addPass(mlir::createCSEPass());
 
-        if(userConfig_.use_obj_ref_mgnt)
-            pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createManageObjRefsPass());
-        if(userConfig_.explain_obj_ref_mgnt)
-            pm.addPass(mlir::daphne::createPrintIRPass("IR after managing object references:"));
+    if (userConfig_.use_obj_ref_mgnt)
+        pm.addNestedPass<mlir::func::FuncOp>(
+            mlir::daphne::createManageObjRefsPass());
+    if (userConfig_.explain_obj_ref_mgnt)
+        pm.addPass(mlir::daphne::createPrintIRPass(
+            "IR after managing object references:"));
 
-        pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createRewriteToCallKernelOpPass());
-        if(userConfig_.explain_kernels)
-            pm.addPass(mlir::daphne::createPrintIRPass("IR after kernel lowering:"));
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::daphne::createRewriteToCallKernelOpPass());
+    if (userConfig_.explain_kernels)
+        pm.addPass(
+            mlir::daphne::createPrintIRPass("IR after kernel lowering:"));
 
-        pm.addPass(mlir::createConvertSCFToCFPass());
-        pm.addNestedPass<mlir::func::FuncOp>(mlir::LLVM::createRequestCWrappersPass());
-        pm.addPass(mlir::daphne::createLowerToLLVMPass(userConfig_));
-        pm.addPass(mlir::createReconcileUnrealizedCastsPass());
-        if(userConfig_.explain_llvm)
-            pm.addPass(mlir::daphne::createPrintIRPass("IR after llvm lowering:"));
+    pm.addPass(mlir::createConvertSCFToCFPass());
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::LLVM::createRequestCWrappersPass());
+    pm.addPass(mlir::daphne::createLowerToLLVMPass(userConfig_));
+    pm.addPass(mlir::createReconcileUnrealizedCastsPass());
+    if (userConfig_.explain_llvm)
+        pm.addPass(mlir::daphne::createPrintIRPass("IR after llvm lowering:"));
 
-        if (failed(pm.run(module))) {
-            module->dump();
-            module->emitError("module pass error");
-            return false;
-        }
-        return true;
+    if (failed(pm.run(module))) {
+        module->dump();
+        module->emitError("module pass error");
+        return false;
     }
-    return false;
+
+    return true;
 }
 
-std::unique_ptr<mlir::ExecutionEngine> DaphneIrExecutor::createExecutionEngine(mlir::ModuleOp module)
-{
-    if (module) {
-        // An optimization pipeline to use within the execution engine.
-        auto optPipeline = mlir::makeOptimizingTransformer(0, 0, nullptr);
-        std::vector<llvm::StringRef> sharedLibRefs;
-        // This next line adds to our Linux platform lock-in
-        std::string daphne_executable_dir(std::filesystem::canonical("/proc/self/exe").parent_path());
-        if(userConfig_.libdir.empty()) {
-            sharedLibRefPaths.push_back(std::string(daphne_executable_dir + "/../lib/libAllKernels.so"));
-            sharedLibRefs.emplace_back(sharedLibRefPaths.back());
-        }
-        else {
-            sharedLibRefs.insert(sharedLibRefs.end(), userConfig_.library_paths.begin(), userConfig_.library_paths.end());
-        }
+std::unique_ptr<mlir::ExecutionEngine> DaphneIrExecutor::createExecutionEngine(
+    mlir::ModuleOp module) {
+    if (!module) return nullptr;
+    // An optimization pipeline to use within the execution engine.
+    unsigned optLevel = 0;
+    unsigned sizeLevel = 0;
+    llvm::TargetMachine *targetMachine = nullptr;
+    auto optPipeline = mlir::makeOptimizingTransformer(optLevel, sizeLevel, targetMachine);
+    std::vector<llvm::StringRef> sharedLibRefs;
+    // This next line adds to our Linux platform lock-in
+    std::string daphne_executable_dir(
+        std::filesystem::canonical("/proc/self/exe").parent_path());
+    if (userConfig_.libdir.empty()) {
+        sharedLibRefPaths.push_back(
+            std::string(daphne_executable_dir + "/../lib/libAllKernels.so"));
+        sharedLibRefs.emplace_back(sharedLibRefPaths.back());
+    } else {
+        sharedLibRefs.insert(sharedLibRefs.end(),
+                             userConfig_.library_paths.begin(),
+                             userConfig_.library_paths.end());
+    }
 
 #ifdef USE_CUDA
-        if(userConfig_.use_cuda) {
-            sharedLibRefPaths.push_back(std::string(daphne_executable_dir + "/../lib/libCUDAKernels.so"));
-            sharedLibRefs.emplace_back(sharedLibRefPaths.back());
-        }
+    if (userConfig_.use_cuda) {
+        sharedLibRefPaths.push_back(
+            std::string(daphne_executable_dir + "/../lib/libCUDAKernels.so"));
+        sharedLibRefs.emplace_back(sharedLibRefPaths.back());
+    }
 #endif
- 
+
 #ifdef USE_FPGAOPENCL
-        if(userConfig_.use_fpgaopencl) {
-            sharedLibRefPaths.push_back(std::string(daphne_executable_dir + "/../lib/libFPGAOPENCLKernels.so"));
-            sharedLibRefs.emplace_back(sharedLibRefPaths.back());
-        }
+    if (userConfig_.use_fpgaopencl) {
+        sharedLibRefPaths.push_back(std::string(
+            daphne_executable_dir + "/../lib/libFPGAOPENCLKernels.so"));
+        sharedLibRefs.emplace_back(sharedLibRefPaths.back());
+    }
 #endif
-        registerLLVMDialectTranslation(context_);
-        // module.dump();
-        mlir::ExecutionEngineOptions options;
-        options.llvmModuleBuilder = nullptr;
-        options.transformer = optPipeline;
-        options.jitCodeGenOptLevel = llvm::CodeGenOpt::Level::Default;
-        options.sharedLibPaths = llvm::ArrayRef<llvm::StringRef>(sharedLibRefs);
-        options.enableObjectDump = true;
-        options.enableGDBNotificationListener = true;
-        options.enablePerfNotificationListener = true;
-        auto maybeEngine = mlir::ExecutionEngine::create(module, options);
-
-        if (!maybeEngine) {
-            llvm::errs() << "Failed to create JIT-Execution engine: "
-                         << maybeEngine.takeError();
-            return nullptr;
-        }
-        return std::move(maybeEngine.get());
+    registerLLVMDialectTranslation(context_);
+    // module.dump();
+    mlir::ExecutionEngineOptions options;
+    options.llvmModuleBuilder = nullptr;
+    options.transformer = optPipeline;
+    options.jitCodeGenOptLevel = llvm::CodeGenOpt::Level::Default;
+    options.sharedLibPaths = llvm::ArrayRef<llvm::StringRef>(sharedLibRefs);
+    options.enableObjectDump = true;
+    options.enableGDBNotificationListener = true;
+    options.enablePerfNotificationListener = true;
+    auto maybeEngine = mlir::ExecutionEngine::create(module, options);
+
+    if (!maybeEngine) {
+        llvm::errs() << "Failed to create JIT-Execution engine: "
+                     << maybeEngine.takeError();
+        return nullptr;
     }
-    return nullptr;
+    return std::move(maybeEngine.get());
+}
+
+void DaphneIrExecutor::buildCodegenPipeline(mlir::PassManager &pm) {
+    if (userConfig_.explain_mlir_codegen)
+        pm.addPass(
+            mlir::daphne::createPrintIRPass("IR before codegen pipeline"));
+
+    pm.addPass(mlir::daphne::createDaphneOptPass());
+
+    if (!userConfig_.use_mlir_hybrid_codegen) {
+        pm.addPass(mlir::daphne::createMatMulOpLoweringPass());
+    }
+
+    pm.addPass(mlir::daphne::createAggAllOpLoweringPass());
+    pm.addPass(mlir::daphne::createMapOpLoweringPass());
+    pm.addPass(mlir::createInlinerPass());
+
+    pm.addPass(mlir::daphne::createEwOpLoweringPass());
+    pm.addPass(mlir::createConvertMathToLLVMPass());
+    pm.addPass(mlir::daphne::createModOpLoweringPass());
+    pm.addPass(mlir::createCanonicalizerPass());
+    pm.addPass(mlir::createCSEPass());
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::createLoopFusionPass());
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::createAffineScalarReplacementPass());
+    pm.addPass(mlir::createLowerAffinePass());
+
+    if (userConfig_.explain_mlir_codegen)
+        pm.addPass(
+            mlir::daphne::createPrintIRPass("IR after codegen pipeline"));
 }
diff --git a/src/compiler/execution/DaphneIrExecutor.h b/src/compiler/execution/DaphneIrExecutor.h
index 05d32d7b1..ef1c32d13 100644
--- a/src/compiler/execution/DaphneIrExecutor.h
+++ b/src/compiler/execution/DaphneIrExecutor.h
@@ -19,6 +19,7 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
 #include <api/cli/DaphneUserConfig.h>
+#include "mlir/Pass/PassManager.h"
 
 class DaphneIrExecutor
 {
@@ -36,5 +37,7 @@ class DaphneIrExecutor
     bool selectMatrixRepresentations_;
     // Storage for lib paths needed for StringRefs
     std::vector<std::string> sharedLibRefPaths;
+
+    void buildCodegenPipeline(mlir::PassManager &);
 };
 
diff --git a/src/compiler/explanation/PrintIRPass.cpp b/src/compiler/explanation/PrintIRPass.cpp
index 6dabf88d6..3adf1bf5b 100644
--- a/src/compiler/explanation/PrintIRPass.cpp
+++ b/src/compiler/explanation/PrintIRPass.cpp
@@ -17,34 +17,39 @@
 #include <ir/daphneir/Daphne.h>
 #include <ir/daphneir/Passes.h>
 
-#include <string>
 #include <iostream>
+#include <string>
 
 using namespace mlir;
 
 /**
  * @brief A compiler pass that simply prints the IR.
- * 
+ *
  * Useful for manual testing and debugging, since this pass can easily be
  * integrated after any other pass to have a look at the IR.
  */
 class PrintIRPass : public PassWrapper<PrintIRPass, OperationPass<ModuleOp>> {
-    
     std::string message;
-    
-public:
-    PrintIRPass(const std::string message) : message(message) {
-        //
-    }
-    
+
+   public:
+    PrintIRPass(const std::string message) : message(message) {}
+
     void runOnOperation() final;
+
+    StringRef getArgument() const final { return "print-ir"; }
+    StringRef getDescription() const final {
+        return "Pass for debugging purposes, prints the IR at the current "
+               "stage in the compilation pipeline.";
+    }
 };
 
 void PrintIRPass::runOnOperation() {
     std::cerr << message << std::endl;
-    
+
     auto module = getOperation();
-    module.dump();
+    OpPrintingFlags flags = {};
+    flags.enableDebugInfo(/*enable=*/false, /*prettyForm=*/false);
+    module.print(llvm::errs(), flags);
 }
 
 std::unique_ptr<Pass> daphne::createPrintIRPass(const std::string message) {
diff --git a/src/compiler/inference/AdaptTypesToKernelsPass.cpp b/src/compiler/inference/AdaptTypesToKernelsPass.cpp
index 94a261866..22812ba9d 100644
--- a/src/compiler/inference/AdaptTypesToKernelsPass.cpp
+++ b/src/compiler/inference/AdaptTypesToKernelsPass.cpp
@@ -40,6 +40,10 @@ using namespace mlir;
 struct AdaptTypesToKernelsPass : public PassWrapper<AdaptTypesToKernelsPass, OperationPass<func::FuncOp>>
 {
     void runOnOperation() final;
+    StringRef getArgument() const final { return "adapt-types-to-kernels"; }
+    StringRef getDescription() const final {
+        return "TODO";
+    }
 };
 
 void AdaptTypesToKernelsPass::runOnOperation()
diff --git a/src/compiler/inference/InferencePass.cpp b/src/compiler/inference/InferencePass.cpp
index 414a2afb6..c0af79ac0 100644
--- a/src/compiler/inference/InferencePass.cpp
+++ b/src/compiler/inference/InferencePass.cpp
@@ -519,8 +519,11 @@ class InferencePass : public PassWrapper<InferencePass, OperationPass<func::Func
             return false;
         });
     }
+
+    StringRef getArgument() const final { return "inference"; }
+    StringRef getDescription() const final { return "TODO"; }
 };
 
 std::unique_ptr<Pass> daphne::createInferencePass(daphne::InferenceConfig cfg) {
     return std::make_unique<InferencePass>(cfg);
-}
\ No newline at end of file
+}
diff --git a/src/compiler/inference/SelectMatrixRepresentationsPass.cpp b/src/compiler/inference/SelectMatrixRepresentationsPass.cpp
index 11f74280b..9049b0a9c 100644
--- a/src/compiler/inference/SelectMatrixRepresentationsPass.cpp
+++ b/src/compiler/inference/SelectMatrixRepresentationsPass.cpp
@@ -161,6 +161,9 @@ class SelectMatrixRepresentationsPass : public PassWrapper<SelectMatrixRepresent
             f.getBody().back().getTerminator()->getOperandTypes()));
     }
 
+    StringRef getArgument() const final { return "select-matrix-representations"; }
+    StringRef getDescription() const final { return "TODO"; }
+
     static bool returnsKnownProperties(Operation *op) {
         return llvm::any_of(op->getResultTypes(), [](Type rt) {
             if(auto mt = rt.dyn_cast<daphne::MatrixType>())
@@ -172,4 +175,4 @@ class SelectMatrixRepresentationsPass : public PassWrapper<SelectMatrixRepresent
 
 std::unique_ptr<Pass> daphne::createSelectMatrixRepresentationsPass() {
     return std::make_unique<SelectMatrixRepresentationsPass>();
-}
\ No newline at end of file
+}
diff --git a/src/compiler/lowering/AggAllOpLowering.cpp b/src/compiler/lowering/AggAllOpLowering.cpp
new file mode 100644
index 000000000..f3f16f861
--- /dev/null
+++ b/src/compiler/lowering/AggAllOpLowering.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "compiler/utils/CompilerUtils.h"
+#include "compiler/utils/LoweringUtils.h"
+#include "ir/daphneir/Daphne.h"
+#include "ir/daphneir/Passes.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
+#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/LinalgToStandard/LinalgToStandard.h"
+#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/UseDefLists.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+using namespace mlir;
+
+class SumAllOpLowering : public OpConversionPattern<daphne::AllAggSumOp> {
+   public:
+    using OpConversionPattern::OpConversionPattern;
+
+    LogicalResult matchAndRewrite(
+        daphne::AllAggSumOp op, OpAdaptor adaptor,
+        ConversionPatternRewriter &rewriter) const override {
+        mlir::daphne::MatrixType matrixType =
+            adaptor.getArg().getType().dyn_cast<mlir::daphne::MatrixType>();
+
+        auto loc = op->getLoc();
+        auto nR = matrixType.getNumRows();
+        auto nC = matrixType.getNumCols();
+
+        auto matrixElementType = matrixType.getElementType();
+        auto memRefType = mlir::MemRefType::get({nR, nC}, matrixElementType);
+        auto memRef = rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(
+            op->getLoc(), memRefType, adaptor.getArg());
+
+        Value sum = rewriter.create<mlir::arith::ConstantOp>(
+            loc, rewriter.getF64Type(), rewriter.getF64FloatAttr(0));
+
+        SmallVector<Value, 4> loopIvs;
+        SmallVector<AffineForOp, 2> forOps;
+        auto outerLoop =
+            rewriter.create<AffineForOp>(loc, 0, nR, 1, ValueRange{sum});
+        for (Operation &nested : *outerLoop.getBody()) {
+            rewriter.eraseOp(&nested);
+        }
+        loopIvs.push_back(outerLoop.getInductionVar());
+        // outer loop body
+        rewriter.setInsertionPointToStart(outerLoop.getBody());
+        Value sum_iter = rewriter.create<mlir::arith::ConstantOp>(
+            loc, rewriter.getF64Type(), rewriter.getF64FloatAttr(0));
+        // inner loop
+        auto innerLoop =
+            rewriter.create<AffineForOp>(loc, 0, nC, 1, ValueRange{sum_iter});
+        for (Operation &nested : *innerLoop.getBody()) {
+            rewriter.eraseOp(&nested);
+        }
+        loopIvs.push_back(innerLoop.getInductionVar());
+        // inner loop body
+        rewriter.setInsertionPointToStart(innerLoop.getBody());
+        // load value from memref
+        auto elementLoad =
+            rewriter.create<memref::LoadOp>(loc, memRef, loopIvs);
+        // sum loop iter arg and memref value
+        mlir::Value inner_sum = rewriter.create<mlir::arith::AddFOp>(
+            loc, innerLoop.getRegionIterArgs()[0], elementLoad);
+        // yield inner loop result
+        rewriter.setInsertionPointToEnd(innerLoop.getBody());
+        rewriter.create<AffineYieldOp>(loc, inner_sum);
+        // yield outer loop result
+        rewriter.setInsertionPointToEnd(outerLoop.getBody());
+        mlir::Value outer_sum = rewriter.create<mlir::arith::AddFOp>(
+            loc, outerLoop.getRegionIterArgs()[0], innerLoop.getResult(0));
+        rewriter.create<AffineYieldOp>(loc, outer_sum);
+
+        rewriter.setInsertionPointAfter(outerLoop);
+        rewriter.create<daphne::DecRefOp>(loc, adaptor.getArg());
+        // replace sumAll op with result of loops
+        rewriter.replaceOp(op, outerLoop.getResult(0));
+
+        return success();
+    }
+};
+
+namespace {
+/**
+ * @brief Lowers the daphne::AggAll operator to a set of affine loops and
+ * performs the aggregation on a MemRef which is created from the input
+ * DenseMatrix.
+ *
+ * This rewrite may enable loop fusion of the produced affine loops by
+ * running the loop fusion pass.
+ */
+struct AggAllLoweringPass
+    : public mlir::PassWrapper<AggAllLoweringPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+    explicit AggAllLoweringPass() {}
+
+    StringRef getArgument() const final { return "lower-agg"; }
+    StringRef getDescription() const final {
+        return "Lowers AggAll operators to a set of affine loops and performs "
+               "the aggregation on a MemRef which is created from the input "
+               "DenseMatrix.";
+    }
+
+    void getDependentDialects(mlir::DialectRegistry &registry) const override {
+        registry.insert<mlir::LLVM::LLVMDialect, mlir::AffineDialect,
+                        mlir::memref::MemRefDialect>();
+    }
+    void runOnOperation() final;
+};
+}  // end anonymous namespace
+
+void AggAllLoweringPass::runOnOperation() {
+    mlir::ConversionTarget target(getContext());
+    mlir::RewritePatternSet patterns(&getContext());
+    LowerToLLVMOptions llvmOptions(&getContext());
+    LLVMTypeConverter typeConverter(&getContext(), llvmOptions);
+
+    target.addLegalDialect<mlir::memref::MemRefDialect>();
+    target.addLegalDialect<mlir::arith::ArithDialect>();
+    target.addLegalDialect<mlir::scf::SCFDialect>();
+    target.addLegalDialect<mlir::AffineDialect>();
+    target.addLegalDialect<mlir::linalg::LinalgDialect>();
+    target.addLegalDialect<mlir::LLVM::LLVMDialect>();
+
+    target.addLegalOp<mlir::daphne::ConvertDenseMatrixToMemRef>();
+    target.addLegalOp<mlir::daphne::ConvertMemRefToDenseMatrix>();
+    target.addLegalOp<mlir::daphne::DecRefOp>();
+
+    target.addIllegalOp<mlir::daphne::AllAggSumOp>();
+
+    patterns.insert<SumAllOpLowering>(&getContext());
+    auto module = getOperation();
+    if (failed(applyPartialConversion(module, target, std::move(patterns)))) {
+        signalPassFailure();
+    }
+}
+
+std::unique_ptr<mlir::Pass> mlir::daphne::createAggAllOpLoweringPass() {
+    return std::make_unique<AggAllLoweringPass>();
+}
diff --git a/src/compiler/lowering/CMakeLists.txt b/src/compiler/lowering/CMakeLists.txt
index 0484a8b5c..6b9ac25af 100644
--- a/src/compiler/lowering/CMakeLists.txt
+++ b/src/compiler/lowering/CMakeLists.txt
@@ -27,6 +27,12 @@ add_mlir_dialect_library(MLIRDaphneTransforms
     SpecializeGenericFunctionsPass.cpp
     VectorizeComputationsPass.cpp
     WhileLoopInvariantCodeMotionPass.cpp
+    DaphneOptPass.cpp
+    EwOpsLowering.cpp
+    ModOpLowering.cpp
+    MapOpLowering.cpp
+    MatMulOpLowering.cpp
+    AggAllOpLowering.cpp
 
     DEPENDS
     MLIRDaphneOpsIncGen
@@ -35,9 +41,14 @@ add_mlir_dialect_library(MLIRDaphneTransforms
     LINK_COMPONENTS
     Core
 )
+
 target_link_libraries(MLIRDaphneTransforms PUBLIC
     CompilerUtils
+    MLIRSCFToControlFlow
     MLIRArithToLLVM
+    MLIRMemRefToLLVM
+    MLIRAffineToStandard
+    MLIRLinalgToStandard
     MLIRControlFlowToLLVM
     MLIRFuncToLLVM
     MLIRFuncTransforms
diff --git a/src/compiler/lowering/DaphneOptPass.cpp b/src/compiler/lowering/DaphneOptPass.cpp
new file mode 100644
index 000000000..8795962e2
--- /dev/null
+++ b/src/compiler/lowering/DaphneOptPass.cpp
@@ -0,0 +1,102 @@
+#include "compiler/utils/CompilerUtils.h"
+#include "compiler/utils/LoweringUtils.h"
+#include "ir/daphneir/Daphne.h"
+#include "ir/daphneir/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#define DEBUG_TYPE "dm-opt"
+
+using namespace mlir;
+
+class IntegerModOpt : public mlir::OpConversionPattern<mlir::daphne::EwModOp> {
+   public:
+    using OpConversionPattern::OpConversionPattern;
+
+    [[nodiscard]] static bool optimization_viable(mlir::daphne::EwModOp op) {
+        if (!op.getRhs().getType().isUnsignedInteger()) return false;
+
+        std::pair<bool, uint64_t> isConstant =
+            CompilerUtils::isConstant<uint64_t>(op.getRhs());
+        // Apply (lhs % rhs) to (lhs & (rhs - 1)) optimization when rhs is a power of two
+        return isConstant.first && (isConstant.second & (isConstant.second - 1)) == 0;
+    }
+
+    mlir::LogicalResult matchAndRewrite(
+        mlir::daphne::EwModOp op, OpAdaptor adaptor,
+        mlir::ConversionPatternRewriter &rewriter) const override {
+        mlir::Value cst_one = rewriter.create<mlir::daphne::ConstantOp>(
+            op.getLoc(), static_cast<uint64_t>(1));
+        mlir::Value sub = rewriter.create<mlir::daphne::EwSubOp>(
+            op.getLoc(), adaptor.getRhs(), cst_one);
+        mlir::Value andOp = rewriter.create<mlir::daphne::EwBitwiseAndOp>(
+            op.getLoc(), adaptor.getLhs(), sub);
+        rewriter.replaceOp(op, andOp);
+        return success();
+    }
+};
+
+namespace {
+/**
+ * @brief This pass transforms operations (currently limited to the EwModOp) in
+ * the DaphneDialect to a different set of operations also from the
+ * DaphneDialect.
+ */
+struct DenseMatrixOptPass
+    : public mlir::PassWrapper<DenseMatrixOptPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+    explicit DenseMatrixOptPass() {}
+
+    void getDependentDialects(mlir::DialectRegistry &registry) const override {
+        registry.insert<mlir::LLVM::LLVMDialect, mlir::arith::ArithDialect,
+                        mlir::daphne::DaphneDialect>();
+    }
+    void runOnOperation() final;
+
+    StringRef getArgument() const final { return "opt-daphne"; }
+    StringRef getDescription() const final {
+        return "Performs optimizations on the DaphneIR by transforming "
+               "operations in the DaphneDialect to a set of other operation "
+               "also from the DaphneDialect.";
+    }
+};
+}  // end anonymous namespace
+
+void DenseMatrixOptPass::runOnOperation() {
+    mlir::ConversionTarget target(getContext());
+    mlir::RewritePatternSet patterns(&getContext());
+    mlir::LowerToLLVMOptions llvmOptions(&getContext());
+    mlir::LLVMTypeConverter typeConverter(&getContext(), llvmOptions);
+
+    typeConverter.addConversion([](Type type) { return type; });
+
+    target.addLegalDialect<mlir::BuiltinDialect>();
+    target.addLegalDialect<mlir::arith::ArithDialect>();
+    target.addLegalDialect<mlir::daphne::DaphneDialect>();
+
+    target.addDynamicallyLegalOp<mlir::daphne::EwModOp>(
+        [&](mlir::daphne::EwModOp op) {
+            return !IntegerModOpt::optimization_viable(op);
+        });
+
+    patterns.insert<IntegerModOpt>(typeConverter, &getContext());
+
+    auto module = getOperation();
+    if (failed(applyPartialConversion(module, target, std::move(patterns)))) {
+        signalPassFailure();
+    }
+}
+
+std::unique_ptr<mlir::Pass> mlir::daphne::createDaphneOptPass() {
+    return std::make_unique<DenseMatrixOptPass>();
+}
diff --git a/src/compiler/lowering/DistributeComputationsPass.cpp b/src/compiler/lowering/DistributeComputationsPass.cpp
index d57a00a62..09b57a9a2 100644
--- a/src/compiler/lowering/DistributeComputationsPass.cpp
+++ b/src/compiler/lowering/DistributeComputationsPass.cpp
@@ -73,6 +73,9 @@ struct DistributeComputationsPass
     : public PassWrapper<DistributeComputationsPass, OperationPass<ModuleOp>>
 {
     void runOnOperation() final;
+
+    StringRef getArgument() const final { return "distribute-computation"; }
+    StringRef getDescription() const final { return "TODO"; }
 };
 }
 
diff --git a/src/compiler/lowering/DistributePipelinesPass.cpp b/src/compiler/lowering/DistributePipelinesPass.cpp
index ae4ce4698..d4ea14468 100644
--- a/src/compiler/lowering/DistributePipelinesPass.cpp
+++ b/src/compiler/lowering/DistributePipelinesPass.cpp
@@ -67,6 +67,9 @@ struct DistributePipelinesPass
     : public PassWrapper<DistributePipelinesPass, OperationPass<ModuleOp>>
 {
     void runOnOperation() final;
+
+    StringRef getArgument() const final { return "distribute-pipelines"; }
+    StringRef getDescription() const final { return "TODO"; }
 };
 
 void DistributePipelinesPass::runOnOperation()
diff --git a/src/compiler/lowering/EwOpsLowering.cpp b/src/compiler/lowering/EwOpsLowering.cpp
new file mode 100644
index 000000000..d892fdfe8
--- /dev/null
+++ b/src/compiler/lowering/EwOpsLowering.cpp
@@ -0,0 +1,344 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "compiler/utils/CompilerUtils.h"
+#include "compiler/utils/LoweringUtils.h"
+#include "ir/daphneir/Daphne.h"
+#include "ir/daphneir/Passes.h"
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/UseDefLists.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+using namespace mlir;
+
+template <class UnaryOp, class IOp, class FOp>
+struct UnaryOpLowering : public mlir::OpConversionPattern<UnaryOp> {
+    using OpAdaptor = typename mlir::OpConversionPattern<UnaryOp>::OpAdaptor;
+
+   public:
+    UnaryOpLowering(mlir::TypeConverter &typeConverter, mlir::MLIRContext *ctx)
+        : mlir::OpConversionPattern<UnaryOp>(typeConverter, ctx) {
+        this->setDebugName("EwDaphneOpsLowering");
+    }
+
+    mlir::LogicalResult matchAndRewrite(
+        UnaryOp op, OpAdaptor adaptor,
+        mlir::ConversionPatternRewriter &rewriter) const override {
+        mlir::Type type = op.getType();
+
+        if (type.isa<mlir::IntegerType>()) {
+            rewriter.replaceOpWithNewOp<IOp>(op.getOperation(),
+                                             adaptor.getOperands());
+        } else if (type.isa<mlir::FloatType>()) {
+            rewriter.replaceOpWithNewOp<FOp>(op.getOperation(),
+                                             adaptor.getOperands());
+        } else {
+            return mlir::failure();
+        }
+        return mlir::success();
+    }
+};
+
+template <class BinaryOp, class IOp, class FOp>
+class BinaryOpLowering final : public mlir::OpConversionPattern<BinaryOp> {
+    using OpAdaptor = typename mlir::OpConversionPattern<BinaryOp>::OpAdaptor;
+
+   public:
+    BinaryOpLowering(mlir::TypeConverter &typeConverter, mlir::MLIRContext *ctx)
+        : mlir::OpConversionPattern<BinaryOp>(typeConverter, ctx) {
+        this->setDebugName("EwDaphneOpLowering");
+    }
+
+    mlir::LogicalResult convertEwScalar(
+        BinaryOp op, OpAdaptor adaptor,
+        mlir::ConversionPatternRewriter &rewriter) const {
+        auto lhs = adaptor.getLhs();
+        auto rhs = adaptor.getRhs();
+        auto loc = op.getLoc();
+
+        if (lhs.getType().template isa<mlir::FloatType>() &&
+            rhs.getType().template isa<mlir::FloatType>()) {
+            rewriter.replaceOpWithNewOp<FOp>(op.getOperation(),
+                                             adaptor.getOperands());
+            return mlir::success();
+        }
+
+        Value castedLhs = this->typeConverter->materializeTargetConversion(
+            rewriter, loc,
+            rewriter.getIntegerType(
+                adaptor.getRhs().getType().getIntOrFloatBitWidth()),
+            ValueRange{adaptor.getLhs()});
+
+        Value castedRhs = this->typeConverter->materializeTargetConversion(
+            rewriter, loc,
+            rewriter.getIntegerType(
+                adaptor.getRhs().getType().getIntOrFloatBitWidth()),
+            ValueRange{adaptor.getRhs()});
+
+        Value binaryOp = rewriter.create<IOp>(loc, castedLhs, castedRhs);
+
+        Value res = this->typeConverter->materializeSourceConversion(
+            rewriter, loc, lhs.getType(), ValueRange{binaryOp});
+
+        rewriter.replaceOp(op, res);
+        return mlir::success();
+    }
+
+    mlir::LogicalResult matchAndRewrite(
+        BinaryOp op, OpAdaptor adaptor,
+        mlir::ConversionPatternRewriter &rewriter) const override {
+        auto lhs = adaptor.getLhs();
+        auto rhs = adaptor.getRhs();
+
+        // no matrix
+        if (!lhs.getType().template isa<mlir::daphne::MatrixType>() &&
+            !rhs.getType().template isa<mlir::daphne::MatrixType>())
+            return convertEwScalar(op, adaptor, rewriter);
+
+        // for now assume matrix is LHS and RHS is non matrix
+        mlir::daphne::MatrixType lhsMatrixType =
+            adaptor.getLhs()
+                .getType()
+                .template dyn_cast<mlir::daphne::MatrixType>();
+        auto matrixElementType = lhsMatrixType.getElementType();
+        auto lhsRows = lhsMatrixType.getNumRows();
+        auto lhsCols = lhsMatrixType.getNumCols();
+        auto lhsMemRefType =
+            mlir::MemRefType::get({lhsRows, lhsCols}, matrixElementType);
+
+        mlir::Type elementType{};
+        mlir::Value memRefLhs =
+            rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(
+                op->getLoc(), lhsMemRefType, adaptor.getLhs());
+
+        mlir::Value memRefRhs{};
+        bool isMatrixMatrix =
+            rhs.getType().template isa<mlir::daphne::MatrixType>();
+
+        if (isMatrixMatrix) {
+            memRefRhs =
+                rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(
+                    op->getLoc(), lhsMemRefType, adaptor.getRhs());
+            elementType = lhsMemRefType.getElementType();
+        } else {
+            elementType = rhs.getType();
+        }
+
+        mlir::Value outputMemRef =
+            insertMemRefAlloc(lhsMemRefType, op->getLoc(), rewriter);
+
+        SmallVector<int64_t, 4> lowerBounds(/*Rank=*/2, /*Value=*/0);
+        SmallVector<int64_t, 4> steps(/*Rank=*/2, /*Value=*/1);
+        buildAffineLoopNest(
+            rewriter, op.getLoc(), lowerBounds,
+            {lhsMatrixType.getNumRows(), lhsMatrixType.getNumCols()}, steps,
+            [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
+                mlir::Value loadLhs =
+                    nestedBuilder.create<AffineLoadOp>(loc, memRefLhs, ivs);
+                mlir::Value binaryOp{};
+
+                if (adaptor.getRhs()
+                        .getType()
+                        .template isa<mlir::FloatType>()) {
+                    binaryOp = nestedBuilder.create<FOp>(loc, loadLhs,
+                                                         adaptor.getRhs());
+
+                    nestedBuilder.create<AffineStoreOp>(loc, binaryOp,
+                                                        outputMemRef, ivs);
+                    return;
+                }
+
+                mlir::Value rhs{};
+                if (isMatrixMatrix)
+                    rhs =
+                        nestedBuilder.create<AffineLoadOp>(loc, memRefRhs, ivs);
+                else
+                    rhs = adaptor.getRhs();
+
+                // is integer
+                if (elementType.isInteger(
+                        elementType.getIntOrFloatBitWidth())) {
+                    Value castedLhs =
+                        this->typeConverter->materializeTargetConversion(
+                            nestedBuilder, loc,
+                            nestedBuilder.getIntegerType(
+                                lhsMemRefType.getElementTypeBitWidth()),
+                            ValueRange{loadLhs});
+
+                    Value castedRhs =
+                        this->typeConverter->materializeTargetConversion(
+                            nestedBuilder, loc,
+                            nestedBuilder.getIntegerType(
+                                lhsMemRefType.getElementTypeBitWidth()),
+                            ValueRange{rhs});
+
+                    binaryOp =
+                        nestedBuilder.create<IOp>(loc, castedLhs, castedRhs);
+                    Value castedRes =
+                        this->typeConverter->materializeSourceConversion(
+                            nestedBuilder, loc, elementType,
+                            ValueRange{binaryOp});
+                    nestedBuilder.create<AffineStoreOp>(loc, castedRes,
+                                                        outputMemRef, ivs);
+                } else {
+                    // is float
+                    binaryOp = nestedBuilder.create<FOp>(loc, loadLhs, rhs);
+                    nestedBuilder.create<AffineStoreOp>(loc, binaryOp,
+                                                        outputMemRef, ivs);
+                }
+            });
+        mlir::Value output = convertMemRefToDenseMatrix(
+            op->getLoc(), rewriter, outputMemRef, op.getType());
+
+        rewriter.replaceOp(op, output);
+        return mlir::success();
+    }
+};
+
+// clang-format off
+// math::sqrt only supports floating point, DAPHNE promotes argument type of sqrt to f32/64
+using SqrtOpLowering = UnaryOpLowering<mlir::daphne::EwSqrtOp, mlir::math::SqrtOp, mlir::math::SqrtOp>;
+using AbsOpLowering = UnaryOpLowering<mlir::daphne::EwAbsOp, mlir::math::AbsIOp, mlir::math::AbsFOp>;
+using AddOpLowering = BinaryOpLowering<mlir::daphne::EwAddOp, mlir::arith::AddIOp, mlir::arith::AddFOp>;
+using SubOpLowering = BinaryOpLowering<mlir::daphne::EwSubOp, mlir::arith::SubIOp, mlir::arith::SubFOp>;
+using MulOpLowering = BinaryOpLowering<mlir::daphne::EwMulOp, mlir::arith::MulIOp, mlir::arith::MulFOp>;
+using DivOpLowering = BinaryOpLowering<mlir::daphne::EwDivOp, mlir::arith::DivSIOp, mlir::arith::DivFOp>;
+using PowOpLowering = BinaryOpLowering<mlir::daphne::EwPowOp, mlir::math::PowFOp, mlir::math::PowFOp>;
+// clang-format on
+
+namespace {
+/**
+ * @brief This pass lowers element-wise operations to affine loop
+ * structures and arithmetic operations.
+ *
+ * This rewrite may enable loop fusion of the produced affine loops by
+ * running the loop fusion pass.
+ */
+struct EwOpLoweringPass
+    : public mlir::PassWrapper<EwOpLoweringPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+    explicit EwOpLoweringPass() {}
+
+    void getDependentDialects(mlir::DialectRegistry &registry) const override {
+        registry.insert<mlir::LLVM::LLVMDialect, mlir::AffineDialect,
+                        mlir::memref::MemRefDialect,
+                        mlir::daphne::DaphneDialect, mlir::math::MathDialect>();
+    }
+    void runOnOperation() final;
+
+    StringRef getArgument() const final { return "lower-ew"; }
+    StringRef getDescription() const final {
+        return "This pass lowers element-wise operations to affine-loop "
+               "structures and arithmetic operations.";
+    }
+};
+}  // end anonymous namespace
+
+void populateLowerEwOpConversionPatterns(mlir::LLVMTypeConverter &typeConverter,
+                                         mlir::RewritePatternSet &patterns) {
+    // clang-format off
+    patterns.insert<
+        AddOpLowering,
+        SubOpLowering,
+        MulOpLowering,
+        SqrtOpLowering,
+        AbsOpLowering,
+        DivOpLowering,
+        PowOpLowering>(typeConverter, patterns.getContext());
+    // clang-format on
+}
+
+void EwOpLoweringPass::runOnOperation() {
+    mlir::ConversionTarget target(getContext());
+    mlir::RewritePatternSet patterns(&getContext());
+    mlir::LowerToLLVMOptions llvmOptions(&getContext());
+    mlir::LLVMTypeConverter typeConverter(&getContext(), llvmOptions);
+
+    typeConverter.addConversion(convertInteger);
+    typeConverter.addConversion(convertFloat);
+    typeConverter.addConversion([](Type type) { return type; });
+    typeConverter.addArgumentMaterialization(materializeCastFromIllegal);
+    typeConverter.addSourceMaterialization(materializeCastToIllegal);
+    typeConverter.addTargetMaterialization(materializeCastFromIllegal);
+
+    target.addLegalDialect<mlir::arith::ArithDialect,
+                           mlir::memref::MemRefDialect, mlir::AffineDialect,
+                           mlir::LLVM::LLVMDialect, mlir::daphne::DaphneDialect,
+                           mlir::BuiltinDialect, mlir::math::MathDialect>();
+
+    target.addDynamicallyLegalOp<mlir::daphne::EwSqrtOp, mlir::daphne::EwAbsOp>(
+        [](Operation *op) {
+            return op->getOperandTypes()[0].isa<mlir::daphne::MatrixType>();
+        });
+
+    target.addDynamicallyLegalOp<mlir::daphne::EwAddOp, mlir::daphne::EwSubOp,
+                                 mlir::daphne::EwMulOp, mlir::daphne::EwPowOp,
+                                 mlir::daphne::EwDivOp>([](Operation *op) {
+        if (op->getOperandTypes()[0].isa<mlir::daphne::MatrixType>() &&
+            op->getOperandTypes()[1].isa<mlir::daphne::MatrixType>()) {
+            mlir::daphne::MatrixType lhs =
+                op->getOperandTypes()[0]
+                    .template dyn_cast<mlir::daphne::MatrixType>();
+            mlir::daphne::MatrixType rhs =
+                op->getOperandTypes()[1]
+                    .template dyn_cast<mlir::daphne::MatrixType>();
+            if (lhs.getNumRows() != rhs.getNumRows() ||
+                lhs.getNumCols() != rhs.getNumCols() ||
+                lhs.getNumRows() == -1 || lhs.getNumCols() == -1)
+                return true;
+
+            return false;
+        }
+
+        if (op->getOperandTypes()[0].isa<mlir::daphne::MatrixType>()) {
+            mlir::daphne::MatrixType lhsMatrixType =
+                op->getOperandTypes()[0].dyn_cast<mlir::daphne::MatrixType>();
+            return lhsMatrixType.getNumRows() == -1 || lhsMatrixType.getNumCols() == -1;
+        }
+
+        return false;
+    });
+
+    populateLowerEwOpConversionPatterns(typeConverter, patterns);
+
+    auto module = getOperation();
+    if (failed(applyPartialConversion(module, target, std::move(patterns))))
+        signalPassFailure();
+}
+
+std::unique_ptr<mlir::Pass> mlir::daphne::createEwOpLoweringPass() {
+    return std::make_unique<EwOpLoweringPass>();
+}
diff --git a/src/compiler/lowering/LowerToLLVMPass.cpp b/src/compiler/lowering/LowerToLLVMPass.cpp
index 6baa7e4ce..6fd9c975e 100644
--- a/src/compiler/lowering/LowerToLLVMPass.cpp
+++ b/src/compiler/lowering/LowerToLLVMPass.cpp
@@ -18,16 +18,23 @@
 #include "ir/daphneir/Passes.h"
 #include "compiler/utils/CompilerUtils.h"
 
+#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
+
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
+#include "mlir/Conversion/LinalgToStandard/LinalgToStandard.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 #include <memory>
@@ -41,35 +48,6 @@ using namespace mlir;
 // be combined into a single variadic result.
 const std::string ATTR_HASVARIADICRESULTS = "hasVariadicResults";
 
-#if 0
-// At the moment, all of these operations are lowered to kernel calls.
-template <typename BinaryOp, typename ReplIOp, typename ReplFOp>
-struct BinaryOpLowering : public OpConversionPattern<BinaryOp>
-{
-    using OpConversionPattern<BinaryOp>::OpConversionPattern;
-
-    LogicalResult
-    matchAndRewrite(BinaryOp op, OpAdaptor adaptor,
-                    ConversionPatternRewriter &rewriter) const override
-    {
-        Type type = op.getType();
-        if (type.isa<IntegerType>()) {
-            rewriter.replaceOpWithNewOp<ReplIOp>(op.getOperation(), adaptor.getOperands());
-        }
-        else if (type.isa<FloatType>()) {
-            rewriter.replaceOpWithNewOp<ReplFOp>(op.getOperation(), adaptor.getOperands());
-        }
-        else {
-            return failure();
-        }
-        return success();
-    }
-};
-using AddOpLowering = BinaryOpLowering<daphne::AddOp, AddIOp, AddFOp>;
-using SubOpLowering = BinaryOpLowering<daphne::SubOp, SubIOp, SubFOp>;
-using MulOpLowering = BinaryOpLowering<daphne::MulOp, MulIOp, MulFOp>;
-#endif
-
 struct ReturnOpLowering : public OpRewritePattern<daphne::ReturnOp>
 {
     using OpRewritePattern<daphne::ReturnOp>::OpRewritePattern;
@@ -308,16 +286,18 @@ class CallKernelOpLowering : public OpConversionPattern<daphne::CallKernelOp>
         auto loc = op.getLoc();
 
         auto inputOutputTypes = getLLVMInputOutputTypes(
-                                                        loc, rewriter.getContext(), typeConverter,
-                                                        op.getResultTypes(), ValueRange(adaptor.getOperands()).getTypes(),
-                                                        hasVarRes, rewriter.getIndexType());
+            loc, rewriter.getContext(), typeConverter, op.getResultTypes(),
+            ValueRange(adaptor.getOperands()).getTypes(), hasVarRes,
+            rewriter.getIndexType());
 
         // create function protoype and get `FlatSymbolRefAttr` to it
         auto kernelRef = getOrInsertFunctionAttr(
-                                                 rewriter, module, op.getCalleeAttr().getValue(),
-                                                 getKernelFuncSignature(rewriter.getContext(), inputOutputTypes));
+            rewriter, module, op.getCalleeAttr().getValue(),
+            getKernelFuncSignature(rewriter.getContext(), inputOutputTypes));
 
-        auto kernelOperands = allocOutputReferences(loc, rewriter, adaptor.getOperands(), inputOutputTypes, op->getNumResults(), hasVarRes);
+        auto kernelOperands = allocOutputReferences(
+            loc, rewriter, adaptor.getOperands(), inputOutputTypes,
+            op->getNumResults(), hasVarRes);
 
         // call function
         // The kernel call has an empty list of return types, because our
@@ -934,6 +914,7 @@ void DaphneLowerToLLVMPass::runOnOperation()
     RewritePatternSet patterns(&getContext());
 
     LowerToLLVMOptions llvmOptions(&getContext());
+    // llvmOptions.useBarePtrCallConv = true;
     LLVMTypeConverter typeConverter(&getContext(), llvmOptions);
     typeConverter.addConversion([&](daphne::MatrixType t)
     {
@@ -985,9 +966,13 @@ void DaphneLowerToLLVMPass::runOnOperation()
     LLVMConversionTarget target(getContext());
 
     // populate dialect conversions
-    arith::populateArithToLLVMConversionPatterns(typeConverter, patterns);
-    populateFuncToLLVMConversionPatterns(typeConverter, patterns);
+    mlir::linalg::populateLinalgToStandardConversionPatterns(patterns);
+    populateAffineToStdConversionPatterns(patterns);
+    populateSCFToControlFlowConversionPatterns(patterns);
+    mlir::arith::populateArithToLLVMConversionPatterns(typeConverter, patterns);
+    populateFinalizeMemRefToLLVMConversionPatterns(typeConverter, patterns);
     cf::populateControlFlowToLLVMConversionPatterns(typeConverter, patterns);
+    populateFuncToLLVMConversionPatterns(typeConverter, patterns);
     populateReturnOpTypeConversionPattern(patterns, typeConverter);
 
     target.addLegalOp<ModuleOp>();
diff --git a/src/compiler/lowering/ManageObjRefsPass.cpp b/src/compiler/lowering/ManageObjRefsPass.cpp
index b819912b6..90120163f 100644
--- a/src/compiler/lowering/ManageObjRefsPass.cpp
+++ b/src/compiler/lowering/ManageObjRefsPass.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <compiler/utils/CompilerUtils.h>
+#include <compiler/utils/LoweringUtils.h>
 #include <ir/daphneir/Daphne.h>
 #include <ir/daphneir/Passes.h>
 
@@ -26,11 +27,11 @@ using namespace mlir;
 /**
  * @brief Inserts DaphneIR operations for managing the reference counters of
  * runtime data objects.
- * 
+ *
  * Thus, it takes care of freeing data objects (e.g., intermediate results) at
  * the right points. The operations employed for reference management are
  * `IncRefOp` and `DecRefOp`.
- * 
+ *
  * The core ideas are:
  * - We decrease the reference counter of each SSA value (block argument or
  *   op result) to prevent memory leaks.
@@ -48,12 +49,23 @@ struct ManageObjRefsPass : public PassWrapper<ManageObjRefsPass, OperationPass<f
 {
     explicit ManageObjRefsPass() {}
     void runOnOperation() final;
+
+    StringRef getArgument() const final { return "manage-obj-refs"; }
+    StringRef getDescription() const final { return "TODO"; }
 };
 
+void processMemRefInterop(OpBuilder builder, Value v) {
+    Operation* lastUseOp = findLastUseOfSSAValue(v);
+
+    builder.setInsertionPointAfter(lastUseOp);
+    builder.create<daphne::DecRefOp>(builder.getUnknownLoc(),
+                                     v.getDefiningOp()->getOperand(0));
+}
+
 /**
  * @brief Inserts a `DecRefOp` in the right place, to decrease the reference
  * counter of the given value.
- * 
+ *
  * @param builder
  * @param v
  */
@@ -62,21 +74,22 @@ void processValue(OpBuilder builder, Value v) {
     // removed soon anyway).
     // We only need to manage the reference counters of DAPHNE data objects
     // like matrices and frames (not of scalars).
+
+    Operation* defOp = v.getDefiningOp();
+    if (defOp && llvm::isa<daphne::ConvertDenseMatrixToMemRef>(defOp))
+        processMemRefInterop(builder, v);
+
     if(!v.getType().isa<daphne::MatrixType, daphne::FrameType>())
         return;
-    
-    Operation * defOp = v.getDefiningOp();
 
-    Operation * decRefAfterOp = nullptr;
-    if(v.use_empty()) {
+    Operation* decRefAfterOp = nullptr;
+    if (v.use_empty()) {
         // If the given SSA value has no uses, we want to decrease its
         // reference counter directly after its definition (nullptr for block
         // args). Note that ideally, there should be no unused SSA values.
-        if(defOp)
-            decRefAfterOp = defOp;
+        if (defOp) decRefAfterOp = defOp;
         // else: decRefAfterOp stays nullptr
-    }
-    else {
+    } else {
         // If the given SSA value has uses, we need to find the last of them.
         // Note that the iterator over the uses provided by the value does not
         // seem to follow any useful order, in general, so we need to find out
@@ -85,26 +98,15 @@ void processValue(OpBuilder builder, Value v) {
         // value in the block where the value was defined, to simplify things.
         // So if the user of the value is in a descendant block, we need to
         // find its parent op in the block where the given value is defined.
-        Operation * lastUseOp = nullptr;
-        // TODO What about Block::findAncestorInBlock()?
-        for(OpOperand & use : v.getUses()) {
-            Operation * thisUseOp = use.getOwner();
-            // Find parent op in the block where v is defined.
-            while(thisUseOp->getBlock() != v.getParentBlock())
-                thisUseOp = thisUseOp->getParentOp();
-            // Determine if this is a later use.
-            if(!lastUseOp || lastUseOp->isBeforeInBlock(thisUseOp))
-                lastUseOp = thisUseOp;
-        }
-        decRefAfterOp = lastUseOp;
+        decRefAfterOp = findLastUseOfSSAValue(v);
     }
 
     // At this point, decRefAfterOp is nullptr, or the last user of v, or the
     // defining op of v.
-    
+
     if(decRefAfterOp) {
         // The given value is used and/or an OpResult.
-        
+
         // Don't insert a DecRefOp if the last user is a terminator.
         if(decRefAfterOp->hasTrait<OpTrait::IsTerminator>())
             // The value is handed out of its block (e.g., return, yield, ...).
@@ -116,7 +118,7 @@ void processValue(OpBuilder builder, Value v) {
         // runtime is on the main branch.
         // Don't insert a DecRefOp if there is already one. Currently, this can
         // happen only on the distributed worker, since the IR it gets already
-        // contains 
+        // contains
         if(isa<daphne::DecRefOp>(decRefAfterOp))
             return;
 
@@ -136,7 +138,7 @@ void processValue(OpBuilder builder, Value v) {
         else
             builder.setInsertionPointToStart(pb);
     }
-    
+
     // Finally create the DecRefOp.
     builder.create<daphne::DecRefOp>(builder.getUnknownLoc(), v);
 }
@@ -144,9 +146,9 @@ void processValue(OpBuilder builder, Value v) {
 /**
  * @brief Inserts an `IncRefOp` for the given value if its type is a DAPHNE
  * data type (matrix, frame).
- * 
+ *
  * If the type is unknown, throw an exception.
- * 
+ *
  * @param v
  * @param b
  */
@@ -164,7 +166,7 @@ void incRefIfObj(Value v, OpBuilder & b) {
 /**
  * @brief Inserts an `IncRefOp` for each operand of the given operation whose
  * type is a DAPHNE data type (matrix, frame), right before the operation.
- * 
+ *
  * @param op
  * @param b
  */
@@ -177,7 +179,7 @@ void incRefArgs(Operation& op, OpBuilder & b) {
 /**
  * @brief Manages the reference counters of all values defined in the given
  * block by inserting `IncRefOp` and `DecRefOp` in the right places.
- * 
+ *
  * @param builder
  * @param b
  */
@@ -185,14 +187,14 @@ void processBlock(OpBuilder builder, Block * b) {
     // Make sure that the reference counters of block arguments are decreased.
     for(BlockArgument& arg : b->getArguments())
         processValue(builder, arg);
-    
+
     // Make sure the the reference counters of op results are decreased, and
     // Increase the reference counters of operands where necessary.
     for(Operation& op : b->getOperations()) {
         // 1) Increase the reference counters of operands, if necessary.
 
         // TODO We could use traits to identify those cases.
-        
+
         // Casts that will not call a kernel.
         if(auto co = dyn_cast<daphne::CastOp>(op)) {
             if(co.isTrivialCast() || co.isRemovePropertyCast())
@@ -228,13 +230,13 @@ void processBlock(OpBuilder builder, Block * b) {
         //   Note: We do not increase the reference counters of the arguments
         //   of vectorized pipelines, because internally, a pipeline processes
         //   views into its inputs. These are individual data objects.
-        
-        
+
+
         // 2) Make sure the the reference counters of op results are decreased.
         for(Value v : op.getResults())
             processValue(builder, v);
-        
-        
+
+
         // 3) Recurse into the op, if it has regions.
         for(Region& r : op.getRegions())
             for(Block& b2 : r.getBlocks())
@@ -252,4 +254,4 @@ void ManageObjRefsPass::runOnOperation()
 std::unique_ptr<Pass> daphne::createManageObjRefsPass()
 {
     return std::make_unique<ManageObjRefsPass>();
-}
\ No newline at end of file
+}
diff --git a/src/compiler/lowering/MapOpLowering.cpp b/src/compiler/lowering/MapOpLowering.cpp
new file mode 100644
index 000000000..27fff5dcc
--- /dev/null
+++ b/src/compiler/lowering/MapOpLowering.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compiler/utils/CompilerUtils.h"
+#include "compiler/utils/LoweringUtils.h"
+#include "ir/daphneir/Daphne.h"
+#include "ir/daphneir/Passes.h"
+#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+using namespace mlir;
+
+class InlineMapOpLowering
+    : public mlir::OpConversionPattern<mlir::daphne::MapOp> {
+   public:
+    using OpConversionPattern::OpConversionPattern;
+
+    mlir::LogicalResult matchAndRewrite(
+        mlir::daphne::MapOp op, OpAdaptor adaptor,
+        mlir::ConversionPatternRewriter &rewriter) const override {
+        auto loc = op->getLoc();
+
+        mlir::daphne::MatrixType lhsMatrixType =
+            op->getOperandTypes().front().dyn_cast<mlir::daphne::MatrixType>();
+        auto matrixElementType = lhsMatrixType.getElementType();
+        auto lhsMemRefType = mlir::MemRefType::get(
+            {lhsMatrixType.getNumRows(), lhsMatrixType.getNumCols()}, matrixElementType);
+
+        mlir::Value lhs =
+            rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(
+                loc, lhsMemRefType, adaptor.getArg());
+        mlir::ModuleOp module = op->getParentOfType<mlir::ModuleOp>();
+        func::FuncOp udfFuncOp =
+            module.lookupSymbol<func::FuncOp>(op.getFunc());
+
+        SmallVector<Value, 4> loopIvs;
+
+        auto outerLoop =
+            rewriter.create<AffineForOp>(loc, 0, lhsMatrixType.getNumRows(), 1);
+        for (Operation &nested : *outerLoop.getBody()) {
+            rewriter.eraseOp(&nested);
+        }
+        loopIvs.push_back(outerLoop.getInductionVar());
+
+        // outer loop body
+        rewriter.setInsertionPointToStart(outerLoop.getBody());
+        auto innerLoop =
+            rewriter.create<AffineForOp>(loc, 0, lhsMatrixType.getNumCols(), 1);
+        for (Operation &nested : *innerLoop.getBody()) {
+            rewriter.eraseOp(&nested);
+        }
+        loopIvs.push_back(innerLoop.getInductionVar());
+        rewriter.create<AffineYieldOp>(loc);
+        rewriter.setInsertionPointToStart(innerLoop.getBody());
+
+        // inner loop body
+        mlir::Value lhsValue = rewriter.create<AffineLoadOp>(loc, lhs, loopIvs);
+        mlir::Value res =
+            rewriter.create<func::CallOp>(loc, udfFuncOp, ValueRange{lhsValue})
+                ->getResult(0);
+        rewriter.create<AffineStoreOp>(loc, res, lhs, loopIvs);
+        rewriter.create<AffineYieldOp>(loc);
+
+        rewriter.setInsertionPointAfter(outerLoop);
+        mlir::Value output = convertMemRefToDenseMatrix(op->getLoc(), rewriter,
+                                                        lhs, op.getType());
+        rewriter.replaceOp(op, output);
+        return mlir::success();
+    }
+};
+
+namespace {
+/**
+ * @brief The MapOpLoweringPass rewrites the daphne::MapOp operator
+ * to a set of perfectly nested affine loops and inserts for each element a call
+ * to the UDF assigned to the daphne::MapOp.
+ *
+ * This rewrite enables subsequent inlining pass to completely replace
+ * the daphne::MapOp by inlining the produced CallOps from this pass.
+ */
+struct MapOpLoweringPass
+    : public mlir::PassWrapper<MapOpLoweringPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+    explicit MapOpLoweringPass() {}
+
+    void getDependentDialects(mlir::DialectRegistry &registry) const override {
+        registry.insert<mlir::LLVM::LLVMDialect, mlir::AffineDialect,
+                        mlir::memref::MemRefDialect,
+                        mlir::daphne::DaphneDialect, mlir::func::FuncDialect>();
+    }
+    void runOnOperation() final;
+
+    StringRef getArgument() const final { return "lower-map"; }
+    StringRef getDescription() const final {
+        return "Lowers the daphne.mapOp operation to"
+               "a set of affine loops, directly calling the UDF. "
+               "Subsequent use of the inlining pass may inline the call to the "
+               "UDF.";
+    }
+};
+}  // end anonymous namespace
+
+void MapOpLoweringPass::runOnOperation() {
+    mlir::ConversionTarget target(getContext());
+    mlir::RewritePatternSet patterns(&getContext());
+    mlir::LowerToLLVMOptions llvmOptions(&getContext());
+    mlir::LLVMTypeConverter typeConverter(&getContext(), llvmOptions);
+
+    target.addLegalDialect<mlir::AffineDialect, arith::ArithDialect,
+                           memref::MemRefDialect, mlir::daphne::DaphneDialect,
+                           mlir::func::FuncDialect>();
+
+    target.addIllegalOp<mlir::daphne::MapOp>();
+
+    patterns.insert<InlineMapOpLowering>(&getContext());
+    auto module = getOperation();
+    if (failed(applyPartialConversion(module, target, std::move(patterns)))) {
+        signalPassFailure();
+    }
+}
+
+std::unique_ptr<mlir::Pass> mlir::daphne::createMapOpLoweringPass() {
+    return std::make_unique<MapOpLoweringPass>();
+}
diff --git a/src/compiler/lowering/MatMulOpLowering.cpp b/src/compiler/lowering/MatMulOpLowering.cpp
new file mode 100644
index 000000000..6c401e266
--- /dev/null
+++ b/src/compiler/lowering/MatMulOpLowering.cpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "compiler/utils/CompilerUtils.h"
+#include "compiler/utils/LoweringUtils.h"
+#include "ir/daphneir/Daphne.h"
+#include "ir/daphneir/Passes.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
+#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/LinalgToStandard/LinalgToStandard.h"
+#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/UseDefLists.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+using namespace mlir;
+
+static constexpr int ROW = 0;
+static constexpr int COL = 1;
+
+void affineMatMul(mlir::Value &lhs, mlir::Value &rhs, mlir::Value &output,
+                  ConversionPatternRewriter &rewriter, mlir::Location loc,
+                  ArrayRef<int64_t> lhsShape, ArrayRef<int64_t> rhsShape,
+                  mlir::MLIRContext *ctx) {
+    SmallVector<Value, 4> loopIvs;
+
+    // row loop
+    auto rowLoop = rewriter.create<AffineForOp>(loc, 0, lhsShape[ROW], 1);
+    for (Operation &nested : *rowLoop.getBody()) {
+        rewriter.eraseOp(&nested);
+    }
+
+    // row loop body
+    rewriter.setInsertionPointToStart(rowLoop.getBody());
+
+    // fma loop
+    auto innerLoop = rewriter.create<AffineForOp>(loc, 0, rhsShape[ROW], 1);
+    for (Operation &nested : *innerLoop.getBody()) {
+        rewriter.eraseOp(&nested);
+    }
+    rewriter.setInsertionPointToStart(innerLoop.getBody());
+
+    // col loop
+    auto colLoop = rewriter.create<AffineForOp>(loc, 0, rhsShape[COL], 1);
+    for (Operation &nested : *colLoop.getBody()) {
+        rewriter.eraseOp(&nested);
+    }
+
+    // col loop body
+    rewriter.setInsertionPointToStart(colLoop.getBody());
+
+    loopIvs.push_back(rowLoop.getInductionVar());
+    loopIvs.push_back(colLoop.getInductionVar());
+    loopIvs.push_back(innerLoop.getInductionVar());
+
+    // load
+    mlir::Value a = rewriter.create<memref::LoadOp>(
+        loc, lhs, ValueRange{loopIvs[0], loopIvs[2]});
+    mlir::Value b = rewriter.create<memref::LoadOp>(
+        loc, rhs, ValueRange{loopIvs[2], loopIvs[1]});
+    mlir::Value c = rewriter.create<memref::LoadOp>(
+        loc, output, ValueRange{loopIvs[0], loopIvs[1]});
+
+    // fma
+    mlir::Value fma = rewriter.create<LLVM::FMAOp>(loc, a, b, c);
+
+    // store
+    rewriter.create<memref::StoreOp>(loc, fma, output,
+                                     ValueRange{loopIvs[0], loopIvs[1]});
+
+    // AffineYieldOp at end of loop blocks
+    rewriter.setInsertionPointToEnd(rowLoop.getBody());
+    rewriter.create<AffineYieldOp>(loc);
+    rewriter.setInsertionPointToEnd(colLoop.getBody());
+    rewriter.create<AffineYieldOp>(loc);
+    rewriter.setInsertionPointToEnd(innerLoop.getBody());
+    rewriter.create<AffineYieldOp>(loc);
+    rewriter.setInsertionPointAfter(rowLoop);
+}
+
+class MatMulLowering : public OpConversionPattern<daphne::MatMulOp> {
+   public:
+    using OpConversionPattern::OpConversionPattern;
+
+    LogicalResult matchAndRewrite(
+        daphne::MatMulOp op, OpAdaptor adaptor,
+        ConversionPatternRewriter &rewriter) const override {
+        auto loc = op->getLoc();
+        mlir::daphne::MatrixType lhsMatrixType =
+            adaptor.getLhs().getType().dyn_cast<mlir::daphne::MatrixType>();
+        mlir::daphne::MatrixType rhsMatrixType =
+            adaptor.getRhs().getType().dyn_cast<mlir::daphne::MatrixType>();
+
+        auto lhsRows = lhsMatrixType.getNumRows();
+        auto lhsCols = lhsMatrixType.getNumCols();
+
+        auto rhsRows = rhsMatrixType.getNumRows();
+        auto rhsCols = rhsMatrixType.getNumCols();
+
+        auto matrixElementType = lhsMatrixType.getElementType();
+
+        // TODO(phil): if shape is unknown, e.g., row/col = -1 we currently
+        // can't create a MemRefType
+        auto lhsMemRefType =
+            mlir::MemRefType::get({lhsRows, lhsCols}, matrixElementType);
+        auto rhsMemRefType =
+            mlir::MemRefType::get({rhsRows, rhsCols}, matrixElementType);
+
+        mlir::MemRefType outputMemRefType =
+            mlir::MemRefType::get({lhsRows, rhsCols}, matrixElementType);
+
+        // daphne::Matrix -> memref
+        mlir::Value lhs =
+            rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(
+                op->getLoc(), lhsMemRefType, adaptor.getLhs());
+        mlir::Value rhs =
+            rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(
+                op->getLoc(), rhsMemRefType, adaptor.getRhs());
+
+        // Alloc output memref
+        mlir::Value outputMemRef =
+            insertMemRefAlloc(outputMemRefType, loc, rewriter);
+
+        // Fill the output MemRef
+        affineFillMemRef(0.0, rewriter, loc, outputMemRefType.getShape(),
+                         op->getContext(), outputMemRef, matrixElementType);
+        // Do the actual MatMul with hand built codegen
+        affineMatMul(lhs, rhs, outputMemRef, rewriter, loc,
+                     lhsMemRefType.getShape(), rhsMemRefType.getShape(),
+                     op->getContext());
+
+        mlir::Value DM = convertMemRefToDenseMatrix(loc, rewriter, outputMemRef,
+                                                    op.getType());
+
+        rewriter.replaceOp(op, DM);
+        return success();
+    }
+};
+
+namespace {
+/**
+ * @brief The MatMulLoweringPass rewrites the MatMulOp from the DaphneDialect
+ * to a affine loop structure implementing a naive iterative matrix
+ * multiplication.
+ *
+ * The naive iterative algorithm is simply a perfectly nested
+ * loop algorithm running in O(n^3) performing the 3 load operations in it's
+ * inner loop body, calculates an FMA and stores the result in the output
+ * matrix.
+ */
+struct MatMulLoweringPass
+    : public mlir::PassWrapper<MatMulLoweringPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+    explicit MatMulLoweringPass() {}
+
+    StringRef getArgument() const final { return "lower-mm"; }
+    StringRef getDescription() const final {
+        return "This pass lowers the MatMulOp to an affine loop structure "
+               "performing a naive iterative matrix multiplication.";
+    }
+
+    void getDependentDialects(mlir::DialectRegistry &registry) const override {
+        registry.insert<mlir::LLVM::LLVMDialect, mlir::AffineDialect,
+                        mlir::memref::MemRefDialect>();
+    }
+    void runOnOperation() final;
+};
+}  // end anonymous namespace
+
+void MatMulLoweringPass::runOnOperation() {
+    mlir::ConversionTarget target(getContext());
+    mlir::RewritePatternSet patterns(&getContext());
+    LowerToLLVMOptions llvmOptions(&getContext());
+    LLVMTypeConverter typeConverter(&getContext(), llvmOptions);
+
+    target.addLegalDialect<mlir::memref::MemRefDialect>();
+    target.addLegalDialect<mlir::arith::ArithDialect>();
+    target.addLegalDialect<mlir::scf::SCFDialect>();
+    target.addLegalDialect<mlir::AffineDialect>();
+    target.addLegalDialect<mlir::linalg::LinalgDialect>();
+    target.addLegalDialect<mlir::LLVM::LLVMDialect>();
+
+    target.addLegalOp<mlir::daphne::ConvertDenseMatrixToMemRef>();
+    target.addLegalOp<mlir::daphne::ConvertMemRefToDenseMatrix>();
+    target.addLegalOp<mlir::daphne::DecRefOp>();
+
+    target.addIllegalOp<mlir::daphne::MatMulOp>();
+
+    patterns.insert<MatMulLowering>(&getContext());
+    auto module = getOperation();
+    if (failed(applyPartialConversion(module, target, std::move(patterns)))) {
+        signalPassFailure();
+    }
+}
+
+std::unique_ptr<mlir::Pass> mlir::daphne::createMatMulOpLoweringPass() {
+    return std::make_unique<MatMulLoweringPass>();
+}
diff --git a/src/compiler/lowering/ModOpLowering.cpp b/src/compiler/lowering/ModOpLowering.cpp
new file mode 100644
index 000000000..05fdf7ea4
--- /dev/null
+++ b/src/compiler/lowering/ModOpLowering.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compiler/utils/CompilerUtils.h"
+#include "compiler/utils/LoweringUtils.h"
+#include "ir/daphneir/Daphne.h"
+#include "ir/daphneir/Passes.h"
+#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+using namespace mlir;
+
+class EwModOpLowering
+    : public mlir::OpConversionPattern<mlir::daphne::EwModOp> {
+   public:
+    using OpConversionPattern::OpConversionPattern;
+
+    [[nodiscard]] bool optimization_viable(mlir::Value divisor) const {
+        std::pair<bool, int64_t> isConstant =
+            CompilerUtils::isConstant<int64_t>(divisor);
+        return isConstant.first && (isConstant.second & (isConstant.second - 1)) == 0;
+    }
+
+    void optimizeEwModOp(mlir::Value memRef, mlir::Value divisor,
+                         ArrayRef<int64_t> shape,
+                         ConversionPatternRewriter &rewriter,
+                         Location loc) const {
+        // divisor - 1
+        mlir::Value cst_one = rewriter.create<mlir::arith::ConstantOp>(
+            loc, rewriter.getI64Type(), rewriter.getI64IntegerAttr(1));
+
+        auto casted_divisor = typeConverter->materializeTargetConversion(
+            rewriter, loc, rewriter.getI64Type(), ValueRange{divisor});
+
+        mlir::Value rhs =
+            rewriter.create<mlir::arith::SubIOp>(loc, casted_divisor, cst_one);
+
+        SmallVector<int64_t, 4> lowerBounds(/*Rank=*/2, /*Value=*/0);
+        SmallVector<int64_t, 4> steps(/*Rank=*/2, /*Value=*/1);
+        buildAffineLoopNest(
+            rewriter, loc, lowerBounds, shape, steps,
+            [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
+                mlir::Value load =
+                    nestedBuilder.create<AffineLoadOp>(loc, memRef, ivs);
+                mlir::Value res{};
+
+                Value castedLhs =
+                    this->typeConverter->materializeTargetConversion(
+                        nestedBuilder, loc,
+                        nestedBuilder.getIntegerType(
+                            divisor.getType().getIntOrFloatBitWidth()),
+                        ValueRange{load});
+
+                res = nestedBuilder.create<arith::AndIOp>(loc, castedLhs, rhs);
+                Value castedRes =
+                    this->typeConverter->materializeSourceConversion(
+                        nestedBuilder, loc, divisor.getType(), ValueRange{res});
+
+                nestedBuilder.create<AffineStoreOp>(loc, castedRes, memRef,
+                                                    ivs);
+            });
+    }
+
+    void lowerEwModOp(mlir::Value memRef, mlir::Value divisor,
+                      ArrayRef<int64_t> shape,
+                      ConversionPatternRewriter &rewriter, Location loc) const {
+        SmallVector<int64_t, 4> lowerBounds(/*Rank=*/2, /*Value=*/0);
+        SmallVector<int64_t, 4> steps(/*Rank=*/2, /*Value=*/1);
+        buildAffineLoopNest(
+            rewriter, loc, lowerBounds, shape, steps,
+            [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
+                mlir::Value load =
+                    nestedBuilder.create<AffineLoadOp>(loc, memRef, ivs);
+                mlir::Value res{};
+
+                // this is enough since divisor will be casted to float if
+                // matrix is float
+                if (divisor.getType().isa<mlir::FloatType>()) {
+                    res =
+                        nestedBuilder.create<arith::RemFOp>(loc, load, divisor);
+                    nestedBuilder.create<AffineStoreOp>(loc, res, memRef, ivs);
+                    return;
+                }
+
+                Value castedLhs =
+                    this->typeConverter->materializeTargetConversion(
+                        nestedBuilder, loc,
+                        nestedBuilder.getIntegerType(
+                            divisor.getType().getIntOrFloatBitWidth()),
+                        ValueRange{load});
+
+                Value castedRhs =
+                    this->typeConverter->materializeTargetConversion(
+                        nestedBuilder, loc,
+                        nestedBuilder.getIntegerType(
+                            divisor.getType().getIntOrFloatBitWidth()),
+                        ValueRange{divisor});
+
+                res = nestedBuilder.create<arith::RemSIOp>(loc, castedLhs,
+                                                           castedRhs);
+                Value castedRes =
+                    this->typeConverter->materializeSourceConversion(
+                        nestedBuilder, loc, divisor.getType(), ValueRange{res});
+
+                nestedBuilder.create<AffineStoreOp>(loc, castedRes, memRef,
+                                                    ivs);
+            });
+    }
+
+    mlir::LogicalResult matchAndRewrite(
+        mlir::daphne::EwModOp op, OpAdaptor adaptor,
+        mlir::ConversionPatternRewriter &rewriter) const override {
+        mlir::daphne::MatrixType lhsTensor =
+            adaptor.getLhs().getType().dyn_cast<mlir::daphne::MatrixType>();
+        auto lhsRows = lhsTensor.getNumRows();
+        auto lhsCols = lhsTensor.getNumCols();
+
+        auto lhsMemRefType = mlir::MemRefType::get({lhsRows, lhsCols},
+                                                   lhsTensor.getElementType());
+
+        // daphne::Matrix -> memref
+        mlir::Value lhs =
+            rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(
+                op->getLoc(), lhsMemRefType, adaptor.getLhs());
+        mlir::Value rhs = adaptor.getRhs();
+
+        if (optimization_viable(rhs))
+            optimizeEwModOp(lhs, rhs,
+                            {lhsTensor.getNumRows(), lhsTensor.getNumCols()},
+                            rewriter, op->getLoc());
+        else
+            lowerEwModOp(lhs, rhs,
+                         {lhsTensor.getNumRows(), lhsTensor.getNumCols()},
+                         rewriter, op->getLoc());
+
+        mlir::Value output = convertMemRefToDenseMatrix(op->getLoc(), rewriter,
+                                                        lhs, op.getType());
+        rewriter.replaceOp(op, output);
+        return success();
+    }
+};
+
+namespace {
+/**
+ * @brief Performs an integer mod optimization on the EwModOp operator by
+ * lowering to an affine loop structure and performing the mod op on values
+ * loaded from a MemRef.
+ *
+ * If possible, we additionally perform the integer modulo optimization by
+ * replacing the modulo with an bitwise AND and a subtraction.
+ */
+struct ModOpLoweringPass
+    : public mlir::PassWrapper<ModOpLoweringPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+    explicit ModOpLoweringPass() {}
+
+    void getDependentDialects(mlir::DialectRegistry &registry) const override {
+        registry
+            .insert<mlir::LLVM::LLVMDialect, mlir::AffineDialect,
+                    mlir::memref::MemRefDialect, mlir::daphne::DaphneDialect>();
+    }
+    void runOnOperation() final;
+
+    StringRef getArgument() const final { return "lower-mod"; }
+    StringRef getDescription() const final {
+        return "Performs an integer mod optimization on the EwModOp operator "
+               "by lowering to an affine loop structure"
+               "and performing the mod op on values loaded from a MemRef.";
+    }
+};
+}  // end anonymous namespace
+
+void ModOpLoweringPass::runOnOperation() {
+    mlir::ConversionTarget target(getContext());
+    mlir::RewritePatternSet patterns(&getContext());
+    mlir::LowerToLLVMOptions llvmOptions(&getContext());
+    mlir::LLVMTypeConverter typeConverter(&getContext(), llvmOptions);
+
+    typeConverter.addConversion(convertInteger);
+    typeConverter.addConversion(convertFloat);
+    typeConverter.addConversion([](Type type) { return type; });
+    typeConverter.addArgumentMaterialization(materializeCastFromIllegal);
+    typeConverter.addSourceMaterialization(materializeCastToIllegal);
+    typeConverter.addTargetMaterialization(materializeCastFromIllegal);
+
+    target.addLegalDialect<mlir::memref::MemRefDialect>();
+    target.addLegalDialect<mlir::arith::ArithDialect>();
+    target.addLegalDialect<mlir::AffineDialect>();
+    target.addLegalDialect<mlir::LLVM::LLVMDialect>();
+    target.addLegalDialect<mlir::BuiltinDialect>();
+    target.addLegalDialect<mlir::daphne::DaphneDialect>();
+
+    target.addIllegalOp<mlir::daphne::EwModOp>();
+
+    patterns.insert<EwModOpLowering>(typeConverter, &getContext());
+    auto module = getOperation();
+    if (failed(applyPartialConversion(module, target, std::move(patterns)))) {
+        signalPassFailure();
+    }
+}
+
+std::unique_ptr<mlir::Pass> mlir::daphne::createModOpLoweringPass() {
+    return std::make_unique<ModOpLoweringPass>();
+}
diff --git a/src/compiler/lowering/RewriteSqlOpPass.cpp b/src/compiler/lowering/RewriteSqlOpPass.cpp
index 401544ac3..9c3d2d32d 100644
--- a/src/compiler/lowering/RewriteSqlOpPass.cpp
+++ b/src/compiler/lowering/RewriteSqlOpPass.cpp
@@ -85,6 +85,9 @@ namespace
     : public PassWrapper <RewriteSqlOpPass, OperationPass<ModuleOp>>
     {
         void runOnOperation() final;
+
+    StringRef getArgument() const final { return "rewrite-sqlop"; }
+    StringRef getDescription() const final { return "TODO"; }
     };
 }
 
diff --git a/src/compiler/lowering/RewriteToCallKernelOpPass.cpp b/src/compiler/lowering/RewriteToCallKernelOpPass.cpp
index b9e78f319..4454aaec8 100644
--- a/src/compiler/lowering/RewriteToCallKernelOpPass.cpp
+++ b/src/compiler/lowering/RewriteToCallKernelOpPass.cpp
@@ -18,9 +18,14 @@
 #include "ir/daphneir/Daphne.h"
 #include "ir/daphneir/Passes.h"
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/BuiltinDialect.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/IR/IRMapping.h"
 
@@ -364,6 +369,7 @@ namespace
 
             // Inject the current DaphneContext as the last input parameter to
             // all kernel calls, unless it's a CreateDaphneContextOp.
+
             if(!llvm::isa<daphne::CreateDaphneContextOp>(op))
                 newOperands.push_back(dctx);
 
@@ -494,8 +500,12 @@ void RewriteToCallKernelOpPass::runOnOperation()
     // Specification of (il)legal dialects/operations. All DaphneIR operations
     // but those explicitly marked as legal will be replaced by CallKernelOp.
     ConversionTarget target(getContext());
-    target.addLegalDialect<arith::ArithDialect, LLVM::LLVMDialect, scf::SCFDialect>();
-    target.addLegalOp<ModuleOp, func::FuncOp>();
+    target.addLegalDialect<mlir::AffineDialect, LLVM::LLVMDialect,
+                           scf::SCFDialect, memref::MemRefDialect,
+                           mlir::linalg::LinalgDialect,
+                           mlir::arith::ArithDialect, mlir::BuiltinDialect>();
+
+    target.addLegalOp<ModuleOp, func::FuncOp, func::CallOp, func::ReturnOp>();
     target.addIllegalDialect<daphne::DaphneDialect>();
     target.addLegalOp<
             daphne::ConstantOp,
@@ -504,6 +514,8 @@ void RewriteToCallKernelOpPass::runOnOperation()
             daphne::CreateVariadicPackOp,
             daphne::StoreVariadicPackOp,
             daphne::VectorizedPipelineOp,
+            scf::ForOp,
+            memref::LoadOp,
             daphne::GenericCallOp,
             daphne::MapOp
     >();
diff --git a/src/compiler/lowering/SpecializeGenericFunctionsPass.cpp b/src/compiler/lowering/SpecializeGenericFunctionsPass.cpp
index 453f65525..15ebd9b03 100644
--- a/src/compiler/lowering/SpecializeGenericFunctionsPass.cpp
+++ b/src/compiler/lowering/SpecializeGenericFunctionsPass.cpp
@@ -387,6 +387,9 @@ namespace {
 
     public:
         void runOnOperation() final;
+
+    StringRef getArgument() const final { return "specialize-generic-funcs"; }
+    StringRef getDescription() const final { return "TODO"; }
     };
 }
 
diff --git a/src/compiler/lowering/WhileLoopInvariantCodeMotionPass.cpp b/src/compiler/lowering/WhileLoopInvariantCodeMotionPass.cpp
index 58c042af0..8e933155e 100644
--- a/src/compiler/lowering/WhileLoopInvariantCodeMotionPass.cpp
+++ b/src/compiler/lowering/WhileLoopInvariantCodeMotionPass.cpp
@@ -36,6 +36,9 @@ using namespace mlir;
 struct WhileLoopInvariantCodeMotionPass
 : public PassWrapper <WhileLoopInvariantCodeMotionPass, OperationPass<func::FuncOp>> {
     void runOnOperation() final;
+
+    StringRef getArgument() const final { return "while-loop-invariant-code-motion"; }
+    StringRef getDescription() const final { return "TODO"; }
 };
 
 void WhileLoopInvariantCodeMotionPass::runOnOperation() {
diff --git a/src/compiler/utils/CMakeLists.txt b/src/compiler/utils/CMakeLists.txt
index a7acd88bf..73e8cd7c0 100644
--- a/src/compiler/utils/CMakeLists.txt
+++ b/src/compiler/utils/CMakeLists.txt
@@ -14,9 +14,10 @@
 
 add_library(CompilerUtils STATIC
         CompilerUtils.cpp
+        LoweringUtils.cpp
         TypePrinting.cpp
 )
 
 target_link_libraries(CompilerUtils PUBLIC
         DaphneMetaDataParser
-)
\ No newline at end of file
+)
diff --git a/src/compiler/utils/CompilerUtils.cpp b/src/compiler/utils/CompilerUtils.cpp
index 9ec231f55..43fb800f1 100644
--- a/src/compiler/utils/CompilerUtils.cpp
+++ b/src/compiler/utils/CompilerUtils.cpp
@@ -57,6 +57,14 @@ std::pair<bool, int64_t> CompilerUtils::isConstant<int64_t>(mlir::Value v) {
     );
 }
 
+
+template<>
+std::pair<bool, uint64_t> CompilerUtils::isConstant<uint64_t>(mlir::Value v) {
+    return isConstantHelper<uint64_t, mlir::IntegerAttr>(
+            v, [](mlir::IntegerAttr attr){return attr.getValue().getLimitedValue();}
+    );
+}
+
 template<>
 std::pair<bool, float> CompilerUtils::isConstant<float>(mlir::Value v) {
     return isConstantHelper<float, mlir::FloatAttr>(
diff --git a/src/compiler/utils/CompilerUtils.h b/src/compiler/utils/CompilerUtils.h
index b934f55ea..13e4973b6 100644
--- a/src/compiler/utils/CompilerUtils.h
+++ b/src/compiler/utils/CompilerUtils.h
@@ -178,6 +178,9 @@ struct CompilerUtils {
             return "Descriptor";
         else if(t.isa<mlir::daphne::TargetType>())
             return "Target";
+        else if(auto memRefType = t.dyn_cast<mlir::MemRefType>()) {
+            return "StridedMemRefType_" + mlirTypeToCppTypeName(memRefType.getElementType(), false) + "_2";
+        }
 
         std::string typeName;
         llvm::raw_string_ostream rsos(typeName);
@@ -261,4 +264,4 @@ struct CompilerUtils {
             return vt;
     }
 
-};
\ No newline at end of file
+};
diff --git a/src/compiler/utils/LoweringUtils.cpp b/src/compiler/utils/LoweringUtils.cpp
new file mode 100644
index 000000000..943dbd304
--- /dev/null
+++ b/src/compiler/utils/LoweringUtils.cpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "LoweringUtils.h"
+
+#include <ir/daphneir/Passes.h>
+
+#include "ir/daphneir/Daphne.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Transforms/Passes.h"
+
+/// Insert an allocation for the given MemRefType.
+mlir::Value insertMemRefAlloc(mlir::MemRefType type, mlir::Location loc,
+                              mlir::PatternRewriter &rewriter) {
+    auto alloc = rewriter.create<mlir::memref::AllocOp>(loc, type);
+
+    // Make sure to allocate at the beginning of the block.
+    auto *parentBlock = alloc->getBlock();
+    alloc->moveBefore(&parentBlock->front());
+
+    return alloc;
+}
+
+void insertMemRefDealloc(mlir::Value memref, mlir::Location loc,
+                         mlir::PatternRewriter &rewriter) {
+    auto dealloc = rewriter.create<mlir::memref::DeallocOp>(loc, memref);
+    dealloc->moveBefore(&memref.getParentBlock()->back());
+}
+
+// TODO(phil) try to provide function templates to remove duplication
+void affineFillMemRefInt(int value, mlir::ConversionPatternRewriter &rewriter,
+                         mlir::Location loc, mlir::ArrayRef<int64_t> shape,
+                         mlir::MLIRContext *ctx, mlir::Value memRef,
+                         mlir::Type elemType) {
+    constexpr int ROW = 0;
+    constexpr int COL = 1;
+    mlir::Value fillValue = rewriter.create<mlir::arith::ConstantOp>(
+        loc, rewriter.getI64Type(), rewriter.getI64IntegerAttr(value));
+
+    llvm::SmallVector<mlir::Value, 4> loopIvs;
+
+    auto outerLoop = rewriter.create<mlir::AffineForOp>(loc, 0, shape[ROW], 1);
+    for (mlir::Operation &nested : *outerLoop.getBody()) {
+        rewriter.eraseOp(&nested);
+    }
+    loopIvs.push_back(outerLoop.getInductionVar());
+
+    // outer loop body
+    rewriter.setInsertionPointToStart(outerLoop.getBody());
+    auto innerLoop = rewriter.create<mlir::AffineForOp>(loc, 0, shape[COL], 1);
+    for (mlir::Operation &nested : *innerLoop.getBody()) {
+        rewriter.eraseOp(&nested);
+    }
+    loopIvs.push_back(innerLoop.getInductionVar());
+    rewriter.create<mlir::AffineYieldOp>(loc);
+    rewriter.setInsertionPointToStart(innerLoop.getBody());
+    rewriter.create<mlir::AffineStoreOp>(loc, fillValue, memRef, loopIvs);
+
+    rewriter.create<mlir::AffineYieldOp>(loc);
+    rewriter.setInsertionPointAfter(outerLoop);
+}
+
+void affineFillMemRef(double value, mlir::ConversionPatternRewriter &rewriter,
+                      mlir::Location loc, mlir::ArrayRef<int64_t> shape,
+                      mlir::MLIRContext *ctx, mlir::Value memRef,
+                      mlir::Type elemType) {
+    constexpr int ROW = 0;
+    constexpr int COL = 1;
+    mlir::Value fillValue = rewriter.create<mlir::arith::ConstantOp>(
+        loc, elemType, rewriter.getFloatAttr(elemType, value));
+
+    llvm::SmallVector<mlir::Value, 4> loopIvs;
+
+    auto outerLoop = rewriter.create<mlir::AffineForOp>(loc, 0, shape[ROW], 1);
+    for (mlir::Operation &nested : *outerLoop.getBody()) {
+        rewriter.eraseOp(&nested);
+    }
+    loopIvs.push_back(outerLoop.getInductionVar());
+
+    // outer loop body
+    rewriter.setInsertionPointToStart(outerLoop.getBody());
+    auto innerLoop = rewriter.create<mlir::AffineForOp>(loc, 0, shape[COL], 1);
+    for (mlir::Operation &nested : *innerLoop.getBody()) {
+        rewriter.eraseOp(&nested);
+    }
+    loopIvs.push_back(innerLoop.getInductionVar());
+    rewriter.create<mlir::AffineYieldOp>(loc);
+    rewriter.setInsertionPointToStart(innerLoop.getBody());
+    rewriter.create<mlir::AffineStoreOp>(loc, fillValue, memRef, loopIvs);
+
+    rewriter.create<mlir::AffineYieldOp>(loc);
+    rewriter.setInsertionPointAfter(outerLoop);
+}
+
+mlir::Value convertMemRefToDenseMatrix(
+    mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+    mlir::Value memRef, mlir::Type type) {
+    auto extractStridedMetadataOp =
+        rewriter.create<mlir::memref::ExtractStridedMetadataOp>(loc, memRef);
+    // aligned ptr (memref.data)
+    mlir::Value alignedPtr =
+        rewriter.create<mlir::memref::ExtractAlignedPointerAsIndexOp>(loc,
+                                                                      memRef);
+    // offset
+    mlir::Value offset = extractStridedMetadataOp.getOffset();
+    // strides
+    mlir::ResultRange strides = extractStridedMetadataOp.getStrides();
+    // sizes
+    mlir::ResultRange sizes = extractStridedMetadataOp.getSizes();
+
+    return rewriter.create<mlir::daphne::ConvertMemRefToDenseMatrix>(
+        loc, type, alignedPtr, offset, sizes[0], sizes[1], strides[0],
+        strides[1]);
+}
+
+mlir::Type convertFloat(mlir::FloatType floatType) {
+    return mlir::IntegerType::get(floatType.getContext(),
+                                  floatType.getIntOrFloatBitWidth());
+}
+
+mlir::Type convertInteger(mlir::IntegerType intType) {
+    return mlir::IntegerType::get(intType.getContext(),
+                                  intType.getIntOrFloatBitWidth());
+}
+
+llvm::Optional<mlir::Value> materializeCastFromIllegal(mlir::OpBuilder &builder,
+                                                       mlir::Type type,
+                                                       mlir::ValueRange inputs,
+                                                       mlir::Location loc) {
+    mlir::Type fromType = getElementTypeOrSelf(inputs[0].getType());
+    mlir::Type toType = getElementTypeOrSelf(type);
+
+    if ((!fromType.isSignedInteger() && !fromType.isUnsignedInteger()) ||
+        !toType.isSignlessInteger())
+        return std::nullopt;
+    // Use unrealized conversion casts to do signful->signless conversions.
+    return builder
+        .create<mlir::UnrealizedConversionCastOp>(loc, type, inputs[0])
+        ->getResult(0);
+}
+
+llvm::Optional<mlir::Value> materializeCastToIllegal(mlir::OpBuilder &builder,
+                                                     mlir::Type type,
+                                                     mlir::ValueRange inputs,
+                                                     mlir::Location loc) {
+    mlir::Type fromType = getElementTypeOrSelf(inputs[0].getType());
+    mlir::Type toType = getElementTypeOrSelf(type);
+
+    if (!fromType.isSignlessInteger() ||
+        (!toType.isSignedInteger() && !toType.isUnsignedInteger()))
+        return std::nullopt;
+    // Use unrealized conversion casts to do signless->signful conversions.
+    return builder
+        .create<mlir::UnrealizedConversionCastOp>(loc, type, inputs[0])
+        ->getResult(0);
+}
+
+mlir::Operation *findLastUseOfSSAValue(mlir::Value &v) {
+    mlir::Operation *lastUseOp = nullptr;
+
+    for (mlir::OpOperand &use : v.getUses()) {
+        mlir::Operation *thisUseOp = use.getOwner();
+        // Find parent op in the block where v is defined.
+        while (thisUseOp->getBlock() != v.getParentBlock())
+            thisUseOp = thisUseOp->getParentOp();
+        // Determine if this is a later use.
+        if (!lastUseOp || lastUseOp->isBeforeInBlock(thisUseOp))
+            lastUseOp = thisUseOp;
+    }
+
+    return lastUseOp;
+}
diff --git a/src/compiler/utils/LoweringUtils.h b/src/compiler/utils/LoweringUtils.h
new file mode 100644
index 000000000..5555b1324
--- /dev/null
+++ b/src/compiler/utils/LoweringUtils.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+mlir::Value insertMemRefAlloc(mlir::MemRefType type, mlir::Location loc,
+                              mlir::PatternRewriter &rewriter);
+
+void insertMemRefDealloc(mlir::Value memref, mlir::Location loc,
+                         mlir::PatternRewriter &rewriter);
+
+void affineFillMemRefInt(int value, mlir::ConversionPatternRewriter &rewriter,
+                         mlir::Location loc, mlir::ArrayRef<int64_t> shape,
+                         mlir::MLIRContext *ctx, mlir::Value memRef,
+                         mlir::Type elemType);
+
+void affineFillMemRef(double value, mlir::ConversionPatternRewriter &rewriter,
+                      mlir::Location loc, mlir::ArrayRef<int64_t> shape,
+                      mlir::MLIRContext *ctx, mlir::Value memRef,
+                      mlir::Type elemType);
+
+mlir::Value convertMemRefToDenseMatrix(mlir::Location,
+                                       mlir::ConversionPatternRewriter &,
+                                       mlir::Value memRef, mlir::Type);
+
+llvm::Optional<mlir::Value> materializeCastFromIllegal(mlir::OpBuilder &builder,
+                                                       mlir::Type type,
+                                                       mlir::ValueRange inputs,
+                                                       mlir::Location loc);
+
+llvm::Optional<mlir::Value> materializeCastToIllegal(mlir::OpBuilder &builder,
+                                                     mlir::Type type,
+                                                     mlir::ValueRange inputs,
+                                                     mlir::Location loc);
+
+mlir::Type convertFloat(mlir::FloatType floatType);
+
+mlir::Type convertInteger(mlir::IntegerType intType);
+
+mlir::Operation *findLastUseOfSSAValue(mlir::Value &v);
diff --git a/src/ir/daphneir/CMakeLists.txt b/src/ir/daphneir/CMakeLists.txt
index c6ef724e3..6036aefcd 100644
--- a/src/ir/daphneir/CMakeLists.txt
+++ b/src/ir/daphneir/CMakeLists.txt
@@ -52,4 +52,4 @@ add_mlir_dialect_library(MLIRDaphne
 
     LINK_LIBS PUBLIC
     CompilerUtils
-)
\ No newline at end of file
+)
diff --git a/src/ir/daphneir/Daphne.h b/src/ir/daphneir/Daphne.h
index f5cd35985..73a2e6b23 100644
--- a/src/ir/daphneir/Daphne.h
+++ b/src/ir/daphneir/Daphne.h
@@ -33,6 +33,7 @@
 #include "mlir/IR/AttrTypeSubElements.h"
 #pragma GCC diagnostic pop
 
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
diff --git a/src/ir/daphneir/DaphneDialect.cpp b/src/ir/daphneir/DaphneDialect.cpp
index ded725c4f..205e7c4e9 100644
--- a/src/ir/daphneir/DaphneDialect.cpp
+++ b/src/ir/daphneir/DaphneDialect.cpp
@@ -16,24 +16,33 @@
 
 #include <compiler/utils/CompilerUtils.h>
 #include <ir/daphneir/Daphne.h>
+
 #include <ir/daphneir/DaphneOpsEnums.cpp.inc>
+
+#include "mlir/Support/LogicalResult.h"
 #define GET_OP_CLASSES
 #include <ir/daphneir/DaphneOps.cpp.inc>
 #define GET_TYPEDEF_CLASSES
-#include <ir/daphneir/DaphneOpsTypes.cpp.inc>
+#include <llvm/ADT/APInt.h>
+#include <llvm/ADT/APSInt.h>
+#include <llvm/ADT/BitVector.h>
+
 #include <ir/daphneir/DaphneOpsDialect.cpp.inc>
+#include <ir/daphneir/DaphneOpsTypes.cpp.inc>
 
+#include "llvm/ADT/ArrayRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/FunctionImplementation.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/SymbolTable.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/CastInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
@@ -41,12 +50,46 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/VectorInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
+#include "mlir/Transforms/InliningUtils.h"
 
 #include <llvm/ADT/BitVector.h>
 #include <llvm/ADT/APInt.h>
 #include <llvm/ADT/APSInt.h>
 #include <llvm/ADT/DenseMap.h>
 
+struct DaphneInlinerInterface : public mlir::DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  bool isLegalToInline(mlir::Operation *call, mlir::Operation *callable,
+                       bool wouldBeCloned) const final {
+    return true;
+  }
+
+  bool isLegalToInline(mlir::Operation *, mlir::Region *, bool, mlir::IRMapping &) const final {
+    return true;
+  }
+
+  bool isLegalToInline(mlir::Region *, mlir::Region *, bool, mlir::IRMapping &) const final {
+    return true;
+  }
+
+  void handleTerminator(mlir::Operation *op,
+                        mlir::ArrayRef<mlir::Value> valuesToRepl) const final {
+    auto returnOp = mlir::dyn_cast<mlir::daphne::ReturnOp>(op);
+
+    // Replace the values directly with the return operands.
+    assert(returnOp.getNumOperands() == valuesToRepl.size());
+    for (const auto &it : llvm::enumerate(returnOp.getOperands()))
+      valuesToRepl[it.index()].replaceAllUsesWith(it.value());
+  }
+
+  mlir::Operation *materializeCallConversion(mlir::OpBuilder &builder, mlir::Value input,
+                                       mlir::Type resultType,
+                                       mlir::Location conversionLoc) const final {
+    return builder.create<mlir::daphne::CastOp>(conversionLoc, resultType, input);
+  }
+};
+
 void mlir::daphne::DaphneDialect::initialize()
 {
     addOperations<
@@ -57,6 +100,7 @@ void mlir::daphne::DaphneDialect::initialize()
         #define GET_TYPEDEF_LIST
         #include <ir/daphneir/DaphneOpsTypes.cpp.inc>
     >();
+    addInterfaces<DaphneInlinerInterface>();
 }
 
 mlir::Operation *mlir::daphne::DaphneDialect::materializeConstant(OpBuilder &builder,
@@ -179,6 +223,9 @@ mlir::Type mlir::daphne::DaphneDialect::parseType(mlir::DialectAsmParser &parser
     else if (keyword == "String") {
         return StringType::get(parser.getBuilder().getContext());
     }
+    else if (keyword == "DaphneContext") {
+        return mlir::daphne::DaphneContextType::get(parser.getBuilder().getContext());
+    }
     else {
         parser.emitError(parser.getCurrentLocation()) << "Parsing failed, keyword `" << keyword << "` not recognized!";
         return nullptr;
@@ -363,6 +410,7 @@ ::mlir::LogicalResult mlir::daphne::MatrixType::verify(
             // Value type is known.
             || elementType.isSignedInteger(64)
             || elementType.isUnsignedInteger(8)
+            || elementType.isUnsignedInteger(64)
             || elementType.isF32()
             || elementType.isF64()
             || elementType.isIndex()
@@ -783,6 +831,10 @@ mlir::OpFoldResult mlir::daphne::EwAndOp::fold(FoldAdaptor adaptor) {
     return {};
 }
 
+mlir::OpFoldResult mlir::daphne::EwBitwiseAndOp::fold(FoldAdaptor adaptor) {
+    return {};
+}
+
 mlir::OpFoldResult mlir::daphne::EwOrOp::fold(FoldAdaptor adaptor) {
     ArrayRef<Attribute> operands = adaptor.getOperands();
     auto boolOp = [](const bool &a, const bool &b) { return a || b; };
@@ -1323,4 +1375,35 @@ mlir::LogicalResult mlir::daphne::CondOp::canonicalize(mlir::daphne::CondOp op,
 
         return mlir::success();
     }
-}
\ No newline at end of file
+}
+
+mlir::LogicalResult mlir::daphne::ConvertDenseMatrixToMemRef::canonicalize(
+    mlir::daphne::ConvertDenseMatrixToMemRef op,
+    mlir::PatternRewriter &rewriter) {
+    // removes unnecessary conversions of MemRef -> DM -> MemRef
+    mlir::Operation *dmNode = op->getOperand(0).getDefiningOp();
+
+    if (!llvm::isa<mlir::daphne::ConvertMemRefToDenseMatrix>(dmNode))
+        return failure();
+
+    mlir::Operation *originalMemRefOp =
+        dmNode->getPrevNode()->getOperand(0).getDefiningOp();
+    op.replaceAllUsesWith(originalMemRefOp);
+
+    rewriter.eraseOp(op);
+    if (dmNode->getUsers().empty()) rewriter.eraseOp(dmNode);
+
+    return mlir::success();
+}
+
+mlir::LogicalResult mlir::daphne::ConvertMemRefToDenseMatrix::canonicalize(
+    mlir::daphne::ConvertMemRefToDenseMatrix op,
+    mlir::PatternRewriter &rewriter) {
+    mlir::Operation *extractPtr = op->getPrevNode();
+    auto srcMemRef = extractPtr->getOperand(0).getDefiningOp();
+    extractPtr->moveAfter(srcMemRef);
+    op->moveAfter(extractPtr);
+
+    return mlir::success();
+}
+
diff --git a/src/ir/daphneir/DaphneDistributableOpInterface.cpp b/src/ir/daphneir/DaphneDistributableOpInterface.cpp
index 20ee390a0..416179a6c 100644
--- a/src/ir/daphneir/DaphneDistributableOpInterface.cpp
+++ b/src/ir/daphneir/DaphneDistributableOpInterface.cpp
@@ -134,6 +134,9 @@ IMPL_EWBINARYOP(EwAndOp)
 IMPL_EWBINARYOP(EwOrOp)
 IMPL_EWBINARYOP(EwXorOp)
 
+// Bitwise
+IMPL_EWBINARYOP(EwBitwiseAndOp);
+
 // Strings
 IMPL_EWBINARYOP(EwConcatOp)
 
@@ -170,4 +173,4 @@ std::vector<mlir::Value> daphne::RowAggMaxOp::createEquivalentDistributedDAG(
 
 std::vector<bool> daphne::RowAggMaxOp::getOperandDistrPrimitives() {
     return {false};
-}
\ No newline at end of file
+}
diff --git a/src/ir/daphneir/DaphneOps.td b/src/ir/daphneir/DaphneOps.td
index b16310932..e4cd6a96b 100644
--- a/src/ir/daphneir/DaphneOps.td
+++ b/src/ir/daphneir/DaphneOps.td
@@ -34,8 +34,11 @@ include "ir/daphneir/DaphneTypeInferenceTraits.td"
 include "ir/daphneir/CUDASupport.td"
 include "ir/daphneir/FPGAOPENCLSupport.td"
 
+include "mlir/Dialect/LLVMIR/LLVMTypes.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/Interfaces/CallInterfaces.td"
+include "mlir/IR/AttrTypeBase.td"
 
 // ****************************************************************************
 // Custom constraints
@@ -66,6 +69,28 @@ class TypesMatchOrOneIsMatrixOfOther<string a, string b> : PredOpTrait<
 class Daphne_Op<string mnemonic, list<Trait> traits = []> :
         Op<Daphne_Dialect, mnemonic, traits>;
 
+// ****************************************************************************
+// DAPHNE Runtime Interoperability
+// ****************************************************************************
+
+def Daphne_ConvertMemRefToDenseMatrix : Daphne_Op<"convertMemRefToDenseMatrix"> {
+    let summary = "Return a DenseMatrix.";
+    let description = [{ Constructs a DenseMatrix given a rank 2 StridedMemRefType. }];
+
+    /* let arguments = (ins AnyMemRef:$arg); */
+    let hasCanonicalizeMethod = 1;
+    let arguments = (ins Size:$base, Size:$offset, Size:$size0, Size:$size1, Size:$stride0, Size:$stride1);
+    let results = (outs MatrixOrU:$res);
+}
+
+def Daphne_ConvertDenseMatrixToMemRef : Daphne_Op<"convertDenseMatrixToMemRef", [Pure]> {
+    let summary = "Given a DenseMatrix, return a StridedMemRefType.";
+    let description = [{ Constructs a StridedMemRefType with rank 2 from a DenseMatrix* with already allocated memory. }];
+    let hasCanonicalizeMethod = 1;
+    let arguments = (ins MatrixOrU:$arg);
+    let results = (outs AnyMemRef:$output);
+}
+
 // ****************************************************************************
 // Data generation
 // ****************************************************************************
@@ -110,7 +135,6 @@ def Daphne_RandMatrixOp : Daphne_Op<"randMatrix", [
     NumRowsFromIthScalar<0>, NumColsFromIthScalar<1>, DeclareOpInterfaceMethods<InferTypesOpInterface>,
     SparsityFromIthScalar<4>, CastArgsToResTypeRandMatrixOp
 ]> {
-    //let arguments = (ins Size:$numRows, Size:$numCols, AnyScalar:$min, AnyScalar:$max, F64:$sparsity, Seed:$seed, StrScalar:$pdf);
     let arguments = (ins Size:$numRows, Size:$numCols, AnyScalar:$min, AnyScalar:$max, F64:$sparsity, IntScalar:$seed);
     let results = (outs MatrixOrU:$res);
 }
@@ -173,7 +197,8 @@ def Daphne_MatMulOp : Daphne_Op<"matMul", [
 class Daphne_EwUnaryOp<string name, Type scalarType, list<Trait> traits = []> : Daphne_Op<name, !listconcat(traits, [
     DataTypeFromFirstArg,
     ShapeFromArg,
-    CastArgsToResType
+    CastArgsToResType,
+    NoMemoryEffect
 ])> {
     let arguments = (ins AnyTypeOf<[MatrixOf<[scalarType]>, scalarType, Unknown]>:$arg);
     let results = (outs AnyTypeOf<[MatrixOf<[scalarType]>, scalarType, Unknown]>:$res);
@@ -228,7 +253,8 @@ class Daphne_EwBinaryOp<string name, Type scalarType, list<Trait> traits = []>
     DeclareOpInterfaceMethods<DistributableOpInterface>,
     DeclareOpInterfaceMethods<VectorizableOpInterface>,
     ShapeEwBinary,
-    CastArgsToResType
+    CastArgsToResType,
+    NoMemoryEffect
 ])> {
     let arguments = (ins AnyTypeOf<[MatrixOf<[scalarType]>, scalarType, Unknown]>:$lhs, AnyTypeOf<[MatrixOf<[scalarType]>, scalarType, Unknown]>:$rhs);
     let results = (outs AnyTypeOf<[MatrixOf<[scalarType]>, scalarType, Unknown]>:$res);
@@ -279,6 +305,12 @@ def Daphne_EwAndOp    : Daphne_EwBinaryOp<"ewAnd", NumScalar, [Commutative, Valu
 def Daphne_EwOrOp     : Daphne_EwBinaryOp<"ewOr" , NumScalar, [Commutative, ValueTypeFromArgsInt]>;
 def Daphne_EwXorOp    : Daphne_EwBinaryOp<"ewXor", NumScalar, [Commutative, ValueTypeFromArgsInt]>;
 
+// ----------------------------------------------------------------------------
+// Bitwise
+// ----------------------------------------------------------------------------
+
+def Daphne_EwBitwiseAndOp    : Daphne_EwBinaryOp<"ewBitwiseAnd", NumScalar, [Commutative, ValueTypeFromArgsInt]>;
+
 // ----------------------------------------------------------------------------
 // Strings
 // ----------------------------------------------------------------------------
@@ -1288,7 +1320,7 @@ def Daphne_PrintOp : Daphne_Op<"print"> {
     // TODO We might change it to only accept scalars here and enforce toString
     // for matrices and frames. But currently, we need it like that for the
     // rest of the program.
-    let arguments = (ins AnyTypeOf<[AnyScalar, MatrixOrFrame, Unknown]>:$arg, BoolScalar:$newline, BoolScalar:$err);
+    let arguments = (ins AnyTypeOf<[AnyScalar, MatrixOrFrame, AnyMemRef, Unknown]>:$arg, BoolScalar:$newline, BoolScalar:$err);
     let results = (outs); // no results
 }
 
diff --git a/src/ir/daphneir/DaphneVectorizableOpInterface.cpp b/src/ir/daphneir/DaphneVectorizableOpInterface.cpp
index 0ab264dc6..9785c9ba3 100644
--- a/src/ir/daphneir/DaphneVectorizableOpInterface.cpp
+++ b/src/ir/daphneir/DaphneVectorizableOpInterface.cpp
@@ -189,6 +189,9 @@ IMPL_SPLIT_COMBINE_EWBINARYOP(EwAndOp)
 IMPL_SPLIT_COMBINE_EWBINARYOP(EwOrOp)
 IMPL_SPLIT_COMBINE_EWBINARYOP(EwXorOp)
 
+// Bitwise
+IMPL_SPLIT_COMBINE_EWBINARYOP(EwBitwiseAndOp);
+
 // Strings
 IMPL_SPLIT_COMBINE_EWBINARYOP(EwConcatOp)
 
diff --git a/src/ir/daphneir/Passes.h b/src/ir/daphneir/Passes.h
index 3f7d7ef18..ec9c5f45a 100644
--- a/src/ir/daphneir/Passes.h
+++ b/src/ir/daphneir/Passes.h
@@ -43,8 +43,15 @@ namespace mlir::daphne {
     std::unique_ptr<Pass> createAdaptTypesToKernelsPass();
     std::unique_ptr<Pass> createDistributeComputationsPass();
     std::unique_ptr<Pass> createDistributePipelinesPass();
+    std::unique_ptr<Pass> createMapOpLoweringPass();
+    std::unique_ptr<Pass> createEwOpLoweringPass();
+    std::unique_ptr<Pass> createModOpLoweringPass();
     std::unique_ptr<Pass> createInferencePass(InferenceConfig cfg = {false, true, true, true, true});
     std::unique_ptr<Pass> createInsertDaphneContextPass(const DaphneUserConfig& cfg);
+    std::unique_ptr<Pass> createDaphneOptPass();
+    std::unique_ptr<Pass> createMatMulOpLoweringPass();
+    std::unique_ptr<Pass> createAggAllOpLoweringPass();
+    std::unique_ptr<Pass> createMemRefTestPass();
     std::unique_ptr<Pass> createProfilingPass();
     std::unique_ptr<Pass> createLowerToLLVMPass(const DaphneUserConfig& cfg);
     std::unique_ptr<Pass> createManageObjRefsPass();
diff --git a/src/ir/daphneir/Passes.td b/src/ir/daphneir/Passes.td
index 20fc2a5ee..39725a131 100644
--- a/src/ir/daphneir/Passes.td
+++ b/src/ir/daphneir/Passes.td
@@ -55,4 +55,25 @@ def WhileLoopInvariantCodeMotionPass : Pass<"while-loop-invariant-code-motion",
     let constructor = "mlir::daphne::createWhileLoopInvariantCodeMotionPass()";
 }
 
-#endif // SRC_IR_DAPHNEIR_PASSES_TD
\ No newline at end of file
+def AggAllLoweringPass : Pass<"lower-agg", "::mlir::func::FuncOp"> {
+    let constructor = "mlir::daphne::createAggAllOpLoweringPass()";
+}
+
+def MatMulOpLoweringPass : Pass<"lower-mm", "::mlir::func::FuncOp"> {
+    let constructor = "mlir::daphne::createMatMulOpLoweringPass()";
+}
+
+def DaphneOpsOptPass : Pass<"opt-daphne", "::mlir::func::FuncOp"> {
+    let constructor = "mlir::daphne::createDaphneOptPass()";
+}
+
+def MapOpLoweringPass: Pass<"lower-map", "::mlir::func::FuncOp"> {
+    let constructor = "mlir::daphne::createMapOpLoweringPass()";
+}
+
+def LowerEwOpPass: Pass<"lower-ew", "::mlir::func::FuncOp"> {
+    let constructor = "mlir::daphne::createEwOpLoweringPass()";
+}
+
+
+#endif // SRC_IR_DAPHNEIR_PASSES_TD
diff --git a/src/parser/config/ConfigParser.cpp b/src/parser/config/ConfigParser.cpp
index d34e446a1..3debbab4d 100644
--- a/src/parser/config/ConfigParser.cpp
+++ b/src/parser/config/ConfigParser.cpp
@@ -51,6 +51,8 @@ void ConfigParser::readUserConfig(const std::string& filename, DaphneUserConfig&
         config.use_ipa_const_propa = jf.at(DaphneConfigJsonParams::USE_IPA_CONST_PROPA).get<bool>();
     if (keyExists(jf, DaphneConfigJsonParams::USE_PHY_OP_SELECTION))
         config.use_phy_op_selection = jf.at(DaphneConfigJsonParams::USE_PHY_OP_SELECTION).get<bool>();
+    if (keyExists(jf, DaphneConfigJsonParams::USE_MLIR_CODEGEN))
+        config.use_mlir_codegen = jf.at(DaphneConfigJsonParams::USE_MLIR_CODEGEN).get<bool>();
     if (keyExists(jf, DaphneConfigJsonParams::CUDA_FUSE_ANY))
         config.cuda_fuse_any = jf.at(DaphneConfigJsonParams::CUDA_FUSE_ANY).get<bool>();
     if (keyExists(jf, DaphneConfigJsonParams::VECTORIZED_SINGLE_QUEUE))
@@ -79,6 +81,8 @@ void ConfigParser::readUserConfig(const std::string& filename, DaphneUserConfig&
         config.explain_vectorized = jf.at(DaphneConfigJsonParams::EXPLAIN_VECTORIZED).get<bool>();
     if (keyExists(jf, DaphneConfigJsonParams::EXPLAIN_OBJ_REF_MGNT))
         config.explain_obj_ref_mgnt = jf.at(DaphneConfigJsonParams::EXPLAIN_OBJ_REF_MGNT).get<bool>();
+    if (keyExists(jf, DaphneConfigJsonParams::EXPLAIN_MLIR_CODEGEN))
+        config.explain_mlir_codegen = jf.at(DaphneConfigJsonParams::EXPLAIN_MLIR_CODEGEN).get<bool>();
     if (keyExists(jf, DaphneConfigJsonParams::TASK_PARTITIONING_SCHEME)) {
         config.taskPartitioningScheme = jf.at(DaphneConfigJsonParams::TASK_PARTITIONING_SCHEME).get<SelfSchedulingScheme>();
         if (config.taskPartitioningScheme == SelfSchedulingScheme::INVALID) {
@@ -142,4 +146,4 @@ void ConfigParser::checkAnyUnexpectedKeys(const nlohmann::basic_json<>& j, const
                 .append("' file"));
         }
     }
-}
\ No newline at end of file
+}
diff --git a/src/parser/config/JsonParams.h b/src/parser/config/JsonParams.h
index 8e9c1c257..172143258 100644
--- a/src/parser/config/JsonParams.h
+++ b/src/parser/config/JsonParams.h
@@ -30,6 +30,7 @@ struct DaphneConfigJsonParams {
     inline static const std::string USE_OBJ_REF_MGNT = "use_obj_ref_mgnt";
     inline static const std::string USE_IPA_CONST_PROPA = "use_ipa_const_propa";
     inline static const std::string USE_PHY_OP_SELECTION = "use_phy_op_selection";
+    inline static const std::string USE_MLIR_CODEGEN = "use_mlir_codegen";
     inline static const std::string CUDA_FUSE_ANY = "cuda_fuse_any";
     inline static const std::string VECTORIZED_SINGLE_QUEUE = "vectorized_single_queue";
 
@@ -45,6 +46,7 @@ struct DaphneConfigJsonParams {
     inline static const std::string EXPLAIN_TYPE_ADAPTATION = "explain_type_adaptation";
     inline static const std::string EXPLAIN_VECTORIZED = "explain_vectorized";
     inline static const std::string EXPLAIN_OBJ_REF_MGNT = "explain_obj_ref_mgnt";
+    inline static const std::string EXPLAIN_MLIR_CODEGEN = "explain_mlir_codegen";
     inline static const std::string TASK_PARTITIONING_SCHEME = "taskPartitioningScheme";
     inline static const std::string NUMBER_OF_THREADS = "numberOfThreads";
     inline static const std::string MINIMUM_TASK_SIZE = "minimumTaskSize";
@@ -53,13 +55,14 @@ struct DaphneConfigJsonParams {
     inline static const std::string LIBRARY_PATHS = "library_paths";
     inline static const std::string DAPHNEDSL_IMPORT_PATHS = "daphnedsl_import_paths";
     inline static const std::string LOGGING = "logging";
-    
+
     inline static const std::string JSON_PARAMS[] = {
             USE_CUDA_,
             USE_VECTORIZED_EXEC,
             USE_OBJ_REF_MGNT,
             USE_IPA_CONST_PROPA,
             USE_PHY_OP_SELECTION,
+            USE_MLIR_CODEGEN,
             CUDA_FUSE_ANY,
             VECTORIZED_SINGLE_QUEUE,
             DEBUG_LLVM,
@@ -73,6 +76,7 @@ struct DaphneConfigJsonParams {
             EXPLAIN_PHY_OP_SELECTION,
             EXPLAIN_TYPE_ADAPTATION,
             EXPLAIN_VECTORIZED,
+            EXPLAIN_MLIR_CODEGEN,
             EXPLAIN_OBJ_REF_MGNT,
             TASK_PARTITIONING_SCHEME,
             NUMBER_OF_THREADS,
diff --git a/src/runtime/local/kernels/BinaryOpCode.h b/src/runtime/local/kernels/BinaryOpCode.h
index 7d61dc0eb..54d878b4c 100644
--- a/src/runtime/local/kernels/BinaryOpCode.h
+++ b/src/runtime/local/kernels/BinaryOpCode.h
@@ -18,29 +18,33 @@
 
 enum class BinaryOpCode {
     // Arithmetic.
-    ADD, // addition
-    SUB, // subtraction
-    MUL, // multiplication
-    DIV, // division
-    POW, // to the power of
-    MOD, // modulus
-    LOG, // logarithm
+    ADD,  // addition
+    SUB,  // subtraction
+    MUL,  // multiplication
+    DIV,  // division
+    POW,  // to the power of
+    MOD,  // modulus
+    LOG,  // logarithm
+
     // Comparisons.
-    EQ,  // equal
-    NEQ, // not equal
-    LT,  // less than
-    LE,  // less equal
-    GT,  // greater than
-    GE,  // greater equal
-    
+    EQ,   // equal
+    NEQ,  // not equal
+    LT,   // less than
+    LE,   // less equal
+    GT,   // greater than
+    GE,   // greater equal
+
     // Min/max.
     MIN,
     MAX,
-    
+
     // Logical.
     AND,
     OR,
+
+    // Bitwise.
+    BITWISE_AND,
 };
 
 static std::string_view binary_op_codes[] = {"ADD", "SUB", "MUL", "DIV", "POW", "MOD", "LOG", "EQ", "NEQ", "LT", "LE",
-        "GT", "GE", "MIN", "MAX", "AND", "OR"};
\ No newline at end of file
+        "GT", "GE", "MIN", "MAX", "AND", "OR", "BITWISE_AND"};
diff --git a/src/runtime/local/kernels/ConvertDenseMatrixToMemRef.h b/src/runtime/local/kernels/ConvertDenseMatrixToMemRef.h
new file mode 100644
index 000000000..c281db96c
--- /dev/null
+++ b/src/runtime/local/kernels/ConvertDenseMatrixToMemRef.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "mlir/ExecutionEngine/CRunnerUtils.h"
+#include "runtime/local/datastructures/DenseMatrix.h"
+
+template <typename T>
+inline StridedMemRefType<T, 2> convertDenseMatrixToMemRef(
+    const DenseMatrix<T> *input, DCTX(ctx)) {
+    StridedMemRefType<T, 2> memRef{};
+    memRef.basePtr = input->getValuesSharedPtr().get();
+    memRef.data = memRef.basePtr;
+    memRef.offset = 0;
+    memRef.sizes[0] = input->getNumRows();
+    memRef.sizes[1] = input->getNumCols();
+
+    // TODO(phil): needs to be calculated for non row-major memory layouts
+    memRef.strides[0] = input->getNumCols();
+    memRef.strides[1] = 1;
+    input->increaseRefCounter();
+
+    return memRef;
+}
diff --git a/src/runtime/local/kernels/ConvertMemRefToDenseMatrix.h b/src/runtime/local/kernels/ConvertMemRefToDenseMatrix.h
new file mode 100644
index 000000000..96779ea70
--- /dev/null
+++ b/src/runtime/local/kernels/ConvertMemRefToDenseMatrix.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "runtime/local/context/DaphneContext.h"
+#include "runtime/local/datastructures/DenseMatrix.h"
+
+template <typename T>
+inline void convertMemRefToDenseMatrix(DenseMatrix<T>*& result, size_t basePtr,
+                                       size_t offset, size_t size0,
+                                       size_t size1, size_t stride0,
+                                       size_t stride1, DCTX(ctx)) {
+    auto no_op_deleter = [](T*) {};
+    T* valuePtr = reinterpret_cast<T*>(basePtr);
+    std::shared_ptr<T[]> ptr(valuePtr, no_op_deleter);
+    result = DataObjectFactory::create<DenseMatrix<T>>(size0, size1, ptr);
+}
+
diff --git a/src/runtime/local/kernels/MatMul.h b/src/runtime/local/kernels/MatMul.h
index d0cfefbeb..fd5ff9e19 100644
--- a/src/runtime/local/kernels/MatMul.h
+++ b/src/runtime/local/kernels/MatMul.h
@@ -51,4 +51,3 @@ void matMul(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, bool transa, boo
 
 
 
-
diff --git a/src/runtime/local/kernels/genKernelInst.py b/src/runtime/local/kernels/genKernelInst.py
index fedbe87ee..d2b2c2a44 100755
--- a/src/runtime/local/kernels/genKernelInst.py
+++ b/src/runtime/local/kernels/genKernelInst.py
@@ -118,7 +118,10 @@ def generateKernelInstantiation(kernelTemplateInfo, templateValues, opCodes, out
             .replace(" **", "" if rp["isOutput"] else "_variadic")
             .replace(" *", "_variadic" if "isVariadic" in rp and rp["isVariadic"] else "")
             .replace("& ", "")
-            .replace("<", "_").replace(">", "")
+            .replace("<", "_")
+            .replace(">", "")
+            .replace(",", "_")
+            .replace(" ", "_")
         for rp in extendedRuntimeParams
     ])
     if typesForName != "":
diff --git a/src/runtime/local/kernels/kernels.json b/src/runtime/local/kernels/kernels.json
index c7129ec5f..1ffd4d625 100644
--- a/src/runtime/local/kernels/kernels.json
+++ b/src/runtime/local/kernels/kernels.json
@@ -784,6 +784,80 @@
             []
         ]
     },
+    {
+        "kernelTemplate": {
+            "header": "ConvertMemRefToDenseMatrix.h",
+            "opName": "convertMemRefToDenseMatrix",
+            "returnType": "void",
+            "templateParams": [
+                {
+                    "name": "VT",
+                    "isDataType": false
+                }
+            ],
+            "runtimeParams": [
+                {
+                    "type": "DenseMatrix<VT> *&",
+                    "name": "result"
+                },
+                {
+                    "type": "size_t",
+                    "name": "basePtr"
+                },
+                {
+                    "type": "size_t",
+                    "name": "offset"
+                },
+                {
+                    "type": "size_t",
+                    "name": "size0"
+                },
+                {
+                    "type": "size_t",
+                    "name": "size1"
+                },
+                {
+                    "type": "size_t",
+                    "name": "stride0"
+                },
+                {
+                    "type": "size_t",
+                    "name": "stride1"
+                }
+            ]
+        },
+        "instantiations": [
+            ["int64_t"],
+            ["uint64_t"],
+            ["float"],
+            ["double"]
+        ]
+    },
+    {
+        "kernelTemplate": {
+            "header": "ConvertDenseMatrixToMemRef.h",
+            "opName": "convertDenseMatrixToMemRef",
+            "returnType": "StridedMemRefType<VT,2>",
+            "templateParams": [
+                {
+                    "name": "VT",
+                    "isDataType": false
+                }
+            ],
+            "runtimeParams": [
+                {
+                    "type": "DenseMatrix<VT> *",
+                    "name": "input"
+                }
+            ]
+        },
+        "instantiations": [
+            ["int64_t"],
+            ["uint64_t"],
+            ["float"],
+            ["double"]
+        ]
+    },
     {
         "kernelTemplate": {
             "header": "CreateFrame.h",
@@ -1086,7 +1160,7 @@
                     [["DenseMatrix", "double"], ["DenseMatrix", "double"], "double"],
                     [["DenseMatrix", "int64_t"], ["DenseMatrix", "int64_t"], "int64_t"]
                 ],
-                "opCodes": ["ADD", "SUB", "MUL", "DIV", "POW", "LOG", "MOD", "EQ", "NEQ", "LT", "LE", "GT", "GE", "MIN", "MAX", "AND", "OR"]
+                "opCodes": ["ADD", "SUB", "MUL", "DIV", "POW", "LOG", "MOD", "EQ", "NEQ", "LT", "LE", "GT", "GE", "MIN", "MAX", "AND", "OR", "BITWISE_AND"]
             },
             {
                 "name":  ["CPP"],
@@ -1097,7 +1171,7 @@
                     ["Frame", "Frame", "double"],
                     ["Frame", "Frame", "int64_t"]
                 ],
-                "opCodes": ["ADD", "SUB", "MUL", "DIV", "POW", "LOG", "MOD", "EQ", "NEQ", "LT", "LE", "GT", "GE", "MIN", "MAX", "AND", "OR"]
+                "opCodes": ["ADD", "SUB", "MUL", "DIV", "POW", "LOG", "MOD", "EQ", "NEQ", "LT", "LE", "GT", "GE", "MIN", "MAX", "AND", "OR", "BITWISE_AND"]
             }
         ]
     },
@@ -1143,7 +1217,7 @@
             ["uint32_t", "uint32_t", "uint32_t"],
             ["size_t", "size_t", "size_t"]
         ],
-        "opCodes": ["ADD", "SUB", "MUL", "DIV", "POW", "LOG", "MOD", "EQ", "NEQ", "LT", "LE", "GT", "GE", "MIN", "MAX", "AND", "OR"]
+        "opCodes": ["ADD", "SUB", "MUL", "DIV", "POW", "LOG", "MOD", "EQ", "NEQ", "LT", "LE", "GT", "GE", "MIN", "MAX", "AND", "OR", "BITWISE_AND"]
     },
     {
         "kernelTemplate": {
@@ -1492,6 +1566,7 @@
                     [["DenseMatrix", "float"], "float"],
                     [["DenseMatrix", "double"], "double"],
                     [["DenseMatrix", "int64_t"], "int64_t"],
+                    [["DenseMatrix", "uint64_t"], "uint64_t"],
                     [["DenseMatrix", "uint8_t"], "uint8_t"]]
             }
         ]
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e266dc80c..7d63d5976 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -47,15 +47,22 @@ set(TEST_SOURCES
         api/cli/vectorized/MultiThreadedOpsTest.cpp
         api/cli/vectorized/VectorizedPipelineTest.cpp
         api/cli/Utils.cpp
-    
+
         api/python/DaphneLibTest.cpp
-        
+
+        api/cli/codegen/EwBinaryScalarTest.cpp
+        api/cli/codegen/MatMulTest.cpp
+        api/cli/codegen/EwOpLoopFusionTest.cpp
+        api/cli/codegen/AggAllTest.cpp
+        api/cli/codegen/MapOpTest.cpp
+        codegen/CodegenTest.cpp
+
         ir/daphneir/InferTypesTest.cpp
-        
+
         parser/config/ConfigParserTest.cpp
-    
+
         runtime/distributed/worker/WorkerTest.cpp
-    
+
         runtime/local/datastructures/CSRMatrixTest.cpp
         runtime/local/datastructures/DenseMatrixTest.cpp
         runtime/local/datastructures/FrameTest.cpp
@@ -142,7 +149,7 @@ endif()
 
 add_executable(run_tests ${TEST_SOURCES})
 set_target_properties(run_tests PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/bin)
-add_dependencies(run_tests daphne daphnelib DistributedWorker)
+add_dependencies(run_tests daphne daphnelib DistributedWorker daphne-opt)
 
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 set(LIBS AllKernels ${dialect_libs} DataStructures DaphneDSLParser MLIRDaphne WorkerImpl Proto DaphneConfigParser
diff --git a/test/api/cli/Utils.h b/test/api/cli/Utils.h
index 3768a18cd..1c2a29891 100644
--- a/test/api/cli/Utils.h
+++ b/test/api/cli/Utils.h
@@ -163,6 +163,31 @@ pid_t runProgramInBackground(int &out, int &err, const char * execPath, Args ...
     }
 }
 
+/**
+ * @brief Executes the "run-lit.py" python script in a directory and
+ * captures `stdout`, `stderr`, and the status code.
+ *
+ * "run-lit.py" is required to run the LLVM tool llvm-lit in order to
+ * test "*.mlir" files in the directoy using the llvm-lit command RUN:
+ * in each file.
+ *
+ * @param out The stream where to direct the program's standard output.
+ * @param err The stream where to direct the program's standard error.
+ * @param dirPath The path to the directory containing the "run-lit.py" script
+ * and the "*.mlir" test cases.
+ * @param args The arguments to pass in addition to the script's path. Despite
+ * the variadic template, each element should be of type `char *`. The last one
+ * does *not* need to be a null pointer.
+ * @return The status code returned by the process, or `-1` if it did not exit
+ * normally.
+ */
+template <typename... Args>
+int runLIT(std::stringstream &out, std::stringstream &err, std::string dirPath,
+           Args... args) {
+    return runProgram(out, err, "/bin/python3", "python3",
+                      (dirPath + "run-lit.py").c_str(), "-v", dirPath.c_str(),
+                      args...);
+}
 
 /**
  * @brief Executes DAPHNE's command line interface with the given arguments and
@@ -465,4 +490,4 @@ void compareDaphneToSomeRefSimple(const std::string & dirPath, const std::string
  */
 std::string generalizeDataTypes(const std::string& str);
 
-#endif //TEST_API_CLI_UTILS_H
\ No newline at end of file
+#endif //TEST_API_CLI_UTILS_H
diff --git a/test/api/cli/codegen/AggAllTest.cpp b/test/api/cli/codegen/AggAllTest.cpp
new file mode 100644
index 000000000..f0c383c00
--- /dev/null
+++ b/test/api/cli/codegen/AggAllTest.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <api/cli/Utils.h>
+#include <tags.h>
+
+#include <catch.hpp>
+#include <sstream>
+#include <string>
+
+#include "api/cli/StatusCode.h"
+
+const std::string dirPath = "test/api/cli/codegen/";
+
+TEST_CASE("aggAll", TAG_CODEGEN) {
+    std::string result = "100\n";
+
+    compareDaphneToStr(result, dirPath + "sum_aggall.daphne");
+    compareDaphneToStr(result, dirPath + "sum_aggall.daphne", "--mlir-codegen");
+}
diff --git a/test/api/cli/codegen/EwBinaryScalarTest.cpp b/test/api/cli/codegen/EwBinaryScalarTest.cpp
new file mode 100644
index 000000000..224d566c3
--- /dev/null
+++ b/test/api/cli/codegen/EwBinaryScalarTest.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <api/cli/Utils.h>
+#include <tags.h>
+
+#include <catch.hpp>
+#include <sstream>
+#include <string>
+
+#include "api/cli/StatusCode.h"
+
+const std::string dirPath = "test/api/cli/codegen/";
+
+void test_binary_lowering(const std::string op,
+                          const std::string kernel_call,
+                          const std::string lowering,
+                          const std::string result) {
+    std::stringstream out;
+    std::stringstream err;
+
+    int status = runDaphne(out, err, "--explain", "llvm", (dirPath + op + ".daphne").c_str());
+    CHECK(status == StatusCode::SUCCESS);
+
+    CHECK_THAT(err.str(), Catch::Contains(kernel_call));
+    CHECK_THAT(err.str(), !Catch::Contains(lowering));
+    CHECK(out.str() == result);
+
+    out.str(std::string());
+    err.str(std::string());
+
+    status = runDaphne(out, err, "--explain", "llvm", "--mlir-codegen", (dirPath + op + ".daphne").c_str());
+    CHECK(status == StatusCode::SUCCESS);
+
+    CHECK_THAT(err.str(), !Catch::Contains(kernel_call));
+    CHECK_THAT(err.str(), Catch::Contains(lowering));
+    CHECK(out.str() == result);
+}
+
+TEST_CASE("ewBinaryAddScalar", TAG_CODEGEN) {
+    test_binary_lowering("add", "llvm.call @_ewAdd__", "llvm.add", "3\n");
+}
+
+TEST_CASE("ewBinarySubScalar", TAG_CODEGEN) {
+    test_binary_lowering("sub", "llvm.call @_ewSub__", "llvm.sub", "-1\n");
+}
+
+TEST_CASE("ewBinaryMulScalar", TAG_CODEGEN) {
+    test_binary_lowering("mul", "llvm.call @_ewMul__", "llvm.mul", "2\n");
+}
+
+TEST_CASE("ewBinaryDivScalar", TAG_CODEGEN) {
+    test_binary_lowering("div", "llvm.call @_ewDiv__", "llvm.fdiv", "1.5\n");
+}
+
+TEST_CASE("ewBinaryPowScalar", TAG_CODEGEN) {
+    test_binary_lowering("pow", "llvm.call @_ewPow__", "llvm.intr.pow", "9\n");
+}
+
+TEST_CASE("ewBinaryAbsScalar", TAG_CODEGEN) {
+    test_binary_lowering("abs", "llvm.call @_ewAbs__", "llvm.intr.fabs", "4\n");
+}
diff --git a/test/api/cli/codegen/EwOpLoopFusionTest.cpp b/test/api/cli/codegen/EwOpLoopFusionTest.cpp
new file mode 100644
index 000000000..46f91b7cb
--- /dev/null
+++ b/test/api/cli/codegen/EwOpLoopFusionTest.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <api/cli/Utils.h>
+#include <tags.h>
+
+#include <catch.hpp>
+#include <sstream>
+#include <string>
+
+#include "api/cli/StatusCode.h"
+
+const std::string dirPath = "test/api/cli/codegen/";
+
+TEST_CASE("ewloopfusion", TAG_CODEGEN) {
+    std::string result =
+        "DenseMatrix(2x2, double)\n"
+        "8 8\n"
+        "8 8\n"
+        "DenseMatrix(2x2, double)\n"
+        "10 10\n"
+        "10 10\n"
+        "DenseMatrix(2x2, double)\n"
+        "9 9\n"
+        "9 9\n";
+
+    compareDaphneToStr(result, dirPath + "fusion.daphne");
+    compareDaphneToStr(result, dirPath + "fusion.daphne", "--mlir-codegen");
+}
diff --git a/test/api/cli/codegen/MapOpTest.cpp b/test/api/cli/codegen/MapOpTest.cpp
new file mode 100644
index 000000000..a7ccf56d0
--- /dev/null
+++ b/test/api/cli/codegen/MapOpTest.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <api/cli/Utils.h>
+#include <tags.h>
+
+#include <catch.hpp>
+#include <sstream>
+#include <string>
+
+#include "api/cli/StatusCode.h"
+
+const std::string dirPath = "test/api/cli/codegen/";
+
+TEST_CASE("mapOp", TAG_CODEGEN) {
+    std::string result =
+        "DenseMatrix(2x2, double)\n"
+        "2.1 1\n"
+        "6.5 -1.2\n";
+
+    compareDaphneToStr(result, dirPath + "map.daphne");
+    compareDaphneToStr(result, dirPath + "map.daphne", "--mlir-codegen", "--no-obj-ref-mgnt");
+}
+
diff --git a/test/api/cli/codegen/MatMulTest.cpp b/test/api/cli/codegen/MatMulTest.cpp
new file mode 100644
index 000000000..6ae2f324a
--- /dev/null
+++ b/test/api/cli/codegen/MatMulTest.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <api/cli/Utils.h>
+#include <tags.h>
+
+#include <catch.hpp>
+#include <sstream>
+#include <string>
+
+#include "api/cli/StatusCode.h"
+
+const std::string dirPath = "test/api/cli/codegen/";
+
+TEST_CASE("matmul", TAG_CODEGEN) {
+    std::string result =
+        "DenseMatrix(3x3, double)\n"
+        "45 45 45\n"
+        "45 45 45\n"
+        "45 45 45\n";
+
+    compareDaphneToStr(result, dirPath + "matmul.daphne");
+    compareDaphneToStr(result, dirPath + "matmul.daphne", "--mlir-codegen");
+}
+
+
+TEST_CASE("matvec", TAG_CODEGEN) {
+    std::string result =
+        "DenseMatrix(3x1, double)\n"
+        "45\n"
+        "45\n"
+        "45\n";
+
+    compareDaphneToStr(result, dirPath + "matvec.daphne");
+    compareDaphneToStr(result, dirPath + "matvec.daphne", "--mlir-codegen");
+}
diff --git a/test/api/cli/codegen/abs.daphne b/test/api/cli/codegen/abs.daphne
new file mode 100644
index 000000000..1cf002ecc
--- /dev/null
+++ b/test/api/cli/codegen/abs.daphne
@@ -0,0 +1,7 @@
+// Performs a Abs. Used to compare precompiled kernel with codegen. Value
+// extracted as scalar to avoid it being optimized out of the calculation with
+// constant folding or similar.
+
+X = [1.0, -2.0, -4.0];
+a = as.scalar(X[2:3, 0:1]);
+print(abs(a));
diff --git a/test/api/cli/codegen/add.daphne b/test/api/cli/codegen/add.daphne
new file mode 100644
index 000000000..fd33984cc
--- /dev/null
+++ b/test/api/cli/codegen/add.daphne
@@ -0,0 +1,8 @@
+// Performs an AddOp. Used to compare precompiled kernel with codegen.
+// Values extracted as scalar to avoid them being optimized out of
+// the calculation with constant folding or similar.
+
+X = [1, 2, 3];
+a = as.scalar(X[0:1, 0:1]);
+b = as.scalar(X[1:2, 0:1]);
+print(a + b);
diff --git a/test/api/cli/codegen/div.daphne b/test/api/cli/codegen/div.daphne
new file mode 100644
index 000000000..a934b84b7
--- /dev/null
+++ b/test/api/cli/codegen/div.daphne
@@ -0,0 +1,8 @@
+// Performs a DivOp. Used to compare precompiled kernel with codegen. Values
+// extracted as scalar to avoid them being optimized out of the calculation
+// with constant folding or similar.
+
+X = [1.0, 2.0, 3.0];
+a = as.scalar(X[2:3, 0:1]);
+b = as.scalar(X[1:2, 0:1]);
+print(a / b);
diff --git a/test/api/cli/codegen/fusion.daphne b/test/api/cli/codegen/fusion.daphne
new file mode 100644
index 000000000..e4b81f68e
--- /dev/null
+++ b/test/api/cli/codegen/fusion.daphne
@@ -0,0 +1,11 @@
+// Performs loop fusion on multiple EwBinaryOps. Used to compare precompiled
+// kernel with codegen.
+
+X = fill(4.0, 2, 2);
+X = X * 2.0;
+Y = X + 2.0;
+Z = X + 1.0;
+
+print(X);
+print(Y);
+print(Z);
diff --git a/test/api/cli/codegen/log.daphne b/test/api/cli/codegen/log.daphne
new file mode 100644
index 000000000..b9f86d44b
--- /dev/null
+++ b/test/api/cli/codegen/log.daphne
@@ -0,0 +1,8 @@
+// Performs a LogOp. Used to compare precompiled kernel with codegen. Values
+// extracted as scalar to avoid them being optimized out of the calculation
+// with constant folding or similar.
+
+X = [1, 2, 3];
+a = as.scalar(X[0:1, 0:1]);
+b = as.scalar(X[1:2, 0:1]);
+print(ln(a, b));
diff --git a/test/api/cli/codegen/map.daphne b/test/api/cli/codegen/map.daphne
new file mode 100644
index 000000000..6c9d203eb
--- /dev/null
+++ b/test/api/cli/codegen/map.daphne
@@ -0,0 +1,10 @@
+// Performs a MapOp with the UDF `increment`. Used to compare precompiled
+// kernel with codegen.
+
+def increment(x) {
+    return x + 1;
+}
+
+X = reshape([1.1, 0.0, 5.5, -2.2], 2, 2);
+
+print(map(X, increment));
diff --git a/test/api/cli/codegen/matmul.daphne b/test/api/cli/codegen/matmul.daphne
new file mode 100644
index 000000000..af5b46ae9
--- /dev/null
+++ b/test/api/cli/codegen/matmul.daphne
@@ -0,0 +1,9 @@
+// Performs a MatMulOp. Used to compare precompiled kernel with codegen.
+
+N = 3;
+A = fill(5.0, N, N);
+B = fill(3.0, N, N);
+
+C = A@B;
+
+print(C); // for small matrices
diff --git a/test/api/cli/codegen/matvec.daphne b/test/api/cli/codegen/matvec.daphne
new file mode 100644
index 000000000..7aba59805
--- /dev/null
+++ b/test/api/cli/codegen/matvec.daphne
@@ -0,0 +1,9 @@
+// Performs a MatMulOp. Used to compare precompiled kernel with codegen.
+
+N = 3;
+A = fill(5.0, N, N);
+B = fill(3.0, N, 1);
+
+C = A@B;
+
+print(C); // for small matrices
diff --git a/test/api/cli/codegen/mul.daphne b/test/api/cli/codegen/mul.daphne
new file mode 100644
index 000000000..17ea31d5c
--- /dev/null
+++ b/test/api/cli/codegen/mul.daphne
@@ -0,0 +1,8 @@
+// Performs a MulOp. Used to compare precompiled kernel with codegen. Values
+// extracted as scalar to avoid them being optimized out of the calculation
+// with constant folding or similar.
+
+X = [1, 2, 3];
+a = as.scalar(X[0:1, 0:1]);
+b = as.scalar(X[1:2, 0:1]);
+print(a * b);
diff --git a/test/api/cli/codegen/pow.daphne b/test/api/cli/codegen/pow.daphne
new file mode 100644
index 000000000..ff13b1b23
--- /dev/null
+++ b/test/api/cli/codegen/pow.daphne
@@ -0,0 +1,8 @@
+// Performs a PowOp. Used to compare precompiled kernel with codegen. Values
+// extracted as scalar to avoid them being optimized out of the calculation
+// with constant folding or similar.
+
+X = [1.0, 2.0, 3.0];
+a = as.scalar(X[2:3, 0:1]);
+b = as.scalar(X[1:2, 0:1]);
+print(pow(a, b));
diff --git a/test/api/cli/codegen/sub.daphne b/test/api/cli/codegen/sub.daphne
new file mode 100644
index 000000000..a230024c1
--- /dev/null
+++ b/test/api/cli/codegen/sub.daphne
@@ -0,0 +1,8 @@
+// Compare precompiled kernel with codegen generated for the SubOp. Value
+// extracted as scalar to avoid it being optimizedd out of the calculation with
+// constant folding or similar.
+
+X = [1, 2, 3];
+a = as.scalar(X[0:1, 0:1]);
+b = as.scalar(X[1:2, 0:1]);
+print(a - b);
diff --git a/test/api/cli/codegen/sum_aggall.daphne b/test/api/cli/codegen/sum_aggall.daphne
new file mode 100644
index 000000000..77578e7c6
--- /dev/null
+++ b/test/api/cli/codegen/sum_aggall.daphne
@@ -0,0 +1,5 @@
+// Compare precompiled kernel with codegen generated for the AggAllOp.
+
+X = fill(1.0, 10, 10);
+a = sum(X);
+print(a);
diff --git a/test/codegen/.gitignore b/test/codegen/.gitignore
new file mode 100644
index 000000000..a6e9662a3
--- /dev/null
+++ b/test/codegen/.gitignore
@@ -0,0 +1,2 @@
+Output/**
+.lit_test_times.txt
diff --git a/test/codegen/CodegenTest.cpp b/test/codegen/CodegenTest.cpp
new file mode 100644
index 000000000..fce4d03c9
--- /dev/null
+++ b/test/codegen/CodegenTest.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2023 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "run_tests.h"
+
+#include "api/cli/StatusCode.h"
+#include "api/cli/Utils.h"
+
+#include <tags.h>
+
+const std::string dirPath = "test/codegen/";
+
+// Place all test files with FileCheck directives in the dirPath.
+// LIT will test all *.mlir files in the directory.
+TEST_CASE("codegen", TAG_CODEGEN) {
+    std::stringstream out;
+    std::stringstream err;
+
+    int status = runLIT(out, err, dirPath);
+
+#ifndef NDEBUG
+    spdlog::info("runLIT return status: " + std::to_string(status));
+    spdlog::info("runLIT out:\n" + out.str());
+    spdlog::info("runLIT err:\n" + err.str());
+#endif
+    CHECK(status == StatusCode::SUCCESS);
+}
diff --git a/test/codegen/daphne_opt.mlir b/test/codegen/daphne_opt.mlir
new file mode 100644
index 000000000..25a02d813
--- /dev/null
+++ b/test/codegen/daphne_opt.mlir
@@ -0,0 +1,18 @@
+// RUN: daphne-opt --opt-daphne %s | FileCheck %s
+
+module {
+  func.func @main() {
+    %0 = "daphne.constant"() {value = 2 : ui64} : () -> ui64
+    %1 = "daphne.constant"() {value = 2 : index} : () -> index
+    %2 = "daphne.constant"() {value = 4 : ui64} : () -> ui64
+    %3 = "daphne.constant"() {value = false} : () -> i1
+    %4 = "daphne.constant"() {value = true} : () -> i1
+    %5 = "daphne.fill"(%2, %1, %1) : (ui64, index, index) -> !daphne.Matrix<2x2xui64>
+    // CHECK-NOT: daphne.ewMod
+    // CHECK: daphne.ewSub
+    // CHECK-NEXT: daphne.ewBitwiseAnd
+    %6 = "daphne.ewMod"(%5, %0) : (!daphne.Matrix<2x2xui64>, ui64) -> !daphne.Matrix<2x2xui64>
+    "daphne.print"(%6, %4, %3) : (!daphne.Matrix<2x2xui64>, i1, i1) -> ()
+    "daphne.return"() : () -> ()
+  }
+}
diff --git a/test/codegen/ew.mlir b/test/codegen/ew.mlir
new file mode 100644
index 000000000..d94cf32cd
--- /dev/null
+++ b/test/codegen/ew.mlir
@@ -0,0 +1,105 @@
+// RUN: daphne-opt --lower-ew %s | FileCheck %s
+
+func.func @add() {
+  %0 = "daphne.constant"() {value = 2 : index} : () -> index
+  %1 = "daphne.constant"() {value = false} : () -> i1
+  %2 = "daphne.constant"() {value = true} : () -> i1
+  %3 = "daphne.constant"() {value = 4.000000e+00 : f64} : () -> f64
+  %4 = "daphne.fill"(%3, %0, %0) : (f64, index, index) -> !daphne.Matrix<2x2xf64>
+  // CHECK-NOT: daphne.ewAdd
+  // CHECK: arith.addf
+  %5 = "daphne.ewAdd"(%4, %4) : (!daphne.Matrix<2x2xf64>, !daphne.Matrix<2x2xf64>) -> !daphne.Matrix<2x2xf64>
+  "daphne.print"(%5, %2, %1) : (!daphne.Matrix<2x2xf64>, i1, i1) -> ()
+  "daphne.return"() : () -> ()
+}
+
+func.func @sub() {
+  %0 = "daphne.constant"() {value = 2 : index} : () -> index
+  %1 = "daphne.constant"() {value = false} : () -> i1
+  %2 = "daphne.constant"() {value = true} : () -> i1
+  %3 = "daphne.constant"() {value = 4.000000e+00 : f64} : () -> f64
+  %4 = "daphne.fill"(%3, %0, %0) : (f64, index, index) -> !daphne.Matrix<2x2xf64>
+  // CHECK-NOT: daphne.ewSub
+  // CHECK: arith.subf
+  %5 = "daphne.ewSub"(%4, %4) : (!daphne.Matrix<2x2xf64>, !daphne.Matrix<2x2xf64>) -> !daphne.Matrix<2x2xf64>
+  "daphne.print"(%5, %2, %1) : (!daphne.Matrix<2x2xf64>, i1, i1) -> ()
+  "daphne.return"() : () -> ()
+}
+
+func.func @mul() {
+  %0 = "daphne.constant"() {value = 2 : index} : () -> index
+  %1 = "daphne.constant"() {value = false} : () -> i1
+  %2 = "daphne.constant"() {value = true} : () -> i1
+  %3 = "daphne.constant"() {value = 4.000000e+00 : f64} : () -> f64
+  %4 = "daphne.fill"(%3, %0, %0) : (f64, index, index) -> !daphne.Matrix<2x2xf64>
+  // CHECK-NOT: daphne.ewMul
+  // CHECK: arith.mulf
+  %5 = "daphne.ewMul"(%4, %4) : (!daphne.Matrix<2x2xf64>, !daphne.Matrix<2x2xf64>) -> !daphne.Matrix<2x2xf64>
+  "daphne.print"(%5, %2, %1) : (!daphne.Matrix<2x2xf64>, i1, i1) -> ()
+  "daphne.return"() : () -> ()
+}
+
+func.func @div() {
+  %0 = "daphne.constant"() {value = 2 : index} : () -> index
+  %1 = "daphne.constant"() {value = false} : () -> i1
+  %2 = "daphne.constant"() {value = true} : () -> i1
+  %3 = "daphne.constant"() {value = 4.000000e+00 : f64} : () -> f64
+  %4 = "daphne.fill"(%3, %0, %0) : (f64, index, index) -> !daphne.Matrix<2x2xf64>
+  // CHECK-NOT: daphne.ewDiv
+  // CHECK: arith.divf
+  %5 = "daphne.ewDiv"(%4, %4) : (!daphne.Matrix<2x2xf64>, !daphne.Matrix<2x2xf64>) -> !daphne.Matrix<2x2xf64>
+  "daphne.print"(%5, %2, %1) : (!daphne.Matrix<2x2xf64>, i1, i1) -> ()
+  "daphne.return"() : () -> ()
+}
+
+func.func @sqrt() {
+  %0 = "daphne.constant"() {value = 0 : index} : () -> index
+  %1 = "daphne.constant"() {value = 1 : index} : () -> index
+  %2 = "daphne.constant"() {value = 2 : index} : () -> index
+  %3 = "daphne.constant"() {value = false} : () -> i1
+  %4 = "daphne.constant"() {value = true} : () -> i1
+  %5 = "daphne.constant"() {value = 4 : si64} : () -> si64
+  %6 = "daphne.fill"(%5, %2, %2) : (si64, index, index) -> !daphne.Matrix<2x2xsi64>
+  %7 = "daphne.sliceRow"(%6, %0, %1) : (!daphne.Matrix<2x2xsi64>, index, index) -> !daphne.Matrix<?x?xsi64>
+  %8 = "daphne.sliceCol"(%7, %0, %1) : (!daphne.Matrix<?x?xsi64>, index, index) -> !daphne.Matrix<?x?xsi64>
+  %9 = "daphne.sliceRow"(%6, %0, %1) : (!daphne.Matrix<2x2xsi64>, index, index) -> !daphne.Matrix<?x?xsi64>
+  %10 = "daphne.sliceCol"(%9, %0, %1) : (!daphne.Matrix<?x?xsi64>, index, index) -> !daphne.Matrix<?x?xsi64>
+  %11 = "daphne.cast"(%10) : (!daphne.Matrix<?x?xsi64>) -> si64
+  %12 = "daphne.cast"(%11) : (si64) -> f64
+  // CHECK-NOT: daphne.ewSqrt
+  // CHECK: math.sqrt
+  %13 = "daphne.ewSqrt"(%12) : (f64) -> f64
+  "daphne.print"(%13, %4, %3) : (f64, i1, i1) -> ()
+  "daphne.return"() : () -> ()
+}
+
+func.func @abs() {
+  %0 = "daphne.constant"() {value = 4.000000e+00 : f64} : () -> f64
+  %3 = "daphne.constant"() {value = false} : () -> i1
+  %4 = "daphne.constant"() {value = true} : () -> i1
+  // CHECK-NOT: daphne.ewAbs
+  // CHECK: math.absf
+  %12 = "daphne.ewAbs"(%0) : (f64) -> f64
+  "daphne.print"(%12, %4, %3) : (f64, i1, i1) -> ()
+  "daphne.return"() : () -> ()
+}
+
+func.func @pow() {
+  %0 = "daphne.constant"() {value = 0 : index} : () -> index
+  %1 = "daphne.constant"() {value = 1 : index} : () -> index
+  %2 = "daphne.constant"() {value = 2 : index} : () -> index
+  %3 = "daphne.constant"() {value = false} : () -> i1
+  %4 = "daphne.constant"() {value = true} : () -> i1
+  %5 = "daphne.constant"() {value = 4.000000e+00 : f64} : () -> f64
+  %6 = "daphne.fill"(%5, %2, %2) : (f64, index, index) -> !daphne.Matrix<2x2xf64>
+  %7 = "daphne.sliceRow"(%6, %0, %1) : (!daphne.Matrix<2x2xf64>, index, index) -> !daphne.Matrix<?x?xf64>
+  %8 = "daphne.sliceCol"(%7, %0, %1) : (!daphne.Matrix<?x?xf64>, index, index) -> !daphne.Matrix<?x?xf64>
+  %9 = "daphne.sliceRow"(%6, %0, %1) : (!daphne.Matrix<2x2xf64>, index, index) -> !daphne.Matrix<?x?xf64>
+  %10 = "daphne.sliceCol"(%9, %0, %1) : (!daphne.Matrix<?x?xf64>, index, index) -> !daphne.Matrix<?x?xf64>
+  %11 = "daphne.cast"(%10) : (!daphne.Matrix<?x?xf64>) -> f64
+  // CHECK-NOT: daphne.ewPow
+  // CHECK: math.powf
+  %12 = "daphne.ewPow"(%11, %11) : (f64, f64) -> f64
+  "daphne.print"(%12, %4, %3) : (f64, i1, i1) -> ()
+  "daphne.return"() : () -> ()
+}
diff --git a/test/codegen/fusion.mlir b/test/codegen/fusion.mlir
new file mode 100644
index 000000000..43107ad22
--- /dev/null
+++ b/test/codegen/fusion.mlir
@@ -0,0 +1,29 @@
+// RUN: daphne-opt -pass-pipeline="builtin.module(lower-ew, canonicalize, func.func(affine-loop-fusion))" %s | FileCheck %s""""
+
+func.func @main() {
+  %0 = "daphne.constant"() {value = 2 : index} : () -> index
+  %1 = "daphne.constant"() {value = false} : () -> i1
+  %2 = "daphne.constant"() {value = true} : () -> i1
+  %3 = "daphne.constant"() {value = 1.000000e+00 : f64} : () -> f64
+  %4 = "daphne.constant"() {value = 2.000000e+00 : f64} : () -> f64
+  %5 = "daphne.constant"() {value = 4.000000e+00 : f64} : () -> f64
+  %6 = "daphne.fill"(%5, %0, %0) : (f64, index, index) -> !daphne.Matrix<2x2xf64>
+  // CHECK: affine.for
+  // CHECK-NEXT: affine.for
+  // CHECK-NEXT: affine.load
+  // CHECK-NEXT: arith.mulf
+  // CHECK-NEXT: affine.store
+  // CHECK-NEXT: affine.load
+  // CHECK-NEXT: arith.addf
+  // CHECK-NEXT: affine.store
+  // CHECK-NEXT: affine.load
+  // CHECK-NEXT: arith.addf
+  // CHECK-NEXT: affine.store
+  %7 = "daphne.ewMul"(%6, %4) : (!daphne.Matrix<2x2xf64>, f64) -> !daphne.Matrix<2x2xf64>
+  %8 = "daphne.ewAdd"(%7, %4) : (!daphne.Matrix<2x2xf64>, f64) -> !daphne.Matrix<2x2xf64>
+  %9 = "daphne.ewAdd"(%7, %3) : (!daphne.Matrix<2x2xf64>, f64) -> !daphne.Matrix<2x2xf64>
+  "daphne.print"(%7, %2, %1) : (!daphne.Matrix<2x2xf64>, i1, i1) -> ()
+  "daphne.print"(%8, %2, %1) : (!daphne.Matrix<2x2xf64>, i1, i1) -> ()
+  "daphne.print"(%9, %2, %1) : (!daphne.Matrix<2x2xf64>, i1, i1) -> ()
+  "daphne.return"() : () -> ()
+}
diff --git a/test/codegen/lit.cfg b/test/codegen/lit.cfg
new file mode 100644
index 000000000..fab502252
--- /dev/null
+++ b/test/codegen/lit.cfg
@@ -0,0 +1,17 @@
+import lit.formats
+import os
+
+config.name = "DAPHNE LIT config"
+config.test_format = lit.formats.ShTest(True)
+
+config.suffixes = [".mlir"]
+
+config.test_source_root = os.path.dirname(__file__)
+
+config.environment["PATH"] = os.path.pathsep.join(
+    (
+        os.path.abspath("bin/"),
+        os.path.abspath("thirdparty/build/llvm-project/bin/"),
+        config.environment["PATH"],
+    )
+)
diff --git a/test/codegen/mapop.mlir b/test/codegen/mapop.mlir
new file mode 100644
index 000000000..ff8825989
--- /dev/null
+++ b/test/codegen/mapop.mlir
@@ -0,0 +1,26 @@
+// RUN: daphne-opt --lower-map --inline %s | FileCheck %s
+
+module {
+  func.func @"increment-1-1"(%arg0: f64) -> f64 {
+    %0 = "daphne.ewExp"(%arg0) : (f64) -> f64
+    "daphne.return"(%0) : (f64) -> ()
+  }
+  func.func @main() {
+    %0 = "daphne.constant"() {value = 2 : index} : () -> index
+    %1 = "daphne.constant"() {value = false} : () -> i1
+    %2 = "daphne.constant"() {value = true} : () -> i1
+    %3 = "daphne.constant"() {value = 93985655361872 : ui64} : () -> ui64
+    %4 = "daphne.matrixConstant"(%3) : (ui64) -> !daphne.Matrix<?x?xf64>
+    %5 = "daphne.reshape"(%4, %0, %0) : (!daphne.Matrix<?x?xf64>, index, index) -> !daphne.Matrix<2x2xf64>
+    // CHECK-NOT: daphne.map
+    // CHECK: {{.*}}"daphne.convertDenseMatrixToMemRef"{{.*}}
+    // CHECK: affine.for
+    // CHECK-NEXT: affine.for
+    // CHECK-NOT: func.call
+    // CHECK: affine.load
+    // CHECK-NEXT: daphne.ewExp
+    %6 = "daphne.map"(%5) {func = "increment-1-1"} : (!daphne.Matrix<2x2xf64>) -> !daphne.Matrix<2x2xf64>
+    "daphne.print"(%6, %2, %1) : (!daphne.Matrix<2x2xf64>, i1, i1) -> ()
+    "daphne.return"() : () -> ()
+  }
+}
diff --git a/test/codegen/matmul.mlir b/test/codegen/matmul.mlir
new file mode 100644
index 000000000..6f3672be5
--- /dev/null
+++ b/test/codegen/matmul.mlir
@@ -0,0 +1,32 @@
+// RUN: daphne-opt --lower-mm %s | FileCheck %s
+
+module {
+  func.func @main() {
+    // CHECK: {{.*}}memref.alloc
+    %0 = "daphne.constant"() {value = 10 : index} : () -> index
+    %1 = "daphne.constant"() {value = false} : () -> i1
+    %2 = "daphne.constant"() {value = 3.000000e+00 : f64} : () -> f64
+    %3 = "daphne.constant"() {value = 5.000000e+00 : f64} : () -> f64
+    %4 = "daphne.fill"(%3, %0, %0) : (f64, index, index) -> !daphne.Matrix<10x10xf64>
+    %5 = "daphne.fill"(%2, %0, %0) : (f64, index, index) -> !daphne.Matrix<10x10xf64>
+    // CHECK: {{.*}}"daphne.convertDenseMatrixToMemRef"{{.*}}
+    // CHECK-NEXT: {{.*}}"daphne.convertDenseMatrixToMemRef"{{.*}}
+
+    // Initialize alloced memref to 0
+    // CHECK: affine.for
+    // CHECK-NEXT: {{ *}}affine.for
+    // CHECK-NEXT: {{ *}}affine.store
+
+    // MatMul
+    // CHECK: affine.for
+    // CHECK-NEXT: affine.for
+    // CHECK-NEXT: affine.for
+    // CHECK-NEXT: {{.*}}memref.load
+    // CHECK-NEXT: {{.*}}memref.load
+    // CHECK-NEXT: {{.*}}memref.load
+    // CHECK-NEXT: {{.*}}llvm.intr.fma
+    // CHECK-NEXT: {{.*}}memref.store
+    %6 = "daphne.matMul"(%4, %5, %1, %1) : (!daphne.Matrix<10x10xf64>, !daphne.Matrix<10x10xf64>, i1, i1) -> !daphne.Matrix<10x10xf64>
+    "daphne.return"() : () -> ()
+  }
+}
diff --git a/test/codegen/run-lit.py b/test/codegen/run-lit.py
new file mode 100644
index 000000000..39898435c
--- /dev/null
+++ b/test/codegen/run-lit.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python
+
+from lit.main import main
+main()
diff --git a/test/codegen/sum_agg.mlir b/test/codegen/sum_agg.mlir
new file mode 100644
index 000000000..d0df6ea7e
--- /dev/null
+++ b/test/codegen/sum_agg.mlir
@@ -0,0 +1,26 @@
+// RUN: daphne-opt --lower-agg %s | FileCheck %s
+
+module {
+  func.func @main() {
+    %0 = "daphne.constant"() {value = true} : () -> i1
+    %1 = "daphne.constant"() {value = 10 : index} : () -> index
+    %2 = "daphne.constant"() {value = 1000000 : si64} : () -> si64
+    %3 = "daphne.constant"() {value = false} : () -> i1
+    %4 = "daphne.constant"() {value = 1.000000e+00 : f64} : () -> f64
+    %5 = "daphne.fill"(%4, %1, %1) : (f64, index, index) -> !daphne.Matrix<10x10xf64>
+    %6 = "daphne.now"() : () -> si64
+    // CHECK-NOT: sumAll
+    // CHECK: {{.*}}"daphne.convertDenseMatrixToMemRef"{{.*}}
+    // CHECK: affine.for
+    // CHECK-NEXT: arith.constant
+    // CHECK-NEXT: affine.for
+    // CHECK-NEXT: memref.load
+    %7 = "daphne.sumAll"(%5) : (!daphne.Matrix<10x10xf64>) -> f64
+    %8 = "daphne.now"() : () -> si64
+    "daphne.print"(%7, %0, %3) : (f64, i1, i1) -> ()
+    %9 = "daphne.ewSub"(%8, %6) : (si64, si64) -> si64
+    %10 = "daphne.ewDiv"(%9, %2) : (si64, si64) -> si64
+    "daphne.print"(%10, %0, %3) : (si64, i1, i1) -> ()
+    "daphne.return"() : () -> ()
+  }
+}
diff --git a/test/tags.h b/test/tags.h
index 9d977a61c..14c490cbc 100644
--- a/test/tags.h
+++ b/test/tags.h
@@ -24,6 +24,7 @@
 
 #define TAG_ALGORITHMS "[algorithms]"
 #define TAG_CAST "[cast]"
+#define TAG_CODEGEN "[codegen]"
 #define TAG_CONFIG "[config]"
 #define TAG_CONTROLFLOW "[controlflow]"
 #define TAG_DATASTRUCTURES "[datastructures]"