diff --git a/software/apps/ofdm/main.c b/software/apps/ofdm/main.c
new file mode 100644
index 000000000..9969219fe
--- /dev/null
+++ b/software/apps/ofdm/main.c
@@ -0,0 +1,123 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Mempool runtime libraries */
+#include "dma.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+#include "xpulp/builtins_v2.h"
+
+#include "data/data_ofdm.h"
+
+// CFFT Parameters
+#define SCHEDULED
+#define FOLDED_TWIDDLES
+#define BITREVERSETABLE
+#define ASM
+#define N_FFTs_COL 4
+#define N_FFTs_ROW (N_RX / N_FFTs_COL)
+// CMATMUL Parameters
+#define NUM_COPIES (N_BANKS / (N_BEAMS * N_RX))
+
+#define ROUNDS 3
+dump(prova, 1);
+
+#include "kernel/mempool_radix4_cfft_butterfly_f16.h"
+#include "kernel/mempool_radix4_cfft_f16p.h"
+#include "kernel/mempool_radix4_cfft_q16_bitreversal.h"
+#include "kernel/mempool_cmatmul_f16.h"
+
+uint32_t arrival_index __attribute__((section(".l1_prio")));
+__fp16 l1_pBF_Coef_folded[2 * N_BEAMS * N_RX * NUM_COPIES]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+
+__fp16 l1_pFFT_Src[N_FFTs_ROW * 8 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+__fp16 l1_pFFT_Dst[N_FFTs_ROW * 8 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_src[6 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_dst[6 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/* MAIN */
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier_init(core_id);
+
+
+  /* INITIALIZATION */
+  mempool_start_benchmark();
+  if (core_id == 0) {
+    // Each FFT is folded over 4 memory rows
+    // Each memory row is 2 * N_BANKS samples
+    __atomic_store_n(&arrival_index, 0, __ATOMIC_RELAXED);
+    dma_memcpy_blocking(l1_pFFT_Src, l2_pFFT_Src, (N_RX * N_SC) * sizeof(int32_t));
+    dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable, BITREVINDEXTABLE_LENGTH * sizeof(int16_t));
+    for (uint32_t i = 0; i < NUM_COPIES; i++) {
+      dma_memcpy_blocking(l1_pBF_Coef_folded + i * (N_BEAMS * N_RX), l2_pBF_Coef, (N_BEAMS * N_RX) * sizeof(int32_t));
+    }
+    for (uint32_t i = 0; i < N_FFTs_COL; i++) {
+      dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * N_BANKS), l2_twiddleCoef_f16, 3 * (N_SC / 4) * sizeof(int32_t));
+    }
+  }
+  mempool_barrier(num_cores);
+  mempool_stop_benchmark();
+  dump_prova(0);
+
+//  // Start of the iterations
+//  for (uint32_t round = 0; round < ROUNDS; round++) {
+
+    /* FFT */
+    mempool_start_benchmark();
+    uint32_t col_fftLen = N_SC / 4;
+    uint32_t col_id = core_id / (N_SC / 16);
+    // Distribute FFTs over columns
+    mempool_radix4_cfft_f16p_scheduler(l1_pFFT_Src, l1_pFFT_Dst, N_SC,
+                                        l1_twiddleCoef_f16_src + 2 * col_id * col_fftLen,
+                                        l1_twiddleCoef_f16_dst + 2 * col_id * col_fftLen,
+                                        l1_BitRevIndexTable, BITREVINDEXTABLE_LENGTH, 1, (N_SC / 16));
+    mempool_log_barrier(2, core_id);
+    mempool_stop_benchmark();
+    dump_prova(1);
+
+    /* BEAMFORMING */
+    mempool_start_benchmark();
+    cmatmul_2x4_folded_f16p(l1_pBF_Coef_folded, l1_pFFT_Src, l1_pFFT_Dst, N_BEAMS, N_RX, N_SC, core_id, num_cores);
+    mempool_stop_benchmark();
+    dump_prova(2);
+
+    mempool_start_benchmark();
+    // Transfer and synchronization
+    if ((num_cores - 1) == __atomic_fetch_add(&arrival_index, 1, __ATOMIC_RELAXED)) {
+      dma_memcpy_blocking(l1_pFFT_Src, l2_pFFT_Src, (N_RX * N_SC) * sizeof(int32_t));
+      dma_memcpy_blocking(l2_pBF_Dst, l1_pFFT_Dst, (N_RX * N_SC) * sizeof(int32_t));
+      for (uint32_t i = 0; i < N_FFTs_COL; i++) {
+        dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * N_BANKS), l2_twiddleCoef_f16, 3 * (N_SC / 4) * sizeof(int32_t));
+      }
+      __atomic_store_n(&arrival_index, 0, __ATOMIC_RELAXED);
+      __sync_synchronize(); // Full memory barrier
+      wake_up_all();
+    }
+    mempool_wfi();
+    mempool_stop_benchmark();
+    dump_prova(3);
+
+//  }
+
+  return 0;
+}
diff --git a/software/runtime/data/data_ofdm.h.tpl b/software/runtime/data/data_ofdm.h.tpl
new file mode 100644
index 000000000..fe7ff7fc6
--- /dev/null
+++ b/software/runtime/data/data_ofdm.h.tpl
@@ -0,0 +1,48 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+\
+<% def array_to_cstr(array):
+    out = '{'
+    i = 0
+    out += '\n'
+    for a in array:
+        out += '(__fp16){:0.5}f, '.format(a)
+        i += 1
+        if i % 8 == 0:
+            out += '\n'
+    out = out[:-2] + '}'
+    return out
+%> \
+
+<% def array_to_str(array):
+    out = '{'
+    i = 0
+    out += '\n'
+    for a in array:
+        out += '{}, '.format(a)
+        i += 1
+        if i % 16 == 0:
+            out += '\n'
+    out = out[:-2] + '}'
+    return out
+%> \
+
+#define LOG2 (${Log2Len})
+#define N_RX (${N_rx})
+#define N_BEAMS (${N_bs})
+#define N_SC (${N_sc})
+#define N_BANKS (NUM_CORES * BANKING_FACTOR)
+#define BITREVINDEXTABLE_LENGTH (${BitrevLen})
+
+
+__fp16 l2_pFFT_Src[${2 * N_sc * N_rx}] = ${array_to_cstr(pFFT_src)};
+
+__fp16 l2_twiddleCoef_f16[${2 * N_sc}] = ${array_to_cstr(pTw_coef)};
+
+__fp16 l2_pBF_Coef[${2 * N_bs * N_rx}] = ${array_to_cstr(pBF_coef)};
+
+__fp16 l2_pBF_Dst[${2 * N_bs * N_sc}] = ${array_to_cstr(pBF_dst)};
+
+// Bitreversal
+uint16_t l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(bitrev)};
diff --git a/software/runtime/data/data_ofdm.py b/software/runtime/data/data_ofdm.py
new file mode 100644
index 000000000..08cf091ce
--- /dev/null
+++ b/software/runtime/data/data_ofdm.py
@@ -0,0 +1,133 @@
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+# Author: Marco Bertuletti, ETH Zurich
+
+#!/usr/bin/env python3
+
+import numpy as np
+import math as M
+import argparse
+import pathlib
+from mako.template import Template
+from scipy.linalg import solve_triangular
+from sympy.combinatorics import Permutation
+
+##################
+# compute_result #
+##################
+
+def compute_bitreversal(N, R):
+    # Decompose
+    logR2 = []
+    idx = N
+    while (idx >= R):
+        logR2.append(int(M.log2(R)))
+        idx = idx // R
+    if (idx > 1):
+        logR2.append(int(M.log2(idx)))
+    # Bitreversal
+    indexes = []
+    for x in range(N):
+        result = 0
+        for bits in logR2:
+            mask = (0xffffffff >> (32 - bits))
+            result = (result << bits) | (x & mask)
+            x = x >> bits
+        indexes.append(result)
+
+    # Create transpositions table
+    tps = []
+    for c in Permutation.from_sequence(indexes).cyclic_form:
+        for i in range(len(c) - 1):
+            tps.append([c[i] * 8, c[-1] * 8])
+    return tps
+
+def gen_data_header_file(outdir: pathlib.Path.cwd(), tpl: pathlib.Path.cwd(), **kwargs):
+
+    file = outdir / f"data_{kwargs['name']}.h"
+
+    print(tpl, outdir, kwargs['name'])
+
+    template = Template(filename=str(tpl))
+    with file.open('w') as f:
+        f.write(template.render(**kwargs))
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for kernels')
+    parser.add_argument(
+        "-o",
+        "--outdir",
+        type=pathlib.Path,
+        default=pathlib.Path(__file__).parent.absolute(),
+        required=False,
+        help='Select out directory of generated data files'
+    )
+    parser.add_argument(
+        "-t",
+        "--tpl",
+        type=pathlib.Path,
+        required=False,
+        default=pathlib.Path(__file__).parent.absolute() / "data_ofdm.h.tpl",
+        help='Path to mako template'
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action='store_true',
+        help='Set verbose'
+    )
+    parser.add_argument(
+        "-rx",
+        "--receivers",
+        type=int,
+        required=False,
+        default=64,
+        help='First dimension.'
+    )
+    parser.add_argument(
+        "-bs",
+        "--beams",
+        type=int,
+        required=False,
+        default=32,
+        help='Second dimension.'
+    )
+    parser.add_argument(
+        "-sc",
+        "--subcarriers",
+        type=int,
+        required=False,
+        default=4096,
+        help='Iterations.'
+    )
+
+    args = parser.parse_args()
+    N_rx=args.receivers
+    N_bs=args.beams
+    N_sc=args.subcarriers
+
+    pFFT_src = ( np.random.rand(2 * N_rx * N_sc)  ).astype(np.float16)
+    pTw_coef = ( np.random.rand(int(3 * N_sc / 4))     ).astype(np.float16)
+    pBF_coef = ( np.random.rand(2 * N_rx * N_bs)  ).astype(np.float16)
+    pBF_dst = ( np.random.rand(2 * N_bs * N_sc)  ).astype(np.float16)
+
+    Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(N_sc, 2)))
+
+    kwargs = {'name': 'ofdm',
+    'pFFT_src': pFFT_src,
+    'pTw_coef': pTw_coef,
+    'pBF_coef': pBF_coef,
+    'pBF_dst': pBF_dst,
+    'bitrev': Bitreversal,
+    'N_rx' : N_rx,
+    'N_bs' : N_bs,
+    'N_sc' : N_sc,
+    'Log2Len': int(np.log2(N_sc)),
+    'BitrevLen': len(Bitreversal)}
+    gen_data_header_file(args.outdir, args.tpl, **kwargs)
+
+if __name__ == "__main__":
+    main()
diff --git a/software/runtime/kernel/mempool_cfft_radix4_butterfly_f16.h b/software/runtime/kernel/mempool_cfft_radix4_butterfly_f16.h
deleted file mode 100644
index 5196fc30d..000000000
--- a/software/runtime/kernel/mempool_cfft_radix4_butterfly_f16.h
+++ /dev/null
@@ -1,199 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-#include "xpulp/builtins_v2.h"
-
-/**
-  @brief         First butterfly stage.
-  @param[in]     pIn  points to input buffer of 16b data, Re and Im parts are
-  interleaved
-  @param[out]    pOut  points to output buffer of 16b data, Re and Im parts are
-  interleaved
-  @param[in]     i0 points to the first element to be processed
-  @param[in]     n2 number of elements in the first wing of the butterfly
-  @param[in]     CoSi1 packed cosine and sine first twiddle
-  @param[in]     CoSi2 packed cosine and sine second twiddle
-  @param[in]     CoSi3 packed cosine and sine third twiddle
-  @param[in]     C1 packed sine and cosine first twiddle
-  @param[in]     C2 packed sine and cosine second twiddle
-  @param[in]     C3 packed sine and cosine third twiddle
-  @return        none
-*/
-static inline void radix4_butterfly(__fp16 *pIn, __fp16 *pOut,
-                                    uint32_t i0, uint32_t n2, v2h CoSi1,
-                                    v2h CoSi2, v2h CoSi3, v2h C1, v2h C2,
-                                    v2h C3) {
-  uint32_t i1, i2, i3;
-  __fp16 t0, t1, t2, t3, t4, t5;
-  v2h A, B, C, D, E, F, G, H;
-
-#if defined(FOLDED) || defined(SCHEDULED)
-  /* index calculation for the input as, */
-  /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
-  i1 = i0 + N_BANKS;
-  i2 = i1 + N_BANKS;
-  i3 = i2 + N_BANKS;
-  uint32_t n2_store = n2 >> 2U;
-  uint32_t i0_store =
-      (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS;
-  uint32_t i1_store = i0_store + n2_store;
-  uint32_t i2_store = i1_store + n2_store;
-  uint32_t i3_store = i2_store + n2_store;
-#else
-  /* index calculation for the input as, */
-  /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
-  i1 = i0 + n2;
-  i2 = i1 + n2;
-  i3 = i2 + n2;
-#endif
-  /* Read ya (real), xa (imag) input */
-  A = *(v2h *)&pIn[i0 * 2U];
-  /* Read yb (real), xb(imag) input */
-  B = *(v2h *)&pIn[i1 * 2U];
-  /* Read yc (real), xc(imag) input */
-  C = *(v2h *)&pIn[i2 * 2U];
-  /* Read yd (real), xd(imag) input */
-  D = *(v2h *)&pIn[i3 * 2U];
-  asm volatile(
-               // xa + xc, ya + yc
-               "vfadd.h  %[E],%[A],%[C];"
-               // xa - xc, ya - yc
-               "vfsub.h  %[F],%[A],%[C];"
-               // xb + xd, yd + yd
-               "vfadd.h  %[G],%[B],%[D];"
-               // xb - xd, yb - yd
-               "vfsub.h  %[H],%[B],%[D];"
-               "pv.extract.h  %[t0],%[H],0;"
-               "pv.extract.h  %[t1],%[H],1;"
-               "fsub.h %[t3],zero,%[t1];"
-               "fsub.h %[t4],zero,%[t0];"
-               // yd - yb, xb - xd
-               "pv.pack.h %[C],%[t0],%[t3];"
-               // yb - yd, xd - xb
-               "pv.pack.h %[D],%[t4],%[t1];"
-               // xa + xc + xb + xd, ya + yb + yc + yd
-               "vfadd.h  %[A],%[E],%[G];"
-               // xa - xc + yb - yd, ya - yc + xd - xb
-               "vfadd.h  %[D],%[F],%[D];"
-               // xa + xc - xb - xd, ya + yc - yb - yd
-               "vfsub.h  %[B],%[E],%[G];"
-               // xa - xc - yb + yd, ya - yc + xb - xd
-               "vfadd.h  %[C],%[F],%[C];"
-               "vfdotpex.s.h  %[t0],%[CoSi1],%[D];"
-               "vfdotpex.s.h  %[t2],%[CoSi2],%[B];"
-               "vfdotpex.s.h  %[t4],%[CoSi3],%[C];"
-               "vfdotpex.s.h  %[t1],%[C1],%[D];"
-               "vfdotpex.s.h  %[t3],%[C1],%[B];"
-               "vfdotpex.s.h  %[t5],%[C3],%[C];"
-               "fcvt.h.s %[t0],%[t0];"
-               "fcvt.h.s %[t1],%[t1];"
-               "fcvt.h.s %[t2],%[t2];"
-               "fcvt.h.s %[t3],%[t3];"
-               "fcvt.h.s %[t4],%[t4];"
-               "fcvt.h.s %[t5],%[t5];"
-               "pv.pack.h %[E],%[t1],%[t0];"
-               "pv.pack.h %[F],%[t3],%[t2];"
-               "pv.pack.h %[G],%[t5],%[t4];"
-               : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D),
-                 [E] "=&r"(E), [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H),
-                 [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3),
-                 [t4] "=&r"(t4), [t5] "=&r"(t5)
-               : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1),
-                 [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3)
-               :);
-#if defined(FOLDED) || defined(SCHEDULED)
-  *((v2h *)&pOut[i0_store * 2U]) = A;
-  *((v2h *)&pOut[i1_store * 2U]) = E;
-  *((v2h *)&pOut[i2_store * 2U]) = F;
-  *((v2h *)&pOut[i3_store * 2U]) = G;
-#else
-  *((v2h *)&pOut[i0 * 2U]) = A;
-  *((v2h *)&pOut[i1 * 2U]) = E;
-  *((v2h *)&pOut[i2 * 2U]) = F;
-  *((v2h *)&pOut[i3 * 2U]) = G;
-#endif
-
-}
-
-/**
-  @brief         Last butterfly stage.
-  @param[in]     pIn  points to input buffer of 16b data, Re and Im parts are
-  interleaved
-  @param[out]    pOut  points to output buffer of 16b data, Re and Im parts are
-  interleaved
-  @param[in]     i0 points to the first element to be processed
-  @return        none
-*/
-static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut,
-                                         uint32_t i0) {
-  __fp16 t0, t1;
-  uint32_t i1, i2, i3;
-  v2h A, B, C, D, E, F, G, H;
-
-#if defined(FOLDED) || defined(SCHEDULED)
-  /*  index calculation for the input as, */
-  /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
-      pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
-  i1 = i0 + N_BANKS;
-  i2 = i1 + N_BANKS;
-  i3 = i2 + N_BANKS;
-#ifndef SCHEDULED
-  uint32_t i0_store = i0 * 4;
-  uint32_t i1_store = i0_store + 1;
-  uint32_t i2_store = i1_store + 1;
-  uint32_t i3_store = i2_store + 1;
-#endif
-#else
-  /*  index calculation for the input as, */
-  /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
-      pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
-  i1 = i0 + 1U;
-  i2 = i1 + 1U;
-  i3 = i2 + 1U;
-#endif
-
-  /* Read ya (real), xa(imag) input */
-  A = *(v2h *)&pIn[i0 * 2U];
-  /* Read yb (real), xb(imag) input */
-  B = *(v2h *)&pIn[i1 * 2U];
-  /* Read yc (real), xc(imag) input */
-  C = *(v2h *)&pIn[i2 * 2U];
-  /* Read yd (real), xd(imag) input */
-  D = *(v2h *)&pIn[i3 * 2U];
-  __fp16 t2, t3;
-  asm volatile(
-      "vfsub.h  %[H],%[B],%[D];"
-      "vfadd.h  %[G],%[B],%[D];"
-      "vfadd.h  %[E],%[A],%[C];"
-      "vfsub.h  %[F],%[A],%[C];"
-      "pv.extract.h  %[t0],%[H],0;"
-      "pv.extract.h  %[t1],%[H],1;"
-      "fsub.h %[t2], zero, %[t0];"
-      "fsub.h %[t3], zero, %[t1];"
-      "pv.pack.h %[A],%[t2],%[t1];"
-      "pv.pack.h %[B],%[t0],%[t3];"
-      "vfadd.h  %[H],%[E],%[G];"
-      "vfsub.h  %[E],%[E],%[G];"
-      "vfadd.h  %[A],%[F],%[A];"
-      "vfadd.h  %[B],%[F],%[B];"
-      : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "=&r"(E),
-        [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), [t0] "=&r"(t0),
-        [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3)
-      :
-      :);
-#if defined(FOLDED)
-  *((v2h *)&pOut[i0_store * 2U]) = H;
-  *((v2h *)&pOut[i1_store * 2U]) = E;
-  *((v2h *)&pOut[i2_store * 2U]) = A;
-  *((v2h *)&pOut[i3_store * 2U]) = B;
-#else
-  *((v2h *)&pOut[i0 * 2U]) = H;
-  *((v2h *)&pOut[i1 * 2U]) = E;
-  *((v2h *)&pOut[i2 * 2U]) = A;
-  *((v2h *)&pOut[i3 * 2U]) = B;
-#endif
-
-}
diff --git a/software/runtime/kernel/mempool_cfft_radix4_f16p.h b/software/runtime/kernel/mempool_cfft_radix4_f16p.h
deleted file mode 100644
index d2220d090..000000000
--- a/software/runtime/kernel/mempool_cfft_radix4_f16p.h
+++ /dev/null
@@ -1,526 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-#include "xpulp/builtins_v2.h"
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-
-/**
-  @brief         Folding in local memory function
-  @param[in]     pSrc16  points to input buffer of 16b data, Re and Im parts are
-  interleaved
-  @param[in]     fftLen  Length of the complex input vector
-  @param[in]     nPE Number of PE
-  @return        none
-*/
-
-static inline void fold_radix4(__fp16 *pSrc16, uint32_t fftLen,
-                               uint32_t core_id, uint32_t nPE) {
-  uint32_t n2, i0, i1, i2, i3;
-  uint32_t i1_store, i2_store, i3_store;
-  volatile v2h A, B, C;
-  n2 = fftLen >> 2U;
-  for (i0 = core_id * STEP; i0 < MIN(core_id * STEP + STEP, n2); i0++) {
-    i1 = i0 + n2;
-    i2 = i1 + n2;
-    i3 = i2 + n2;
-    A = *(v2h *)&pSrc16[i1 * 2U];
-    B = *(v2h *)&pSrc16[i2 * 2U];
-    C = *(v2h *)&pSrc16[i3 * 2U];
-    i1_store = i0 + N_BANKS;
-    i2_store = i1_store + N_BANKS;
-    i3_store = i2_store + N_BANKS;
-    *(v2h *)&pSrc16[i1_store * 2U] = A;
-    *(v2h *)&pSrc16[i2_store * 2U] = B;
-    *(v2h *)&pSrc16[i3_store * 2U] = C;
-  }
-  mempool_log_partial_barrier(2 * WU_STRIDE, WU_STRIDE * core_id,
-                              nPE * WU_STRIDE);
-}
-
-#ifdef FOLDED_TWIDDLES
-/**
-  @brief         Full FFT butterfly
-  @param[in]     pSrc16  points to input buffer of 16b data, Re and Im parts are
-  interleaved
-  @param[out]    pDst16  points to output buffer of 16b data, Re and Im parts
-  are interleaved
-  @param[in]     fftLen  Length of the complex input vector
-  @param[in]     pCoef_src Twiddle coefficients vector
-  @param[in]     pCoef_dst Auxiliary twiddle coefficients vector
-  @param[in]     nPE Number of PE
-  @return        pointer to output vector
-*/
-__fp16 *mempool_radix4_cfft_q16p_folded(__fp16 *pSrc16, __fp16 *pDst16,
-                                         uint32_t fftLen, __fp16 *pCoef_src,
-                                         __fp16 *pCoef_dst, uint32_t nPE)
-#else
-/**
-  Twiddles are not folded in memory
-  @brief         Full FFT butterfly
-  @param[in]     pSrc16  points to input buffer of 16b data, Re and Im parts are
-  interleaved
-  @param[out]    pDst16  points to output buffer of 16b data, Re and Im parts
-  are interleaved
-  @param[in]     fftLen  Length of the complex input vector
-  @param[in]     pCoef_src Twiddle coefficients vector
-  @param[in]     nPE Number of PE
-  @return        pointer to output vector
-*/
-__fp16 *mempool_radix4_cfft_q16p_folded(__fp16 *pSrc16, __fp16 *pDst16,
-                                         uint32_t fftLen, __fp16 *pCoef_src,
-                                         uint32_t nPE)
-#endif
-{
-
-#ifdef FOLDED_TWIDDLES
-  uint32_t absolute_core_id = mempool_get_core_id();
-  uint32_t core_id = absolute_core_id / WU_STRIDE;
-  __fp16 t0, t1, t2, t3, t4, t5;
-  v2h CoSi1, CoSi2, CoSi3;
-  v2h C1, C2, C3;
-  uint32_t n1, n2, n2_store, i0, j, k;
-  uint32_t ic, offset, wing_idx;
-  __fp16 *pTmp;
-#else
-  uint32_t absolute_core_id = mempool_get_core_id();
-  uint32_t core_id = absolute_core_id / WU_STRIDE;
-  __fp16 t0, t1, t2, t3, t4, t5;
-  v2h CoSi1, CoSi2, CoSi3;
-  v2h C1, C2, C3;
-  uint32_t n1, n2, n2_store, i0, j, k;
-  uint32_t ic, offset, wing_id, bank_id;
-  __fp16 *pTmp;
-  uint32_t twidCoefModifier = 1U;
-#endif
-
-  if (fftLen <= N_BANKS)
-    fold_radix4(pSrc16, fftLen, core_id, nPE);
-
-  /* START OF FIRST STAGE PROCESS */
-  n1 = fftLen;
-  n2 = n1 >> 2U;
-  n2_store = n2 >> 2U;
-  for (i0 = core_id * STEP; i0 < MIN(core_id * STEP + STEP, n2); i0++) {
-
-#ifdef FOLDED_TWIDDLES
-    CoSi1 = *(v2h *)&pCoef_src[2U * i0];
-    CoSi2 = *(v2h *)&pCoef_src[2U * (i0 + 1 * N_BANKS)];
-    CoSi3 = *(v2h *)&pCoef_src[2U * (i0 + 2 * N_BANKS)];
-    if (i0 % 4 == 0) {
-      ic = i0 >> 2U;
-      *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1;
-      ic += N_BANKS;
-      *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2;
-      ic += N_BANKS;
-      *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3;
-    }
-#else
-    CoSi1 = *(v2h *)&pCoef_src[2U * i0];
-    CoSi2 = *(v2h *)&pCoef_src[2U * (i0 * 2U)];
-    CoSi3 = *(v2h *)&pCoef_src[2U * (i0 * 3U)];
-#endif
-    asm volatile("pv.extract.h  %[t1],%[CoSi1],0;"
-                 "pv.extract.h  %[t3],%[CoSi2],0;"
-                 "pv.extract.h  %[t5],%[CoSi3],0;"
-                 "pv.extract.h  %[t0],%[CoSi1],1;"
-                 "pv.extract.h  %[t2],%[CoSi2],1;"
-                 "pv.extract.h  %[t4],%[CoSi3],1;"
-                 "fsub.h           %[t0],zero,%[t0];"
-                 "fsub.h           %[t2],zero,%[t2];"
-                 "fsub.h           %[t4],zero,%[t4];"
-                 "pv.pack.h %[C1],%[t1],%[t0];"
-                 "pv.pack.h %[C2],%[t3],%[t2];"
-                 "pv.pack.h %[C3],%[t5],%[t4];"
-                 : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), [t0] "=&r"(t0),
-                   [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3),
-                   [t4] "=&r"(t4), [t5] "=&r"(t5)
-                 : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3)
-                 :);
-    radix4_butterfly(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
-                           C3);
-  }
-  pTmp = pSrc16;
-  pSrc16 = pDst16;
-  pDst16 = pTmp;
-#ifdef FOLDED_TWIDDLES
-  pTmp = pCoef_src;
-  pCoef_src = pCoef_dst;
-  pCoef_dst = pTmp;
-#else
-  twidCoefModifier <<= 2U;
-#endif
-  mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id, nPE * WU_STRIDE);
-  /* END OF FIRST STAGE PROCESSING */
-
-  /* START OF MIDDLE STAGE PROCESS */
-  for (k = fftLen / 4U; k > 4U; k >>= 2U) {
-    n1 = n2;
-    n2 >>= 2U;
-    n2_store = n2 >> 2U;
-
-#ifdef FOLDED_TWIDDLES
-    for (j = core_id * STEP; j < core_id * STEP + STEP; j++) {
-      CoSi1 = *(v2h *)&pCoef_src[2U * j];
-      CoSi2 = *(v2h *)&pCoef_src[2U * (j + 1 * N_BANKS)];
-      CoSi3 = *(v2h *)&pCoef_src[2U * (j + 2 * N_BANKS)];
-      if (j % 4 == 0) {
-        wing_idx = j % n2;
-        offset = (j / n2);
-        ic = wing_idx >> 2U;
-        ic += offset * n2;
-        *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1;
-        ic += N_BANKS;
-        *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2;
-        ic += N_BANKS;
-        *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3;
-      }
-#else
-    bank_id = core_id / n2_store;
-    wing_id = core_id % n2_store;
-    offset = bank_id * n2;
-    for (j = wing_id * 4; j < MIN(wing_id * 4 + 4, n2); j++) {
-      ic = j * twidCoefModifier;
-      CoSi1 = *(v2h *)&pCoef_src[2U * ic];
-      CoSi2 = *(v2h *)&pCoef_src[2U * (ic * 2U)];
-      CoSi3 = *(v2h *)&pCoef_src[2U * (ic * 3U)];
-#endif
-      asm volatile("pv.extract.h  %[t1],%[CoSi1],0;"
-                   "pv.extract.h  %[t3],%[CoSi2],0;"
-                   "pv.extract.h  %[t5],%[CoSi3],0;"
-                   "pv.extract.h  %[t0],%[CoSi1],1;"
-                   "pv.extract.h  %[t2],%[CoSi2],1;"
-                   "pv.extract.h  %[t4],%[CoSi3],1;"
-                   "fsub.h           %[t0],zero,%[t0];"
-                   "fsub.h           %[t2],zero,%[t2];"
-                   "fsub.h           %[t4],zero,%[t4];"
-                   "pv.pack %[C1],%[t1],%[t0];"
-                   "pv.pack %[C2],%[t3],%[t2];"
-                   "pv.pack %[C3],%[t5],%[t4];"
-                   : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3),
-                     [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2),
-                     [t3] "=&r"(t3), [t4] "=&r"(t4), [t5] "=&r"(t5)
-                   : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3)
-                   :);
-#ifdef FOLDED_TWIDDLES
-      i0 = j;
-      radix4_butterfly(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1,
-                              C2, C3);
-    }
-#else
-      i0 = offset + j;
-      radix4_butterfly(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1,
-                              C2, C3);
-    }
-#endif
-    pTmp = pSrc16;
-    pSrc16 = pDst16;
-    pDst16 = pTmp;
-#ifdef FOLDED_TWIDDLES
-    pTmp = pCoef_src;
-    pCoef_src = pCoef_dst;
-    pCoef_dst = pTmp;
-#else
-    twidCoefModifier <<= 2U;
-#endif
-    mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id,
-                                nPE * WU_STRIDE);
-  }
-  /* END OF MIDDLE STAGE PROCESSING */
-
-  /* START OF LAST STAGE PROCESSING */
-  n1 = n2;
-  n2 >>= 2U;
-  for (i0 = core_id * STEP; i0 < MIN(core_id * STEP + STEP, fftLen >> 2U);
-       i0++) {
-    radix4_butterfly_last(pSrc16, pDst16, i0);
-  }
-  mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id, nPE * WU_STRIDE);
-  /* END OF LAST STAGE PROCESSING */
-
-  return pDst16;
-}
-
-/**
-  SCHEDULER OF MULTIPLE FOLDED FFTS
-  Memory:
-
-  1st row of FFTS
-
-  col_idx1     col_idx2     col_idx3
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-
-  2nd row of FFTS
-
-  col_idx1     col_idx2     col_idx3
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-
-  ...
-
-  @brief         Scheduler of folded FFTs
-  @param[in]     column index of the current FFT
-  @param[in]     pSrc16  input buffer of 16b data, Re and Im are interleaved
-  @param[out]    pDst16  output buffer of 16b data, Re and Im are interleaved
-  @param[in]     fftLen  Length of the complex input vector
-  @param[in]     pCoef_src Twiddle coefficients vector
-  @param[in]     pCoef_dst Twiddle coefficients vector
-  @param[in]     pBitRevTable Bitreversal table
-  @param[in]     bitReverseLen Length of bitreversal table
-  @param[in]     bitReverseFlag Flag for bitreversal
-  @param[in]     nPE Number of PE
-  @return        void
-*/
-
-void mempool_radix4_cfft_q16p_scheduler(uint32_t col_id, __fp16 *pSrc16,
-                                        __fp16 *pDst16, uint32_t fftLen,
-                                        __fp16 *pCoef_src, __fp16 *pCoef_dst,
-                                        __attribute__((unused))
-                                        uint16_t *pBitRevTable,
-                                        __attribute__((unused))
-                                        uint16_t bitReverseLen,
-                                        uint8_t bitReverseFlag, uint32_t nPE) {
-
-  uint32_t absolute_core_id = mempool_get_core_id();
-  uint32_t core_id = absolute_core_id % (fftLen >> 4U);
-
-  uint32_t n1, n2, i0, ic, j, k;
-  uint32_t n2_store;
-  uint32_t offset, wing_idx;
-  __fp16 *pTmp;
-  int32_t t0, t1, t2, t3, t4, t5;
-  v2h CoSi1, CoSi2, CoSi3;
-  v2h C1, C2, C3;
-
-  /* FIRST STAGE */
-  n1 = fftLen;
-  n2 = n1 >> 2U;
-  n2_store = n2 >> 2U;
-  for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) {
-    CoSi1 = *(v2h *)&pCoef_src[2U * i0];
-    CoSi2 = *(v2h *)&pCoef_src[2U * (i0 + 1 * N_BANKS)];
-    CoSi3 = *(v2h *)&pCoef_src[2U * (i0 + 2 * N_BANKS)];
-    if (i0 % 4 == 0) {
-      ic = i0 / 4;
-      *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1;
-      ic += N_BANKS;
-      *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2;
-      ic += N_BANKS;
-      *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3;
-    }
-    asm volatile("pv.extract.h  %[t1],%[CoSi1],0;"
-                 "pv.extract.h  %[t3],%[CoSi2],0;"
-                 "pv.extract.h  %[t5],%[CoSi3],0;"
-                 "pv.extract.h  %[t0],%[CoSi1],1;"
-                 "pv.extract.h  %[t2],%[CoSi2],1;"
-                 "pv.extract.h  %[t4],%[CoSi3],1;"
-                 "fsub.h           %[t0],zero,%[t0];"
-                 "fsub.h           %[t2],zero,%[t2];"
-                 "fsub.h           %[t4],zero,%[t4];"
-                 "pv.pack.h %[C1],%[t1],%[t0];"
-                 "pv.pack.h %[C2],%[t3],%[t2];"
-                 "pv.pack.h %[C3],%[t5],%[t4];"
-                 : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), [t0] "=&r"(t0),
-                   [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3),
-                   [t4] "=&r"(t4), [t5] "=&r"(t5)
-                 : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3)
-                 :);
-    for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
-      __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8);
-      __fp16 *pOut = pDst16 + idx_row * (N_BANKS * 8);
-      radix4_butterfly(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
-                             C3);
-    }
-  }
-  pTmp = pSrc16;
-  pSrc16 = pDst16;
-  pDst16 = pTmp;
-  pTmp = pCoef_src;
-  pCoef_src = pCoef_dst;
-  pCoef_dst = pTmp;
-  mempool_log_partial_barrier(2, absolute_core_id, nPE);
-
-  /* MIDDLE STAGE */
-  for (k = fftLen / 4U; k > 4U; k >>= 2U) {
-    n1 = n2;
-    n2 >>= 2U;
-    n2_store = n2 >> 2U;
-
-    for (j = core_id * 4; j < core_id * 4 + 4; j++) {
-      CoSi1 = *(v2h *)&pCoef_src[2U * (j)];
-      CoSi2 = *(v2h *)&pCoef_src[2U * (j + 1 * N_BANKS)];
-      CoSi3 = *(v2h *)&pCoef_src[2U * (j + 2 * N_BANKS)];
-      if (j % 4 == 0) {
-
-        wing_idx = j % n2;
-        offset = (j / n2);
-        ic = wing_idx >> 2U;
-        ic += offset * n2;
-
-        *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1;
-        ic += N_BANKS;
-        *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2;
-        ic += N_BANKS;
-        *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3;
-      }
-      asm volatile("pv.extract.h  %[t1],%[CoSi1],0;"
-                   "pv.extract.h  %[t3],%[CoSi2],0;"
-                   "pv.extract.h  %[t5],%[CoSi3],0;"
-                   "pv.extract.h  %[t0],%[CoSi1],1;"
-                   "pv.extract.h  %[t2],%[CoSi2],1;"
-                   "pv.extract.h  %[t4],%[CoSi3],1;"
-                   "fsub.h           %[t0],zero,%[t0];"
-                   "fsub.h           %[t2],zero,%[t2];"
-                   "fsub.h           %[t4],zero,%[t4];"
-                   "pv.pack.h %[C1],%[t1],%[t0];"
-                   "pv.pack.h %[C2],%[t3],%[t2];"
-                   "pv.pack.h %[C3],%[t5],%[t4];"
-                   : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3),
-                     [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2),
-                     [t3] "=&r"(t3), [t4] "=&r"(t4), [t5] "=&r"(t5)
-                   : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3)
-                   :);
-      for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
-        __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8);
-        __fp16 *pOut = pDst16 + idx_row * (N_BANKS * 8);
-        radix4_butterfly(pIn, pOut, j, n2, CoSi1, CoSi2, CoSi3, C1, C2,
-                                C3);
-      }
-    }
-    pTmp = pSrc16;
-    pSrc16 = pDst16;
-    pDst16 = pTmp;
-    pTmp = pCoef_src;
-    pCoef_src = pCoef_dst;
-    pCoef_dst = pTmp;
-    mempool_log_partial_barrier(2, absolute_core_id, nPE);
-  }
-
-  /*  LAST STAGE */
-  n1 = n2;
-  n2 >>= 2U;
-  for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, fftLen >> 2U); i0++) {
-    for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
-      __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8);
-      __fp16 *pOut = pDst16 + idx_row * (N_BANKS * 8);
-      radix4_butterfly_last(pIn, pOut, i0);
-    }
-  }
-  pTmp = pSrc16;
-  pSrc16 = pDst16;
-  pDst16 = pTmp;
-  mempool_log_partial_barrier(2, absolute_core_id, nPE);
-
-  mempool_stop_benchmark();
-  mempool_start_benchmark();
-
-  /* BITREVERSAL */
-  // Bitreversal stage stores in the sequential addresses
-  if (bitReverseFlag) {
-#ifdef BITREVERSETABLE
-    uint16_t *ptr1 = (uint16_t *)(pSrc16 + 2 * col_id * (fftLen >> 2U));
-    uint16_t *ptr2 = (uint16_t *)(pDst16 + 2 * col_id * (3 * (fftLen >> 2)));
-    for (j = 2 * core_id; j < bitReverseLen; j += 2 * nPE) {
-      v2h addr, tmpa, tmpb;
-      addr = __SRA2(*(v2h *)&pBitRevTable[j], ((v2h){2, 2}));
-      for (int32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
-        int32_t a0 = addr[0] / 4 + (addr[0] % 4) * N_BANKS;
-        int32_t a1 = addr[1] / 4 + (addr[0] % 4) * N_BANKS;
-        tmpa = *(v2h *)&ptr1[a0 + idx_row * (N_BANKS * 8)];
-        tmpb = *(v2h *)&ptr1[a1 + idx_row * (N_BANKS * 8)];
-        *((v2h *)&ptr2[addr[0] + idx_row * (N_BANKS * 8)]) = tmpb;
-        *((v2h *)&ptr2[addr[1] + idx_row * (N_BANKS * 8)]) = tmpa;
-      }
-    }
-#else
-    uint16_t *ptr1 = (uint16_t *)(pSrc16 + 2 * col_id * (fftLen >> 2U));
-    uint16_t *ptr2 = (uint16_t *)(pDst16 + 2 * col_id * (3 * (fftLen >> 2)));
-    for (j = core_id * 16; j < MIN(core_id * 16 + 16, fftLen >> 2U); j += 4) {
-      uint32_t idx0 = j;
-      uint32_t idx1 = j + 1;
-      uint32_t idx2 = j + 2;
-      uint32_t idx3 = j + 3;
-      uint32_t idx_result0 = 0;
-      uint32_t idx_result1 = 0;
-      uint32_t idx_result2 = 0;
-      uint32_t idx_result3 = 0;
-      for (k = 0; k < LOG2; k++) {
-        idx_result0 = (idx_result0 << 1U) | (idx0 & 1U);
-        idx_result1 = (idx_result1 << 1U) | (idx1 & 1U);
-        idx_result2 = (idx_result2 << 1U) | (idx2 & 1U);
-        idx_result3 = (idx_result3 << 1U) | (idx3 & 1U);
-        idx0 = idx0 >> 1U;
-        idx1 = idx1 >> 1U;
-        idx2 = idx2 >> 1U;
-        idx3 = idx3 >> 1U;
-      }
-      for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
-        uint32_t addr_src0 = (idx0 / 4) + (idx0 % 4) * N_BANKS;
-        uint32_t addr_src1 = (idx1 / 4) + (idx1 % 4) * N_BANKS;
-        uint32_t addr_src2 = (idx2 / 4) + (idx2 % 4) * N_BANKS;
-        uint32_t addr_src3 = (idx3 / 4) + (idx3 % 4) * N_BANKS;
-        uint32_t addr_dst0 = idx_result0;
-        uint32_t addr_dst1 = idx_result1;
-        uint32_t addr_dst2 = idx_result2;
-        uint32_t addr_dst3 = idx_result3;
-        addr_src0 += idx_row * (N_BANKS * 8);
-        addr_src1 += idx_row * (N_BANKS * 8);
-        addr_src2 += idx_row * (N_BANKS * 8);
-        addr_src3 += idx_row * (N_BANKS * 8);
-        addr_dst0 += idx_row * (N_BANKS * 8);
-        addr_dst1 += idx_row * (N_BANKS * 8);
-        addr_dst2 += idx_row * (N_BANKS * 8);
-        addr_dst3 += idx_row * (N_BANKS * 8);
-        *((uint32_t *)&ptr2[addr_dst0]) = (uint32_t)ptr1[addr_src0];
-        *((uint32_t *)&ptr2[addr_dst1]) = (uint32_t)ptr1[addr_src1];
-        *((uint32_t *)&ptr2[addr_dst2]) = (uint32_t)ptr1[addr_src2];
-        *((uint32_t *)&ptr2[addr_dst3]) = (uint32_t)ptr1[addr_src3];
-      }
-    }
-#endif
-  }
-  mempool_log_partial_barrier(2, absolute_core_id, nPE);
-}
diff --git a/software/runtime/kernel/mempool_cfft_radix4_q16_bitreversal.h b/software/runtime/kernel/mempool_cfft_radix4_q16_bitreversal.h
deleted file mode 100644
index 32f7a5265..000000000
--- a/software/runtime/kernel/mempool_cfft_radix4_q16_bitreversal.h
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-void mempool_bitrev_q16p_xpulpimg(uint16_t *pSrc, uint16_t *pDst,
-                                  const uint16_t fftLen, const uint32_t nPE) {
-  uint32_t absolute_core_id = mempool_get_core_id();
-  uint32_t core_id = absolute_core_id / WU_STRIDE;
-  uint32_t idx_result, idx, i, j;
-  for (i = core_id; i < fftLen; i += nPE) {
-    idx_result = 0;
-    idx = i;
-    for (j = 0; j < LOG2; j++) {
-      idx_result = (idx_result << 1U) | (idx & 1U);
-      idx = idx >> 1U;
-    }
-    pDst[2 * idx_result] = pSrc[2 * i];
-    pDst[2 * idx_result + 1] = pSrc[2 * i + 1];
-  }
-  mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id, nPE * WU_STRIDE);
-}
diff --git a/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h b/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h
new file mode 100644
index 000000000..fbb6964ac
--- /dev/null
+++ b/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h
@@ -0,0 +1,337 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include "xpulp/builtins_v2.h"
+
+/**
+  @brief         First butterfly stage.
+  @param[in]     pIn  points to input buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[out]    pOut  points to output buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[in]     i0 points to the first element to be processed
+  @param[in]     n2 number of elements in the first wing of the butterfly
+  @param[in]     CoSi1 packed cosine and sine first twiddle
+  @param[in]     CoSi2 packed cosine and sine second twiddle
+  @param[in]     CoSi3 packed cosine and sine third twiddle
+  @param[in]     C1 packed sine and cosine first twiddle
+  @param[in]     C2 packed sine and cosine second twiddle
+  @param[in]     C3 packed sine and cosine third twiddle
+  @return        none
+*/
+static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut,
+                                          uint32_t i0, uint32_t n2, v2h CoSi1,
+                                          v2h CoSi2, v2h CoSi3, v2h C1, v2h C2,
+                                          v2h C3) {
+  __fp16 t0, t1, t2, t3;
+  uint32_t i1, i2, i3;
+  uint32_t i0_store, i1_store, i2_store, i3_store;
+  float s0 = 0.0f, s1 = 0.0f, s2 = 0.0f, s3 = 0.0f, s4 = 0.0f, s5 = 0.0f;
+  v2h A, B, C, D, E, F, G, H;
+
+// LOAD INDEXES
+#if defined(FOLDED) || defined(SCHEDULED)
+  /* index calculation for the input as, */
+  /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
+  i1 = i0 + n2;
+  i2 = i1 + n2;
+  i3 = i2 + n2;
+#else
+  /* index calculation for the input as, */
+  /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
+  i1 = i0 + n2;
+  i2 = i1 + n2;
+  i3 = i2 + n2;
+#endif
+// STORE INDEXES
+#if defined(FOLDED) || defined(SCHEDULED)
+  uint32_t n2_store = n2 >> 2U;
+  i0_store = (i0 % n2_store) + (i0 / n2_store) * N_BANKS;
+  i1_store = i0_store + n2_store;
+  i2_store = i1_store + n2_store;
+  i3_store = i2_store + n2_store;
+#else
+  i0_store = i0;
+  i1_store = i1;
+  i2_store = i2;
+  i3_store = i3;
+#endif
+
+  /* Read yb (real), xb(imag) input */
+  B = *(v2h *)&pIn[i1 * 2U];
+  /* Read yd (real), xd(imag) input */
+  D = *(v2h *)&pIn[i3 * 2U];
+  /* Read ya (real), xa (imag) input */
+  A = *(v2h *)&pIn[i0 * 2U];
+  /* Read yc (real), xc(imag) input */
+  C = *(v2h *)&pIn[i2 * 2U];
+  asm volatile(
+      // xb - xd, yb - yd
+      "vfsub.h  %[H],%[B],%[D];"
+      // xb + xd, yd + yd
+      "vfadd.h  %[G],%[B],%[D];"
+      // xa + xc, ya + yc
+      "vfadd.h  %[E],%[A],%[C];"
+      "pv.extract.h  %[t0],%[H],0;" // yb - yd
+      "pv.extract.h  %[t1],%[H],1;" // xb - xd
+      // xa - xc, ya - yc
+      "vfsub.h  %[F],%[A],%[C];"
+
+      "xor %[t2],%[t0],%[neg_mask];" // yd - yb
+      "xor %[t3],%[t1],%[neg_mask];" // xd - xb
+      "pv.pack.h %[D],%[t2],%[t1];"    // yd - yb, xb - xd
+      "pv.pack.h %[C],%[t0],%[t3];"    // yb - yd, xd - xb
+
+      // xa + xc + xb + xd, ya + yb + yc + yd
+      "vfadd.h  %[A],%[E],%[G];"
+      // xa + xc - xb - xd, ya + yc - yb - yd
+      "vfsub.h  %[B],%[E],%[G];"
+      // xa - xc + yb - yd, ya - yc + xd - xb
+      "vfadd.h  %[C],%[F],%[C];"
+      // xa - xc + yd - yb, ya - yc + xb - xd
+      "vfadd.h  %[D],%[F],%[D];"
+
+      // Co2(xa + xc - xb - xd), Si2(ya + yc - yb - yd)
+      "vfdotpex.s.h  %[s0],%[CoSi2],%[B];"
+      //-Si2(xa + xc - xb - xd), Co2(ya + yc - yb - yd)
+      "vfdotpex.s.h  %[s1],%[C2],%[B];"
+
+      // Co1(xa - xc + yd - yb), Si1(ya - yc + xb - xd)
+      "vfdotpex.s.h  %[s2],%[CoSi1],%[D];"
+      //-Si1(xa - xc + yd - yb), Co1(ya - yc + xb - xd)
+      "vfdotpex.s.h  %[s3],%[C1],%[D];"
+
+      // Co3(xa - xc + yb - yd), Si3(ya - yc + xd - xb)
+      "vfdotpex.s.h  %[s4],%[CoSi3],%[C];"
+      //-Si3(xa - xc + yb - yd), Co3(ya - yc + xd - xb)
+      "vfdotpex.s.h  %[s5],%[C3],%[C];"
+
+      // xb', yb'
+      "vfcpka.h.s %[B], %[s0], %[s1];"
+      // xc', yc'
+      "vfcpka.h.s %[C], %[s2], %[s3];"
+      // xd', yd'
+      "vfcpka.h.s %[D], %[s4], %[s5];"
+      : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E),
+        [F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "=&r"(t0),
+        [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [s0] "=&r"(s0),
+        [s1] "=&r"(s1), [s2] "=&r"(s2), [s3] "=&r"(s3), [s4] "=&r"(s4),
+        [s5] "=&r"(s5)
+      : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1),
+        [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [neg_mask] "r"(0x00008000)
+      :);
+  *((v2h *)&pOut[i0_store * 2U]) = A;
+  *((v2h *)&pOut[i1_store * 2U]) = B;
+  *((v2h *)&pOut[i2_store * 2U]) = D;
+  *((v2h *)&pOut[i3_store * 2U]) = C;
+}
+
+/**
+  @brief         Middle butterfly stage.
+  @param[in]     pIn  points to input buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[out]    pOut  points to output buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[in]     i0 points to the first element to be processed
+  @param[in]     n2 number of elements in the first wing of the butterfly
+  @param[in]     CoSi1 packed cosine and sine first twiddle
+  @param[in]     CoSi2 packed cosine and sine second twiddle
+  @param[in]     CoSi3 packed cosine and sine third twiddle
+  @param[in]     C1 packed sine and cosine first twiddle
+  @param[in]     C2 packed sine and cosine second twiddle
+  @param[in]     C3 packed sine and cosine third twiddle
+  @return        none
+*/
+static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
+                                           uint32_t i0, uint32_t n2, v2h CoSi1,
+                                           v2h CoSi2, v2h CoSi3, v2h C1, v2h C2,
+                                           v2h C3) {
+  __fp16 t0, t1, t2, t3;
+  uint32_t i1, i2, i3;
+  uint32_t i0_store, i1_store, i2_store, i3_store;
+  float s0 = 0.0f, s1 = 0.0f, s2 = 0.0f, s3 = 0.0f, s4 = 0.0f, s5 = 0.0f;
+  v2h A, B, C, D, E, F, G, H;
+
+// LOAD INDEXES
+#if defined(FOLDED) || defined(SCHEDULED)
+  /*  index calculation for the input as, */
+  /*  pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 +
+   * 3fftLen/4] */
+  i1 = i0 + N_BANKS;
+  i2 = i1 + N_BANKS;
+  i3 = i2 + N_BANKS;
+#else
+  /*  index calculation for the input as, */
+  /*  pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 +
+   * 3fftLen/4] */
+  i1 = i0 + n2;
+  i2 = i1 + n2;
+  i3 = i2 + n2;
+#endif
+// STORE INDEXES
+#if defined(FOLDED) || defined(SCHEDULED)
+  uint32_t n2_store = n2 >> 2U;
+  i0_store =
+      (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS;
+  i1_store = i0_store + n2_store;
+  i2_store = i1_store + n2_store;
+  i3_store = i2_store + n2_store;
+#else
+  i0_store = i0;
+  i1_store = i1;
+  i2_store = i2;
+  i3_store = i3;
+#endif
+
+  /* Read yb (real), xb(imag) input */
+  B = *(v2h *)&pIn[i1 * 2U];
+  /* Read yd (real), xd(imag) input */
+  D = *(v2h *)&pIn[i3 * 2U];
+  /* Read ya (real), xa (imag) input */
+  A = *(v2h *)&pIn[i0 * 2U];
+  /* Read yc (real), xc(imag) input */
+  C = *(v2h *)&pIn[i2 * 2U];
+  asm volatile(
+      // xb - xd, yb - yd
+      "vfsub.h  %[H],%[B],%[D];"
+      // xb + xd, yd + yd
+      "vfadd.h  %[G],%[B],%[D];"
+      // xa + xc, ya + yc
+      "vfadd.h  %[E],%[A],%[C];"
+      "pv.extract.h  %[t0],%[H],1;" // yb - yd
+      "pv.extract.h  %[t1],%[H],0;" // xb - xd
+      // xa - xc, ya - yc
+      "vfsub.h  %[F],%[A],%[C];"
+
+      "xor %[t2],%[t0],%[neg_mask];" // yd - yb
+      "xor %[t3],%[t1],%[neg_mask];" // xd - xb
+      "pv.pack.h %[D],%[t2],%[t1];"    // yd - yb, xb - xd
+      "pv.pack.h %[C],%[t0],%[t3];"    // yb - yd, xd - xb
+
+      // xa + xc + xb + xd, ya + yb + yc + yd
+      "vfadd.h  %[A],%[E],%[G];"
+      // xa + xc - xb - xd, ya + yc - yb - yd
+      "vfsub.h  %[B],%[E],%[G];"
+      // xa - xc + yb - yd, ya - yc + xd - xb
+      "vfadd.h  %[C],%[F],%[C];"
+      // xa - xc + yd - yb, ya - yc + xb - xd
+      "vfadd.h  %[D],%[F],%[D];"
+
+      // Co2(xa + xc - xb - xd), Si2(ya + yc - yb - yd)
+      "vfdotpex.s.h  %[s0],%[CoSi2],%[B];"
+      //-Si2(xa + xc - xb - xd), Co2(ya + yc - yb - yd)
+      "vfdotpex.s.h  %[s1],%[C2],%[B];"
+
+      // Co1(xa - xc + yd - yb), Si1(ya - yc + xb - xd)
+      "vfdotpex.s.h  %[s2],%[CoSi1],%[D];"
+      //-Si1(xa - xc + yd - yb), Co1(ya - yc + xb - xd)
+      "vfdotpex.s.h  %[s3],%[C1],%[D];"
+
+      // Co3(xa - xc + yb - yd), Si3(ya - yc + xd - xb)
+      "vfdotpex.s.h  %[s4],%[CoSi3],%[C];"
+      //-Si3(xa - xc + yb - yd), Co3(ya - yc + xd - xb)
+      "vfdotpex.s.h  %[s5],%[C3],%[C];"
+
+      // xb', yb'
+      "vfcpka.h.s %[B], %[s0], %[s1];"
+      // xc', yc'
+      "vfcpka.h.s %[C], %[s2], %[s3];"
+      // xd', yd'
+      "vfcpka.h.s %[D], %[s4], %[s5];"
+      : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E),
+        [F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "=&r"(t0),
+        [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [s0] "=&r"(s0),
+        [s1] "=&r"(s1), [s2] "=&r"(s2), [s3] "=&r"(s3), [s4] "=&r"(s4),
+        [s5] "=&r"(s5)
+      : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1),
+        [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [neg_mask] "r"(0x00008000)
+      :);
+
+  *((v2h *)&pOut[i0_store * 2U]) = A;
+  *((v2h *)&pOut[i1_store * 2U]) = B;
+  *((v2h *)&pOut[i2_store * 2U]) = D;
+  *((v2h *)&pOut[i3_store * 2U]) = C;
+}
+
+/**
+  @brief         Last butterfly stage.
+  @param[in]     pIn  points to input buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[out]    pOut  points to output buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[in]     i0 points to the first element to be processed
+  @return        none
+*/
+static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut,
+                                         uint32_t i0) {
+  __fp16 t0, t1;
+  uint32_t i1, i2, i3;
+  uint32_t i0_store, i1_store, i2_store, i3_store;
+  v2h A, B, C, D, E, F, G, H;
+
+// LOAD INDEXES
+#if defined(FOLDED) || defined(SCHEDULED)
+  /*  index calculation for the input as, */
+  /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
+      pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
+  i1 = i0 + N_BANKS;
+  i2 = i1 + N_BANKS;
+  i3 = i2 + N_BANKS;
+#else
+  /*  index calculation for the input as, */
+  /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
+      pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
+  i1 = i0 + 1U;
+  i2 = i1 + 1U;
+  i3 = i2 + 1U;
+#endif
+// STORE INDEXES
+#if defined(FOLDED)
+  i0_store = i0 * 4;
+  i1_store = i0_store + 1;
+  i2_store = i1_store + 1;
+  i3_store = i2_store + 1;
+#else
+  i0_store = i0;
+  i1_store = i1;
+  i2_store = i2;
+  i3_store = i3;
+#endif
+
+  /* Read yb (real), xb(imag) input */
+  B = *(v2h *)&pIn[i1 * 2U];
+  /* Read yd (real), xd(imag) input */
+  D = *(v2h *)&pIn[i3 * 2U];
+  /* Read ya (real), xa(imag) input */
+  A = *(v2h *)&pIn[i0 * 2U];
+  /* Read yc (real), xc(imag) input */
+  C = *(v2h *)&pIn[i2 * 2U];
+  __fp16 t2, t3;
+  asm volatile("vfsub.h  %[H],%[B],%[D];"
+               "vfadd.h  %[G],%[B],%[D];"
+               "vfadd.h  %[E],%[A],%[C];"
+               "vfsub.h  %[F],%[A],%[C];"
+               "pv.extract.h  %[t0],%[H],1;"
+               "pv.extract.h  %[t1],%[H],0;"
+               "xor %[t2],%[t0],%[neg_mask];"
+               "xor %[t3],%[t1],%[neg_mask];"
+               "pv.pack.h %[A],%[t2],%[t1];"
+               "pv.pack.h %[B],%[t0],%[t3];"
+               "vfadd.h  %[H],%[E],%[G];"
+               "vfsub.h  %[E],%[E],%[G];"
+               "vfadd.h  %[A],%[F],%[A];"
+               "vfadd.h  %[B],%[F],%[B];"
+               : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D),
+                 [E] "=&r"(E), [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H),
+                 [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3)
+               : [neg_mask] "r"(0x00008000)
+               :);
+
+  *((v2h *)&pOut[i0_store * 2U]) = H;
+  *((v2h *)&pOut[i1_store * 2U]) = E;
+  *((v2h *)&pOut[i2_store * 2U]) = A;
+  *((v2h *)&pOut[i3_store * 2U]) = B;
+}
diff --git a/software/runtime/kernel/mempool_radix4_cfft_f16p.h b/software/runtime/kernel/mempool_radix4_cfft_f16p.h
new file mode 100644
index 000000000..2076a108a
--- /dev/null
+++ b/software/runtime/kernel/mempool_radix4_cfft_f16p.h
@@ -0,0 +1,441 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include "xpulp/builtins_v2.h"
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+#define SHUFFLE_TWIDDLEFACT                                                    \
+  asm volatile("pv.extract.h  %[t1],%[CoSi1],0;"                               \
+               "pv.extract.h  %[t3],%[CoSi2],0;"                               \
+               "pv.extract.h  %[t5],%[CoSi3],0;"                               \
+               "pv.extract.h  %[t0],%[CoSi1],1;"                               \
+               "pv.extract.h  %[t2],%[CoSi2],1;"                               \
+               "pv.extract.h  %[t4],%[CoSi3],1;"                               \
+               "xor           %[t1],%[t1],%[neg_mask];"                        \
+               "xor           %[t3],%[t3],%[neg_mask];"                        \
+               "xor           %[t5],%[t5],%[neg_mask];"                        \
+               "pv.pack.h %[C1],%[t0],%[t1];"                                    \
+               "pv.pack.h %[C2],%[t2],%[t3];"                                    \
+               "pv.pack.h %[C3],%[t4],%[t5];"                                    \
+               : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), [t0] "=&r"(t0),  \
+                 [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3),               \
+                 [t4] "=&r"(t4), [t5] "=&r"(t5)                                \
+               : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3),   \
+                 [neg_mask] "r"(0x00008000)                                    \
+               :);
+
+#ifdef FOLDED_TWIDDLES
+
+#define LOAD_STORE_TWIDDLEFACT                                                 \
+  CoSi1 = *(v2h *)&pCoef_src[2U * ic];                                         \
+  CoSi2 = *(v2h *)&pCoef_src[2U * (ic + 1 * N_BANKS)];                         \
+  CoSi3 = *(v2h *)&pCoef_src[2U * (ic + 2 * N_BANKS)];                         \
+  if (ic % 4 == 0) {                                                           \
+    *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi1;                             \
+    *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi1;              \
+    *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi1;              \
+    *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi1;              \
+    ic_store += N_BANKS;                                                       \
+    *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi2;                             \
+    *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi2;              \
+    *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi2;              \
+    *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi2;              \
+    ic_store += N_BANKS;                                                       \
+    *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi3;                             \
+    *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi3;              \
+    *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi3;              \
+    *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi3;              \
+  }
+
+#else
+#define LOAD_STORE_TWIDDLEFACT                                                 \
+  CoSi1 = *(v2h *)&pCoef_src[2U * ic];                                         \
+  CoSi2 = *(v2h *)&pCoef_src[2U * (ic * 2U)];                                  \
+  CoSi3 = *(v2h *)&pCoef_src[2U * (ic * 3U)];
+#endif
+
+
+
+#ifdef FOLDED_TWIDDLES
+/**
+  @brief         Full FFT butterfly
+  @param[in]     pSrc16  points to input buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[out]    pDst16  points to output buffer of 16b data, Re and Im parts
+  are interleaved
+  @param[in]     fftLen  Length of the complex input vector
+  @param[in]     pCoef_src Twiddle coefficients vector
+  @param[in]     pCoef_dst Auxiliary twiddle coefficients vector
+  @param[in]     nPE Number of PE
+  @return        pointer to output vector
+*/
+void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16,
+                                     uint32_t fftLen, __fp16 *pCoef_src,
+                                     __fp16 *pCoef_dst, uint32_t nPE)
+#else
+/**
+  Twiddles are not folded in memory
+  @brief         Full FFT butterfly
+  @param[in]     pSrc16  points to input buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[out]    pDst16  points to output buffer of 16b data, Re and Im parts
+  are interleaved
+  @param[in]     fftLen  Length of the complex input vector
+  @param[in]     pCoef_src Twiddle coefficients vector
+  @param[in]     nPE Number of PE
+  @return        pointer to output vector
+*/
+void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16,
+                                     uint32_t fftLen, __fp16 *pCoef_src,
+                                     uint32_t nPE)
+#endif
+{
+
+  uint32_t absolute_core_id = mempool_get_core_id();
+  uint32_t core_id = absolute_core_id;
+  __fp16 t0, t1, t2, t3, t4, t5;
+  v2h CoSi1, CoSi2, CoSi3;
+  v2h C1, C2, C3;
+#ifdef FOLDED_TWIDDLES
+  uint32_t n1, n2, n2_store;
+  uint32_t i0, k, ic, ic_store;
+  __fp16 *pTmp;
+#else
+  uint32_t n1, n2;
+  uint32_t i0, k, ic;
+  __fp16 *pTmp;
+  uint32_t twidCoefModifier = 1U;
+#endif
+
+  /* START OF FIRST STAGE PROCESSING */
+  n1 = fftLen;
+  n2 = n1 >> 2U;
+  for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) {
+
+#ifdef FOLDED_TWIDDLES
+    ic = i0;
+    ic_store = ic >> 2U;
+    n2_store = n2 >> 2U;
+#else
+    ic = i0;
+#endif
+    LOAD_STORE_TWIDDLEFACT;
+    SHUFFLE_TWIDDLEFACT;
+    radix4_butterfly_first(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
+                           C3);
+  }
+  pTmp = pSrc16;
+  pSrc16 = pDst16;
+  pDst16 = pTmp;
+#ifdef FOLDED_TWIDDLES
+  pTmp = pCoef_src;
+  pCoef_src = pCoef_dst;
+  pCoef_dst = pTmp;
+#else
+  twidCoefModifier <<= 2U;
+#endif
+  mempool_log_partial_barrier(2, absolute_core_id, nPE);
+  /* END OF FIRST STAGE PROCESSING */
+
+  /* START OF MIDDLE STAGE PROCESSING */
+  for (k = fftLen / 4U; k > 4U; k >>= 2U) {
+    n1 = n2;
+    n2 >>= 2U;
+    for (i0 = core_id * 4; i0 < core_id * 4 + 4; i0++) {
+#ifdef FOLDED_TWIDDLES
+      ic = i0;
+      // (ic % n2) / 4 take only every 4th index in the wing
+      // (ic / n2) * n2 shift of the wing size
+      ic_store = ((ic % n2) >> 2) + (ic / n2) * n2;
+      n2_store = n2 >> 2U;
+#else
+      ic = (i0 % n2) * twidCoefModifier;
+#endif
+      LOAD_STORE_TWIDDLEFACT;
+      SHUFFLE_TWIDDLEFACT;
+      radix4_butterfly_middle(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1,
+                              C2, C3);
+    }
+    pTmp = pSrc16;
+    pSrc16 = pDst16;
+    pDst16 = pTmp;
+#ifdef FOLDED_TWIDDLES
+    pTmp = pCoef_src;
+    pCoef_src = pCoef_dst;
+    pCoef_dst = pTmp;
+#else
+    twidCoefModifier <<= 2U;
+#endif
+    mempool_log_partial_barrier(2, absolute_core_id, nPE);
+  }
+  /* END OF MIDDLE STAGE PROCESSING */
+
+  /* START OF LAST STAGE PROCESSING */
+  for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, fftLen >> 2U); i0++) {
+    radix4_butterfly_last(pSrc16, pDst16, i0);
+  }
+  mempool_log_partial_barrier(2, absolute_core_id, nPE);
+  /* END OF LAST STAGE PROCESSING */
+  return;
+}
+
+/**
+  SCHEDULER OF MULTIPLE FOLDED FFTS
+  Memory:
+
+  1st row of FFTS
+
+  col_idx1     col_idx2     col_idx3
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+
+  2nd row of FFTS
+
+  col_idx1     col_idx2     col_idx3
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+
+  ...
+
+  @brief         Scheduler of folded FFTs
+  @param[in]     column index of the current FFT
+  @param[in]     pSrc16  input buffer of 16b data, Re and Im are interleaved
+  @param[out]    pDst16  output buffer of 16b data, Re and Im are interleaved
+  @param[in]     fftLen  Length of the complex input vector
+  @param[in]     pCoef_src Twiddle coefficients vector
+  @param[in]     pCoef_dst Twiddle coefficients vector
+  @param[in]     pBitRevTable Bitreversal table
+  @param[in]     bitReverseLen Length of bitreversal table
+  @param[in]     bitReverseFlag Flag for bitreversal
+  @param[in]     nPE Number of PE
+  @return        void
+*/
+
+void mempool_radix4_cfft_f16p_scheduler(
+  __fp16 *pSrc16, __fp16 *pDst16, uint32_t fftLen,
+  __fp16 *pCoef_src, __fp16 *pCoef_dst, __attribute__((unused))
+  uint16_t *pBitRevTable, __attribute__((unused)) uint16_t bitReverseLen,
+  uint8_t bitReverseFlag, uint32_t nPE) {
+
+  uint32_t absolute_core_id = mempool_get_core_id();
+  uint32_t core_id = absolute_core_id % (fftLen >> 4U);
+  uint32_t col_id = absolute_core_id / (fftLen >> 4U);
+
+  __fp16 t0, t1, t2, t3, t4, t5;
+  v2h CoSi1, CoSi2, CoSi3;
+  v2h C1, C2, C3;
+#ifdef FOLDED_TWIDDLES
+  uint32_t n1, n2, n2_store;
+  uint32_t i0, k, ic, ic_store;
+#else
+  uint32_t n1, n2;
+  uint32_t i0, k, ic;
+  uint32_t twidCoefModifier = 1U;
+#endif
+  __fp16 *pTmp;
+
+  /* FIRST STAGE */
+  n1 = fftLen;
+  n2 = n1 >> 2U;
+  for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) {
+    ic = i0;
+#ifdef FOLDED_TWIDDLES
+    ic_store = ic >> 2U;
+    n2_store = n2 >> 2U;
+#endif
+    LOAD_STORE_TWIDDLEFACT;
+    SHUFFLE_TWIDDLEFACT;
+    for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+      __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen;
+      __fp16 *pOut =
+          pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      radix4_butterfly_first(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
+                             C3);
+    }
+  }
+  pTmp = pSrc16;
+  pSrc16 = pDst16;
+  pDst16 = pTmp;
+  pTmp = pCoef_src;
+  pCoef_src = pCoef_dst;
+  pCoef_dst = pTmp;
+  mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+  /* MIDDLE STAGE */
+  for (k = fftLen / 4U; k > 4U; k >>= 2U) {
+    n1 = n2;
+    n2 >>= 2U;
+    for (i0 = core_id * 4; i0 < core_id * 4 + 4; i0++) {
+#ifdef FOLDED_TWIDDLES
+      ic = i0;
+      ic_store = ((ic % n2) >> 2) + (ic / n2) * n2;
+      n2_store = n2 >> 2U;
+#else
+      ic = (i0 % n2) * twidCoefModifier;
+#endif
+      LOAD_STORE_TWIDDLEFACT;
+      SHUFFLE_TWIDDLEFACT;
+
+      for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+        __fp16 *pIn =
+            pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+        __fp16 *pOut =
+            pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+        radix4_butterfly_middle(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
+                                C3);
+      }
+    }
+    pTmp = pSrc16;
+    pSrc16 = pDst16;
+    pDst16 = pTmp;
+    pTmp = pCoef_src;
+    pCoef_src = pCoef_dst;
+    pCoef_dst = pTmp;
+    mempool_log_partial_barrier(2, absolute_core_id, N_FFTs_COL * nPE);
+  }
+
+  /*  LAST STAGE */
+  for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, fftLen >> 2U); i0++) {
+    for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+      __fp16 *pIn =
+          pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      __fp16 *pOut =
+          pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      radix4_butterfly_last(pIn, pOut, i0);
+    }
+  }
+  pTmp = pSrc16;
+  pSrc16 = pDst16;
+  pDst16 = pTmp;
+  mempool_log_partial_barrier(2, absolute_core_id, N_FFTs_COL * nPE);
+  mempool_stop_benchmark();
+  mempool_start_benchmark();
+  /* BITREVERSAL */
+  // Bitreversal stage stores in the sequential addresses
+  if (bitReverseFlag) {
+#ifdef BITREVERSETABLE
+    pSrc16 = pSrc16 + 2 * col_id * (fftLen / 4);
+    pDst16 = pDst16 + 2 * col_id * fftLen;
+    for (ic = 8 * core_id; ic < bitReverseLen; ic += 8 * nPE) {
+      uint32_t addr1, addr2, addr3, addr4;
+      uint32_t tmpa1, tmpa2, tmpa3, tmpa4;
+      uint32_t tmpb1, tmpb2, tmpb3, tmpb4;
+      uint32_t a1, a2, a3, a4;
+      uint32_t b1, b2, b3, b4;
+      uint32_t a1_load, a2_load, a3_load, a4_load;
+      uint32_t b1_load, b2_load, b3_load, b4_load;
+      uint32_t s2 = 0x00020002;
+      addr1 = *(uint32_t *)&pBitRevTable[ic];
+      addr2 = *(uint32_t *)&pBitRevTable[ic + 2];
+      addr3 = *(uint32_t *)&pBitRevTable[ic + 4];
+      addr4 = *(uint32_t *)&pBitRevTable[ic + 6];
+      asm volatile("pv.sra.h  %[addr1],%[addr1],%[s2];"
+                   "pv.sra.h  %[addr2],%[addr2],%[s2];"
+                   "pv.sra.h  %[addr3],%[addr3],%[s2];"
+                   "pv.sra.h  %[addr4],%[addr4],%[s2];"
+                   "pv.extract.h  %[a1],%[addr1],0;"
+                   "pv.extract.h  %[a2],%[addr2],0;"
+                   "pv.extract.h  %[a3],%[addr3],0;"
+                   "pv.extract.h  %[a4],%[addr4],0;"
+                   "pv.extract.h  %[b1],%[addr1],1;"
+                   "pv.extract.h  %[b2],%[addr2],1;"
+                   "pv.extract.h  %[b3],%[addr3],1;"
+                   "pv.extract.h  %[b4],%[addr4],1;"
+                   : [a1] "=r"(a1), [a2] "=r"(a2), [a3] "=r"(a3), [a4] "=r"(a4),
+                     [b1] "=r"(b1), [b2] "=r"(b2), [b3] "=r"(b3), [b4] "=r"(b4),
+                     [addr1] "+&r"(addr1), [addr2] "+&r"(addr2),
+                     [addr3] "+&r"(addr3), [addr4] "+&r"(addr4)
+                   : [s2] "r"(s2)
+                   :);
+      // Compute the local addresses from the natural order ones
+      a1_load = (a1 % 4) * 2 * N_BANKS + 2 * (a1 / 4);
+      a2_load = (a2 % 4) * 2 * N_BANKS + 2 * (a2 / 4);
+      a3_load = (a3 % 4) * 2 * N_BANKS + 2 * (a3 / 4);
+      a4_load = (a4 % 4) * 2 * N_BANKS + 2 * (a4 / 4);
+      b1_load = (b1 % 4) * 2 * N_BANKS + 2 * (b1 / 4);
+      b2_load = (b2 % 4) * 2 * N_BANKS + 2 * (b2 / 4);
+      b3_load = (b3 % 4) * 2 * N_BANKS + 2 * (b3 / 4);
+      b4_load = (b4 % 4) * 2 * N_BANKS + 2 * (b4 / 4);
+      for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+        uint16_t *ptr1 = (uint16_t *)(pSrc16 + idx_row * (N_BANKS * 8));
+        uint16_t *ptr2 = (uint16_t *)(pDst16 + idx_row * (N_BANKS * 8));
+        // Load at address a
+        tmpa1 = *(uint32_t *)&ptr1[a1_load];
+        tmpa2 = *(uint32_t *)&ptr1[a2_load];
+        tmpa3 = *(uint32_t *)&ptr1[a3_load];
+        tmpa4 = *(uint32_t *)&ptr1[a4_load];
+        // Load at address b
+        tmpb1 = *(uint32_t *)&ptr1[b1_load];
+        tmpb2 = *(uint32_t *)&ptr1[b2_load];
+        tmpb3 = *(uint32_t *)&ptr1[b3_load];
+        tmpb4 = *(uint32_t *)&ptr1[b4_load];
+        // Swap a with b
+        *((uint32_t *)&ptr2[b1]) = tmpa1;
+        *((uint32_t *)&ptr2[b2]) = tmpa2;
+        *((uint32_t *)&ptr2[b3]) = tmpa3;
+        *((uint32_t *)&ptr2[b4]) = tmpa4;
+        // Swap b with a
+        *((uint32_t *)&ptr2[a1]) = tmpb1;
+        *((uint32_t *)&ptr2[a2]) = tmpb2;
+        *((uint32_t *)&ptr2[a3]) = tmpb3;
+        *((uint32_t *)&ptr2[a4]) = tmpb4;
+      }
+    }
+#else
+    uint16_t *ptr1 = (uint16_t *)(pSrc16 + 2 * col_id * (fftLen / 4));
+    uint16_t *ptr2 = (uint16_t *)(pDst16 + 2 * col_id * fftLen);
+    for (ic = core_id * 16; ic < MIN(core_id * 16 + 16, fftLen >> 2U);
+         ic += 4) {
+      uint32_t idx0 = ic;
+      uint32_t idx1 = ic + 1;
+      uint32_t idx2 = ic + 2;
+      uint32_t idx3 = ic + 3;
+      uint32_t idx_result0 = 0;
+      uint32_t idx_result1 = 0;
+      uint32_t idx_result2 = 0;
+      uint32_t idx_result3 = 0;
+      for (k = 0; k < LOG2; k++) {
+        idx_result0 = (idx_result0 << 1U) | (idx0 & 1U);
+        idx_result1 = (idx_result1 << 1U) | (idx1 & 1U);
+        idx_result2 = (idx_result2 << 1U) | (idx2 & 1U);
+        idx_result3 = (idx_result3 << 1U) | (idx3 & 1U);
+        idx0 = idx0 >> 1U;
+        idx1 = idx1 >> 1U;
+        idx2 = idx2 >> 1U;
+        idx3 = idx3 >> 1U;
+      }
+      for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+        uint32_t addr_src0 = (idx0 / 4) + (idx0 % 4) * N_BANKS;
+        uint32_t addr_src1 = (idx1 / 4) + (idx1 % 4) * N_BANKS;
+        uint32_t addr_src2 = (idx2 / 4) + (idx2 % 4) * N_BANKS;
+        uint32_t addr_src3 = (idx3 / 4) + (idx3 % 4) * N_BANKS;
+        uint32_t addr_dst0 = idx_result0;
+        uint32_t addr_dst1 = idx_result1;
+        uint32_t addr_dst2 = idx_result2;
+        uint32_t addr_dst3 = idx_result3;
+        addr_src0 += idx_row * (N_BANKS * 8);
+        addr_src1 += idx_row * (N_BANKS * 8);
+        addr_src2 += idx_row * (N_BANKS * 8);
+        addr_src3 += idx_row * (N_BANKS * 8);
+        addr_dst0 += idx_row * (N_BANKS * 8);
+        addr_dst1 += idx_row * (N_BANKS * 8);
+        addr_dst2 += idx_row * (N_BANKS * 8);
+        addr_dst3 += idx_row * (N_BANKS * 8);
+        *((uint32_t *)&ptr2[addr_dst0]) = (uint32_t)ptr1[addr_src0];
+        *((uint32_t *)&ptr2[addr_dst1]) = (uint32_t)ptr1[addr_src1];
+        *((uint32_t *)&ptr2[addr_dst2]) = (uint32_t)ptr1[addr_src2];
+        *((uint32_t *)&ptr2[addr_dst3]) = (uint32_t)ptr1[addr_src3];
+      }
+    }
+#endif
+  }
+  mempool_log_partial_barrier(2, absolute_core_id, nPE);
+  return;
+}
diff --git a/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h b/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h
index 56f4f478b..e5380444c 100644
--- a/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h
+++ b/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h
@@ -26,14 +26,14 @@ void mempool_bitrevtable_q16s_riscv32(uint16_t *pSrc, const uint16_t bitRevLen,
 
 #ifndef ASM
 #define SWAP_ITEMS                                                             \
-  addr1 = *(v2s *)&pBitRevTab[i];                                              \
-  addr2 = *(v2s *)&pBitRevTab[i + 2];                                          \
-  addr3 = *(v2s *)&pBitRevTab[i + 4];                                          \
-  addr4 = *(v2s *)&pBitRevTab[i + 6];                                          \
-  addr1 = __SRA2(addr1, s2);                                                   \
-  addr2 = __SRA2(addr2, s2);                                                   \
-  addr3 = __SRA2(addr3, s2);                                                   \
-  addr4 = __SRA2(addr4, s2);                                                   \
+  addr1 = *(uint32_t *)&pBitRevTab[i];                                              \
+  addr2 = *(uint32_t *)&pBitRevTab[i + 2];                                          \
+  addr3 = *(uint32_t *)&pBitRevTab[i + 4];                                          \
+  addr4 = *(uint32_t *)&pBitRevTab[i + 6];                                          \
+  addr1 = __SRA2(*(v2s*)&addr1, *(v2s*)&s2);                                                   \
+  addr2 = __SRA2(*(v2s*)&addr2, *(v2s*)&s2);                                                   \
+  addr3 = __SRA2(*(v2s*)&addr3, *(v2s*)&s2);                                                   \
+  addr4 = __SRA2(*(v2s*)&addr4, *(v2s*)&s2);                                                   \
   a1 = addr1[1];                                                               \
   a2 = addr2[1];                                                               \
   a3 = addr3[1];                                                               \
@@ -42,28 +42,28 @@ void mempool_bitrevtable_q16s_riscv32(uint16_t *pSrc, const uint16_t bitRevLen,
   b2 = addr2[0];                                                               \
   b3 = addr3[0];                                                               \
   b4 = addr4[0];                                                               \
-  tmpa1 = *(v2s *)&pSrc[a1];                                                   \
-  tmpa2 = *(v2s *)&pSrc[a2];                                                   \
-  tmpa3 = *(v2s *)&pSrc[a3];                                                   \
-  tmpa4 = *(v2s *)&pSrc[a4];                                                   \
-  tmpb1 = *(v2s *)&pSrc[b1];                                                   \
-  tmpb2 = *(v2s *)&pSrc[b2];                                                   \
-  tmpb3 = *(v2s *)&pSrc[b3];                                                   \
-  tmpb4 = *(v2s *)&pSrc[b4];                                                   \
-  *((v2s *)&pSrc[a1]) = tmpb1;                                                 \
-  *((v2s *)&pSrc[a2]) = tmpb2;                                                 \
-  *((v2s *)&pSrc[a3]) = tmpb3;                                                 \
-  *((v2s *)&pSrc[a4]) = tmpb4;                                                 \
-  *((v2s *)&pSrc[b1]) = tmpa1;                                                 \
-  *((v2s *)&pSrc[b2]) = tmpa2;                                                 \
-  *((v2s *)&pSrc[b3]) = tmpa3;                                                 \
-  *((v2s *)&pSrc[b4]) = tmpa4;
+  tmpa1 = *(uint32_t *)&pSrc[a1];                                                   \
+  tmpa2 = *(uint32_t *)&pSrc[a2];                                                   \
+  tmpa3 = *(uint32_t *)&pSrc[a3];                                                   \
+  tmpa4 = *(uint32_t *)&pSrc[a4];                                                   \
+  tmpb1 = *(uint32_t *)&pSrc[b1];                                                   \
+  tmpb2 = *(uint32_t *)&pSrc[b2];                                                   \
+  tmpb3 = *(uint32_t *)&pSrc[b3];                                                   \
+  tmpb4 = *(uint32_t *)&pSrc[b4];                                                   \
+  *((uint32_t *)&pSrc[a1]) = tmpb1;                                                 \
+  *((uint32_t *)&pSrc[a2]) = tmpb2;                                                 \
+  *((uint32_t *)&pSrc[a3]) = tmpb3;                                                 \
+  *((uint32_t *)&pSrc[a4]) = tmpb4;                                                 \
+  *((uint32_t *)&pSrc[b1]) = tmpa1;                                                 \
+  *((uint32_t *)&pSrc[b2]) = tmpa2;                                                 \
+  *((uint32_t *)&pSrc[b3]) = tmpa3;                                                 \
+  *((uint32_t *)&pSrc[b4]) = tmpa4;
 #else
 #define SWAP_ITEMS                                                             \
-  addr1 = *(v2s *)&pBitRevTab[i];                                              \
-  addr2 = *(v2s *)&pBitRevTab[i + 2];                                          \
-  addr3 = *(v2s *)&pBitRevTab[i + 4];                                          \
-  addr4 = *(v2s *)&pBitRevTab[i + 6];                                          \
+  addr1 = *(uint32_t *)&pBitRevTab[i];                                              \
+  addr2 = *(uint32_t *)&pBitRevTab[i + 2];                                          \
+  addr3 = *(uint32_t *)&pBitRevTab[i + 4];                                          \
+  addr4 = *(uint32_t *)&pBitRevTab[i + 6];                                          \
   asm volatile("pv.sra.h  %[addr1],%[addr1],%[s2];"                            \
                "pv.sra.h  %[addr2],%[addr2],%[s2];"                            \
                "pv.sra.h  %[addr3],%[addr3],%[s2];"                            \
@@ -82,30 +82,30 @@ void mempool_bitrevtable_q16s_riscv32(uint16_t *pSrc, const uint16_t bitRevLen,
                  [addr3] "+&r"(addr3), [addr4] "+&r"(addr4)                    \
                : [s2] "r"(s2)                                                  \
                :);                                                             \
-  tmpa1 = *(v2s *)&pSrc[a1];                                                   \
-  tmpa2 = *(v2s *)&pSrc[a2];                                                   \
-  tmpa3 = *(v2s *)&pSrc[a3];                                                   \
-  tmpa4 = *(v2s *)&pSrc[a4];                                                   \
-  tmpb1 = *(v2s *)&pSrc[b1];                                                   \
-  tmpb2 = *(v2s *)&pSrc[b2];                                                   \
-  tmpb3 = *(v2s *)&pSrc[b3];                                                   \
-  tmpb4 = *(v2s *)&pSrc[b4];                                                   \
-  *((v2s *)&pSrc[a1]) = tmpb1;                                                 \
-  *((v2s *)&pSrc[a2]) = tmpb2;                                                 \
-  *((v2s *)&pSrc[a3]) = tmpb3;                                                 \
-  *((v2s *)&pSrc[a4]) = tmpb4;                                                 \
-  *((v2s *)&pSrc[b1]) = tmpa1;                                                 \
-  *((v2s *)&pSrc[b2]) = tmpa2;                                                 \
-  *((v2s *)&pSrc[b3]) = tmpa3;                                                 \
-  *((v2s *)&pSrc[b4]) = tmpa4;
+  tmpa1 = *(uint32_t *)&pSrc[a1];                                                   \
+  tmpa2 = *(uint32_t *)&pSrc[a2];                                                   \
+  tmpa3 = *(uint32_t *)&pSrc[a3];                                                   \
+  tmpa4 = *(uint32_t *)&pSrc[a4];                                                   \
+  tmpb1 = *(uint32_t *)&pSrc[b1];                                                   \
+  tmpb2 = *(uint32_t *)&pSrc[b2];                                                   \
+  tmpb3 = *(uint32_t *)&pSrc[b3];                                                   \
+  tmpb4 = *(uint32_t *)&pSrc[b4];                                                   \
+  *((uint32_t *)&pSrc[a1]) = tmpb1;                                                 \
+  *((uint32_t *)&pSrc[a2]) = tmpb2;                                                 \
+  *((uint32_t *)&pSrc[a3]) = tmpb3;                                                 \
+  *((uint32_t *)&pSrc[a4]) = tmpb4;                                                 \
+  *((uint32_t *)&pSrc[b1]) = tmpa1;                                                 \
+  *((uint32_t *)&pSrc[b2]) = tmpa2;                                                 \
+  *((uint32_t *)&pSrc[b3]) = tmpa3;                                                 \
+  *((uint32_t *)&pSrc[b4]) = tmpa4;
 #endif
 
 void mempool_bitrevtable_q16s_xpulpimg(uint16_t *pSrc, const uint16_t bitRevLen,
                                        const uint16_t *pBitRevTab) {
-  v2s addr1, addr2, addr3, addr4;
-  v2s s2 = (v2s){2, 2};
-  v2s tmpa1, tmpa2, tmpa3, tmpa4;
-  v2s tmpb1, tmpb2, tmpb3, tmpb4;
+  uint32_t addr1, addr2, addr3, addr4;
+  uint32_t s2 = 0x00020002;
+  uint32_t tmpa1, tmpa2, tmpa3, tmpa4;
+  uint32_t tmpb1, tmpb2, tmpb3, tmpb4;
   int32_t a1, a2, a3, a4;
   int32_t b1, b2, b3, b4;
   for (uint32_t i = 0; i < bitRevLen; i += 8) {
@@ -117,10 +117,10 @@ void mempool_bitrevtable_q16p_xpulpimg(uint16_t *pSrc, const uint16_t bitRevLen,
                                        const uint16_t *pBitRevTab,
                                        const uint32_t nPE) {
   uint32_t core_id = mempool_get_core_id();
-  v2s addr1, addr2, addr3, addr4;
-  v2s s2 = (v2s){2, 2};
-  v2s tmpa1, tmpa2, tmpa3, tmpa4;
-  v2s tmpb1, tmpb2, tmpb3, tmpb4;
+  uint32_t addr1, addr2, addr3, addr4;
+  uint32_t s2 = 0x00020002;
+  uint32_t tmpa1, tmpa2, tmpa3, tmpa4;
+  uint32_t tmpb1, tmpb2, tmpb3, tmpb4;
   int32_t a1, a2, a3, a4;
   int32_t b1, b2, b3, b4;
   for (uint32_t i = 8 * core_id; i < bitRevLen; i += (8 * nPE)) {
diff --git a/software/runtime/kernel/mempool_radix4_cfft_q16p.h b/software/runtime/kernel/mempool_radix4_cfft_q16p.h
index 34f338ce8..ce928be00 100644
--- a/software/runtime/kernel/mempool_radix4_cfft_q16p.h
+++ b/software/runtime/kernel/mempool_radix4_cfft_q16p.h
@@ -81,7 +81,7 @@ void mempool_radix4_cfft_q16p_xpulpimg(int16_t *pSrc16, uint32_t fftLen,
   uint32_t n1, n2, ic, i0, j, k;
   uint32_t step, steps;
 
-  /* START OF FIRST STAGE PROCESS */
+  /* START OF FIRST STAGE PROCESSING */
   n1 = fftLen;
   n2 = n1 >> 2U;
   step = (n2 + nPE - 1) / nPE;