diff --git a/software/apps/ofdm/main.c b/software/apps/ofdm/main.c new file mode 100644 index 000000000..9969219fe --- /dev/null +++ b/software/apps/ofdm/main.c @@ -0,0 +1,123 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include +#include +#include +#include + +/* Mempool runtime libraries */ +#include "dma.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" +#include "xpulp/builtins_v2.h" + +#include "data/data_ofdm.h" + +// CFFT Parameters +#define SCHEDULED +#define FOLDED_TWIDDLES +#define BITREVERSETABLE +#define ASM +#define N_FFTs_COL 4 +#define N_FFTs_ROW (N_RX / N_FFTs_COL) +// CMATMUL Parameters +#define NUM_COPIES (N_BANKS / (N_BEAMS * N_RX)) + +#define ROUNDS 3 +dump(prova, 1); + +#include "kernel/mempool_radix4_cfft_butterfly_f16.h" +#include "kernel/mempool_radix4_cfft_f16p.h" +#include "kernel/mempool_radix4_cfft_q16_bitreversal.h" +#include "kernel/mempool_cmatmul_f16.h" + +uint32_t arrival_index __attribute__((section(".l1_prio"))); +__fp16 l1_pBF_Coef_folded[2 * N_BEAMS * N_RX * NUM_COPIES] + __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); + +__fp16 l1_pFFT_Src[N_FFTs_ROW * 8 * N_BANKS] + __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +__fp16 l1_pFFT_Dst[N_FFTs_ROW * 8 * N_BANKS] + __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +__fp16 l1_twiddleCoef_f16_src[6 * N_BANKS] + __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +__fp16 l1_twiddleCoef_f16_dst[6 * N_BANKS] + __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] + __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/* MAIN */ +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier_init(core_id); + + + /* INITIALIZATION */ + mempool_start_benchmark(); + if (core_id == 0) { + // Each FFT is folded over 4 memory rows + // Each memory row is 2 * N_BANKS samples + __atomic_store_n(&arrival_index, 0, __ATOMIC_RELAXED); + dma_memcpy_blocking(l1_pFFT_Src, l2_pFFT_Src, (N_RX * N_SC) * sizeof(int32_t)); + dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable, BITREVINDEXTABLE_LENGTH * sizeof(int16_t)); + for (uint32_t i = 0; i < NUM_COPIES; i++) { + dma_memcpy_blocking(l1_pBF_Coef_folded + i * (N_BEAMS * N_RX), l2_pBF_Coef, (N_BEAMS * N_RX) * sizeof(int32_t)); + } + for (uint32_t i = 0; i < N_FFTs_COL; i++) { + dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * N_BANKS), l2_twiddleCoef_f16, 3 * (N_SC / 4) * sizeof(int32_t)); + } + } + mempool_barrier(num_cores); + mempool_stop_benchmark(); + dump_prova(0); + +// // Start of the iterations +// for (uint32_t round = 0; round < ROUNDS; round++) { + + /* FFT */ + mempool_start_benchmark(); + uint32_t col_fftLen = N_SC / 4; + uint32_t col_id = core_id / (N_SC / 16); + // Distribute FFTs over columns + mempool_radix4_cfft_f16p_scheduler(l1_pFFT_Src, l1_pFFT_Dst, N_SC, + l1_twiddleCoef_f16_src + 2 * col_id * col_fftLen, + l1_twiddleCoef_f16_dst + 2 * col_id * col_fftLen, + l1_BitRevIndexTable, BITREVINDEXTABLE_LENGTH, 1, (N_SC / 16)); + mempool_log_barrier(2, core_id); + mempool_stop_benchmark(); + dump_prova(1); + + /* BEAMFORMING */ + mempool_start_benchmark(); + cmatmul_2x4_folded_f16p(l1_pBF_Coef_folded, l1_pFFT_Src, l1_pFFT_Dst, N_BEAMS, N_RX, N_SC, core_id, num_cores); + mempool_stop_benchmark(); + dump_prova(2); + + mempool_start_benchmark(); + // Transfer and synchronization + if ((num_cores - 1) == __atomic_fetch_add(&arrival_index, 1, __ATOMIC_RELAXED)) { + dma_memcpy_blocking(l1_pFFT_Src, l2_pFFT_Src, (N_RX * N_SC) * sizeof(int32_t)); + dma_memcpy_blocking(l2_pBF_Dst, l1_pFFT_Dst, (N_RX * N_SC) * sizeof(int32_t)); + for (uint32_t i = 0; i < N_FFTs_COL; i++) { + dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * N_BANKS), l2_twiddleCoef_f16, 3 * (N_SC / 4) * sizeof(int32_t)); + } + __atomic_store_n(&arrival_index, 0, __ATOMIC_RELAXED); + __sync_synchronize(); // Full memory barrier + wake_up_all(); + } + mempool_wfi(); + mempool_stop_benchmark(); + dump_prova(3); + +// } + + return 0; +} diff --git a/software/runtime/data/data_ofdm.h.tpl b/software/runtime/data/data_ofdm.h.tpl new file mode 100644 index 000000000..fe7ff7fc6 --- /dev/null +++ b/software/runtime/data/data_ofdm.h.tpl @@ -0,0 +1,48 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +\ +<% def array_to_cstr(array): + out = '{' + i = 0 + out += '\n' + for a in array: + out += '(__fp16){:0.5}f, '.format(a) + i += 1 + if i % 8 == 0: + out += '\n' + out = out[:-2] + '}' + return out +%> \ + +<% def array_to_str(array): + out = '{' + i = 0 + out += '\n' + for a in array: + out += '{}, '.format(a) + i += 1 + if i % 16 == 0: + out += '\n' + out = out[:-2] + '}' + return out +%> \ + +#define LOG2 (${Log2Len}) +#define N_RX (${N_rx}) +#define N_BEAMS (${N_bs}) +#define N_SC (${N_sc}) +#define N_BANKS (NUM_CORES * BANKING_FACTOR) +#define BITREVINDEXTABLE_LENGTH (${BitrevLen}) + + +__fp16 l2_pFFT_Src[${2 * N_sc * N_rx}] = ${array_to_cstr(pFFT_src)}; + +__fp16 l2_twiddleCoef_f16[${2 * N_sc}] = ${array_to_cstr(pTw_coef)}; + +__fp16 l2_pBF_Coef[${2 * N_bs * N_rx}] = ${array_to_cstr(pBF_coef)}; + +__fp16 l2_pBF_Dst[${2 * N_bs * N_sc}] = ${array_to_cstr(pBF_dst)}; + +// Bitreversal +uint16_t l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(bitrev)}; diff --git a/software/runtime/data/data_ofdm.py b/software/runtime/data/data_ofdm.py new file mode 100644 index 000000000..08cf091ce --- /dev/null +++ b/software/runtime/data/data_ofdm.py @@ -0,0 +1,133 @@ +# Copyright 2022 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Author: Marco Bertuletti, ETH Zurich + +#!/usr/bin/env python3 + +import numpy as np +import math as M +import argparse +import pathlib +from mako.template import Template +from scipy.linalg import solve_triangular +from sympy.combinatorics import Permutation + +################## +# compute_result # +################## + +def compute_bitreversal(N, R): + # Decompose + logR2 = [] + idx = N + while (idx >= R): + logR2.append(int(M.log2(R))) + idx = idx // R + if (idx > 1): + logR2.append(int(M.log2(idx))) + # Bitreversal + indexes = [] + for x in range(N): + result = 0 + for bits in logR2: + mask = (0xffffffff >> (32 - bits)) + result = (result << bits) | (x & mask) + x = x >> bits + indexes.append(result) + + # Create transpositions table + tps = [] + for c in Permutation.from_sequence(indexes).cyclic_form: + for i in range(len(c) - 1): + tps.append([c[i] * 8, c[-1] * 8]) + return tps + +def gen_data_header_file(outdir: pathlib.Path.cwd(), tpl: pathlib.Path.cwd(), **kwargs): + + file = outdir / f"data_{kwargs['name']}.h" + + print(tpl, outdir, kwargs['name']) + + template = Template(filename=str(tpl)) + with file.open('w') as f: + f.write(template.render(**kwargs)) + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for kernels') + parser.add_argument( + "-o", + "--outdir", + type=pathlib.Path, + default=pathlib.Path(__file__).parent.absolute(), + required=False, + help='Select out directory of generated data files' + ) + parser.add_argument( + "-t", + "--tpl", + type=pathlib.Path, + required=False, + default=pathlib.Path(__file__).parent.absolute() / "data_ofdm.h.tpl", + help='Path to mako template' + ) + parser.add_argument( + "-v", + "--verbose", + action='store_true', + help='Set verbose' + ) + parser.add_argument( + "-rx", + "--receivers", + type=int, + required=False, + default=64, + help='First dimension.' + ) + parser.add_argument( + "-bs", + "--beams", + type=int, + required=False, + default=32, + help='Second dimension.' + ) + parser.add_argument( + "-sc", + "--subcarriers", + type=int, + required=False, + default=4096, + help='Iterations.' + ) + + args = parser.parse_args() + N_rx=args.receivers + N_bs=args.beams + N_sc=args.subcarriers + + pFFT_src = ( np.random.rand(2 * N_rx * N_sc) ).astype(np.float16) + pTw_coef = ( np.random.rand(int(3 * N_sc / 4)) ).astype(np.float16) + pBF_coef = ( np.random.rand(2 * N_rx * N_bs) ).astype(np.float16) + pBF_dst = ( np.random.rand(2 * N_bs * N_sc) ).astype(np.float16) + + Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(N_sc, 2))) + + kwargs = {'name': 'ofdm', + 'pFFT_src': pFFT_src, + 'pTw_coef': pTw_coef, + 'pBF_coef': pBF_coef, + 'pBF_dst': pBF_dst, + 'bitrev': Bitreversal, + 'N_rx' : N_rx, + 'N_bs' : N_bs, + 'N_sc' : N_sc, + 'Log2Len': int(np.log2(N_sc)), + 'BitrevLen': len(Bitreversal)} + gen_data_header_file(args.outdir, args.tpl, **kwargs) + +if __name__ == "__main__": + main() diff --git a/software/runtime/kernel/mempool_cfft_radix4_butterfly_f16.h b/software/runtime/kernel/mempool_cfft_radix4_butterfly_f16.h deleted file mode 100644 index 5196fc30d..000000000 --- a/software/runtime/kernel/mempool_cfft_radix4_butterfly_f16.h +++ /dev/null @@ -1,199 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -#include "xpulp/builtins_v2.h" - -/** - @brief First butterfly stage. - @param[in] pIn points to input buffer of 16b data, Re and Im parts are - interleaved - @param[out] pOut points to output buffer of 16b data, Re and Im parts are - interleaved - @param[in] i0 points to the first element to be processed - @param[in] n2 number of elements in the first wing of the butterfly - @param[in] CoSi1 packed cosine and sine first twiddle - @param[in] CoSi2 packed cosine and sine second twiddle - @param[in] CoSi3 packed cosine and sine third twiddle - @param[in] C1 packed sine and cosine first twiddle - @param[in] C2 packed sine and cosine second twiddle - @param[in] C3 packed sine and cosine third twiddle - @return none -*/ -static inline void radix4_butterfly(__fp16 *pIn, __fp16 *pOut, - uint32_t i0, uint32_t n2, v2h CoSi1, - v2h CoSi2, v2h CoSi3, v2h C1, v2h C2, - v2h C3) { - uint32_t i1, i2, i3; - __fp16 t0, t1, t2, t3, t4, t5; - v2h A, B, C, D, E, F, G, H; - -#if defined(FOLDED) || defined(SCHEDULED) - /* index calculation for the input as, */ - /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ - i1 = i0 + N_BANKS; - i2 = i1 + N_BANKS; - i3 = i2 + N_BANKS; - uint32_t n2_store = n2 >> 2U; - uint32_t i0_store = - (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS; - uint32_t i1_store = i0_store + n2_store; - uint32_t i2_store = i1_store + n2_store; - uint32_t i3_store = i2_store + n2_store; -#else - /* index calculation for the input as, */ - /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ - i1 = i0 + n2; - i2 = i1 + n2; - i3 = i2 + n2; -#endif - /* Read ya (real), xa (imag) input */ - A = *(v2h *)&pIn[i0 * 2U]; - /* Read yb (real), xb(imag) input */ - B = *(v2h *)&pIn[i1 * 2U]; - /* Read yc (real), xc(imag) input */ - C = *(v2h *)&pIn[i2 * 2U]; - /* Read yd (real), xd(imag) input */ - D = *(v2h *)&pIn[i3 * 2U]; - asm volatile( - // xa + xc, ya + yc - "vfadd.h %[E],%[A],%[C];" - // xa - xc, ya - yc - "vfsub.h %[F],%[A],%[C];" - // xb + xd, yd + yd - "vfadd.h %[G],%[B],%[D];" - // xb - xd, yb - yd - "vfsub.h %[H],%[B],%[D];" - "pv.extract.h %[t0],%[H],0;" - "pv.extract.h %[t1],%[H],1;" - "fsub.h %[t3],zero,%[t1];" - "fsub.h %[t4],zero,%[t0];" - // yd - yb, xb - xd - "pv.pack.h %[C],%[t0],%[t3];" - // yb - yd, xd - xb - "pv.pack.h %[D],%[t4],%[t1];" - // xa + xc + xb + xd, ya + yb + yc + yd - "vfadd.h %[A],%[E],%[G];" - // xa - xc + yb - yd, ya - yc + xd - xb - "vfadd.h %[D],%[F],%[D];" - // xa + xc - xb - xd, ya + yc - yb - yd - "vfsub.h %[B],%[E],%[G];" - // xa - xc - yb + yd, ya - yc + xb - xd - "vfadd.h %[C],%[F],%[C];" - "vfdotpex.s.h %[t0],%[CoSi1],%[D];" - "vfdotpex.s.h %[t2],%[CoSi2],%[B];" - "vfdotpex.s.h %[t4],%[CoSi3],%[C];" - "vfdotpex.s.h %[t1],%[C1],%[D];" - "vfdotpex.s.h %[t3],%[C1],%[B];" - "vfdotpex.s.h %[t5],%[C3],%[C];" - "fcvt.h.s %[t0],%[t0];" - "fcvt.h.s %[t1],%[t1];" - "fcvt.h.s %[t2],%[t2];" - "fcvt.h.s %[t3],%[t3];" - "fcvt.h.s %[t4],%[t4];" - "fcvt.h.s %[t5],%[t5];" - "pv.pack.h %[E],%[t1],%[t0];" - "pv.pack.h %[F],%[t3],%[t2];" - "pv.pack.h %[G],%[t5],%[t4];" - : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), - [E] "=&r"(E), [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), - [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), - [t4] "=&r"(t4), [t5] "=&r"(t5) - : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1), - [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3) - :); -#if defined(FOLDED) || defined(SCHEDULED) - *((v2h *)&pOut[i0_store * 2U]) = A; - *((v2h *)&pOut[i1_store * 2U]) = E; - *((v2h *)&pOut[i2_store * 2U]) = F; - *((v2h *)&pOut[i3_store * 2U]) = G; -#else - *((v2h *)&pOut[i0 * 2U]) = A; - *((v2h *)&pOut[i1 * 2U]) = E; - *((v2h *)&pOut[i2 * 2U]) = F; - *((v2h *)&pOut[i3 * 2U]) = G; -#endif - -} - -/** - @brief Last butterfly stage. - @param[in] pIn points to input buffer of 16b data, Re and Im parts are - interleaved - @param[out] pOut points to output buffer of 16b data, Re and Im parts are - interleaved - @param[in] i0 points to the first element to be processed - @return none -*/ -static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut, - uint32_t i0) { - __fp16 t0, t1; - uint32_t i1, i2, i3; - v2h A, B, C, D, E, F, G, H; - -#if defined(FOLDED) || defined(SCHEDULED) - /* index calculation for the input as, */ - /* pIn[i0 + 0], pIn[i0 + fftLen/4], - pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ - i1 = i0 + N_BANKS; - i2 = i1 + N_BANKS; - i3 = i2 + N_BANKS; -#ifndef SCHEDULED - uint32_t i0_store = i0 * 4; - uint32_t i1_store = i0_store + 1; - uint32_t i2_store = i1_store + 1; - uint32_t i3_store = i2_store + 1; -#endif -#else - /* index calculation for the input as, */ - /* pIn[i0 + 0], pIn[i0 + fftLen/4], - pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ - i1 = i0 + 1U; - i2 = i1 + 1U; - i3 = i2 + 1U; -#endif - - /* Read ya (real), xa(imag) input */ - A = *(v2h *)&pIn[i0 * 2U]; - /* Read yb (real), xb(imag) input */ - B = *(v2h *)&pIn[i1 * 2U]; - /* Read yc (real), xc(imag) input */ - C = *(v2h *)&pIn[i2 * 2U]; - /* Read yd (real), xd(imag) input */ - D = *(v2h *)&pIn[i3 * 2U]; - __fp16 t2, t3; - asm volatile( - "vfsub.h %[H],%[B],%[D];" - "vfadd.h %[G],%[B],%[D];" - "vfadd.h %[E],%[A],%[C];" - "vfsub.h %[F],%[A],%[C];" - "pv.extract.h %[t0],%[H],0;" - "pv.extract.h %[t1],%[H],1;" - "fsub.h %[t2], zero, %[t0];" - "fsub.h %[t3], zero, %[t1];" - "pv.pack.h %[A],%[t2],%[t1];" - "pv.pack.h %[B],%[t0],%[t3];" - "vfadd.h %[H],%[E],%[G];" - "vfsub.h %[E],%[E],%[G];" - "vfadd.h %[A],%[F],%[A];" - "vfadd.h %[B],%[F],%[B];" - : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "=&r"(E), - [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), [t0] "=&r"(t0), - [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3) - : - :); -#if defined(FOLDED) - *((v2h *)&pOut[i0_store * 2U]) = H; - *((v2h *)&pOut[i1_store * 2U]) = E; - *((v2h *)&pOut[i2_store * 2U]) = A; - *((v2h *)&pOut[i3_store * 2U]) = B; -#else - *((v2h *)&pOut[i0 * 2U]) = H; - *((v2h *)&pOut[i1 * 2U]) = E; - *((v2h *)&pOut[i2 * 2U]) = A; - *((v2h *)&pOut[i3 * 2U]) = B; -#endif - -} diff --git a/software/runtime/kernel/mempool_cfft_radix4_f16p.h b/software/runtime/kernel/mempool_cfft_radix4_f16p.h deleted file mode 100644 index d2220d090..000000000 --- a/software/runtime/kernel/mempool_cfft_radix4_f16p.h +++ /dev/null @@ -1,526 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -#include "xpulp/builtins_v2.h" -#define MIN(x, y) (((x) < (y)) ? (x) : (y)) - -/** - @brief Folding in local memory function - @param[in] pSrc16 points to input buffer of 16b data, Re and Im parts are - interleaved - @param[in] fftLen Length of the complex input vector - @param[in] nPE Number of PE - @return none -*/ - -static inline void fold_radix4(__fp16 *pSrc16, uint32_t fftLen, - uint32_t core_id, uint32_t nPE) { - uint32_t n2, i0, i1, i2, i3; - uint32_t i1_store, i2_store, i3_store; - volatile v2h A, B, C; - n2 = fftLen >> 2U; - for (i0 = core_id * STEP; i0 < MIN(core_id * STEP + STEP, n2); i0++) { - i1 = i0 + n2; - i2 = i1 + n2; - i3 = i2 + n2; - A = *(v2h *)&pSrc16[i1 * 2U]; - B = *(v2h *)&pSrc16[i2 * 2U]; - C = *(v2h *)&pSrc16[i3 * 2U]; - i1_store = i0 + N_BANKS; - i2_store = i1_store + N_BANKS; - i3_store = i2_store + N_BANKS; - *(v2h *)&pSrc16[i1_store * 2U] = A; - *(v2h *)&pSrc16[i2_store * 2U] = B; - *(v2h *)&pSrc16[i3_store * 2U] = C; - } - mempool_log_partial_barrier(2 * WU_STRIDE, WU_STRIDE * core_id, - nPE * WU_STRIDE); -} - -#ifdef FOLDED_TWIDDLES -/** - @brief Full FFT butterfly - @param[in] pSrc16 points to input buffer of 16b data, Re and Im parts are - interleaved - @param[out] pDst16 points to output buffer of 16b data, Re and Im parts - are interleaved - @param[in] fftLen Length of the complex input vector - @param[in] pCoef_src Twiddle coefficients vector - @param[in] pCoef_dst Auxiliary twiddle coefficients vector - @param[in] nPE Number of PE - @return pointer to output vector -*/ -__fp16 *mempool_radix4_cfft_q16p_folded(__fp16 *pSrc16, __fp16 *pDst16, - uint32_t fftLen, __fp16 *pCoef_src, - __fp16 *pCoef_dst, uint32_t nPE) -#else -/** - Twiddles are not folded in memory - @brief Full FFT butterfly - @param[in] pSrc16 points to input buffer of 16b data, Re and Im parts are - interleaved - @param[out] pDst16 points to output buffer of 16b data, Re and Im parts - are interleaved - @param[in] fftLen Length of the complex input vector - @param[in] pCoef_src Twiddle coefficients vector - @param[in] nPE Number of PE - @return pointer to output vector -*/ -__fp16 *mempool_radix4_cfft_q16p_folded(__fp16 *pSrc16, __fp16 *pDst16, - uint32_t fftLen, __fp16 *pCoef_src, - uint32_t nPE) -#endif -{ - -#ifdef FOLDED_TWIDDLES - uint32_t absolute_core_id = mempool_get_core_id(); - uint32_t core_id = absolute_core_id / WU_STRIDE; - __fp16 t0, t1, t2, t3, t4, t5; - v2h CoSi1, CoSi2, CoSi3; - v2h C1, C2, C3; - uint32_t n1, n2, n2_store, i0, j, k; - uint32_t ic, offset, wing_idx; - __fp16 *pTmp; -#else - uint32_t absolute_core_id = mempool_get_core_id(); - uint32_t core_id = absolute_core_id / WU_STRIDE; - __fp16 t0, t1, t2, t3, t4, t5; - v2h CoSi1, CoSi2, CoSi3; - v2h C1, C2, C3; - uint32_t n1, n2, n2_store, i0, j, k; - uint32_t ic, offset, wing_id, bank_id; - __fp16 *pTmp; - uint32_t twidCoefModifier = 1U; -#endif - - if (fftLen <= N_BANKS) - fold_radix4(pSrc16, fftLen, core_id, nPE); - - /* START OF FIRST STAGE PROCESS */ - n1 = fftLen; - n2 = n1 >> 2U; - n2_store = n2 >> 2U; - for (i0 = core_id * STEP; i0 < MIN(core_id * STEP + STEP, n2); i0++) { - -#ifdef FOLDED_TWIDDLES - CoSi1 = *(v2h *)&pCoef_src[2U * i0]; - CoSi2 = *(v2h *)&pCoef_src[2U * (i0 + 1 * N_BANKS)]; - CoSi3 = *(v2h *)&pCoef_src[2U * (i0 + 2 * N_BANKS)]; - if (i0 % 4 == 0) { - ic = i0 >> 2U; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3; - } -#else - CoSi1 = *(v2h *)&pCoef_src[2U * i0]; - CoSi2 = *(v2h *)&pCoef_src[2U * (i0 * 2U)]; - CoSi3 = *(v2h *)&pCoef_src[2U * (i0 * 3U)]; -#endif - asm volatile("pv.extract.h %[t1],%[CoSi1],0;" - "pv.extract.h %[t3],%[CoSi2],0;" - "pv.extract.h %[t5],%[CoSi3],0;" - "pv.extract.h %[t0],%[CoSi1],1;" - "pv.extract.h %[t2],%[CoSi2],1;" - "pv.extract.h %[t4],%[CoSi3],1;" - "fsub.h %[t0],zero,%[t0];" - "fsub.h %[t2],zero,%[t2];" - "fsub.h %[t4],zero,%[t4];" - "pv.pack.h %[C1],%[t1],%[t0];" - "pv.pack.h %[C2],%[t3],%[t2];" - "pv.pack.h %[C3],%[t5],%[t4];" - : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), [t0] "=&r"(t0), - [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), - [t4] "=&r"(t4), [t5] "=&r"(t5) - : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3) - :); - radix4_butterfly(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, - C3); - } - pTmp = pSrc16; - pSrc16 = pDst16; - pDst16 = pTmp; -#ifdef FOLDED_TWIDDLES - pTmp = pCoef_src; - pCoef_src = pCoef_dst; - pCoef_dst = pTmp; -#else - twidCoefModifier <<= 2U; -#endif - mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id, nPE * WU_STRIDE); - /* END OF FIRST STAGE PROCESSING */ - - /* START OF MIDDLE STAGE PROCESS */ - for (k = fftLen / 4U; k > 4U; k >>= 2U) { - n1 = n2; - n2 >>= 2U; - n2_store = n2 >> 2U; - -#ifdef FOLDED_TWIDDLES - for (j = core_id * STEP; j < core_id * STEP + STEP; j++) { - CoSi1 = *(v2h *)&pCoef_src[2U * j]; - CoSi2 = *(v2h *)&pCoef_src[2U * (j + 1 * N_BANKS)]; - CoSi3 = *(v2h *)&pCoef_src[2U * (j + 2 * N_BANKS)]; - if (j % 4 == 0) { - wing_idx = j % n2; - offset = (j / n2); - ic = wing_idx >> 2U; - ic += offset * n2; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3; - } -#else - bank_id = core_id / n2_store; - wing_id = core_id % n2_store; - offset = bank_id * n2; - for (j = wing_id * 4; j < MIN(wing_id * 4 + 4, n2); j++) { - ic = j * twidCoefModifier; - CoSi1 = *(v2h *)&pCoef_src[2U * ic]; - CoSi2 = *(v2h *)&pCoef_src[2U * (ic * 2U)]; - CoSi3 = *(v2h *)&pCoef_src[2U * (ic * 3U)]; -#endif - asm volatile("pv.extract.h %[t1],%[CoSi1],0;" - "pv.extract.h %[t3],%[CoSi2],0;" - "pv.extract.h %[t5],%[CoSi3],0;" - "pv.extract.h %[t0],%[CoSi1],1;" - "pv.extract.h %[t2],%[CoSi2],1;" - "pv.extract.h %[t4],%[CoSi3],1;" - "fsub.h %[t0],zero,%[t0];" - "fsub.h %[t2],zero,%[t2];" - "fsub.h %[t4],zero,%[t4];" - "pv.pack %[C1],%[t1],%[t0];" - "pv.pack %[C2],%[t3],%[t2];" - "pv.pack %[C3],%[t5],%[t4];" - : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), - [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2), - [t3] "=&r"(t3), [t4] "=&r"(t4), [t5] "=&r"(t5) - : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3) - :); -#ifdef FOLDED_TWIDDLES - i0 = j; - radix4_butterfly(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1, - C2, C3); - } -#else - i0 = offset + j; - radix4_butterfly(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1, - C2, C3); - } -#endif - pTmp = pSrc16; - pSrc16 = pDst16; - pDst16 = pTmp; -#ifdef FOLDED_TWIDDLES - pTmp = pCoef_src; - pCoef_src = pCoef_dst; - pCoef_dst = pTmp; -#else - twidCoefModifier <<= 2U; -#endif - mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id, - nPE * WU_STRIDE); - } - /* END OF MIDDLE STAGE PROCESSING */ - - /* START OF LAST STAGE PROCESSING */ - n1 = n2; - n2 >>= 2U; - for (i0 = core_id * STEP; i0 < MIN(core_id * STEP + STEP, fftLen >> 2U); - i0++) { - radix4_butterfly_last(pSrc16, pDst16, i0); - } - mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id, nPE * WU_STRIDE); - /* END OF LAST STAGE PROCESSING */ - - return pDst16; -} - -/** - SCHEDULER OF MULTIPLE FOLDED FFTS - Memory: - - 1st row of FFTS - - col_idx1 col_idx2 col_idx3 - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - - 2nd row of FFTS - - col_idx1 col_idx2 col_idx3 - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - - ... - - @brief Scheduler of folded FFTs - @param[in] column index of the current FFT - @param[in] pSrc16 input buffer of 16b data, Re and Im are interleaved - @param[out] pDst16 output buffer of 16b data, Re and Im are interleaved - @param[in] fftLen Length of the complex input vector - @param[in] pCoef_src Twiddle coefficients vector - @param[in] pCoef_dst Twiddle coefficients vector - @param[in] pBitRevTable Bitreversal table - @param[in] bitReverseLen Length of bitreversal table - @param[in] bitReverseFlag Flag for bitreversal - @param[in] nPE Number of PE - @return void -*/ - -void mempool_radix4_cfft_q16p_scheduler(uint32_t col_id, __fp16 *pSrc16, - __fp16 *pDst16, uint32_t fftLen, - __fp16 *pCoef_src, __fp16 *pCoef_dst, - __attribute__((unused)) - uint16_t *pBitRevTable, - __attribute__((unused)) - uint16_t bitReverseLen, - uint8_t bitReverseFlag, uint32_t nPE) { - - uint32_t absolute_core_id = mempool_get_core_id(); - uint32_t core_id = absolute_core_id % (fftLen >> 4U); - - uint32_t n1, n2, i0, ic, j, k; - uint32_t n2_store; - uint32_t offset, wing_idx; - __fp16 *pTmp; - int32_t t0, t1, t2, t3, t4, t5; - v2h CoSi1, CoSi2, CoSi3; - v2h C1, C2, C3; - - /* FIRST STAGE */ - n1 = fftLen; - n2 = n1 >> 2U; - n2_store = n2 >> 2U; - for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) { - CoSi1 = *(v2h *)&pCoef_src[2U * i0]; - CoSi2 = *(v2h *)&pCoef_src[2U * (i0 + 1 * N_BANKS)]; - CoSi3 = *(v2h *)&pCoef_src[2U * (i0 + 2 * N_BANKS)]; - if (i0 % 4 == 0) { - ic = i0 / 4; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3; - } - asm volatile("pv.extract.h %[t1],%[CoSi1],0;" - "pv.extract.h %[t3],%[CoSi2],0;" - "pv.extract.h %[t5],%[CoSi3],0;" - "pv.extract.h %[t0],%[CoSi1],1;" - "pv.extract.h %[t2],%[CoSi2],1;" - "pv.extract.h %[t4],%[CoSi3],1;" - "fsub.h %[t0],zero,%[t0];" - "fsub.h %[t2],zero,%[t2];" - "fsub.h %[t4],zero,%[t4];" - "pv.pack.h %[C1],%[t1],%[t0];" - "pv.pack.h %[C2],%[t3],%[t2];" - "pv.pack.h %[C3],%[t5],%[t4];" - : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), [t0] "=&r"(t0), - [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), - [t4] "=&r"(t4), [t5] "=&r"(t5) - : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3) - :); - for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { - __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8); - __fp16 *pOut = pDst16 + idx_row * (N_BANKS * 8); - radix4_butterfly(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, - C3); - } - } - pTmp = pSrc16; - pSrc16 = pDst16; - pDst16 = pTmp; - pTmp = pCoef_src; - pCoef_src = pCoef_dst; - pCoef_dst = pTmp; - mempool_log_partial_barrier(2, absolute_core_id, nPE); - - /* MIDDLE STAGE */ - for (k = fftLen / 4U; k > 4U; k >>= 2U) { - n1 = n2; - n2 >>= 2U; - n2_store = n2 >> 2U; - - for (j = core_id * 4; j < core_id * 4 + 4; j++) { - CoSi1 = *(v2h *)&pCoef_src[2U * (j)]; - CoSi2 = *(v2h *)&pCoef_src[2U * (j + 1 * N_BANKS)]; - CoSi3 = *(v2h *)&pCoef_src[2U * (j + 2 * N_BANKS)]; - if (j % 4 == 0) { - - wing_idx = j % n2; - offset = (j / n2); - ic = wing_idx >> 2U; - ic += offset * n2; - - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3; - } - asm volatile("pv.extract.h %[t1],%[CoSi1],0;" - "pv.extract.h %[t3],%[CoSi2],0;" - "pv.extract.h %[t5],%[CoSi3],0;" - "pv.extract.h %[t0],%[CoSi1],1;" - "pv.extract.h %[t2],%[CoSi2],1;" - "pv.extract.h %[t4],%[CoSi3],1;" - "fsub.h %[t0],zero,%[t0];" - "fsub.h %[t2],zero,%[t2];" - "fsub.h %[t4],zero,%[t4];" - "pv.pack.h %[C1],%[t1],%[t0];" - "pv.pack.h %[C2],%[t3],%[t2];" - "pv.pack.h %[C3],%[t5],%[t4];" - : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), - [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2), - [t3] "=&r"(t3), [t4] "=&r"(t4), [t5] "=&r"(t5) - : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3) - :); - for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { - __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8); - __fp16 *pOut = pDst16 + idx_row * (N_BANKS * 8); - radix4_butterfly(pIn, pOut, j, n2, CoSi1, CoSi2, CoSi3, C1, C2, - C3); - } - } - pTmp = pSrc16; - pSrc16 = pDst16; - pDst16 = pTmp; - pTmp = pCoef_src; - pCoef_src = pCoef_dst; - pCoef_dst = pTmp; - mempool_log_partial_barrier(2, absolute_core_id, nPE); - } - - /* LAST STAGE */ - n1 = n2; - n2 >>= 2U; - for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, fftLen >> 2U); i0++) { - for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { - __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8); - __fp16 *pOut = pDst16 + idx_row * (N_BANKS * 8); - radix4_butterfly_last(pIn, pOut, i0); - } - } - pTmp = pSrc16; - pSrc16 = pDst16; - pDst16 = pTmp; - mempool_log_partial_barrier(2, absolute_core_id, nPE); - - mempool_stop_benchmark(); - mempool_start_benchmark(); - - /* BITREVERSAL */ - // Bitreversal stage stores in the sequential addresses - if (bitReverseFlag) { -#ifdef BITREVERSETABLE - uint16_t *ptr1 = (uint16_t *)(pSrc16 + 2 * col_id * (fftLen >> 2U)); - uint16_t *ptr2 = (uint16_t *)(pDst16 + 2 * col_id * (3 * (fftLen >> 2))); - for (j = 2 * core_id; j < bitReverseLen; j += 2 * nPE) { - v2h addr, tmpa, tmpb; - addr = __SRA2(*(v2h *)&pBitRevTable[j], ((v2h){2, 2})); - for (int32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { - int32_t a0 = addr[0] / 4 + (addr[0] % 4) * N_BANKS; - int32_t a1 = addr[1] / 4 + (addr[0] % 4) * N_BANKS; - tmpa = *(v2h *)&ptr1[a0 + idx_row * (N_BANKS * 8)]; - tmpb = *(v2h *)&ptr1[a1 + idx_row * (N_BANKS * 8)]; - *((v2h *)&ptr2[addr[0] + idx_row * (N_BANKS * 8)]) = tmpb; - *((v2h *)&ptr2[addr[1] + idx_row * (N_BANKS * 8)]) = tmpa; - } - } -#else - uint16_t *ptr1 = (uint16_t *)(pSrc16 + 2 * col_id * (fftLen >> 2U)); - uint16_t *ptr2 = (uint16_t *)(pDst16 + 2 * col_id * (3 * (fftLen >> 2))); - for (j = core_id * 16; j < MIN(core_id * 16 + 16, fftLen >> 2U); j += 4) { - uint32_t idx0 = j; - uint32_t idx1 = j + 1; - uint32_t idx2 = j + 2; - uint32_t idx3 = j + 3; - uint32_t idx_result0 = 0; - uint32_t idx_result1 = 0; - uint32_t idx_result2 = 0; - uint32_t idx_result3 = 0; - for (k = 0; k < LOG2; k++) { - idx_result0 = (idx_result0 << 1U) | (idx0 & 1U); - idx_result1 = (idx_result1 << 1U) | (idx1 & 1U); - idx_result2 = (idx_result2 << 1U) | (idx2 & 1U); - idx_result3 = (idx_result3 << 1U) | (idx3 & 1U); - idx0 = idx0 >> 1U; - idx1 = idx1 >> 1U; - idx2 = idx2 >> 1U; - idx3 = idx3 >> 1U; - } - for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { - uint32_t addr_src0 = (idx0 / 4) + (idx0 % 4) * N_BANKS; - uint32_t addr_src1 = (idx1 / 4) + (idx1 % 4) * N_BANKS; - uint32_t addr_src2 = (idx2 / 4) + (idx2 % 4) * N_BANKS; - uint32_t addr_src3 = (idx3 / 4) + (idx3 % 4) * N_BANKS; - uint32_t addr_dst0 = idx_result0; - uint32_t addr_dst1 = idx_result1; - uint32_t addr_dst2 = idx_result2; - uint32_t addr_dst3 = idx_result3; - addr_src0 += idx_row * (N_BANKS * 8); - addr_src1 += idx_row * (N_BANKS * 8); - addr_src2 += idx_row * (N_BANKS * 8); - addr_src3 += idx_row * (N_BANKS * 8); - addr_dst0 += idx_row * (N_BANKS * 8); - addr_dst1 += idx_row * (N_BANKS * 8); - addr_dst2 += idx_row * (N_BANKS * 8); - addr_dst3 += idx_row * (N_BANKS * 8); - *((uint32_t *)&ptr2[addr_dst0]) = (uint32_t)ptr1[addr_src0]; - *((uint32_t *)&ptr2[addr_dst1]) = (uint32_t)ptr1[addr_src1]; - *((uint32_t *)&ptr2[addr_dst2]) = (uint32_t)ptr1[addr_src2]; - *((uint32_t *)&ptr2[addr_dst3]) = (uint32_t)ptr1[addr_src3]; - } - } -#endif - } - mempool_log_partial_barrier(2, absolute_core_id, nPE); -} diff --git a/software/runtime/kernel/mempool_cfft_radix4_q16_bitreversal.h b/software/runtime/kernel/mempool_cfft_radix4_q16_bitreversal.h deleted file mode 100644 index 32f7a5265..000000000 --- a/software/runtime/kernel/mempool_cfft_radix4_q16_bitreversal.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -void mempool_bitrev_q16p_xpulpimg(uint16_t *pSrc, uint16_t *pDst, - const uint16_t fftLen, const uint32_t nPE) { - uint32_t absolute_core_id = mempool_get_core_id(); - uint32_t core_id = absolute_core_id / WU_STRIDE; - uint32_t idx_result, idx, i, j; - for (i = core_id; i < fftLen; i += nPE) { - idx_result = 0; - idx = i; - for (j = 0; j < LOG2; j++) { - idx_result = (idx_result << 1U) | (idx & 1U); - idx = idx >> 1U; - } - pDst[2 * idx_result] = pSrc[2 * i]; - pDst[2 * idx_result + 1] = pSrc[2 * i + 1]; - } - mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id, nPE * WU_STRIDE); -} diff --git a/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h b/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h new file mode 100644 index 000000000..fbb6964ac --- /dev/null +++ b/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h @@ -0,0 +1,337 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include "xpulp/builtins_v2.h" + +/** + @brief First butterfly stage. + @param[in] pIn points to input buffer of 16b data, Re and Im parts are + interleaved + @param[out] pOut points to output buffer of 16b data, Re and Im parts are + interleaved + @param[in] i0 points to the first element to be processed + @param[in] n2 number of elements in the first wing of the butterfly + @param[in] CoSi1 packed cosine and sine first twiddle + @param[in] CoSi2 packed cosine and sine second twiddle + @param[in] CoSi3 packed cosine and sine third twiddle + @param[in] C1 packed sine and cosine first twiddle + @param[in] C2 packed sine and cosine second twiddle + @param[in] C3 packed sine and cosine third twiddle + @return none +*/ +static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut, + uint32_t i0, uint32_t n2, v2h CoSi1, + v2h CoSi2, v2h CoSi3, v2h C1, v2h C2, + v2h C3) { + __fp16 t0, t1, t2, t3; + uint32_t i1, i2, i3; + uint32_t i0_store, i1_store, i2_store, i3_store; + float s0 = 0.0f, s1 = 0.0f, s2 = 0.0f, s3 = 0.0f, s4 = 0.0f, s5 = 0.0f; + v2h A, B, C, D, E, F, G, H; + +// LOAD INDEXES +#if defined(FOLDED) || defined(SCHEDULED) + /* index calculation for the input as, */ + /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; +#else + /* index calculation for the input as, */ + /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; +#endif +// STORE INDEXES +#if defined(FOLDED) || defined(SCHEDULED) + uint32_t n2_store = n2 >> 2U; + i0_store = (i0 % n2_store) + (i0 / n2_store) * N_BANKS; + i1_store = i0_store + n2_store; + i2_store = i1_store + n2_store; + i3_store = i2_store + n2_store; +#else + i0_store = i0; + i1_store = i1; + i2_store = i2; + i3_store = i3; +#endif + + /* Read yb (real), xb(imag) input */ + B = *(v2h *)&pIn[i1 * 2U]; + /* Read yd (real), xd(imag) input */ + D = *(v2h *)&pIn[i3 * 2U]; + /* Read ya (real), xa (imag) input */ + A = *(v2h *)&pIn[i0 * 2U]; + /* Read yc (real), xc(imag) input */ + C = *(v2h *)&pIn[i2 * 2U]; + asm volatile( + // xb - xd, yb - yd + "vfsub.h %[H],%[B],%[D];" + // xb + xd, yd + yd + "vfadd.h %[G],%[B],%[D];" + // xa + xc, ya + yc + "vfadd.h %[E],%[A],%[C];" + "pv.extract.h %[t0],%[H],0;" // yb - yd + "pv.extract.h %[t1],%[H],1;" // xb - xd + // xa - xc, ya - yc + "vfsub.h %[F],%[A],%[C];" + + "xor %[t2],%[t0],%[neg_mask];" // yd - yb + "xor %[t3],%[t1],%[neg_mask];" // xd - xb + "pv.pack.h %[D],%[t2],%[t1];" // yd - yb, xb - xd + "pv.pack.h %[C],%[t0],%[t3];" // yb - yd, xd - xb + + // xa + xc + xb + xd, ya + yb + yc + yd + "vfadd.h %[A],%[E],%[G];" + // xa + xc - xb - xd, ya + yc - yb - yd + "vfsub.h %[B],%[E],%[G];" + // xa - xc + yb - yd, ya - yc + xd - xb + "vfadd.h %[C],%[F],%[C];" + // xa - xc + yd - yb, ya - yc + xb - xd + "vfadd.h %[D],%[F],%[D];" + + // Co2(xa + xc - xb - xd), Si2(ya + yc - yb - yd) + "vfdotpex.s.h %[s0],%[CoSi2],%[B];" + //-Si2(xa + xc - xb - xd), Co2(ya + yc - yb - yd) + "vfdotpex.s.h %[s1],%[C2],%[B];" + + // Co1(xa - xc + yd - yb), Si1(ya - yc + xb - xd) + "vfdotpex.s.h %[s2],%[CoSi1],%[D];" + //-Si1(xa - xc + yd - yb), Co1(ya - yc + xb - xd) + "vfdotpex.s.h %[s3],%[C1],%[D];" + + // Co3(xa - xc + yb - yd), Si3(ya - yc + xd - xb) + "vfdotpex.s.h %[s4],%[CoSi3],%[C];" + //-Si3(xa - xc + yb - yd), Co3(ya - yc + xd - xb) + "vfdotpex.s.h %[s5],%[C3],%[C];" + + // xb', yb' + "vfcpka.h.s %[B], %[s0], %[s1];" + // xc', yc' + "vfcpka.h.s %[C], %[s2], %[s3];" + // xd', yd' + "vfcpka.h.s %[D], %[s4], %[s5];" + : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E), + [F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "=&r"(t0), + [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [s0] "=&r"(s0), + [s1] "=&r"(s1), [s2] "=&r"(s2), [s3] "=&r"(s3), [s4] "=&r"(s4), + [s5] "=&r"(s5) + : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1), + [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [neg_mask] "r"(0x00008000) + :); + *((v2h *)&pOut[i0_store * 2U]) = A; + *((v2h *)&pOut[i1_store * 2U]) = B; + *((v2h *)&pOut[i2_store * 2U]) = D; + *((v2h *)&pOut[i3_store * 2U]) = C; +} + +/** + @brief Middle butterfly stage. + @param[in] pIn points to input buffer of 16b data, Re and Im parts are + interleaved + @param[out] pOut points to output buffer of 16b data, Re and Im parts are + interleaved + @param[in] i0 points to the first element to be processed + @param[in] n2 number of elements in the first wing of the butterfly + @param[in] CoSi1 packed cosine and sine first twiddle + @param[in] CoSi2 packed cosine and sine second twiddle + @param[in] CoSi3 packed cosine and sine third twiddle + @param[in] C1 packed sine and cosine first twiddle + @param[in] C2 packed sine and cosine second twiddle + @param[in] C3 packed sine and cosine third twiddle + @return none +*/ +static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut, + uint32_t i0, uint32_t n2, v2h CoSi1, + v2h CoSi2, v2h CoSi3, v2h C1, v2h C2, + v2h C3) { + __fp16 t0, t1, t2, t3; + uint32_t i1, i2, i3; + uint32_t i0_store, i1_store, i2_store, i3_store; + float s0 = 0.0f, s1 = 0.0f, s2 = 0.0f, s3 = 0.0f, s4 = 0.0f, s5 = 0.0f; + v2h A, B, C, D, E, F, G, H; + +// LOAD INDEXES +#if defined(FOLDED) || defined(SCHEDULED) + /* index calculation for the input as, */ + /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + + * 3fftLen/4] */ + i1 = i0 + N_BANKS; + i2 = i1 + N_BANKS; + i3 = i2 + N_BANKS; +#else + /* index calculation for the input as, */ + /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + + * 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; +#endif +// STORE INDEXES +#if defined(FOLDED) || defined(SCHEDULED) + uint32_t n2_store = n2 >> 2U; + i0_store = + (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS; + i1_store = i0_store + n2_store; + i2_store = i1_store + n2_store; + i3_store = i2_store + n2_store; +#else + i0_store = i0; + i1_store = i1; + i2_store = i2; + i3_store = i3; +#endif + + /* Read yb (real), xb(imag) input */ + B = *(v2h *)&pIn[i1 * 2U]; + /* Read yd (real), xd(imag) input */ + D = *(v2h *)&pIn[i3 * 2U]; + /* Read ya (real), xa (imag) input */ + A = *(v2h *)&pIn[i0 * 2U]; + /* Read yc (real), xc(imag) input */ + C = *(v2h *)&pIn[i2 * 2U]; + asm volatile( + // xb - xd, yb - yd + "vfsub.h %[H],%[B],%[D];" + // xb + xd, yd + yd + "vfadd.h %[G],%[B],%[D];" + // xa + xc, ya + yc + "vfadd.h %[E],%[A],%[C];" + "pv.extract.h %[t0],%[H],1;" // yb - yd + "pv.extract.h %[t1],%[H],0;" // xb - xd + // xa - xc, ya - yc + "vfsub.h %[F],%[A],%[C];" + + "xor %[t2],%[t0],%[neg_mask];" // yd - yb + "xor %[t3],%[t1],%[neg_mask];" // xd - xb + "pv.pack.h %[D],%[t2],%[t1];" // yd - yb, xb - xd + "pv.pack.h %[C],%[t0],%[t3];" // yb - yd, xd - xb + + // xa + xc + xb + xd, ya + yb + yc + yd + "vfadd.h %[A],%[E],%[G];" + // xa + xc - xb - xd, ya + yc - yb - yd + "vfsub.h %[B],%[E],%[G];" + // xa - xc + yb - yd, ya - yc + xd - xb + "vfadd.h %[C],%[F],%[C];" + // xa - xc + yd - yb, ya - yc + xb - xd + "vfadd.h %[D],%[F],%[D];" + + // Co2(xa + xc - xb - xd), Si2(ya + yc - yb - yd) + "vfdotpex.s.h %[s0],%[CoSi2],%[B];" + //-Si2(xa + xc - xb - xd), Co2(ya + yc - yb - yd) + "vfdotpex.s.h %[s1],%[C2],%[B];" + + // Co1(xa - xc + yd - yb), Si1(ya - yc + xb - xd) + "vfdotpex.s.h %[s2],%[CoSi1],%[D];" + //-Si1(xa - xc + yd - yb), Co1(ya - yc + xb - xd) + "vfdotpex.s.h %[s3],%[C1],%[D];" + + // Co3(xa - xc + yb - yd), Si3(ya - yc + xd - xb) + "vfdotpex.s.h %[s4],%[CoSi3],%[C];" + //-Si3(xa - xc + yb - yd), Co3(ya - yc + xd - xb) + "vfdotpex.s.h %[s5],%[C3],%[C];" + + // xb', yb' + "vfcpka.h.s %[B], %[s0], %[s1];" + // xc', yc' + "vfcpka.h.s %[C], %[s2], %[s3];" + // xd', yd' + "vfcpka.h.s %[D], %[s4], %[s5];" + : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E), + [F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "=&r"(t0), + [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [s0] "=&r"(s0), + [s1] "=&r"(s1), [s2] "=&r"(s2), [s3] "=&r"(s3), [s4] "=&r"(s4), + [s5] "=&r"(s5) + : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1), + [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [neg_mask] "r"(0x00008000) + :); + + *((v2h *)&pOut[i0_store * 2U]) = A; + *((v2h *)&pOut[i1_store * 2U]) = B; + *((v2h *)&pOut[i2_store * 2U]) = D; + *((v2h *)&pOut[i3_store * 2U]) = C; +} + +/** + @brief Last butterfly stage. + @param[in] pIn points to input buffer of 16b data, Re and Im parts are + interleaved + @param[out] pOut points to output buffer of 16b data, Re and Im parts are + interleaved + @param[in] i0 points to the first element to be processed + @return none +*/ +static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut, + uint32_t i0) { + __fp16 t0, t1; + uint32_t i1, i2, i3; + uint32_t i0_store, i1_store, i2_store, i3_store; + v2h A, B, C, D, E, F, G, H; + +// LOAD INDEXES +#if defined(FOLDED) || defined(SCHEDULED) + /* index calculation for the input as, */ + /* pIn[i0 + 0], pIn[i0 + fftLen/4], + pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ + i1 = i0 + N_BANKS; + i2 = i1 + N_BANKS; + i3 = i2 + N_BANKS; +#else + /* index calculation for the input as, */ + /* pIn[i0 + 0], pIn[i0 + fftLen/4], + pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ + i1 = i0 + 1U; + i2 = i1 + 1U; + i3 = i2 + 1U; +#endif +// STORE INDEXES +#if defined(FOLDED) + i0_store = i0 * 4; + i1_store = i0_store + 1; + i2_store = i1_store + 1; + i3_store = i2_store + 1; +#else + i0_store = i0; + i1_store = i1; + i2_store = i2; + i3_store = i3; +#endif + + /* Read yb (real), xb(imag) input */ + B = *(v2h *)&pIn[i1 * 2U]; + /* Read yd (real), xd(imag) input */ + D = *(v2h *)&pIn[i3 * 2U]; + /* Read ya (real), xa(imag) input */ + A = *(v2h *)&pIn[i0 * 2U]; + /* Read yc (real), xc(imag) input */ + C = *(v2h *)&pIn[i2 * 2U]; + __fp16 t2, t3; + asm volatile("vfsub.h %[H],%[B],%[D];" + "vfadd.h %[G],%[B],%[D];" + "vfadd.h %[E],%[A],%[C];" + "vfsub.h %[F],%[A],%[C];" + "pv.extract.h %[t0],%[H],1;" + "pv.extract.h %[t1],%[H],0;" + "xor %[t2],%[t0],%[neg_mask];" + "xor %[t3],%[t1],%[neg_mask];" + "pv.pack.h %[A],%[t2],%[t1];" + "pv.pack.h %[B],%[t0],%[t3];" + "vfadd.h %[H],%[E],%[G];" + "vfsub.h %[E],%[E],%[G];" + "vfadd.h %[A],%[F],%[A];" + "vfadd.h %[B],%[F],%[B];" + : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), + [E] "=&r"(E), [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), + [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3) + : [neg_mask] "r"(0x00008000) + :); + + *((v2h *)&pOut[i0_store * 2U]) = H; + *((v2h *)&pOut[i1_store * 2U]) = E; + *((v2h *)&pOut[i2_store * 2U]) = A; + *((v2h *)&pOut[i3_store * 2U]) = B; +} diff --git a/software/runtime/kernel/mempool_radix4_cfft_f16p.h b/software/runtime/kernel/mempool_radix4_cfft_f16p.h new file mode 100644 index 000000000..2076a108a --- /dev/null +++ b/software/runtime/kernel/mempool_radix4_cfft_f16p.h @@ -0,0 +1,441 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include "xpulp/builtins_v2.h" +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) + +#define SHUFFLE_TWIDDLEFACT \ + asm volatile("pv.extract.h %[t1],%[CoSi1],0;" \ + "pv.extract.h %[t3],%[CoSi2],0;" \ + "pv.extract.h %[t5],%[CoSi3],0;" \ + "pv.extract.h %[t0],%[CoSi1],1;" \ + "pv.extract.h %[t2],%[CoSi2],1;" \ + "pv.extract.h %[t4],%[CoSi3],1;" \ + "xor %[t1],%[t1],%[neg_mask];" \ + "xor %[t3],%[t3],%[neg_mask];" \ + "xor %[t5],%[t5],%[neg_mask];" \ + "pv.pack.h %[C1],%[t0],%[t1];" \ + "pv.pack.h %[C2],%[t2],%[t3];" \ + "pv.pack.h %[C3],%[t4],%[t5];" \ + : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), [t0] "=&r"(t0), \ + [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), \ + [t4] "=&r"(t4), [t5] "=&r"(t5) \ + : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), \ + [neg_mask] "r"(0x00008000) \ + :); + +#ifdef FOLDED_TWIDDLES + +#define LOAD_STORE_TWIDDLEFACT \ + CoSi1 = *(v2h *)&pCoef_src[2U * ic]; \ + CoSi2 = *(v2h *)&pCoef_src[2U * (ic + 1 * N_BANKS)]; \ + CoSi3 = *(v2h *)&pCoef_src[2U * (ic + 2 * N_BANKS)]; \ + if (ic % 4 == 0) { \ + *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi1; \ + *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi1; \ + *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi1; \ + *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi1; \ + ic_store += N_BANKS; \ + *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi2; \ + *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi2; \ + *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi2; \ + *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi2; \ + ic_store += N_BANKS; \ + *((v2h *)&pCoef_dst[2U * (ic_store)]) = CoSi3; \ + *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi3; \ + *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi3; \ + *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi3; \ + } + +#else +#define LOAD_STORE_TWIDDLEFACT \ + CoSi1 = *(v2h *)&pCoef_src[2U * ic]; \ + CoSi2 = *(v2h *)&pCoef_src[2U * (ic * 2U)]; \ + CoSi3 = *(v2h *)&pCoef_src[2U * (ic * 3U)]; +#endif + + + +#ifdef FOLDED_TWIDDLES +/** + @brief Full FFT butterfly + @param[in] pSrc16 points to input buffer of 16b data, Re and Im parts are + interleaved + @param[out] pDst16 points to output buffer of 16b data, Re and Im parts + are interleaved + @param[in] fftLen Length of the complex input vector + @param[in] pCoef_src Twiddle coefficients vector + @param[in] pCoef_dst Auxiliary twiddle coefficients vector + @param[in] nPE Number of PE + @return pointer to output vector +*/ +void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16, + uint32_t fftLen, __fp16 *pCoef_src, + __fp16 *pCoef_dst, uint32_t nPE) +#else +/** + Twiddles are not folded in memory + @brief Full FFT butterfly + @param[in] pSrc16 points to input buffer of 16b data, Re and Im parts are + interleaved + @param[out] pDst16 points to output buffer of 16b data, Re and Im parts + are interleaved + @param[in] fftLen Length of the complex input vector + @param[in] pCoef_src Twiddle coefficients vector + @param[in] nPE Number of PE + @return pointer to output vector +*/ +void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16, + uint32_t fftLen, __fp16 *pCoef_src, + uint32_t nPE) +#endif +{ + + uint32_t absolute_core_id = mempool_get_core_id(); + uint32_t core_id = absolute_core_id; + __fp16 t0, t1, t2, t3, t4, t5; + v2h CoSi1, CoSi2, CoSi3; + v2h C1, C2, C3; +#ifdef FOLDED_TWIDDLES + uint32_t n1, n2, n2_store; + uint32_t i0, k, ic, ic_store; + __fp16 *pTmp; +#else + uint32_t n1, n2; + uint32_t i0, k, ic; + __fp16 *pTmp; + uint32_t twidCoefModifier = 1U; +#endif + + /* START OF FIRST STAGE PROCESSING */ + n1 = fftLen; + n2 = n1 >> 2U; + for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) { + +#ifdef FOLDED_TWIDDLES + ic = i0; + ic_store = ic >> 2U; + n2_store = n2 >> 2U; +#else + ic = i0; +#endif + LOAD_STORE_TWIDDLEFACT; + SHUFFLE_TWIDDLEFACT; + radix4_butterfly_first(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, + C3); + } + pTmp = pSrc16; + pSrc16 = pDst16; + pDst16 = pTmp; +#ifdef FOLDED_TWIDDLES + pTmp = pCoef_src; + pCoef_src = pCoef_dst; + pCoef_dst = pTmp; +#else + twidCoefModifier <<= 2U; +#endif + mempool_log_partial_barrier(2, absolute_core_id, nPE); + /* END OF FIRST STAGE PROCESSING */ + + /* START OF MIDDLE STAGE PROCESSING */ + for (k = fftLen / 4U; k > 4U; k >>= 2U) { + n1 = n2; + n2 >>= 2U; + for (i0 = core_id * 4; i0 < core_id * 4 + 4; i0++) { +#ifdef FOLDED_TWIDDLES + ic = i0; + // (ic % n2) / 4 take only every 4th index in the wing + // (ic / n2) * n2 shift of the wing size + ic_store = ((ic % n2) >> 2) + (ic / n2) * n2; + n2_store = n2 >> 2U; +#else + ic = (i0 % n2) * twidCoefModifier; +#endif + LOAD_STORE_TWIDDLEFACT; + SHUFFLE_TWIDDLEFACT; + radix4_butterfly_middle(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1, + C2, C3); + } + pTmp = pSrc16; + pSrc16 = pDst16; + pDst16 = pTmp; +#ifdef FOLDED_TWIDDLES + pTmp = pCoef_src; + pCoef_src = pCoef_dst; + pCoef_dst = pTmp; +#else + twidCoefModifier <<= 2U; +#endif + mempool_log_partial_barrier(2, absolute_core_id, nPE); + } + /* END OF MIDDLE STAGE PROCESSING */ + + /* START OF LAST STAGE PROCESSING */ + for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, fftLen >> 2U); i0++) { + radix4_butterfly_last(pSrc16, pDst16, i0); + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + /* END OF LAST STAGE PROCESSING */ + return; +} + +/** + SCHEDULER OF MULTIPLE FOLDED FFTS + Memory: + + 1st row of FFTS + + col_idx1 col_idx2 col_idx3 + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + + 2nd row of FFTS + + col_idx1 col_idx2 col_idx3 + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + + ... + + @brief Scheduler of folded FFTs + @param[in] column index of the current FFT + @param[in] pSrc16 input buffer of 16b data, Re and Im are interleaved + @param[out] pDst16 output buffer of 16b data, Re and Im are interleaved + @param[in] fftLen Length of the complex input vector + @param[in] pCoef_src Twiddle coefficients vector + @param[in] pCoef_dst Twiddle coefficients vector + @param[in] pBitRevTable Bitreversal table + @param[in] bitReverseLen Length of bitreversal table + @param[in] bitReverseFlag Flag for bitreversal + @param[in] nPE Number of PE + @return void +*/ + +void mempool_radix4_cfft_f16p_scheduler( + __fp16 *pSrc16, __fp16 *pDst16, uint32_t fftLen, + __fp16 *pCoef_src, __fp16 *pCoef_dst, __attribute__((unused)) + uint16_t *pBitRevTable, __attribute__((unused)) uint16_t bitReverseLen, + uint8_t bitReverseFlag, uint32_t nPE) { + + uint32_t absolute_core_id = mempool_get_core_id(); + uint32_t core_id = absolute_core_id % (fftLen >> 4U); + uint32_t col_id = absolute_core_id / (fftLen >> 4U); + + __fp16 t0, t1, t2, t3, t4, t5; + v2h CoSi1, CoSi2, CoSi3; + v2h C1, C2, C3; +#ifdef FOLDED_TWIDDLES + uint32_t n1, n2, n2_store; + uint32_t i0, k, ic, ic_store; +#else + uint32_t n1, n2; + uint32_t i0, k, ic; + uint32_t twidCoefModifier = 1U; +#endif + __fp16 *pTmp; + + /* FIRST STAGE */ + n1 = fftLen; + n2 = n1 >> 2U; + for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) { + ic = i0; +#ifdef FOLDED_TWIDDLES + ic_store = ic >> 2U; + n2_store = n2 >> 2U; +#endif + LOAD_STORE_TWIDDLEFACT; + SHUFFLE_TWIDDLEFACT; + for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { + __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen; + __fp16 *pOut = + pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + radix4_butterfly_first(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, + C3); + } + } + pTmp = pSrc16; + pSrc16 = pDst16; + pDst16 = pTmp; + pTmp = pCoef_src; + pCoef_src = pCoef_dst; + pCoef_dst = pTmp; + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + /* MIDDLE STAGE */ + for (k = fftLen / 4U; k > 4U; k >>= 2U) { + n1 = n2; + n2 >>= 2U; + for (i0 = core_id * 4; i0 < core_id * 4 + 4; i0++) { +#ifdef FOLDED_TWIDDLES + ic = i0; + ic_store = ((ic % n2) >> 2) + (ic / n2) * n2; + n2_store = n2 >> 2U; +#else + ic = (i0 % n2) * twidCoefModifier; +#endif + LOAD_STORE_TWIDDLEFACT; + SHUFFLE_TWIDDLEFACT; + + for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { + __fp16 *pIn = + pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + __fp16 *pOut = + pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + radix4_butterfly_middle(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, + C3); + } + } + pTmp = pSrc16; + pSrc16 = pDst16; + pDst16 = pTmp; + pTmp = pCoef_src; + pCoef_src = pCoef_dst; + pCoef_dst = pTmp; + mempool_log_partial_barrier(2, absolute_core_id, N_FFTs_COL * nPE); + } + + /* LAST STAGE */ + for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, fftLen >> 2U); i0++) { + for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { + __fp16 *pIn = + pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + __fp16 *pOut = + pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + radix4_butterfly_last(pIn, pOut, i0); + } + } + pTmp = pSrc16; + pSrc16 = pDst16; + pDst16 = pTmp; + mempool_log_partial_barrier(2, absolute_core_id, N_FFTs_COL * nPE); + mempool_stop_benchmark(); + mempool_start_benchmark(); + /* BITREVERSAL */ + // Bitreversal stage stores in the sequential addresses + if (bitReverseFlag) { +#ifdef BITREVERSETABLE + pSrc16 = pSrc16 + 2 * col_id * (fftLen / 4); + pDst16 = pDst16 + 2 * col_id * fftLen; + for (ic = 8 * core_id; ic < bitReverseLen; ic += 8 * nPE) { + uint32_t addr1, addr2, addr3, addr4; + uint32_t tmpa1, tmpa2, tmpa3, tmpa4; + uint32_t tmpb1, tmpb2, tmpb3, tmpb4; + uint32_t a1, a2, a3, a4; + uint32_t b1, b2, b3, b4; + uint32_t a1_load, a2_load, a3_load, a4_load; + uint32_t b1_load, b2_load, b3_load, b4_load; + uint32_t s2 = 0x00020002; + addr1 = *(uint32_t *)&pBitRevTable[ic]; + addr2 = *(uint32_t *)&pBitRevTable[ic + 2]; + addr3 = *(uint32_t *)&pBitRevTable[ic + 4]; + addr4 = *(uint32_t *)&pBitRevTable[ic + 6]; + asm volatile("pv.sra.h %[addr1],%[addr1],%[s2];" + "pv.sra.h %[addr2],%[addr2],%[s2];" + "pv.sra.h %[addr3],%[addr3],%[s2];" + "pv.sra.h %[addr4],%[addr4],%[s2];" + "pv.extract.h %[a1],%[addr1],0;" + "pv.extract.h %[a2],%[addr2],0;" + "pv.extract.h %[a3],%[addr3],0;" + "pv.extract.h %[a4],%[addr4],0;" + "pv.extract.h %[b1],%[addr1],1;" + "pv.extract.h %[b2],%[addr2],1;" + "pv.extract.h %[b3],%[addr3],1;" + "pv.extract.h %[b4],%[addr4],1;" + : [a1] "=r"(a1), [a2] "=r"(a2), [a3] "=r"(a3), [a4] "=r"(a4), + [b1] "=r"(b1), [b2] "=r"(b2), [b3] "=r"(b3), [b4] "=r"(b4), + [addr1] "+&r"(addr1), [addr2] "+&r"(addr2), + [addr3] "+&r"(addr3), [addr4] "+&r"(addr4) + : [s2] "r"(s2) + :); + // Compute the local addresses from the natural order ones + a1_load = (a1 % 4) * 2 * N_BANKS + 2 * (a1 / 4); + a2_load = (a2 % 4) * 2 * N_BANKS + 2 * (a2 / 4); + a3_load = (a3 % 4) * 2 * N_BANKS + 2 * (a3 / 4); + a4_load = (a4 % 4) * 2 * N_BANKS + 2 * (a4 / 4); + b1_load = (b1 % 4) * 2 * N_BANKS + 2 * (b1 / 4); + b2_load = (b2 % 4) * 2 * N_BANKS + 2 * (b2 / 4); + b3_load = (b3 % 4) * 2 * N_BANKS + 2 * (b3 / 4); + b4_load = (b4 % 4) * 2 * N_BANKS + 2 * (b4 / 4); + for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { + uint16_t *ptr1 = (uint16_t *)(pSrc16 + idx_row * (N_BANKS * 8)); + uint16_t *ptr2 = (uint16_t *)(pDst16 + idx_row * (N_BANKS * 8)); + // Load at address a + tmpa1 = *(uint32_t *)&ptr1[a1_load]; + tmpa2 = *(uint32_t *)&ptr1[a2_load]; + tmpa3 = *(uint32_t *)&ptr1[a3_load]; + tmpa4 = *(uint32_t *)&ptr1[a4_load]; + // Load at address b + tmpb1 = *(uint32_t *)&ptr1[b1_load]; + tmpb2 = *(uint32_t *)&ptr1[b2_load]; + tmpb3 = *(uint32_t *)&ptr1[b3_load]; + tmpb4 = *(uint32_t *)&ptr1[b4_load]; + // Swap a with b + *((uint32_t *)&ptr2[b1]) = tmpa1; + *((uint32_t *)&ptr2[b2]) = tmpa2; + *((uint32_t *)&ptr2[b3]) = tmpa3; + *((uint32_t *)&ptr2[b4]) = tmpa4; + // Swap b with a + *((uint32_t *)&ptr2[a1]) = tmpb1; + *((uint32_t *)&ptr2[a2]) = tmpb2; + *((uint32_t *)&ptr2[a3]) = tmpb3; + *((uint32_t *)&ptr2[a4]) = tmpb4; + } + } +#else + uint16_t *ptr1 = (uint16_t *)(pSrc16 + 2 * col_id * (fftLen / 4)); + uint16_t *ptr2 = (uint16_t *)(pDst16 + 2 * col_id * fftLen); + for (ic = core_id * 16; ic < MIN(core_id * 16 + 16, fftLen >> 2U); + ic += 4) { + uint32_t idx0 = ic; + uint32_t idx1 = ic + 1; + uint32_t idx2 = ic + 2; + uint32_t idx3 = ic + 3; + uint32_t idx_result0 = 0; + uint32_t idx_result1 = 0; + uint32_t idx_result2 = 0; + uint32_t idx_result3 = 0; + for (k = 0; k < LOG2; k++) { + idx_result0 = (idx_result0 << 1U) | (idx0 & 1U); + idx_result1 = (idx_result1 << 1U) | (idx1 & 1U); + idx_result2 = (idx_result2 << 1U) | (idx2 & 1U); + idx_result3 = (idx_result3 << 1U) | (idx3 & 1U); + idx0 = idx0 >> 1U; + idx1 = idx1 >> 1U; + idx2 = idx2 >> 1U; + idx3 = idx3 >> 1U; + } + for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { + uint32_t addr_src0 = (idx0 / 4) + (idx0 % 4) * N_BANKS; + uint32_t addr_src1 = (idx1 / 4) + (idx1 % 4) * N_BANKS; + uint32_t addr_src2 = (idx2 / 4) + (idx2 % 4) * N_BANKS; + uint32_t addr_src3 = (idx3 / 4) + (idx3 % 4) * N_BANKS; + uint32_t addr_dst0 = idx_result0; + uint32_t addr_dst1 = idx_result1; + uint32_t addr_dst2 = idx_result2; + uint32_t addr_dst3 = idx_result3; + addr_src0 += idx_row * (N_BANKS * 8); + addr_src1 += idx_row * (N_BANKS * 8); + addr_src2 += idx_row * (N_BANKS * 8); + addr_src3 += idx_row * (N_BANKS * 8); + addr_dst0 += idx_row * (N_BANKS * 8); + addr_dst1 += idx_row * (N_BANKS * 8); + addr_dst2 += idx_row * (N_BANKS * 8); + addr_dst3 += idx_row * (N_BANKS * 8); + *((uint32_t *)&ptr2[addr_dst0]) = (uint32_t)ptr1[addr_src0]; + *((uint32_t *)&ptr2[addr_dst1]) = (uint32_t)ptr1[addr_src1]; + *((uint32_t *)&ptr2[addr_dst2]) = (uint32_t)ptr1[addr_src2]; + *((uint32_t *)&ptr2[addr_dst3]) = (uint32_t)ptr1[addr_src3]; + } + } +#endif + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + return; +} diff --git a/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h b/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h index 56f4f478b..e5380444c 100644 --- a/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h +++ b/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h @@ -26,14 +26,14 @@ void mempool_bitrevtable_q16s_riscv32(uint16_t *pSrc, const uint16_t bitRevLen, #ifndef ASM #define SWAP_ITEMS \ - addr1 = *(v2s *)&pBitRevTab[i]; \ - addr2 = *(v2s *)&pBitRevTab[i + 2]; \ - addr3 = *(v2s *)&pBitRevTab[i + 4]; \ - addr4 = *(v2s *)&pBitRevTab[i + 6]; \ - addr1 = __SRA2(addr1, s2); \ - addr2 = __SRA2(addr2, s2); \ - addr3 = __SRA2(addr3, s2); \ - addr4 = __SRA2(addr4, s2); \ + addr1 = *(uint32_t *)&pBitRevTab[i]; \ + addr2 = *(uint32_t *)&pBitRevTab[i + 2]; \ + addr3 = *(uint32_t *)&pBitRevTab[i + 4]; \ + addr4 = *(uint32_t *)&pBitRevTab[i + 6]; \ + addr1 = __SRA2(*(v2s*)&addr1, *(v2s*)&s2); \ + addr2 = __SRA2(*(v2s*)&addr2, *(v2s*)&s2); \ + addr3 = __SRA2(*(v2s*)&addr3, *(v2s*)&s2); \ + addr4 = __SRA2(*(v2s*)&addr4, *(v2s*)&s2); \ a1 = addr1[1]; \ a2 = addr2[1]; \ a3 = addr3[1]; \ @@ -42,28 +42,28 @@ void mempool_bitrevtable_q16s_riscv32(uint16_t *pSrc, const uint16_t bitRevLen, b2 = addr2[0]; \ b3 = addr3[0]; \ b4 = addr4[0]; \ - tmpa1 = *(v2s *)&pSrc[a1]; \ - tmpa2 = *(v2s *)&pSrc[a2]; \ - tmpa3 = *(v2s *)&pSrc[a3]; \ - tmpa4 = *(v2s *)&pSrc[a4]; \ - tmpb1 = *(v2s *)&pSrc[b1]; \ - tmpb2 = *(v2s *)&pSrc[b2]; \ - tmpb3 = *(v2s *)&pSrc[b3]; \ - tmpb4 = *(v2s *)&pSrc[b4]; \ - *((v2s *)&pSrc[a1]) = tmpb1; \ - *((v2s *)&pSrc[a2]) = tmpb2; \ - *((v2s *)&pSrc[a3]) = tmpb3; \ - *((v2s *)&pSrc[a4]) = tmpb4; \ - *((v2s *)&pSrc[b1]) = tmpa1; \ - *((v2s *)&pSrc[b2]) = tmpa2; \ - *((v2s *)&pSrc[b3]) = tmpa3; \ - *((v2s *)&pSrc[b4]) = tmpa4; + tmpa1 = *(uint32_t *)&pSrc[a1]; \ + tmpa2 = *(uint32_t *)&pSrc[a2]; \ + tmpa3 = *(uint32_t *)&pSrc[a3]; \ + tmpa4 = *(uint32_t *)&pSrc[a4]; \ + tmpb1 = *(uint32_t *)&pSrc[b1]; \ + tmpb2 = *(uint32_t *)&pSrc[b2]; \ + tmpb3 = *(uint32_t *)&pSrc[b3]; \ + tmpb4 = *(uint32_t *)&pSrc[b4]; \ + *((uint32_t *)&pSrc[a1]) = tmpb1; \ + *((uint32_t *)&pSrc[a2]) = tmpb2; \ + *((uint32_t *)&pSrc[a3]) = tmpb3; \ + *((uint32_t *)&pSrc[a4]) = tmpb4; \ + *((uint32_t *)&pSrc[b1]) = tmpa1; \ + *((uint32_t *)&pSrc[b2]) = tmpa2; \ + *((uint32_t *)&pSrc[b3]) = tmpa3; \ + *((uint32_t *)&pSrc[b4]) = tmpa4; #else #define SWAP_ITEMS \ - addr1 = *(v2s *)&pBitRevTab[i]; \ - addr2 = *(v2s *)&pBitRevTab[i + 2]; \ - addr3 = *(v2s *)&pBitRevTab[i + 4]; \ - addr4 = *(v2s *)&pBitRevTab[i + 6]; \ + addr1 = *(uint32_t *)&pBitRevTab[i]; \ + addr2 = *(uint32_t *)&pBitRevTab[i + 2]; \ + addr3 = *(uint32_t *)&pBitRevTab[i + 4]; \ + addr4 = *(uint32_t *)&pBitRevTab[i + 6]; \ asm volatile("pv.sra.h %[addr1],%[addr1],%[s2];" \ "pv.sra.h %[addr2],%[addr2],%[s2];" \ "pv.sra.h %[addr3],%[addr3],%[s2];" \ @@ -82,30 +82,30 @@ void mempool_bitrevtable_q16s_riscv32(uint16_t *pSrc, const uint16_t bitRevLen, [addr3] "+&r"(addr3), [addr4] "+&r"(addr4) \ : [s2] "r"(s2) \ :); \ - tmpa1 = *(v2s *)&pSrc[a1]; \ - tmpa2 = *(v2s *)&pSrc[a2]; \ - tmpa3 = *(v2s *)&pSrc[a3]; \ - tmpa4 = *(v2s *)&pSrc[a4]; \ - tmpb1 = *(v2s *)&pSrc[b1]; \ - tmpb2 = *(v2s *)&pSrc[b2]; \ - tmpb3 = *(v2s *)&pSrc[b3]; \ - tmpb4 = *(v2s *)&pSrc[b4]; \ - *((v2s *)&pSrc[a1]) = tmpb1; \ - *((v2s *)&pSrc[a2]) = tmpb2; \ - *((v2s *)&pSrc[a3]) = tmpb3; \ - *((v2s *)&pSrc[a4]) = tmpb4; \ - *((v2s *)&pSrc[b1]) = tmpa1; \ - *((v2s *)&pSrc[b2]) = tmpa2; \ - *((v2s *)&pSrc[b3]) = tmpa3; \ - *((v2s *)&pSrc[b4]) = tmpa4; + tmpa1 = *(uint32_t *)&pSrc[a1]; \ + tmpa2 = *(uint32_t *)&pSrc[a2]; \ + tmpa3 = *(uint32_t *)&pSrc[a3]; \ + tmpa4 = *(uint32_t *)&pSrc[a4]; \ + tmpb1 = *(uint32_t *)&pSrc[b1]; \ + tmpb2 = *(uint32_t *)&pSrc[b2]; \ + tmpb3 = *(uint32_t *)&pSrc[b3]; \ + tmpb4 = *(uint32_t *)&pSrc[b4]; \ + *((uint32_t *)&pSrc[a1]) = tmpb1; \ + *((uint32_t *)&pSrc[a2]) = tmpb2; \ + *((uint32_t *)&pSrc[a3]) = tmpb3; \ + *((uint32_t *)&pSrc[a4]) = tmpb4; \ + *((uint32_t *)&pSrc[b1]) = tmpa1; \ + *((uint32_t *)&pSrc[b2]) = tmpa2; \ + *((uint32_t *)&pSrc[b3]) = tmpa3; \ + *((uint32_t *)&pSrc[b4]) = tmpa4; #endif void mempool_bitrevtable_q16s_xpulpimg(uint16_t *pSrc, const uint16_t bitRevLen, const uint16_t *pBitRevTab) { - v2s addr1, addr2, addr3, addr4; - v2s s2 = (v2s){2, 2}; - v2s tmpa1, tmpa2, tmpa3, tmpa4; - v2s tmpb1, tmpb2, tmpb3, tmpb4; + uint32_t addr1, addr2, addr3, addr4; + uint32_t s2 = 0x00020002; + uint32_t tmpa1, tmpa2, tmpa3, tmpa4; + uint32_t tmpb1, tmpb2, tmpb3, tmpb4; int32_t a1, a2, a3, a4; int32_t b1, b2, b3, b4; for (uint32_t i = 0; i < bitRevLen; i += 8) { @@ -117,10 +117,10 @@ void mempool_bitrevtable_q16p_xpulpimg(uint16_t *pSrc, const uint16_t bitRevLen, const uint16_t *pBitRevTab, const uint32_t nPE) { uint32_t core_id = mempool_get_core_id(); - v2s addr1, addr2, addr3, addr4; - v2s s2 = (v2s){2, 2}; - v2s tmpa1, tmpa2, tmpa3, tmpa4; - v2s tmpb1, tmpb2, tmpb3, tmpb4; + uint32_t addr1, addr2, addr3, addr4; + uint32_t s2 = 0x00020002; + uint32_t tmpa1, tmpa2, tmpa3, tmpa4; + uint32_t tmpb1, tmpb2, tmpb3, tmpb4; int32_t a1, a2, a3, a4; int32_t b1, b2, b3, b4; for (uint32_t i = 8 * core_id; i < bitRevLen; i += (8 * nPE)) { diff --git a/software/runtime/kernel/mempool_radix4_cfft_q16p.h b/software/runtime/kernel/mempool_radix4_cfft_q16p.h index 34f338ce8..ce928be00 100644 --- a/software/runtime/kernel/mempool_radix4_cfft_q16p.h +++ b/software/runtime/kernel/mempool_radix4_cfft_q16p.h @@ -81,7 +81,7 @@ void mempool_radix4_cfft_q16p_xpulpimg(int16_t *pSrc16, uint32_t fftLen, uint32_t n1, n2, ic, i0, j, k; uint32_t step, steps; - /* START OF FIRST STAGE PROCESS */ + /* START OF FIRST STAGE PROCESSING */ n1 = fftLen; n2 = n1 >> 2U; step = (n2 + nPE - 1) / nPE;