Skip to content

Commit

Permalink
Math: FIR: Optimize filter core function for HiFi5
Browse files Browse the repository at this point in the history
This patch optimizes the function fir_32x16_2x_hifi5().

- The (4x) quad-MAC with AE_MULAFD32X16X2_FIR_HH() and
  AE_MULAFD32X16X2_FIR_HL() is replaced with a 8x MAC intrinsic
  AE_MULA2Q32X16_FIR_H().
- Since the 8x MAC is not supporting fractions, a shift left by
  one is added to adjust the format to Q17.47.
- The output sample single saturation and round is replaced with
  instruction that rounds two 64 bit accumulators.

The MCPS saving with FIR EQ and TDFB components seems quite small,
only 0.2 MCPS. The PCM samples data load operations seem to
dominate in MCPS load with the used HiFi5 core configuration in
simulation.

Signed-off-by: Seppo Ingalsuo <[email protected]>
  • Loading branch information
singalsu committed Mar 3, 2025
1 parent 5273054 commit 4591ac4
Showing 1 changed file with 28 additions and 45 deletions.
73 changes: 28 additions & 45 deletions src/math/fir_hifi5.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,6 @@ void fir_get_lrshifts(struct fir_state_32x16 *fir, int *lshift,
}
EXPORT_SYMBOL(fir_get_lrshifts);

/* HiFi EP has the follow number of reqisters that should not be exceeded
* 4x 56 bit registers in register file Q
* 8x 48 bit registers in register file P
*/

void fir_32x16(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, int shift)
{
/* This function uses
Expand Down Expand Up @@ -163,31 +158,26 @@ void fir_32x16(struct fir_state_32x16 *fir, ae_int32 x, ae_int32 *y, int shift)
}
EXPORT_SYMBOL(fir_32x16);

/* HiFi EP has the follow number of reqisters that should not be exceeded
* 4x 56 bit registers in register file Q
* 8x 48 bit registers in register file P
*/

void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1,
ae_int32 *y0, ae_int32 *y1, int shift)
{
/* This function uses
* 2x 56 bit registers Q,
* 4x 48 bit registers P
* 7x 64 bit AE registers
* 3x integers
* 2x address pointers,
*/
ae_f64 a;
ae_f64 b;
ae_valign u;
ae_f64 a = AE_ZERO64();
ae_f64 b = AE_ZERO64();
ae_f32x2 d0;
ae_f32x2 d1;
ae_f32x2 d2;
ae_f16x4 coefs;
int i;
ae_f32x2 *dp;
ae_f16x4 *coefp = fir->coef;
const int taps_div_4 = fir->taps >> 2;
const int inc = 2 * sizeof(int32_t);
int i;

/* Bypass samples if taps count is zero. */
if (!taps_div_4) {
Expand All @@ -201,18 +191,11 @@ void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1,
dp = (ae_f32x2 *)fir->rwp;
AE_S32_L_XC(x1, fir->rwp, -sizeof(int32_t));

/* Note: If the next function is converted to handle two samples
* per call the data load can be done with single instruction
* AE_LP24X2F_C(data2, dp, sizeof(ae_p24x2f));
*/
a = AE_ZERO64();
b = AE_ZERO64();

/* Prime the coefficients stream */
u = AE_LA64_PP(coefp);

/* Load two data samples and pack to d0 to data2_h and
* d1 to data2_l.
/* Load two samples, two newest samples and proceed
* to elder input samples in delay line.
*/
AE_L32X2_XC(d0, dp, inc);
for (i = 0; i < taps_div_4; i++) {
Expand All @@ -222,34 +205,34 @@ void fir_32x16_2x(struct fir_state_32x16 *fir, ae_int32 x0, ae_int32 x1,
*/
AE_LA16X4_IP(coefs, u, coefp);

/* Load two data samples. Upper part d1_h is x[n+1] and
* lower part d1_l is x[n].
/* Load two data samples more.
* d0.H is x[n] the newest sample
* d0.L is x[n-1]
* d1.H is x[n-2]
* d1.L is x[n-3]
* d2.H is x[n-4]
*/
AE_L32X2_XC(d1, dp, inc);
AE_L32X2_XC(d2, dp, inc);

/* Quad MAC (HH)
* b += d0_h * coefs_3 + d0_l * coefs_2
* a += d0_l * coefs_3 + d1_h * coefs_2
/* Calculate four FIR taps for current (x1 -> a) and previous input (x0 -> b)
* b = b + d0.H * c.3 + d0.L * c.2 + d1.H * c.1 + d1.L * c.0
* a = a + d0.L * c.3 + d1.H * c.2 + d1.L * c.1 + d2.H * c.0
*/
AE_MULAFD32X16X2_FIR_HH(b, a, d0, d1, coefs);
d0 = d1;

/* Repeat the same for next two taps and increase coefp. */
AE_L32X2_XC(d1, dp, inc);
AE_MULA2Q32X16_FIR_H(b, a, d0, d1, d2, coefs);

/* Quad MAC (HL)
* b += d0_h * coefs_1 + d0_l * coefs_0
* a += d0_l * coefs_1 + d1_h * coefs_0
*/
AE_MULAFD32X16X2_FIR_HL(b, a, d0, d1, coefs);
d0 = d1;
/* Prepare for next four taps, d2 overlaps to next loop iteration as d0 */
d0 = d2;
}

/* Do scaling shifts and store sample. */
b = AE_SLAA64S(b, shift);
a = AE_SLAA64S(a, shift);
AE_S32_L_I(AE_ROUND32F48SSYM(b), (ae_int32 *)y1, 0);
AE_S32_L_I(AE_ROUND32F48SSYM(a), (ae_int32 *)y0, 0);
/* Shift left by one Q1.31 x Q1.15 -> Q2.46 format for Q2.47 round and
* store output samples.
*/
b = AE_SLAA64S(b, shift + 1);
a = AE_SLAA64S(a, shift + 1);
d0 = AE_ROUND32X2F48SASYM(b, a);
AE_S32_H_I(d0, (ae_int32 *)y1, 0);
AE_S32_L_I(d0, (ae_int32 *)y0, 0);
}
EXPORT_SYMBOL(fir_32x16_2x);

Expand Down

0 comments on commit 4591ac4

Please sign in to comment.