Skip to content

Commit

Permalink
Trying to use fma instructions when possible
Browse files Browse the repository at this point in the history
Compilers sometimes replace vmlaq*() with fmul+fadd instead of fmla.
Trying to use vfmaq*() instead when possible.
  • Loading branch information
jmvalin committed Nov 28, 2023
1 parent 72cc88d commit db26e38
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 0 deletions.
8 changes: 8 additions & 0 deletions celt/arm/celt_neon_intr.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va
}

#else

#if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64)
/* If we can, force the compiler to use an FMA instruction rather than break
* vmlaq_f32() into fmul/fadd. */
#define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane)
#endif


/*
* Function: xcorr_kernel_neon_float
* ---------------------------------
Expand Down
7 changes: 7 additions & 0 deletions celt/arm/pitch_neon_intr.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,13 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus

/* ========================================================================== */

#ifdef __ARM_FEATURE_FMA
/* If we can, force the compiler to use an FMA instruction rather than break
vmlaq_f32() into fmul/fadd. */
#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c)
#endif


#ifdef OPUS_CHECK_ASM

/* This part of code simulates floating-point NEON operations. */
Expand Down
6 changes: 6 additions & 0 deletions dnn/vec_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ static OPUS_INLINE int16x8_t vmull_high_s8(int8x16_t a, int8x16_t b) {
}
#endif

#ifdef __ARM_FEATURE_FMA
/* If we can, force the compiler to use an FMA instruction rather than break
vmlaq_f32() into fmul/fadd. */
#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c)
#endif

#ifndef LPCNET_TEST
static inline float32x4_t exp4_approx(float32x4_t x) {
int32x4_t i;
Expand Down

0 comments on commit db26e38

Please sign in to comment.