From e2cb86ff9682ec3e1a11142e6a0376c29d1d9f8c Mon Sep 17 00:00:00 2001 From: Clayton Smith Date: Wed, 31 Jan 2024 21:54:11 -0500 Subject: [PATCH] Fix --- kernels/volk/volk_8u_x4_conv_k7_r2_8u.h | 34 +++++++++++++++---------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h index 23164117..b20fa033 100644 --- a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h +++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h @@ -634,8 +634,8 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y, uint16x8_t high_bits; uint32x4_t paired16; uint64x2_t paired32; + uint8x16_t paired32_new; uint8x16_t paired64; - uint16x8_t paired64_new; a71 = ((uint8x16_t*)X); s18 = *(a71); a72 = (a71 + 2); @@ -668,14 +668,18 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y, d10 = vceqq_u8(a90, m26); high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d9, 7)); paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6)); - paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12)); - paired64_new = vreinterpretq_u16_u64(vsraq_n_u64(paired32, paired32, 24)); - s20 = ((unsigned int)vgetq_lane_u16(paired64_new, 0) << 1) | ((unsigned int)vgetq_lane_u16(paired64_new, 4) << 17); + paired32_new = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12)); + s20 = ((unsigned int)vgetq_lane_u8(paired32_new, 0) << 1) + | ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 9) + | ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 17) + | ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 25); high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d10, 7)); paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6)); - paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12)); - paired64_new = vreinterpretq_u16_u64(vsraq_n_u64(paired32, paired32, 24)); - s20 |= ((unsigned int)vgetq_lane_u16(paired64_new, 0)) | ((unsigned int)vgetq_lane_u16(paired64_new, 4) << 16); + paired32_new = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12)); + s20 |= ((unsigned int)vgetq_lane_u8(paired32_new, 0)) + | ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 8) + | ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 16) + | ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 24); a91 = ((unsigned int*)dec); a92 = (4 * i9); a93 = (a91 + a92); @@ -710,14 +714,18 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y, d12 = vceqq_u8(a109, m30); high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d11, 7)); paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6)); - paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12)); - paired64_new = vreinterpretq_u16_u64(vsraq_n_u64(paired32, paired32, 24)); - s26 = ((unsigned int)vgetq_lane_u16(paired64_new, 0) << 1) | ((unsigned int)vgetq_lane_u16(paired64_new, 4) << 17); + paired32_new = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12)); + s26 = ((unsigned int)vgetq_lane_u8(paired32_new, 0) << 1) + | ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 9) + | ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 17) + | ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 25); high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d12, 7)); paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6)); - paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12)); - paired64_new = vreinterpretq_u16_u64(vsraq_n_u64(paired32, paired32, 24)); - s26 |= ((unsigned int)vgetq_lane_u16(paired64_new, 0)) | ((unsigned int)vgetq_lane_u16(paired64_new, 4) << 16); + paired32_new = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12)); + s26 |= ((unsigned int)vgetq_lane_u8(paired32_new, 0)) + | ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 8) + | ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 16) + | ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 24); a110 = (a93 + 1); *(a110) = s26; s28 = vzip1q_u8(a108, a109);