diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h index 33ab9676..53b366c1 100644 --- a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h +++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h @@ -669,18 +669,18 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y, high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d9, 7)); paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6)); paired32_new = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12)); - s20 = ((unsigned int)vgetq_lane_u8(paired32_new, 0) << 17) - | ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 25) - | ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 1) - | ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 9); + s20 = ((unsigned int)vgetq_lane_u8(paired32_new, 0) << 9) + | ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 1) + | ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 25) + | ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 17); printf("%08x\n", s20); high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d10, 7)); paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6)); paired32_new = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12)); - s20 |= ((unsigned int)vgetq_lane_u8(paired32_new, 0) << 16) - | ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 24) - | ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 0) - | ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 8); + s20 |= ((unsigned int)vgetq_lane_u8(paired32_new, 0) << 8) + | ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 0) + | ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 24) + | ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 16); printf("%08x\n", s20); a91 = ((unsigned int*)dec); a92 = (4 * i9); @@ -717,17 +717,17 @@ printf("%08x\n", s20); high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d11, 7)); paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6)); paired32_new = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12)); - s26 = ((unsigned int)vgetq_lane_u8(paired32_new, 0) << 17) - | ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 25) - | ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 1) - | ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 9); + s26 = ((unsigned int)vgetq_lane_u8(paired32_new, 0) << 9) + | ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 1) + | ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 25) + | ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 17); high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d12, 7)); paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6)); paired32_new = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12)); - s26 |= ((unsigned int)vgetq_lane_u8(paired32_new, 0) << 16) - | ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 24) - | ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 0) - | ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 8); + s26 |= ((unsigned int)vgetq_lane_u8(paired32_new, 0) << 8) + | ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 0) + | ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 24) + | ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 16); a110 = (a93 + 1); *(a110) = s26; s28 = vzip1q_u8(a108, a109);