Skip to content

Commit

Permalink
Fix
Browse files Browse the repository at this point in the history
  • Loading branch information
argilo committed Feb 1, 2024
1 parent 264ab06 commit e2cb86f
Showing 1 changed file with 21 additions and 13 deletions.
34 changes: 21 additions & 13 deletions kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
Original file line number Diff line number Diff line change
Expand Up @@ -634,8 +634,8 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
uint16x8_t high_bits;
uint32x4_t paired16;
uint64x2_t paired32;
uint8x16_t paired32_new;
uint8x16_t paired64;
uint16x8_t paired64_new;
a71 = ((uint8x16_t*)X);
s18 = *(a71);
a72 = (a71 + 2);
Expand Down Expand Up @@ -668,14 +668,18 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
d10 = vceqq_u8(a90, m26);
high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d9, 7));
paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12));
paired64_new = vreinterpretq_u16_u64(vsraq_n_u64(paired32, paired32, 24));
s20 = ((unsigned int)vgetq_lane_u16(paired64_new, 0) << 1) | ((unsigned int)vgetq_lane_u16(paired64_new, 4) << 17);
paired32_new = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
s20 = ((unsigned int)vgetq_lane_u8(paired32_new, 0) << 1)
| ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 9)
| ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 17)
| ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 25);
high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d10, 7));
paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12));
paired64_new = vreinterpretq_u16_u64(vsraq_n_u64(paired32, paired32, 24));
s20 |= ((unsigned int)vgetq_lane_u16(paired64_new, 0)) | ((unsigned int)vgetq_lane_u16(paired64_new, 4) << 16);
paired32_new = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12));
s20 |= ((unsigned int)vgetq_lane_u8(paired32_new, 0))
| ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 8)
| ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 16)
| ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 24);
a91 = ((unsigned int*)dec);
a92 = (4 * i9);
a93 = (a91 + a92);
Expand Down Expand Up @@ -710,14 +714,18 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
d12 = vceqq_u8(a109, m30);
high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d11, 7));
paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12));
paired64_new = vreinterpretq_u16_u64(vsraq_n_u64(paired32, paired32, 24));
s26 = ((unsigned int)vgetq_lane_u16(paired64_new, 0) << 1) | ((unsigned int)vgetq_lane_u16(paired64_new, 4) << 17);
paired32_new = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
s26 = ((unsigned int)vgetq_lane_u8(paired32_new, 0) << 1)
| ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 9)
| ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 17)
| ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 25);
high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d12, 7));
paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12));
paired64_new = vreinterpretq_u16_u64(vsraq_n_u64(paired32, paired32, 24));
s26 |= ((unsigned int)vgetq_lane_u16(paired64_new, 0)) | ((unsigned int)vgetq_lane_u16(paired64_new, 4) << 16);
paired32_new = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12));
s26 |= ((unsigned int)vgetq_lane_u8(paired32_new, 0))
| ((unsigned int)vgetq_lane_u8(paired32_new, 4) << 8)
| ((unsigned int)vgetq_lane_u8(paired32_new, 8) << 16)
| ((unsigned int)vgetq_lane_u8(paired32_new, 12) << 24);
a110 = (a93 + 1);
*(a110) = s26;
s28 = vzip1q_u8(a108, a109);
Expand Down

0 comments on commit e2cb86f

Please sign in to comment.