Skip to content

Commit

Permalink
Moar
Browse files Browse the repository at this point in the history
  • Loading branch information
argilo committed Feb 1, 2024
1 parent 210e5ae commit 264ab06
Showing 1 changed file with 14 additions and 21 deletions.
35 changes: 14 additions & 21 deletions kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
Original file line number Diff line number Diff line change
Expand Up @@ -624,8 +624,7 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
short int s26, s27;
unsigned int s20;
unsigned char *a74, *a80, *b6;
short int *a110, *a111;
unsigned int *a91, *a93, *a94;
unsigned int *a110, *a91, *a93, *a94;
uint8x16_t *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98,
*a99;
uint8x16_t a105, a86;
Expand Down Expand Up @@ -676,8 +675,8 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12));
paired64_new = vreinterpretq_u16_u64(vsraq_n_u64(paired32, paired32, 24));
s20 |= ((unsigned int)vgetq_lane_u16(paired64_new, 0)) | ((unsigned int)vgetq_lane_u8(paired64_new, 4) << 16);
a91 = ((short int*)dec);
s20 |= ((unsigned int)vgetq_lane_u16(paired64_new, 0)) | ((unsigned int)vgetq_lane_u16(paired64_new, 4) << 16);
a91 = ((unsigned int*)dec);
a92 = (4 * i9);
a93 = (a91 + a92);
*(a93) = s20;
Expand Down Expand Up @@ -709,24 +708,18 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
d11 = vceqq_u8(a108, m28);
a109 = vminq_u8(m30, m29);
d12 = vceqq_u8(a109, m30);
// s26 = _mm_movemask_epi8(vzip1q_u8(d11, d12));
high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vzip1q_u8(d11, d12), 7));
paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
s26 = vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8);

a110 = (a93 + 2);
high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d11, 7));
paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12));
paired64_new = vreinterpretq_u16_u64(vsraq_n_u64(paired32, paired32, 24));
s26 = ((unsigned int)vgetq_lane_u16(paired64_new, 0) << 1) | ((unsigned int)vgetq_lane_u16(paired64_new, 4) << 17);
high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d12, 7));
paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12));
paired64_new = vreinterpretq_u16_u64(vsraq_n_u64(paired32, paired32, 24));
s26 |= ((unsigned int)vgetq_lane_u16(paired64_new, 0)) | ((unsigned int)vgetq_lane_u16(paired64_new, 4) << 16);
a110 = (a93 + 1);
*(a110) = s26;
// s27 = _mm_movemask_epi8(vzip2q_u8(d11, d12));
high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vzip2q_u8(d11, d12), 7));
paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
s27 = vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8);

a111 = (a93 + 3);
*(a111) = s27;
s28 = vzip1q_u8(a108, a109);
s29 = vzip2q_u8(a108, a109);
a112 = (a95 + 2);
Expand Down

0 comments on commit 264ab06

Please sign in to comment.