Experiment

argilo · Feb 1, 2024 · 7798fd7 · 7798fd7
1 parent bd07ba2
commit 7798fd7
Showing 1 changed file with 16 additions and 19 deletions.
diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
@@ -621,9 +621,11 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
     for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
         unsigned char a75, a81;
         int a73, a92;
-        short int s20, s21, s26, s27;
+        short int s26, s27;
+        unsigned int s20;
         unsigned char *a74, *a80, *b6;
-        short int *a110, *a111, *a91, *a93, *a94;
+        short int *a110, *a111
+        unsigned int *a91, *a93, *a94;
         uint8x16_t *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98,
             *a99;
         uint8x16_t a105, a86;
@@ -634,6 +636,7 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
         uint32x4_t paired16;
         uint64x2_t paired32;
         uint8x16_t paired64;
+        uint16x8_t paired64_new;
         a71 = ((uint8x16_t*)X);
         s18 = *(a71);
         a72 = (a71 + 2);
@@ -664,26 +667,20 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
         d9 = vceqq_u8(a89, m24);
         a90 = vminq_u8(m26, m25);
         d10 = vceqq_u8(a90, m26);
-        // s20 = _mm_movemask_epi8(vzip1q_u8(d9, d10)); // make this work on 32-bit
-        high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vzip1q_u8(d9, d10), 7));
-        paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
-        paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
-        paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
-        s20 = vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8);
-
+        high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d9, 7));
+        paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
+        paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12));
+        paired64_new = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 24));
+        s20 = ((unsigned int)vgetq_lane_u16(paired64_new, 0) << 1) | ((unsigned int)vgetq_lane_u16(paired64_new, 4) << 17);
+        high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d10, 7));
+        paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
+        paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 12));
+        paired64_new = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 24));
+        s20 |= ((unsigned int)vgetq_lane_u16(paired64_new, 0)) | ((unsigned int)vgetq_lane_u8(paired64_new, 4) << 16);
         a91 = ((short int*)dec);
-        a92 = (8 * i9);
+        a92 = (4 * i9);
         a93 = (a91 + a92);
         *(a93) = s20;
-        // s21 = _mm_movemask_epi8(vzip2q_u8(d9, d10));
-        high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vzip2q_u8(d9, d10), 7));
-        paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
-        paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
-        paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
-        s21 = vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8);
-
-        a94 = (a93 + 1);
-        *(a94) = s21;
         s22 = vzip1q_u8(a89, a90);
         s23 = vzip2q_u8(a89, a90);
         a95 = ((uint8x16_t*)Y);