Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
argilo committed Jan 31, 2024
1 parent d605d9a commit 5317ea2
Showing 1 changed file with 104 additions and 131 deletions.
235 changes: 104 additions & 131 deletions kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
Original file line number Diff line number Diff line change
Expand Up @@ -607,7 +607,7 @@ static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,

#if LV_HAVE_NEON

#include "volk/sse2neon/sse2neon.h"
#include <arm_neon.h>

static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
unsigned char* X,
Expand All @@ -624,47 +624,42 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
short int s20, s21, s26, s27;
unsigned char *a74, *a80, *b6;
short int *a110, *a111, *a91, *a93, *a94;
__m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
__m128i a105, a106, a86, a87;
__m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
uint8x16_t *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98,
*a99;
uint8x16_t a105, a106, a86, a87;
uint8x16_t a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
a71 = ((__m128i*)X);
a71 = ((uint8x16_t*)X);
s18 = *(a71);
a72 = (a71 + 2);
s19 = *(a72);
a73 = (4 * i9);
a74 = (syms + a73);
a75 = *(a74);
a76 = _mm_set1_epi8(a75);
a77 = ((__m128i*)Branchtab);
a76 = vdupq_n_u8(a75);
a77 = ((uint8x16_t*)Branchtab);
a78 = *(a77);
a79 = _mm_xor_si128(a76, a78);
a79 = veorq_u8(a76, a78);
b6 = (a73 + syms);
a80 = (b6 + 1);
a81 = *(a80);
a82 = _mm_set1_epi8(a81);
a82 = vdupq_n_u8(a81);
a83 = (a77 + 2);
a84 = *(a83);
a85 = _mm_xor_si128(a82, a84);
t13 = _mm_avg_epu8(a79, a85);
a86 = ((__m128i)t13);
a87 = _mm_srli_epi16(a86, 2);
a88 = ((__m128i)a87);
t14 = _mm_and_si128(
a88,
_mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
t15 = _mm_subs_epu8(
_mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
t14);
m23 = _mm_adds_epu8(s18, t14);
m24 = _mm_adds_epu8(s19, t15);
m25 = _mm_adds_epu8(s18, t15);
m26 = _mm_adds_epu8(s19, t14);
a89 = _mm_min_epu8(m24, m23);
d9 = _mm_cmpeq_epi8(a89, m24);
a90 = _mm_min_epu8(m26, m25);
d10 = _mm_cmpeq_epi8(a90, m26);
a85 = veorq_u8(a82, a84);
t13 = vrhaddq_u8(a79, a85);
a86 = ((uint8x16_t)t13);
t14 = vshlq_u8(a86, vdupq_n_s8(-2));
t15 = vqsubq_u8(vdupq_n_u8(63), t14);
m23 = vqaddq_u8(s18, t14);
m24 = vqaddq_u8(s19, t15);
m25 = vqaddq_u8(s18, t15);
m26 = vqaddq_u8(s19, t14);
a89 = vminq_u8(m24, m23);
d9 = vceqq_u8(a89, m24);
a90 = vminq_u8(m26, m25);
d10 = vceqq_u8(a90, m26);
s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
a91 = ((short int*)dec);
a92 = (8 * i9);
Expand All @@ -675,7 +670,7 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
*(a94) = s21;
s22 = _mm_unpacklo_epi8(a89, a90);
s23 = _mm_unpackhi_epi8(a89, a90);
a95 = ((__m128i*)Y);
a95 = ((uint8x16_t*)Y);
*(a95) = s22;
a96 = (a95 + 1);
*(a96) = s23;
Expand All @@ -685,28 +680,22 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
s25 = *(a98);
a99 = (a77 + 1);
a100 = *(a99);
a101 = _mm_xor_si128(a76, a100);
a101 = veorq_u8(a76, a100);
a102 = (a77 + 3);
a103 = *(a102);
a104 = _mm_xor_si128(a82, a103);
t16 = _mm_avg_epu8(a101, a104);
a105 = ((__m128i)t16);
a106 = _mm_srli_epi16(a105, 2);
a107 = ((__m128i)a106);
t17 = _mm_and_si128(
a107,
_mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
t18 = _mm_subs_epu8(
_mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
t17);
m27 = _mm_adds_epu8(s24, t17);
m28 = _mm_adds_epu8(s25, t18);
m29 = _mm_adds_epu8(s24, t18);
m30 = _mm_adds_epu8(s25, t17);
a108 = _mm_min_epu8(m28, m27);
d11 = _mm_cmpeq_epi8(a108, m28);
a109 = _mm_min_epu8(m30, m29);
d12 = _mm_cmpeq_epi8(a109, m30);
a104 = veorq_u8(a82, a103);
t16 = vrhaddq_u8(a101, a104);
a105 = ((uint8x16_t)t16);
t17 = vshlq_u8(a105, vdupq_n_s8(-2));
t18 = vqsubq_u8(vdupq_n_u8(63), t17);
m27 = vqaddq_u8(s24, t17);
m28 = vqaddq_u8(s25, t18);
m29 = vqaddq_u8(s24, t18);
m30 = vqaddq_u8(s25, t17);
a108 = vminq_u8(m28, m27);
d11 = vceqq_u8(a108, m28);
a109 = vminq_u8(m30, m29);
d12 = vceqq_u8(a109, m30);
s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
a110 = (a93 + 2);
*(a110) = s26;
Expand All @@ -720,71 +709,63 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
a113 = (a95 + 3);
*(a113) = s29;

__m128i m5, m6;
m5 = ((__m128i*)Y)[0];
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
__m128i m7;
m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
m7 = _mm_unpacklo_epi8(m7, m7);
m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
m6 = _mm_unpacklo_epi64(m7, m7);
((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
uint8x16_t m5, m6;
m5 = ((uint8x16_t*)Y)[0];
m5 = vminq_u8(m5, ((uint8x16_t*)Y)[1]);
m5 = vminq_u8(m5, ((uint8x16_t*)Y)[2]);
m5 = vminq_u8(m5, ((uint8x16_t*)Y)[3]);
uint8x8_t m7;
m7 = vpmin_u8(vget_low_u8(m5), vget_high_u8(m5));
m7 = vpmin_u8(m7, m7);
m7 = vpmin_u8(m7, m7);
m7 = vpmin_u8(m7, m7);
m6 = vcombine_u8(m7, m7);
((uint8x16_t*)Y)[0] = vqsubq_u8(((uint8x16_t*)Y)[0], m6);
((uint8x16_t*)Y)[1] = vqsubq_u8(((uint8x16_t*)Y)[1], m6);
((uint8x16_t*)Y)[2] = vqsubq_u8(((uint8x16_t*)Y)[2], m6);
((uint8x16_t*)Y)[3] = vqsubq_u8(((uint8x16_t*)Y)[3], m6);

unsigned char a188, a194;
int a186, a205;
short int s48, s49, s54, s55;
unsigned char *a187, *a193, *b15;
short int *a204, *a206, *a207, *a223, *a224, *b16;
__m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
uint8x16_t *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
*a225, *a226;
__m128i a199, a200, a218, a219;
__m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
uint8x16_t a199, a200, a218, a219;
uint8x16_t a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
a184 = ((__m128i*)Y);
a184 = ((uint8x16_t*)Y);
s46 = *(a184);
a185 = (a184 + 2);
s47 = *(a185);
a186 = (4 * i9);
b15 = (a186 + syms);
a187 = (b15 + 2);
a188 = *(a187);
a189 = _mm_set1_epi8(a188);
a190 = ((__m128i*)Branchtab);
a189 = vdupq_n_u8(a188);
a190 = ((uint8x16_t*)Branchtab);
a191 = *(a190);
a192 = _mm_xor_si128(a189, a191);
a192 = veorq_u8(a189, a191);
a193 = (b15 + 3);
a194 = *(a193);
a195 = _mm_set1_epi8(a194);
a195 = vdupq_n_u8(a194);
a196 = (a190 + 2);
a197 = *(a196);
a198 = _mm_xor_si128(a195, a197);
t25 = _mm_avg_epu8(a192, a198);
a199 = ((__m128i)t25);
a200 = _mm_srli_epi16(a199, 2);
a201 = ((__m128i)a200);
t26 = _mm_and_si128(
a201,
_mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
t27 = _mm_subs_epu8(
_mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
t26);
m39 = _mm_adds_epu8(s46, t26);
m40 = _mm_adds_epu8(s47, t27);
m41 = _mm_adds_epu8(s46, t27);
m42 = _mm_adds_epu8(s47, t26);
a202 = _mm_min_epu8(m40, m39);
d17 = _mm_cmpeq_epi8(a202, m40);
a203 = _mm_min_epu8(m42, m41);
d18 = _mm_cmpeq_epi8(a203, m42);
a198 = veorq_u8(a195, a197);
t25 = vrhaddq_u8(a192, a198);
a199 = ((uint8x16_t)t25);
t26 = vshlq_u8(a199, vdupq_n_s8(-2));
t27 = vqsubq_u8(vdupq_n_u8(63), t26);
m39 = vqaddq_u8(s46, t26);
m40 = vqaddq_u8(s47, t27);
m41 = vqaddq_u8(s46, t27);
m42 = vqaddq_u8(s47, t26);
a202 = vminq_u8(m40, m39);
d17 = vceqq_u8(a202, m40);
a203 = vminq_u8(m42, m41);
d18 = vceqq_u8(a203, m42);
s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
a204 = ((short int*)dec);
a205 = (8 * i9);
Expand All @@ -796,7 +777,7 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
*(a207) = s49;
s50 = _mm_unpacklo_epi8(a202, a203);
s51 = _mm_unpackhi_epi8(a202, a203);
a208 = ((__m128i*)X);
a208 = ((uint8x16_t*)X);
*(a208) = s50;
a209 = (a208 + 1);
*(a209) = s51;
Expand All @@ -806,28 +787,22 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
s53 = *(a211);
a212 = (a190 + 1);
a213 = *(a212);
a214 = _mm_xor_si128(a189, a213);
a214 = veorq_u8(a189, a213);
a215 = (a190 + 3);
a216 = *(a215);
a217 = _mm_xor_si128(a195, a216);
t28 = _mm_avg_epu8(a214, a217);
a218 = ((__m128i)t28);
a219 = _mm_srli_epi16(a218, 2);
a220 = ((__m128i)a219);
t29 = _mm_and_si128(
a220,
_mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
t30 = _mm_subs_epu8(
_mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
t29);
m43 = _mm_adds_epu8(s52, t29);
m44 = _mm_adds_epu8(s53, t30);
m45 = _mm_adds_epu8(s52, t30);
m46 = _mm_adds_epu8(s53, t29);
a221 = _mm_min_epu8(m44, m43);
d19 = _mm_cmpeq_epi8(a221, m44);
a222 = _mm_min_epu8(m46, m45);
d20 = _mm_cmpeq_epi8(a222, m46);
a217 = veorq_u8(a195, a216);
t28 = vrhaddq_u8(a214, a217);
a218 = ((uint8x16_t)t28);
t29 = vshlq_u8(a218, vdupq_n_s8(-2));
t30 = vqsubq_u8(vdupq_n_u8(63), t29);
m43 = vqaddq_u8(s52, t29);
m44 = vqaddq_u8(s53, t30);
m45 = vqaddq_u8(s52, t30);
m46 = vqaddq_u8(s53, t29);
a221 = vminq_u8(m44, m43);
d19 = vceqq_u8(a221, m44);
a222 = vminq_u8(m46, m45);
d20 = vceqq_u8(a222, m46);
s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
a223 = (b16 + 6);
*(a223) = s54;
Expand All @@ -841,23 +816,21 @@ static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
a226 = (a208 + 3);
*(a226) = s57;

__m128i m12, m13;
m12 = ((__m128i*)X)[0];
m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
__m128i m14;
m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), ((__m128i)m14)));
m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), ((__m128i)m14)));
m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), ((__m128i)m14)));
m14 = _mm_unpacklo_epi8(m14, m14);
m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
m13 = _mm_unpacklo_epi64(m14, m14);
((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
uint8x16_t m12, m13;
m12 = ((uint8x16_t*)X)[0];
m12 = vminq_u8(m12, ((uint8x16_t*)X)[1]);
m12 = vminq_u8(m12, ((uint8x16_t*)X)[2]);
m12 = vminq_u8(m12, ((uint8x16_t*)X)[3]);
uint8x8_t m14;
m14 = vpmin_u8(vget_low_u8(m12), vget_high_u8(m12));
m14 = vpmin_u8(m14, m14);
m14 = vpmin_u8(m14, m14);
m14 = vpmin_u8(m14, m14);
m13 = vcombine_u8(m14, m14);
((uint8x16_t*)X)[0] = vqsubq_u8(((uint8x16_t*)X)[0], m13);
((uint8x16_t*)X)[1] = vqsubq_u8(((uint8x16_t*)X)[1], m13);
((uint8x16_t*)X)[2] = vqsubq_u8(((uint8x16_t*)X)[2], m13);
((uint8x16_t*)X)[3] = vqsubq_u8(((uint8x16_t*)X)[3], m13);
}

renormalize(X);
Expand Down

0 comments on commit 5317ea2

Please sign in to comment.