Skip to content

Commit

Permalink
Simplify Spiral-generated code
Browse files Browse the repository at this point in the history
Signed-off-by: Clayton Smith <[email protected]>
  • Loading branch information
argilo committed Feb 1, 2024
1 parent cc91bd1 commit 7883a32
Showing 1 changed file with 36 additions and 76 deletions.
112 changes: 36 additions & 76 deletions kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,47 +236,28 @@ static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
unsigned int excess,
unsigned char* Branchtab)
{
unsigned int i9;
for (i9 = 0; i9 < framebits + excess; i9++) {
unsigned int i;
for (i = 0; i < framebits + excess; i++) {
unsigned char* tmp;
unsigned char a75, a81;
int a73, a92;
short int s20, s21, s26, s27;
unsigned char *a74, *a80, *b6;
short int *a110, *a111, *a91, *a93, *a94;
__m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
__m128i a105, a106, a86, a87;
unsigned short* dec_short = (unsigned short*)dec;
__m128i a105, a86;
__m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
a71 = ((__m128i*)X);
s18 = *(a71);
a72 = (a71 + 2);
s19 = *(a72);
a73 = (2 * i9);
a74 = (syms + a73);
a75 = *(a74);
a76 = _mm_set1_epi8(a75);
a77 = ((__m128i*)Branchtab);
a78 = *(a77);
s19, s22, s23, s24, s25, s28, s29, t14, t15, t17, t18;

// First half of butterfly
s18 = ((__m128i*)X)[0];
s19 = ((__m128i*)X)[2];
a76 = _mm_set1_epi8(syms[2 * i]);
a78 = ((__m128i*)Branchtab)[0];
a79 = _mm_xor_si128(a76, a78);
b6 = (a73 + syms);
a80 = (b6 + 1);
a81 = *(a80);
a82 = _mm_set1_epi8(a81);
a83 = (a77 + 2);
a84 = *(a83);
a82 = _mm_set1_epi8(syms[2 * i + 1]);
a84 = ((__m128i*)Branchtab)[2];
a85 = _mm_xor_si128(a82, a84);
t13 = _mm_avg_epu8(a79, a85);
a86 = ((__m128i)t13);
a87 = _mm_srli_epi16(a86, 2);
a88 = ((__m128i)a87);
t14 = _mm_and_si128(
a88,
_mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
t15 = _mm_subs_epu8(
_mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
t14);
a86 = _mm_avg_epu8(a79, a85);
a88 = _mm_srli_epi16(a86, 2);
t14 = _mm_and_si128(a88, _mm_set1_epi8(63));
t15 = _mm_subs_epu8(_mm_set1_epi8(63), t14);
m23 = _mm_adds_epu8(s18, t14);
m24 = _mm_adds_epu8(s19, t15);
m25 = _mm_adds_epu8(s18, t15);
Expand All @@ -285,40 +266,24 @@ static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
d9 = _mm_cmpeq_epi8(a89, m24);
a90 = _mm_min_epu8(m26, m25);
d10 = _mm_cmpeq_epi8(a90, m26);
s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
a91 = ((short int*)dec);
a92 = (4 * i9);
a93 = (a91 + a92);
*(a93) = s20;
s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
a94 = (a93 + 1);
*(a94) = s21;
dec_short[4 * i] = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
dec_short[4 * i + 1] = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
s22 = _mm_unpacklo_epi8(a89, a90);
s23 = _mm_unpackhi_epi8(a89, a90);
a95 = ((__m128i*)Y);
*(a95) = s22;
a96 = (a95 + 1);
*(a96) = s23;
a97 = (a71 + 1);
s24 = *(a97);
a98 = (a71 + 3);
s25 = *(a98);
a99 = (a77 + 1);
a100 = *(a99);
((__m128i*)Y)[0] = s22;
((__m128i*)Y)[1] = s23;

// Second half of butterfly
s24 = ((__m128i*)X)[1];
s25 = ((__m128i*)X)[3];
a100 = ((__m128i*)Branchtab)[1];
a101 = _mm_xor_si128(a76, a100);
a102 = (a77 + 3);
a103 = *(a102);
a103 = ((__m128i*)Branchtab)[3];
a104 = _mm_xor_si128(a82, a103);
t16 = _mm_avg_epu8(a101, a104);
a105 = ((__m128i)t16);
a106 = _mm_srli_epi16(a105, 2);
a107 = ((__m128i)a106);
t17 = _mm_and_si128(
a107,
_mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
t18 = _mm_subs_epu8(
_mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
t17);
a105 = _mm_avg_epu8(a101, a104);
a107 = _mm_srli_epi16(a105, 2);
t17 = _mm_and_si128(a107, _mm_set1_epi8(63));
t18 = _mm_subs_epu8(_mm_set1_epi8(63), t17);
m27 = _mm_adds_epu8(s24, t17);
m28 = _mm_adds_epu8(s25, t18);
m29 = _mm_adds_epu8(s24, t18);
Expand All @@ -327,19 +292,14 @@ static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
d11 = _mm_cmpeq_epi8(a108, m28);
a109 = _mm_min_epu8(m30, m29);
d12 = _mm_cmpeq_epi8(a109, m30);
s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
a110 = (a93 + 2);
*(a110) = s26;
s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
a111 = (a93 + 3);
*(a111) = s27;
dec_short[4 * i + 2] = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
dec_short[4 * i + 3] = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
s28 = _mm_unpacklo_epi8(a108, a109);
s29 = _mm_unpackhi_epi8(a108, a109);
a112 = (a95 + 2);
*(a112) = s28;
a113 = (a95 + 3);
*(a113) = s29;
((__m128i*)Y)[2] = s28;
((__m128i*)Y)[3] = s29;

// Renormalize
__m128i m5, m6;
m5 = ((__m128i*)Y)[0];
m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
Expand Down

0 comments on commit 7883a32

Please sign in to comment.