Skip to content

Commit

Permalink
Make use of NEON alignment hints
Browse files Browse the repository at this point in the history
  • Loading branch information
ccawley2011 committed Feb 19, 2025
1 parent 287c4dc commit 4a06577
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 14 deletions.
14 changes: 7 additions & 7 deletions arch/arm/adler32_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include "adler32_p.h"

static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
static const uint16_t ALIGNED_(16) taps[64] = {
static const uint16_t ALIGNED_(32) taps[64] = {
64, 63, 62, 61, 60, 59, 58, 57,
56, 55, 54, 53, 52, 51, 50, 49,
48, 47, 46, 45, 44, 43, 42, 41,
Expand Down Expand Up @@ -43,7 +43,7 @@ static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
int rem = len & 3;

for (size_t i = 0; i < num_iter; ++i) {
uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
uint8x16x4_t d0_d3 = vld1q_u8_x4_ex(buf, 128);

/* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
* bit instruction, we'll have to make due summing to 16 bits first */
Expand Down Expand Up @@ -82,7 +82,7 @@ static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
if (rem) {
uint32x4_t s3acc_0 = vdupq_n_u32(0);
while (rem--) {
uint8x16_t d0 = vld1q_u8(buf);
uint8x16_t d0 = vld1q_u8_ex(buf, 128);
uint16x8_t adler;
adler = vpaddlq_u8(d0);
s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
Expand All @@ -97,8 +97,8 @@ static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
s3acc = vaddq_u32(s3acc_0, s3acc);
}

uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256);
uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256);

s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
Expand Down Expand Up @@ -171,11 +171,11 @@ Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len)
/* If memory is not SIMD aligned, do scalar sums to an aligned
* offset, provided that doing so doesn't completely eliminate
* SIMD operation. Aligned loads are still faster on ARM, even
* though there's no explicit aligned load instruction */
* when there's no explicit aligned load instruction */
unsigned int align_offset = ((uintptr_t)buf & 15);
unsigned int align_adj = (align_offset) ? 16 - align_offset : 0;

if (align_offset && len >= (16 + align_adj)) {
if (align_offset) {
NEON_handle_tail(pair, buf, align_adj);
n -= align_adj;
done += align_adj;
Expand Down
6 changes: 3 additions & 3 deletions arch/arm/chunkset_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,12 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
#if defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__)
uint8x16_t ret_vec = vld1q_u8(buf);

uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);
uint8x16_t perm_vec = vld1q_u8_ex(permute_table + lut_rem.idx, 256);
return vqtbl1q_u8(ret_vec, perm_vec);
#else
uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
perm_vec0 = vld1_u8(permute_table + lut_rem.idx);
perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8);
perm_vec0 = vld1_u8_ex(permute_table + lut_rem.idx, 256);
perm_vec1 = vld1_u8_ex(permute_table + lut_rem.idx + 8, 64);
a = vld1_u8(buf);
b = vld1_u8(buf + 8);
ret0 = vtbl1_u8(a, perm_vec0);
Expand Down
11 changes: 11 additions & 0 deletions arch/arm/neon_intrins.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,17 @@ static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
vst1q_u16(p + 24, a.val[3]);
}
# endif // HASLD4 check

# ifndef _MSC_VER
# define vld1_u8_ex(p, align) vld1_u8(HINT_ALIGNED((p), (align)/8))
# define vld1q_u8_ex(p, align) vld1q_u8(HINT_ALIGNED((p), (align)/8))
# endif
# if !defined(_MSC_VER) || !defined(ARM_NEON_HASLD4)
# define vld1q_u8_x4_ex(p, align) vld1q_u8_x4(HINT_ALIGNED((p), (align)/8))
# define vld1q_u16_x4_ex(p, align) vld1q_u16_x4(HINT_ALIGNED((p), (align)/8))
# define vst1q_u16_x4_ex(p, a, align) vst1q_u16_x4(HINT_ALIGNED((p), (align)/8), a)
# endif

#endif

#endif // include guard ARM_NEON_INTRINS_H
8 changes: 4 additions & 4 deletions arch/arm/slide_hash_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize

n = size / (sizeof(uint16x8_t) * 8);
do {
p0 = vld1q_u16_x4(table);
p1 = vld1q_u16_x4(table+32);
p0 = vld1q_u16_x4_ex(table, 256);
p1 = vld1q_u16_x4_ex(table+32, 256);
vqsubq_u16_x4_x1(p0, p0, v);
vqsubq_u16_x4_x1(p1, p1, v);
vst1q_u16_x4(table, p0);
vst1q_u16_x4(table+32, p1);
vst1q_u16_x4_ex(table, p0, 256);
vst1q_u16_x4_ex(table+32, p1, 256);
table += 64;
} while (--n);
}
Expand Down

0 comments on commit 4a06577

Please sign in to comment.