Make use of NEON alignment hints

ccawley2011 · Feb 19, 2025 · 4a06577 · 4a06577
1 parent 287c4dc
commit 4a06577
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 14 deletions.
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c
@@ -11,7 +11,7 @@
 #include "adler32_p.h"
 
 static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
-    static const uint16_t ALIGNED_(16) taps[64] = {
+    static const uint16_t ALIGNED_(32) taps[64] = {
         64, 63, 62, 61, 60, 59, 58, 57,
         56, 55, 54, 53, 52, 51, 50, 49,
         48, 47, 46, 45, 44, 43, 42, 41,
@@ -43,7 +43,7 @@ static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
     int rem = len & 3;
 
     for (size_t i = 0; i < num_iter; ++i) {
-        uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
+        uint8x16x4_t d0_d3 = vld1q_u8_x4_ex(buf, 128);
 
         /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
          * bit instruction, we'll have to make due summing to 16 bits first */
@@ -82,7 +82,7 @@ static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
     if (rem) {
         uint32x4_t s3acc_0 = vdupq_n_u32(0);
         while (rem--) {
-            uint8x16_t d0 = vld1q_u8(buf);
+            uint8x16_t d0 = vld1q_u8_ex(buf, 128);
             uint16x8_t adler;
             adler = vpaddlq_u8(d0);
             s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
@@ -97,8 +97,8 @@ static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
         s3acc = vaddq_u32(s3acc_0, s3acc);
     }
 
-    uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
-    uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
+    uint16x8x4_t t0_t3 = vld1q_u16_x4_ex(taps, 256);
+    uint16x8x4_t t4_t7 = vld1q_u16_x4_ex(taps + 32, 256);
 
     s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
     s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
@@ -171,11 +171,11 @@ Z_INTERNAL uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len)
     /* If memory is not SIMD aligned, do scalar sums to an aligned
      * offset, provided that doing so doesn't completely eliminate
      * SIMD operation. Aligned loads are still faster on ARM, even
-     * though there's no explicit aligned load instruction */
+     * when there's no explicit aligned load instruction */
     unsigned int align_offset = ((uintptr_t)buf & 15);
     unsigned int align_adj = (align_offset) ? 16 - align_offset : 0;
 
-    if (align_offset && len >= (16 + align_adj)) {
+    if (align_offset) {
         NEON_handle_tail(pair, buf, align_adj);
         n -= align_adj;
         done += align_adj;

diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c
@@ -68,12 +68,12 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
 #if defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__)
     uint8x16_t ret_vec = vld1q_u8(buf);
 
-    uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);
+    uint8x16_t perm_vec = vld1q_u8_ex(permute_table + lut_rem.idx, 256);
     return vqtbl1q_u8(ret_vec, perm_vec);
 #else
     uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
-    perm_vec0 = vld1_u8(permute_table + lut_rem.idx);
-    perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8);
+    perm_vec0 = vld1_u8_ex(permute_table + lut_rem.idx, 256);
+    perm_vec1 = vld1_u8_ex(permute_table + lut_rem.idx + 8, 64);
     a = vld1_u8(buf);
     b = vld1_u8(buf + 8);
     ret0 = vtbl1_u8(a, perm_vec0);

diff --git a/arch/arm/neon_intrins.h b/arch/arm/neon_intrins.h
@@ -60,6 +60,17 @@ static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
     vst1q_u16(p + 24, a.val[3]);
 }
 #  endif // HASLD4 check
+
+#  ifndef _MSC_VER
+#    define vld1_u8_ex(p, align) vld1_u8(HINT_ALIGNED((p), (align)/8))
+#    define vld1q_u8_ex(p, align) vld1q_u8(HINT_ALIGNED((p), (align)/8))
+#  endif
+#  if !defined(_MSC_VER) || !defined(ARM_NEON_HASLD4)
+#    define vld1q_u8_x4_ex(p, align) vld1q_u8_x4(HINT_ALIGNED((p), (align)/8))
+#    define vld1q_u16_x4_ex(p, align) vld1q_u16_x4(HINT_ALIGNED((p), (align)/8))
+#    define vst1q_u16_x4_ex(p, a, align) vst1q_u16_x4(HINT_ALIGNED((p), (align)/8), a)
+#  endif
+
 #endif
 
 #endif // include guard ARM_NEON_INTRINS_H
diff --git a/arch/arm/slide_hash_neon.c b/arch/arm/slide_hash_neon.c
@@ -27,12 +27,12 @@ static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize
 
     n = size / (sizeof(uint16x8_t) * 8);
     do {
-        p0 = vld1q_u16_x4(table);
-        p1 = vld1q_u16_x4(table+32);
+        p0 = vld1q_u16_x4_ex(table, 256);
+        p1 = vld1q_u16_x4_ex(table+32, 256);
         vqsubq_u16_x4_x1(p0, p0, v);
         vqsubq_u16_x4_x1(p1, p1, v);
-        vst1q_u16_x4(table, p0);
-        vst1q_u16_x4(table+32, p1);
+        vst1q_u16_x4_ex(table, p0, 256);
+        vst1q_u16_x4_ex(table+32, p1, 256);
         table += 64;
     } while (--n);
 }