diff --git a/adler32.c b/adler32.c index c47f764586..95ac13c304 100644 --- a/adler32.c +++ b/adler32.c @@ -8,7 +8,7 @@ #include "adler32_p.h" /* ========================================================================= */ -Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, uint64_t len) { +Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) { uint32_t sum2; unsigned n; diff --git a/adler32_fold.c b/adler32_fold.c index adaabe0210..e2f6f9ac7d 100644 --- a/adler32_fold.c +++ b/adler32_fold.c @@ -9,19 +9,8 @@ #include -Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) { +Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { adler = functable.adler32(adler, src, len); -/* Test that we don't try to copy more than actually fits in available address space */ -#if INTPTR_MAX > SSIZE_MAX - while (len > SSIZE_MAX) { - memcpy(dst, src, SSIZE_MAX); - dst += SSIZE_MAX; - src += SSIZE_MAX; - len -= SSIZE_MAX; - } -#endif - if (len) { - memcpy(dst, src, (size_t)len); - } + memcpy(dst, src, len); return adler; } diff --git a/adler32_fold.h b/adler32_fold.h index bdaf2130fe..20aa1c7400 100644 --- a/adler32_fold.h +++ b/adler32_fold.h @@ -6,6 +6,6 @@ #ifndef ADLER32_FOLD_H_ #define ADLER32_FOLD_H_ -Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len); +Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif diff --git a/adler32_p.h b/adler32_p.h index 8fe41b1422..38ba2ad721 100644 --- a/adler32_p.h +++ b/adler32_p.h @@ -26,7 +26,7 @@ static inline uint32_t adler32_len_1(uint32_t adler, const uint8_t *buf, uint32_ return adler | (sum2 << 16); } -static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, uint64_t len, uint32_t sum2) { +static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) { while (len) { --len; adler += *buf++; @@ -38,7 +38,7 @@ static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, uint64 return adler | (sum2 << 16); } -static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, uint64_t len, uint32_t sum2) { +static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, size_t len, uint32_t sum2) { while (len--) { *dst = *buf++; adler += *dst++; @@ -50,7 +50,7 @@ static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, u return adler | (sum2 << 16); } -static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, uint64_t len, uint32_t sum2) { +static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) { #ifdef UNROLL_MORE while (len >= 16) { len -= 16; diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c index 9b9d65ddd4..c8871cc0cd 100644 --- a/arch/arm/adler32_neon.c +++ b/arch/arm/adler32_neon.c @@ -10,7 +10,7 @@ #include "../../zbuild.h" #include "../../adler32_p.h" -static void NEON_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) { +static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) { static const uint16_t ALIGNED_(16) taps[64] = { 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, @@ -39,10 +39,10 @@ static void NEON_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) { uint16x8_t s2_4, s2_5, s2_6, s2_7; s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0); - uint64_t num_iter = len >> 2; + size_t num_iter = len >> 2; int rem = len & 3; - for (uint64_t i = 0; i < num_iter; ++i) { + for (size_t i = 0; i < num_iter; ++i) { uint8x16x4_t d0_d3 = vld1q_u8_x4(buf); /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32 @@ -133,7 +133,7 @@ static void NEON_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) { s[1] = vget_lane_u32(as, 1); } -static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, uint64_t len) { +static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) { unsigned int i; for (i = 0; i < len; ++i) { pair[0] += buf[i]; @@ -141,7 +141,7 @@ static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, uint64_t len) { } } -uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, uint64_t len) { +uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) { /* split Adler-32 into component sums */ uint32_t sum2 = (adler >> 16) & 0xffff; adler &= 0xffff; diff --git a/arch/power/adler32_power8.c b/arch/power/adler32_power8.c index 0e8e0d162c..497e2f92c8 100644 --- a/arch/power/adler32_power8.c +++ b/arch/power/adler32_power8.c @@ -52,7 +52,7 @@ static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsi return __a; } -uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, uint64_t len) { +uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) { uint32_t s1 = adler & 0xffff; uint32_t s2 = (adler >> 16) & 0xffff; diff --git a/arch/power/adler32_vmx.c b/arch/power/adler32_vmx.c index 2c8eb68270..1b648820d0 100644 --- a/arch/power/adler32_vmx.c +++ b/arch/power/adler32_vmx.c @@ -12,7 +12,7 @@ #define vmx_zero() (vec_splat_u32(0)) -static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, uint64_t len) { +static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) { unsigned int i; for (i = 0; i < len; ++i) { pair[0] += buf[i]; @@ -20,7 +20,7 @@ static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, u } } -static void vmx_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) { +static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) { /* Different taps for the separable components of sums */ const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49}; const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33}; @@ -113,7 +113,7 @@ static void vmx_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) { vec_ste(s2acc, 0, s+1); } -uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, uint64_t len) { +uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) { uint32_t sum2; uint32_t pair[16] ALIGNED_(16); memset(&pair[2], 0, 14); diff --git a/arch/x86/adler32_avx2_tpl.h b/arch/x86/adler32_avx2_tpl.h index 4ff1838de0..0b2e89be43 100644 --- a/arch/x86/adler32_avx2_tpl.h +++ b/arch/x86/adler32_avx2_tpl.h @@ -11,8 +11,8 @@ #include "adler32_avx2_p.h" #ifdef X86_SSE42_ADLER32 -extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len); -extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, uint64_t len); +extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len); #define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d) #define sub32(a, b, c) adler32_ssse3(a, b, c) @@ -22,9 +22,9 @@ extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, uint64_t len); #endif #ifdef COPY -Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) { +Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { #else -Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, uint64_t len) { +Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) { #endif if (src == NULL) return 1L; if (len == 0) return adler; @@ -61,7 +61,7 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, uint64_t le __m256i vs1_0 = vs1; __m256i vs3 = _mm256_setzero_si256(); - uint64_t k = MIN(len, NMAX); + size_t k = MIN(len, NMAX); k -= k % 32; len -= k; diff --git a/arch/x86/adler32_avx512_tpl.h b/arch/x86/adler32_avx512_tpl.h index 2419865988..6ed39b45df 100644 --- a/arch/x86/adler32_avx512_tpl.h +++ b/arch/x86/adler32_avx512_tpl.h @@ -14,9 +14,9 @@ #ifdef X86_AVX512_ADLER32 #ifdef COPY -Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) { +Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { #else -Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, uint64_t len) { +Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) { #endif if (src == NULL) return 1L; @@ -52,7 +52,7 @@ Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, uint64_t 56, 57, 58, 59, 60, 61, 62, 63, 64); const __m512i dot3v = _mm512_set1_epi16(1); const __m512i zero = _mm512_setzero_si512(); - uint64_t k; + size_t k; while (len >= 64) { __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0)); diff --git a/arch/x86/adler32_avx512_vnni.c b/arch/x86/adler32_avx512_vnni.c index ecebdec733..42a166062f 100644 --- a/arch/x86/adler32_avx512_vnni.c +++ b/arch/x86/adler32_avx512_vnni.c @@ -18,7 +18,7 @@ #include "adler32_avx512_p.h" #include "adler32_avx2_p.h" -Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, uint64_t len) { +Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) { if (src == NULL) return 1L; if (len == 0) return adler; @@ -54,7 +54,7 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, uint while (len >= 64) { vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0)); vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1)); - uint64_t k = MIN(len, NMAX); + size_t k = MIN(len, NMAX); k -= k % 64; len -= k; __m512i vs1_0 = vs1; @@ -120,7 +120,7 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, uint return adler; } -Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) { +Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { if (src == NULL) return 1L; if (len == 0) return adler; @@ -151,7 +151,7 @@ Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, while (len >= 32) { vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0)); vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1)); - uint64_t k = MIN(len, NMAX); + size_t k = MIN(len, NMAX); k -= k % 32; len -= k; __m256i vs1_0 = vs1; diff --git a/arch/x86/adler32_sse42.c b/arch/x86/adler32_sse42.c index 5e68e4c41e..ec0513409b 100644 --- a/arch/x86/adler32_sse42.c +++ b/arch/x86/adler32_sse42.c @@ -14,7 +14,7 @@ #ifdef X86_SSE42_ADLER32 -Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) { +Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { uint32_t adler0, adler1; adler1 = (adler >> 16) & 0xffff; adler0 = adler & 0xffff; @@ -31,7 +31,7 @@ Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); const __m128i dot3v = _mm_set1_epi16(1); - uint64_t k; + size_t k; while (len >= 16) { diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c index 45fddd42af..1f4abba507 100644 --- a/arch/x86/adler32_ssse3.c +++ b/arch/x86/adler32_ssse3.c @@ -14,7 +14,7 @@ #include -Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, uint64_t len) { +Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) { uint32_t sum2; /* split Adler-32 into component sums */ @@ -46,10 +46,10 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, uint64_t l * additions worthwhile or if it's worth it to just eat the cost of an unaligned * load. This is a pretty simple test, just test if 16 - the remainder + len is * < 16 */ - uint64_t max_iters = NMAX; - uint64_t rem = (uintptr_t)buf & 15; - uint64_t align_offset = 16 - rem; - uint64_t k = 0; + size_t max_iters = NMAX; + size_t rem = (uintptr_t)buf & 15; + size_t align_offset = 16 - rem; + size_t k = 0; if (rem) { if (len < 16 + align_offset) { /* Let's eat the cost of this one unaligned load so that diff --git a/cpu_features.h b/cpu_features.h index 9d10ce7df0..72e40a1652 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -23,43 +23,43 @@ extern void cpu_check_features(void); /* adler32 */ -typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, uint64_t len); +typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len); -extern uint32_t adler32_c(uint32_t adler, const uint8_t *buf, uint64_t len); +extern uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len); #ifdef ARM_NEON_ADLER32 -extern uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, uint64_t len); +extern uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len); #endif #ifdef PPC_VMX_ADLER32 -extern uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, uint64_t len); +extern uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len); #endif #ifdef X86_SSSE3_ADLER32 -extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, uint64_t len); +extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len); #endif #ifdef X86_AVX2_ADLER32 -extern uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, uint64_t len); +extern uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len); #endif #ifdef X86_AVX512_ADLER32 -extern uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, uint64_t len); +extern uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len); #endif #ifdef X86_AVX512VNNI_ADLER32 -extern uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, uint64_t len); +extern uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len); #endif #ifdef POWER8_VSX_ADLER32 -extern uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, uint64_t len); +extern uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len); #endif /* adler32 folding */ #ifdef X86_SSE42_ADLER32 -extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len); +extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif #ifdef X86_AVX2_ADLER32 -extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len); +extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif #ifdef X86_AVX512_ADLER32 -extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len); +extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif #ifdef X86_AVX512VNNI_ADLER32 -extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len); +extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif /* CRC32 folding */ diff --git a/functable.c b/functable.c index 5dd5e54d4c..260d52f2f3 100644 --- a/functable.c +++ b/functable.c @@ -162,7 +162,7 @@ Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_matc return functable.longest_match_slow(s, cur_match); } -Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const uint8_t *buf, uint64_t len) { +Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const uint8_t *buf, size_t len) { // Initialize default functable.adler32 = &adler32_c; cpu_check_features(); @@ -202,7 +202,7 @@ Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const uint8_t *buf, uint64_t le return functable.adler32(adler, buf, len); } -Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) { +Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { functable.adler32_fold_copy = &adler32_fold_copy_c; #if (defined X86_SSE42_ADLER32) if (x86_cpu_has_sse42) diff --git a/functable.h b/functable.h index 9b6ec3b542..531f3a1cef 100644 --- a/functable.h +++ b/functable.h @@ -11,8 +11,8 @@ #include "adler32_fold.h" struct functable_s { - uint32_t (* adler32) (uint32_t adler, const uint8_t *buf, uint64_t len); - uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len); + uint32_t (* adler32) (uint32_t adler, const uint8_t *buf, size_t len); + uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint32_t (* crc32) (uint32_t crc, const uint8_t *buf, size_t len); uint32_t (* crc32_fold_reset) (struct crc32_fold_s *crc); void (* crc32_fold_copy) (struct crc32_fold_s *crc, uint8_t *dst, const uint8_t *src, size_t len); diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc index 71ac72f295..f89659a7f1 100644 --- a/test/benchmarks/benchmark_adler32_copy.cc +++ b/test/benchmarks/benchmark_adler32_copy.cc @@ -18,7 +18,7 @@ extern "C" { #define MAX_RANDOM_INTS (1024 * 1024) #define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t)) -typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const uint8_t *buf, uint64_t len); +typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const uint8_t *buf, size_t len); class adler32_copy: public benchmark::Fixture { private: @@ -76,7 +76,7 @@ class adler32_copy: public benchmark::Fixture { state.SkipWithError("CPU does not support " #name); \ } \ Bench(state, [](uint32_t init_sum, unsigned char *dst, \ - const uint8_t *buf, uint64_t len) -> uint32_t { \ + const uint8_t *buf, size_t len) -> uint32_t { \ memcpy(dst, buf, (size_t)len); \ return fptr(init_sum, buf, len); \ }); \