From 06895bc1b3a3cb01a9ae663fc9db7319989025da Mon Sep 17 00:00:00 2001 From: Hans Kristian Rosbach Date: Wed, 3 Jan 2024 15:59:40 +0100 Subject: [PATCH] Move crc32 C fallbacks to arch/generic --- CMakeLists.txt | 5 +- Makefile.in | 6 +- arch/generic/Makefile.in | 16 +- arch/generic/crc32_braid_c.c | 235 ++++++++++++++++++++ crc32_fold.c => arch/generic/crc32_fold_c.c | 5 +- crc32_fold.h => arch/generic/crc32_fold_c.h | 12 +- arch/x86/crc32_pclmulqdq_tpl.h | 2 +- cpu_features.h | 2 +- crc32.h | 16 ++ crc32_braid.c | 225 ------------------- crc32_braid_comb.c | 1 - crc32_braid_p.h | 1 - deflate.h | 2 +- functable.h | 2 +- inflate.h | 2 +- win32/Makefile.a64 | 10 +- win32/Makefile.arm | 10 +- win32/Makefile.msc | 12 +- 18 files changed, 301 insertions(+), 263 deletions(-) create mode 100644 arch/generic/crc32_braid_c.c rename crc32_fold.c => arch/generic/crc32_fold_c.c (96%) rename crc32_fold.h => arch/generic/crc32_fold_c.h (68%) create mode 100644 crc32.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 536f30cd40..ea36dcd108 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -978,6 +978,7 @@ set(ZLIB_PUBLIC_HDRS ) set(ZLIB_PRIVATE_HDRS arch/generic/adler32_fold_c.h + arch/generic/crc32_fold_c.h adler32_p.h chunkset_tpl.h compare256_rle.h @@ -985,7 +986,6 @@ set(ZLIB_PRIVATE_HDRS crc32_braid_p.h crc32_braid_comb_p.h crc32_braid_tbl.h - crc32_fold.h deflate.h deflate_p.h functable.h @@ -1006,6 +1006,8 @@ set(ZLIB_PRIVATE_HDRS set(ZLIB_SRCS arch/generic/adler32_c.c arch/generic/adler32_fold_c.c + arch/generic/crc32_braid_c.c + arch/generic/crc32_fold_c.c adler32.c chunkset.c compare256.c @@ -1013,7 +1015,6 @@ set(ZLIB_SRCS cpu_features.c crc32_braid.c crc32_braid_comb.c - crc32_fold.c deflate.c deflate_fast.c deflate_huff.c diff --git a/Makefile.in b/Makefile.in index e4295902eb..6d60d8030f 100644 --- a/Makefile.in +++ b/Makefile.in @@ -76,6 +76,8 @@ pkgconfigdir = ${libdir}/pkgconfig OBJZ = \ arch/generic/adler32_c.o \ arch/generic/adler32_fold_c.o \ + arch/generic/crc32_braid_c.o \ + arch/generic/crc32_fold_c.o \ adler32.o \ chunkset.o \ compare256.o \ @@ -83,7 +85,6 @@ OBJZ = \ cpu_features.o \ crc32_braid.o \ crc32_braid_comb.o \ - crc32_fold.o \ deflate.o \ deflate_fast.o \ deflate_huff.o \ @@ -115,6 +116,8 @@ OBJC = $(OBJZ) $(OBJG) PIC_OBJZ = \ arch/generic/adler32_c.lo \ arch/generic/adler32_fold_c.lo \ + arch/generic/crc32_braid_c.lo \ + arch/generic/crc32_fold_c.lo \ adler32.lo \ chunkset.lo \ compare256.lo \ @@ -122,7 +125,6 @@ PIC_OBJZ = \ cpu_features.lo \ crc32_braid.lo \ crc32_braid_comb.lo \ - crc32_fold.lo \ deflate.lo \ deflate_fast.lo \ deflate_huff.lo \ diff --git a/arch/generic/Makefile.in b/arch/generic/Makefile.in index 5dc27a6aa3..31770f6ab7 100644 --- a/arch/generic/Makefile.in +++ b/arch/generic/Makefile.in @@ -14,7 +14,9 @@ TOPDIR=$(SRCTOP) all: \ adler32_c.o adler32_c.lo \ - adler32_fold_c.o adler32_fold_c.lo + adler32_fold_c.o adler32_fold_c.lo \ + crc32_braid_c.o crc32_braid_c.lo \ + crc32_fold_c.o crc32_fold_c.lo adler32_c.o: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h @@ -29,6 +31,18 @@ adler32_fold_c.o: $(SRCDIR)/adler32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/funct adler32_fold_c.lo: $(SRCDIR)/adler32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h $(SRCDIR)/adler32_fold_c.h $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c +crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c + +crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c + +crc32_fold_c.o: $(SRCDIR)/crc32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h $(SRCDIR)/crc32_fold_c.h + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c + +crc32_fold_c.lo: $(SRCDIR)/crc32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h $(SRCDIR)/crc32_fold_c.h + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c + mostlyclean: clean clean: diff --git a/arch/generic/crc32_braid_c.c b/arch/generic/crc32_braid_c.c new file mode 100644 index 0000000000..7bf83f17ab --- /dev/null +++ b/arch/generic/crc32_braid_c.c @@ -0,0 +1,235 @@ +/* crc32_braid.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-2022 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + * + * This interleaved implementation of a CRC makes use of pipelined multiple + * arithmetic-logic units, commonly found in modern CPU cores. It is due to + * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution. + */ + +#include "zbuild.h" +#include "crc32_braid_p.h" +#include "crc32_braid_tbl.h" + +/* ========================================================================= */ + +/* + A CRC of a message is computed on N braids of words in the message, where + each word consists of W bytes (4 or 8). If N is 3, for example, then three + running sparse CRCs are calculated respectively on each braid, at these + indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ... + This is done starting at a word boundary, and continues until as many blocks + of N * W bytes as are available have been processed. The results are combined + into a single CRC at the end. For this code, N must be in the range 1..6 and + W must be 4 or 8. The upper limit on N can be increased if desired by adding + more #if blocks, extending the patterns apparent in the code. In addition, + crc32 tables would need to be regenerated, if the maximum N value is increased. + + N and W are chosen empirically by benchmarking the execution time on a given + processor. The choices for N and W below were based on testing on Intel Kaby + Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64 + Octeon II processors. The Intel, AMD, and ARM processors were all fastest + with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4. + They were all tested with either gcc or clang, all using the -O3 optimization + level. Your mileage may vary. +*/ + +/* ========================================================================= */ + +#if BYTE_ORDER == LITTLE_ENDIAN +# define ZSWAPWORD(word) (word) +# define BRAID_TABLE crc_braid_table +#elif BYTE_ORDER == BIG_ENDIAN +# if W == 8 +# define ZSWAPWORD(word) ZSWAP64(word) +# elif W == 4 +# define ZSWAPWORD(word) ZSWAP32(word) +# endif +# define BRAID_TABLE crc_braid_big_table +#else +# error "No endian defined" +#endif +#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8) +#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 + +/* ========================================================================= */ +#ifdef W +/* + Return the CRC of the W bytes in the word_t data, taking the + least-significant byte of the word as the first byte of data, without any pre + or post conditioning. This is used to combine the CRCs of each braid. + */ +#if BYTE_ORDER == LITTLE_ENDIAN +static uint32_t crc_word(z_word_t data) { + int k; + for (k = 0; k < W; k++) + data = (data >> 8) ^ crc_table[data & 0xff]; + return (uint32_t)data; +} +#elif BYTE_ORDER == BIG_ENDIAN +static z_word_t crc_word(z_word_t data) { + int k; + for (k = 0; k < W; k++) + data = (data << 8) ^ + crc_big_table[(data >> ((W - 1) << 3)) & 0xff]; + return data; +} +#endif /* BYTE_ORDER */ + +#endif /* W */ + +/* ========================================================================= */ +Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) { + Z_REGISTER uint32_t c; + + /* Pre-condition the CRC */ + c = (~crc) & 0xffffffff; + +#ifdef W + /* If provided enough bytes, do a braided CRC calculation. */ + if (len >= N * W + W - 1) { + size_t blks; + z_word_t const *words; + int k; + + /* Compute the CRC up to a z_word_t boundary. */ + while (len && ((uintptr_t)buf & (W - 1)) != 0) { + len--; + DO1; + } + + /* Compute the CRC on as many N z_word_t blocks as are available. */ + blks = len / (N * W); + len -= blks * N * W; + words = (z_word_t const *)buf; + + z_word_t crc0, word0, comb; +#if N > 1 + z_word_t crc1, word1; +#if N > 2 + z_word_t crc2, word2; +#if N > 3 + z_word_t crc3, word3; +#if N > 4 + z_word_t crc4, word4; +#if N > 5 + z_word_t crc5, word5; +#endif +#endif +#endif +#endif +#endif + /* Initialize the CRC for each braid. */ + crc0 = ZSWAPWORD(c); +#if N > 1 + crc1 = 0; +#if N > 2 + crc2 = 0; +#if N > 3 + crc3 = 0; +#if N > 4 + crc4 = 0; +#if N > 5 + crc5 = 0; +#endif +#endif +#endif +#endif +#endif + /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */ + while (--blks) { + /* Load the word for each braid into registers. */ + word0 = crc0 ^ words[0]; +#if N > 1 + word1 = crc1 ^ words[1]; +#if N > 2 + word2 = crc2 ^ words[2]; +#if N > 3 + word3 = crc3 ^ words[3]; +#if N > 4 + word4 = crc4 ^ words[4]; +#if N > 5 + word5 = crc5 ^ words[5]; +#endif +#endif +#endif +#endif +#endif + words += N; + + /* Compute and update the CRC for each word. The loop should get unrolled. */ + crc0 = BRAID_TABLE[0][word0 & 0xff]; +#if N > 1 + crc1 = BRAID_TABLE[0][word1 & 0xff]; +#if N > 2 + crc2 = BRAID_TABLE[0][word2 & 0xff]; +#if N > 3 + crc3 = BRAID_TABLE[0][word3 & 0xff]; +#if N > 4 + crc4 = BRAID_TABLE[0][word4 & 0xff]; +#if N > 5 + crc5 = BRAID_TABLE[0][word5 & 0xff]; +#endif +#endif +#endif +#endif +#endif + for (k = 1; k < W; k++) { + crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff]; +#if N > 1 + crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff]; +#if N > 2 + crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff]; +#if N > 3 + crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff]; +#if N > 4 + crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff]; +#if N > 5 + crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff]; +#endif +#endif +#endif +#endif +#endif + } + } + + /* Process the last block, combining the CRCs of the N braids at the same time. */ + comb = crc_word(crc0 ^ words[0]); +#if N > 1 + comb = crc_word(crc1 ^ words[1] ^ comb); +#if N > 2 + comb = crc_word(crc2 ^ words[2] ^ comb); +#if N > 3 + comb = crc_word(crc3 ^ words[3] ^ comb); +#if N > 4 + comb = crc_word(crc4 ^ words[4] ^ comb); +#if N > 5 + comb = crc_word(crc5 ^ words[5] ^ comb); +#endif +#endif +#endif +#endif +#endif + words += N; + c = ZSWAPWORD(comb); + + /* Update the pointer to the remaining bytes to process. */ + buf = (const unsigned char *)words; + } + +#endif /* W */ + + /* Complete the computation of the CRC on any remaining bytes. */ + while (len >= 8) { + len -= 8; + DO8; + } + while (len) { + len--; + DO1; + } + + /* Return the CRC, post-conditioned. */ + return c ^ 0xffffffff; +} diff --git a/crc32_fold.c b/arch/generic/crc32_fold_c.c similarity index 96% rename from crc32_fold.c rename to arch/generic/crc32_fold_c.c index 5b3c7c459f..404c6e2110 100644 --- a/crc32_fold.c +++ b/arch/generic/crc32_fold_c.c @@ -4,10 +4,9 @@ */ #include "zbuild.h" #include "functable.h" +#include "crc32.h" -#include "crc32_fold.h" - -#include +#include "crc32_fold_c.h" Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) { crc->value = CRC32_INITIAL_VALUE; diff --git a/crc32_fold.h b/arch/generic/crc32_fold_c.h similarity index 68% rename from crc32_fold.h rename to arch/generic/crc32_fold_c.h index 0d2ff66967..f68689c09e 100644 --- a/crc32_fold.h +++ b/arch/generic/crc32_fold_c.h @@ -2,16 +2,8 @@ * Copyright (C) 2021 Nathan Moinvaziri * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifndef CRC32_FOLD_H_ -#define CRC32_FOLD_H_ - -#define CRC32_FOLD_BUFFER_SIZE (16 * 4) -/* sizeof(__m128i) * (4 folds) */ - -typedef struct crc32_fold_s { - uint8_t fold[CRC32_FOLD_BUFFER_SIZE]; - uint32_t value; -} crc32_fold; +#ifndef CRC32_FOLD_C_H_ +#define CRC32_FOLD_C_H_ Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc); Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); diff --git a/arch/x86/crc32_pclmulqdq_tpl.h b/arch/x86/crc32_pclmulqdq_tpl.h index 7f29a5d0d1..8c06a60ef4 100644 --- a/arch/x86/crc32_pclmulqdq_tpl.h +++ b/arch/x86/crc32_pclmulqdq_tpl.h @@ -26,7 +26,7 @@ # include #endif -#include "crc32_fold.h" +#include "crc32.h" #include "crc32_braid_p.h" #include "x86_intrins.h" #include diff --git a/cpu_features.h b/cpu_features.h index 25c6b43914..05ee407bf5 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -6,7 +6,7 @@ #ifndef CPU_FEATURES_H_ #define CPU_FEATURES_H_ -#include "crc32_fold.h" +#include "crc32.h" #if defined(X86_FEATURES) # include "arch/x86/x86_features.h" diff --git a/crc32.h b/crc32.h new file mode 100644 index 0000000000..8c3d7a8a3e --- /dev/null +++ b/crc32.h @@ -0,0 +1,16 @@ +/* crc32.h -- crc32 folding interface + * Copyright (C) 2021 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#ifndef CRC32_H_ +#define CRC32_H_ + +#define CRC32_FOLD_BUFFER_SIZE (16 * 4) +/* sizeof(__m128i) * (4 folds) */ + +typedef struct crc32_fold_s { + uint8_t fold[CRC32_FOLD_BUFFER_SIZE]; + uint32_t value; +} crc32_fold; + +#endif diff --git a/crc32_braid.c b/crc32_braid.c index 96754b53df..fb6e519e7a 100644 --- a/crc32_braid.c +++ b/crc32_braid.c @@ -8,9 +8,7 @@ */ #include "zbuild.h" -#include "zutil.h" #include "functable.h" -#include "crc32_braid_p.h" #include "crc32_braid_tbl.h" /* ========================================================================= */ @@ -42,226 +40,3 @@ uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t return PREFIX(crc32_z)(crc, buf, len); } #endif - -/* ========================================================================= */ - -/* - A CRC of a message is computed on N braids of words in the message, where - each word consists of W bytes (4 or 8). If N is 3, for example, then three - running sparse CRCs are calculated respectively on each braid, at these - indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ... - This is done starting at a word boundary, and continues until as many blocks - of N * W bytes as are available have been processed. The results are combined - into a single CRC at the end. For this code, N must be in the range 1..6 and - W must be 4 or 8. The upper limit on N can be increased if desired by adding - more #if blocks, extending the patterns apparent in the code. In addition, - crc32 tables would need to be regenerated, if the maximum N value is increased. - - N and W are chosen empirically by benchmarking the execution time on a given - processor. The choices for N and W below were based on testing on Intel Kaby - Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64 - Octeon II processors. The Intel, AMD, and ARM processors were all fastest - with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4. - They were all tested with either gcc or clang, all using the -O3 optimization - level. Your mileage may vary. -*/ - -/* ========================================================================= */ - -#if BYTE_ORDER == LITTLE_ENDIAN -# define ZSWAPWORD(word) (word) -# define BRAID_TABLE crc_braid_table -#elif BYTE_ORDER == BIG_ENDIAN -# if W == 8 -# define ZSWAPWORD(word) ZSWAP64(word) -# elif W == 4 -# define ZSWAPWORD(word) ZSWAP32(word) -# endif -# define BRAID_TABLE crc_braid_big_table -#else -# error "No endian defined" -#endif -#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8) -#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 - -/* ========================================================================= */ -#ifdef W -/* - Return the CRC of the W bytes in the word_t data, taking the - least-significant byte of the word as the first byte of data, without any pre - or post conditioning. This is used to combine the CRCs of each braid. - */ -#if BYTE_ORDER == LITTLE_ENDIAN -static uint32_t crc_word(z_word_t data) { - int k; - for (k = 0; k < W; k++) - data = (data >> 8) ^ crc_table[data & 0xff]; - return (uint32_t)data; -} -#elif BYTE_ORDER == BIG_ENDIAN -static z_word_t crc_word(z_word_t data) { - int k; - for (k = 0; k < W; k++) - data = (data << 8) ^ - crc_big_table[(data >> ((W - 1) << 3)) & 0xff]; - return data; -} -#endif /* BYTE_ORDER */ - -#endif /* W */ - -/* ========================================================================= */ -Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) { - Z_REGISTER uint32_t c; - - /* Pre-condition the CRC */ - c = (~crc) & 0xffffffff; - -#ifdef W - /* If provided enough bytes, do a braided CRC calculation. */ - if (len >= N * W + W - 1) { - size_t blks; - z_word_t const *words; - int k; - - /* Compute the CRC up to a z_word_t boundary. */ - while (len && ((uintptr_t)buf & (W - 1)) != 0) { - len--; - DO1; - } - - /* Compute the CRC on as many N z_word_t blocks as are available. */ - blks = len / (N * W); - len -= blks * N * W; - words = (z_word_t const *)buf; - - z_word_t crc0, word0, comb; -#if N > 1 - z_word_t crc1, word1; -#if N > 2 - z_word_t crc2, word2; -#if N > 3 - z_word_t crc3, word3; -#if N > 4 - z_word_t crc4, word4; -#if N > 5 - z_word_t crc5, word5; -#endif -#endif -#endif -#endif -#endif - /* Initialize the CRC for each braid. */ - crc0 = ZSWAPWORD(c); -#if N > 1 - crc1 = 0; -#if N > 2 - crc2 = 0; -#if N > 3 - crc3 = 0; -#if N > 4 - crc4 = 0; -#if N > 5 - crc5 = 0; -#endif -#endif -#endif -#endif -#endif - /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */ - while (--blks) { - /* Load the word for each braid into registers. */ - word0 = crc0 ^ words[0]; -#if N > 1 - word1 = crc1 ^ words[1]; -#if N > 2 - word2 = crc2 ^ words[2]; -#if N > 3 - word3 = crc3 ^ words[3]; -#if N > 4 - word4 = crc4 ^ words[4]; -#if N > 5 - word5 = crc5 ^ words[5]; -#endif -#endif -#endif -#endif -#endif - words += N; - - /* Compute and update the CRC for each word. The loop should get unrolled. */ - crc0 = BRAID_TABLE[0][word0 & 0xff]; -#if N > 1 - crc1 = BRAID_TABLE[0][word1 & 0xff]; -#if N > 2 - crc2 = BRAID_TABLE[0][word2 & 0xff]; -#if N > 3 - crc3 = BRAID_TABLE[0][word3 & 0xff]; -#if N > 4 - crc4 = BRAID_TABLE[0][word4 & 0xff]; -#if N > 5 - crc5 = BRAID_TABLE[0][word5 & 0xff]; -#endif -#endif -#endif -#endif -#endif - for (k = 1; k < W; k++) { - crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff]; -#if N > 1 - crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff]; -#if N > 2 - crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff]; -#if N > 3 - crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff]; -#if N > 4 - crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff]; -#if N > 5 - crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff]; -#endif -#endif -#endif -#endif -#endif - } - } - - /* Process the last block, combining the CRCs of the N braids at the same time. */ - comb = crc_word(crc0 ^ words[0]); -#if N > 1 - comb = crc_word(crc1 ^ words[1] ^ comb); -#if N > 2 - comb = crc_word(crc2 ^ words[2] ^ comb); -#if N > 3 - comb = crc_word(crc3 ^ words[3] ^ comb); -#if N > 4 - comb = crc_word(crc4 ^ words[4] ^ comb); -#if N > 5 - comb = crc_word(crc5 ^ words[5] ^ comb); -#endif -#endif -#endif -#endif -#endif - words += N; - c = ZSWAPWORD(comb); - - /* Update the pointer to the remaining bytes to process. */ - buf = (const unsigned char *)words; - } - -#endif /* W */ - - /* Complete the computation of the CRC on any remaining bytes. */ - while (len >= 8) { - len -= 8; - DO8; - } - while (len) { - len--; - DO1; - } - - /* Return the CRC, post-conditioned. */ - return c ^ 0xffffffff; -} diff --git a/crc32_braid_comb.c b/crc32_braid_comb.c index 75fb474258..f253ae10a2 100644 --- a/crc32_braid_comb.c +++ b/crc32_braid_comb.c @@ -7,7 +7,6 @@ * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution. */ -#include "zbuild.h" #include "zutil.h" #include "crc32_braid_p.h" #include "crc32_braid_tbl.h" diff --git a/crc32_braid_p.h b/crc32_braid_p.h index 1d8a07068a..65a535dc20 100644 --- a/crc32_braid_p.h +++ b/crc32_braid_p.h @@ -1,7 +1,6 @@ #ifndef CRC32_BRAID_P_H_ #define CRC32_BRAID_P_H_ -#include "zbuild.h" #include "zendian.h" /* Define N */ diff --git a/deflate.h b/deflate.h index 55053b7fb0..ec1519daf4 100644 --- a/deflate.h +++ b/deflate.h @@ -12,7 +12,7 @@ #include "zutil.h" #include "zendian.h" -#include "crc32_fold.h" +#include "crc32.h" /* define NO_GZIP when compiling if you want to disable gzip header and trailer creation by deflate(). NO_GZIP would be used to avoid linking in diff --git a/functable.h b/functable.h index 9f5dad9001..433f696d3c 100644 --- a/functable.h +++ b/functable.h @@ -7,7 +7,7 @@ #define FUNCTABLE_H_ #include "deflate.h" -#include "crc32_fold.h" +#include "crc32_fold_c.h" #include "adler32_fold_c.h" #ifdef ZLIB_COMPAT diff --git a/inflate.h b/inflate.h index 7a90c5ae59..1f43de297d 100644 --- a/inflate.h +++ b/inflate.h @@ -11,7 +11,7 @@ #ifndef INFLATE_H_ #define INFLATE_H_ -#include "crc32_fold.h" +#include "crc32.h" /* define NO_GZIP when compiling if you want to disable gzip header and trailer decoding by inflate(). NO_GZIP would be used to avoid linking in the crc code when it is not needed. diff --git a/win32/Makefile.a64 b/win32/Makefile.a64 index f099cb06b3..985b758d8f 100644 --- a/win32/Makefile.a64 +++ b/win32/Makefile.a64 @@ -52,8 +52,9 @@ OBJS = \ compress.obj \ cpu_features.obj \ crc32_braid.obj \ + crc32_braid_c.obj \ crc32_braid_comb.obj \ - crc32_fold.obj \ + crc32_fold_c.obj \ deflate.obj \ deflate_fast.obj \ deflate_huff.obj \ @@ -191,9 +192,10 @@ gzwrite.obj: $(SRCDIR)/gzwrite.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR) compress.obj: $(SRCDIR)/compress.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h -crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h -crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h -crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h +crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_tbl.h +crc32_braid_c.obj: $(SRCDIR)/arch/generic/crc32_braid_c.c $(SRCDIR)/zbuild.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h +crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h +crc32_fold_c.obj: $(SRCDIR)/arch/generic/crc32_fold_c.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/arch/generic/crc32_fold_c.h deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_huff.obj: $(SRCDIR)/deflate_huff.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h diff --git a/win32/Makefile.arm b/win32/Makefile.arm index 4cdd8952b8..5c7ae6bd17 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -57,8 +57,9 @@ OBJS = \ compress.obj \ cpu_features.obj \ crc32_braid.obj \ + crc32_braid_c.obj \ crc32_braid_comb.obj \ - crc32_fold.obj \ + crc32_fold_c.obj \ deflate.obj \ deflate_fast.obj \ deflate_huff.obj \ @@ -212,9 +213,10 @@ compress.obj: $(SRCDIR)/compress.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h -crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h -crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h -crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h +crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_tbl.h +crc32_braid_c.obj: $(SRCDIR)/arch/generic/crc32_braid_c.c $(SRCDIR)/zbuild.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h +crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h +crc32_fold_c.obj: $(SRCDIR)/arch/generic/crc32_fold_c.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/arch/generic/crc32_fold_c.h deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_huff.obj: $(SRCDIR)/deflate_huff.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 470a379110..fb02db36b8 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -64,8 +64,9 @@ OBJS = \ compress.obj \ cpu_features.obj \ crc32_braid.obj \ + crc32_braid_c.obj \ crc32_braid_comb.obj \ - crc32_fold.obj \ + crc32_fold_c.obj \ crc32_pclmulqdq.obj \ deflate.obj \ deflate_fast.obj \ @@ -210,11 +211,12 @@ chunkset_avx2.obj: $(SRCDIR)/arch/x86/chunkset_avx2.c $(SRCDIR)/zbuild.h $(SRCDI chunkset_sse2.obj: $(SRCDIR)/arch/x86/chunkset_sse2.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h chunkset_ssse3.obj: $(SRCDIR)/arch/x86/chunkset_ssse3.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h -crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h -crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h -crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h +crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_tbl.h +crc32_braid_c.obj: $(SRCDIR)/arch/generic/crc32_braid_c.c $(SRCDIR)/zbuild.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h +crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h +crc32_fold_c.obj: $(SRCDIR)/arch/generic/crc32_fold_c.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/arch/generic/crc32_fold_c.h crc32_pclmulqdq.obj: $(SRCDIR)/arch/x86/crc32_pclmulqdq.c $(SRCDIR)/arch/x86/crc32_pclmulqdq_tpl.h $(SRCDIR)/arch/x86/crc32_fold_pclmulqdq_tpl.h \ - $(SRCDIR)/crc32_fold.h $(SRCDIR)/zbuild.h + $(SRCDIR)/arch/generic/crc32_fold_c.h $(SRCDIR)/zbuild.h deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_huff.obj: $(SRCDIR)/deflate_huff.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h