From 06895bc1b3a3cb01a9ae663fc9db7319989025da Mon Sep 17 00:00:00 2001
From: Hans Kristian Rosbach <hk-git@circlestorm.org>
Date: Wed, 3 Jan 2024 15:59:40 +0100
Subject: [PATCH] Move crc32 C fallbacks to arch/generic

---
 CMakeLists.txt                              |   5 +-
 Makefile.in                                 |   6 +-
 arch/generic/Makefile.in                    |  16 +-
 arch/generic/crc32_braid_c.c                | 235 ++++++++++++++++++++
 crc32_fold.c => arch/generic/crc32_fold_c.c |   5 +-
 crc32_fold.h => arch/generic/crc32_fold_c.h |  12 +-
 arch/x86/crc32_pclmulqdq_tpl.h              |   2 +-
 cpu_features.h                              |   2 +-
 crc32.h                                     |  16 ++
 crc32_braid.c                               | 225 -------------------
 crc32_braid_comb.c                          |   1 -
 crc32_braid_p.h                             |   1 -
 deflate.h                                   |   2 +-
 functable.h                                 |   2 +-
 inflate.h                                   |   2 +-
 win32/Makefile.a64                          |  10 +-
 win32/Makefile.arm                          |  10 +-
 win32/Makefile.msc                          |  12 +-
 18 files changed, 301 insertions(+), 263 deletions(-)
 create mode 100644 arch/generic/crc32_braid_c.c
 rename crc32_fold.c => arch/generic/crc32_fold_c.c (96%)
 rename crc32_fold.h => arch/generic/crc32_fold_c.h (68%)
 create mode 100644 crc32.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 536f30cd40..ea36dcd108 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -978,6 +978,7 @@ set(ZLIB_PUBLIC_HDRS
 )
 set(ZLIB_PRIVATE_HDRS
     arch/generic/adler32_fold_c.h
+    arch/generic/crc32_fold_c.h
     adler32_p.h
     chunkset_tpl.h
     compare256_rle.h
@@ -985,7 +986,6 @@ set(ZLIB_PRIVATE_HDRS
     crc32_braid_p.h
     crc32_braid_comb_p.h
     crc32_braid_tbl.h
-    crc32_fold.h
     deflate.h
     deflate_p.h
     functable.h
@@ -1006,6 +1006,8 @@ set(ZLIB_PRIVATE_HDRS
 set(ZLIB_SRCS
     arch/generic/adler32_c.c
     arch/generic/adler32_fold_c.c
+    arch/generic/crc32_braid_c.c
+    arch/generic/crc32_fold_c.c
     adler32.c
     chunkset.c
     compare256.c
@@ -1013,7 +1015,6 @@ set(ZLIB_SRCS
     cpu_features.c
     crc32_braid.c
     crc32_braid_comb.c
-    crc32_fold.c
     deflate.c
     deflate_fast.c
     deflate_huff.c
diff --git a/Makefile.in b/Makefile.in
index e4295902eb..6d60d8030f 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -76,6 +76,8 @@ pkgconfigdir = ${libdir}/pkgconfig
 OBJZ = \
 	arch/generic/adler32_c.o \
 	arch/generic/adler32_fold_c.o \
+	arch/generic/crc32_braid_c.o \
+	arch/generic/crc32_fold_c.o \
 	adler32.o \
 	chunkset.o \
 	compare256.o \
@@ -83,7 +85,6 @@ OBJZ = \
 	cpu_features.o \
 	crc32_braid.o \
 	crc32_braid_comb.o \
-	crc32_fold.o \
 	deflate.o \
 	deflate_fast.o \
 	deflate_huff.o \
@@ -115,6 +116,8 @@ OBJC = $(OBJZ) $(OBJG)
 PIC_OBJZ = \
 	arch/generic/adler32_c.lo \
 	arch/generic/adler32_fold_c.lo \
+	arch/generic/crc32_braid_c.lo \
+	arch/generic/crc32_fold_c.lo \
 	adler32.lo \
 	chunkset.lo \
 	compare256.lo \
@@ -122,7 +125,6 @@ PIC_OBJZ = \
 	cpu_features.lo \
 	crc32_braid.lo \
 	crc32_braid_comb.lo \
-	crc32_fold.lo \
 	deflate.lo \
 	deflate_fast.lo \
 	deflate_huff.lo \
diff --git a/arch/generic/Makefile.in b/arch/generic/Makefile.in
index 5dc27a6aa3..31770f6ab7 100644
--- a/arch/generic/Makefile.in
+++ b/arch/generic/Makefile.in
@@ -14,7 +14,9 @@ TOPDIR=$(SRCTOP)
 
 all: \
  adler32_c.o adler32_c.lo \
- adler32_fold_c.o adler32_fold_c.lo
+ adler32_fold_c.o adler32_fold_c.lo \
+ crc32_braid_c.o crc32_braid_c.lo \
+ crc32_fold_c.o crc32_fold_c.lo
 
 
 adler32_c.o: $(SRCDIR)/adler32_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
@@ -29,6 +31,18 @@ adler32_fold_c.o: $(SRCDIR)/adler32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/funct
 adler32_fold_c.lo: $(SRCDIR)/adler32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h $(SRCDIR)/adler32_fold_c.h
 	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c
 
+crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c
+
+crc32_fold_c.o: $(SRCDIR)/crc32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h $(SRCDIR)/crc32_fold_c.h
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
+
+crc32_fold_c.lo: $(SRCDIR)/crc32_fold_c.c  $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h $(SRCDIR)/crc32_fold_c.h
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c
+
 
 mostlyclean: clean
 clean:
diff --git a/arch/generic/crc32_braid_c.c b/arch/generic/crc32_braid_c.c
new file mode 100644
index 0000000000..7bf83f17ab
--- /dev/null
+++ b/arch/generic/crc32_braid_c.c
@@ -0,0 +1,235 @@
+/* crc32_braid.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2022 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * This interleaved implementation of a CRC makes use of pipelined multiple
+ * arithmetic-logic units, commonly found in modern CPU cores. It is due to
+ * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
+ */
+
+#include "zbuild.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+
+/* ========================================================================= */
+
+/*
+  A CRC of a message is computed on N braids of words in the message, where
+  each word consists of W bytes (4 or 8). If N is 3, for example, then three
+  running sparse CRCs are calculated respectively on each braid, at these
+  indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
+  This is done starting at a word boundary, and continues until as many blocks
+  of N * W bytes as are available have been processed. The results are combined
+  into a single CRC at the end. For this code, N must be in the range 1..6 and
+  W must be 4 or 8. The upper limit on N can be increased if desired by adding
+  more #if blocks, extending the patterns apparent in the code. In addition,
+  crc32 tables would need to be regenerated, if the maximum N value is increased.
+
+  N and W are chosen empirically by benchmarking the execution time on a given
+  processor. The choices for N and W below were based on testing on Intel Kaby
+  Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
+  Octeon II processors. The Intel, AMD, and ARM processors were all fastest
+  with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
+  They were all tested with either gcc or clang, all using the -O3 optimization
+  level. Your mileage may vary.
+*/
+
+/* ========================================================================= */
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#  define ZSWAPWORD(word) (word)
+#  define BRAID_TABLE crc_braid_table
+#elif BYTE_ORDER == BIG_ENDIAN
+#  if W == 8
+#    define ZSWAPWORD(word) ZSWAP64(word)
+#  elif W == 4
+#    define ZSWAPWORD(word) ZSWAP32(word)
+#  endif
+#  define BRAID_TABLE crc_braid_big_table
+#else
+#  error "No endian defined"
+#endif
+#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
+#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+
+/* ========================================================================= */
+#ifdef W
+/*
+  Return the CRC of the W bytes in the word_t data, taking the
+  least-significant byte of the word as the first byte of data, without any pre
+  or post conditioning. This is used to combine the CRCs of each braid.
+ */
+#if BYTE_ORDER == LITTLE_ENDIAN
+static uint32_t crc_word(z_word_t data) {
+    int k;
+    for (k = 0; k < W; k++)
+        data = (data >> 8) ^ crc_table[data & 0xff];
+    return (uint32_t)data;
+}
+#elif BYTE_ORDER == BIG_ENDIAN
+static z_word_t crc_word(z_word_t data) {
+    int k;
+    for (k = 0; k < W; k++)
+        data = (data << 8) ^
+            crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
+    return data;
+}
+#endif /* BYTE_ORDER */
+
+#endif /* W */
+
+/* ========================================================================= */
+Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) {
+    Z_REGISTER uint32_t c;
+
+    /* Pre-condition the CRC */
+    c = (~crc) & 0xffffffff;
+
+#ifdef W
+    /* If provided enough bytes, do a braided CRC calculation. */
+    if (len >= N * W + W - 1) {
+        size_t blks;
+        z_word_t const *words;
+        int k;
+
+        /* Compute the CRC up to a z_word_t boundary. */
+        while (len && ((uintptr_t)buf & (W - 1)) != 0) {
+            len--;
+            DO1;
+        }
+
+        /* Compute the CRC on as many N z_word_t blocks as are available. */
+        blks = len / (N * W);
+        len -= blks * N * W;
+        words = (z_word_t const *)buf;
+
+        z_word_t crc0, word0, comb;
+#if N > 1
+        z_word_t crc1, word1;
+#if N > 2
+        z_word_t crc2, word2;
+#if N > 3
+        z_word_t crc3, word3;
+#if N > 4
+        z_word_t crc4, word4;
+#if N > 5
+        z_word_t crc5, word5;
+#endif
+#endif
+#endif
+#endif
+#endif
+        /* Initialize the CRC for each braid. */
+        crc0 = ZSWAPWORD(c);
+#if N > 1
+        crc1 = 0;
+#if N > 2
+        crc2 = 0;
+#if N > 3
+        crc3 = 0;
+#if N > 4
+        crc4 = 0;
+#if N > 5
+        crc5 = 0;
+#endif
+#endif
+#endif
+#endif
+#endif
+        /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
+        while (--blks) {
+            /* Load the word for each braid into registers. */
+            word0 = crc0 ^ words[0];
+#if N > 1
+            word1 = crc1 ^ words[1];
+#if N > 2
+            word2 = crc2 ^ words[2];
+#if N > 3
+            word3 = crc3 ^ words[3];
+#if N > 4
+            word4 = crc4 ^ words[4];
+#if N > 5
+            word5 = crc5 ^ words[5];
+#endif
+#endif
+#endif
+#endif
+#endif
+            words += N;
+
+            /* Compute and update the CRC for each word. The loop should get unrolled. */
+            crc0 = BRAID_TABLE[0][word0 & 0xff];
+#if N > 1
+            crc1 = BRAID_TABLE[0][word1 & 0xff];
+#if N > 2
+            crc2 = BRAID_TABLE[0][word2 & 0xff];
+#if N > 3
+            crc3 = BRAID_TABLE[0][word3 & 0xff];
+#if N > 4
+            crc4 = BRAID_TABLE[0][word4 & 0xff];
+#if N > 5
+            crc5 = BRAID_TABLE[0][word5 & 0xff];
+#endif
+#endif
+#endif
+#endif
+#endif
+            for (k = 1; k < W; k++) {
+                crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
+#if N > 1
+                crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
+#if N > 2
+                crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
+#if N > 3
+                crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
+#if N > 4
+                crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
+#if N > 5
+                crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
+#endif
+#endif
+#endif
+#endif
+#endif
+            }
+        }
+
+        /* Process the last block, combining the CRCs of the N braids at the same time. */
+        comb = crc_word(crc0 ^ words[0]);
+#if N > 1
+        comb = crc_word(crc1 ^ words[1] ^ comb);
+#if N > 2
+        comb = crc_word(crc2 ^ words[2] ^ comb);
+#if N > 3
+        comb = crc_word(crc3 ^ words[3] ^ comb);
+#if N > 4
+        comb = crc_word(crc4 ^ words[4] ^ comb);
+#if N > 5
+        comb = crc_word(crc5 ^ words[5] ^ comb);
+#endif
+#endif
+#endif
+#endif
+#endif
+        words += N;
+        c = ZSWAPWORD(comb);
+
+        /* Update the pointer to the remaining bytes to process. */
+        buf = (const unsigned char *)words;
+    }
+
+#endif /* W */
+
+    /* Complete the computation of the CRC on any remaining bytes. */
+    while (len >= 8) {
+        len -= 8;
+        DO8;
+    }
+    while (len) {
+        len--;
+        DO1;
+    }
+
+    /* Return the CRC, post-conditioned. */
+    return c ^ 0xffffffff;
+}
diff --git a/crc32_fold.c b/arch/generic/crc32_fold_c.c
similarity index 96%
rename from crc32_fold.c
rename to arch/generic/crc32_fold_c.c
index 5b3c7c459f..404c6e2110 100644
--- a/crc32_fold.c
+++ b/arch/generic/crc32_fold_c.c
@@ -4,10 +4,9 @@
  */
 #include "zbuild.h"
 #include "functable.h"
+#include "crc32.h"
 
-#include "crc32_fold.h"
-
-#include <limits.h>
+#include "crc32_fold_c.h"
 
 Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
     crc->value = CRC32_INITIAL_VALUE;
diff --git a/crc32_fold.h b/arch/generic/crc32_fold_c.h
similarity index 68%
rename from crc32_fold.h
rename to arch/generic/crc32_fold_c.h
index 0d2ff66967..f68689c09e 100644
--- a/crc32_fold.h
+++ b/arch/generic/crc32_fold_c.h
@@ -2,16 +2,8 @@
  * Copyright (C) 2021 Nathan Moinvaziri
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
-#ifndef CRC32_FOLD_H_
-#define CRC32_FOLD_H_
-
-#define CRC32_FOLD_BUFFER_SIZE (16 * 4)
-/* sizeof(__m128i) * (4 folds) */
-
-typedef struct crc32_fold_s {
-    uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
-    uint32_t value;
-} crc32_fold;
+#ifndef CRC32_FOLD_C_H_
+#define CRC32_FOLD_C_H_
 
 Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
 Z_INTERNAL void     crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
diff --git a/arch/x86/crc32_pclmulqdq_tpl.h b/arch/x86/crc32_pclmulqdq_tpl.h
index 7f29a5d0d1..8c06a60ef4 100644
--- a/arch/x86/crc32_pclmulqdq_tpl.h
+++ b/arch/x86/crc32_pclmulqdq_tpl.h
@@ -26,7 +26,7 @@
 #  include <immintrin.h>
 #endif
 
-#include "crc32_fold.h"
+#include "crc32.h"
 #include "crc32_braid_p.h"
 #include "x86_intrins.h"
 #include <assert.h>
diff --git a/cpu_features.h b/cpu_features.h
index 25c6b43914..05ee407bf5 100644
--- a/cpu_features.h
+++ b/cpu_features.h
@@ -6,7 +6,7 @@
 #ifndef CPU_FEATURES_H_
 #define CPU_FEATURES_H_
 
-#include "crc32_fold.h"
+#include "crc32.h"
 
 #if defined(X86_FEATURES)
 #  include "arch/x86/x86_features.h"
diff --git a/crc32.h b/crc32.h
new file mode 100644
index 0000000000..8c3d7a8a3e
--- /dev/null
+++ b/crc32.h
@@ -0,0 +1,16 @@
+/* crc32.h -- crc32 folding interface
+ * Copyright (C) 2021 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef CRC32_H_
+#define CRC32_H_
+
+#define CRC32_FOLD_BUFFER_SIZE (16 * 4)
+/* sizeof(__m128i) * (4 folds) */
+
+typedef struct crc32_fold_s {
+    uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
+    uint32_t value;
+} crc32_fold;
+
+#endif
diff --git a/crc32_braid.c b/crc32_braid.c
index 96754b53df..fb6e519e7a 100644
--- a/crc32_braid.c
+++ b/crc32_braid.c
@@ -8,9 +8,7 @@
  */
 
 #include "zbuild.h"
-#include "zutil.h"
 #include "functable.h"
-#include "crc32_braid_p.h"
 #include "crc32_braid_tbl.h"
 
 /* ========================================================================= */
@@ -42,226 +40,3 @@ uint32_t Z_EXPORT PREFIX(crc32)(uint32_t crc, const unsigned char *buf, uint32_t
     return PREFIX(crc32_z)(crc, buf, len);
 }
 #endif
-
-/* ========================================================================= */
-
-/*
-  A CRC of a message is computed on N braids of words in the message, where
-  each word consists of W bytes (4 or 8). If N is 3, for example, then three
-  running sparse CRCs are calculated respectively on each braid, at these
-  indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
-  This is done starting at a word boundary, and continues until as many blocks
-  of N * W bytes as are available have been processed. The results are combined
-  into a single CRC at the end. For this code, N must be in the range 1..6 and
-  W must be 4 or 8. The upper limit on N can be increased if desired by adding
-  more #if blocks, extending the patterns apparent in the code. In addition,
-  crc32 tables would need to be regenerated, if the maximum N value is increased.
-
-  N and W are chosen empirically by benchmarking the execution time on a given
-  processor. The choices for N and W below were based on testing on Intel Kaby
-  Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
-  Octeon II processors. The Intel, AMD, and ARM processors were all fastest
-  with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
-  They were all tested with either gcc or clang, all using the -O3 optimization
-  level. Your mileage may vary.
-*/
-
-/* ========================================================================= */
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-#  define ZSWAPWORD(word) (word)
-#  define BRAID_TABLE crc_braid_table
-#elif BYTE_ORDER == BIG_ENDIAN
-#  if W == 8
-#    define ZSWAPWORD(word) ZSWAP64(word)
-#  elif W == 4
-#    define ZSWAPWORD(word) ZSWAP32(word)
-#  endif
-#  define BRAID_TABLE crc_braid_big_table
-#else
-#  error "No endian defined"
-#endif
-#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
-#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
-
-/* ========================================================================= */
-#ifdef W
-/*
-  Return the CRC of the W bytes in the word_t data, taking the
-  least-significant byte of the word as the first byte of data, without any pre
-  or post conditioning. This is used to combine the CRCs of each braid.
- */
-#if BYTE_ORDER == LITTLE_ENDIAN
-static uint32_t crc_word(z_word_t data) {
-    int k;
-    for (k = 0; k < W; k++)
-        data = (data >> 8) ^ crc_table[data & 0xff];
-    return (uint32_t)data;
-}
-#elif BYTE_ORDER == BIG_ENDIAN
-static z_word_t crc_word(z_word_t data) {
-    int k;
-    for (k = 0; k < W; k++)
-        data = (data << 8) ^
-            crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
-    return data;
-}
-#endif /* BYTE_ORDER */
-
-#endif /* W */
-
-/* ========================================================================= */
-Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) {
-    Z_REGISTER uint32_t c;
-
-    /* Pre-condition the CRC */
-    c = (~crc) & 0xffffffff;
-
-#ifdef W
-    /* If provided enough bytes, do a braided CRC calculation. */
-    if (len >= N * W + W - 1) {
-        size_t blks;
-        z_word_t const *words;
-        int k;
-
-        /* Compute the CRC up to a z_word_t boundary. */
-        while (len && ((uintptr_t)buf & (W - 1)) != 0) {
-            len--;
-            DO1;
-        }
-
-        /* Compute the CRC on as many N z_word_t blocks as are available. */
-        blks = len / (N * W);
-        len -= blks * N * W;
-        words = (z_word_t const *)buf;
-
-        z_word_t crc0, word0, comb;
-#if N > 1
-        z_word_t crc1, word1;
-#if N > 2
-        z_word_t crc2, word2;
-#if N > 3
-        z_word_t crc3, word3;
-#if N > 4
-        z_word_t crc4, word4;
-#if N > 5
-        z_word_t crc5, word5;
-#endif
-#endif
-#endif
-#endif
-#endif
-        /* Initialize the CRC for each braid. */
-        crc0 = ZSWAPWORD(c);
-#if N > 1
-        crc1 = 0;
-#if N > 2
-        crc2 = 0;
-#if N > 3
-        crc3 = 0;
-#if N > 4
-        crc4 = 0;
-#if N > 5
-        crc5 = 0;
-#endif
-#endif
-#endif
-#endif
-#endif
-        /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
-        while (--blks) {
-            /* Load the word for each braid into registers. */
-            word0 = crc0 ^ words[0];
-#if N > 1
-            word1 = crc1 ^ words[1];
-#if N > 2
-            word2 = crc2 ^ words[2];
-#if N > 3
-            word3 = crc3 ^ words[3];
-#if N > 4
-            word4 = crc4 ^ words[4];
-#if N > 5
-            word5 = crc5 ^ words[5];
-#endif
-#endif
-#endif
-#endif
-#endif
-            words += N;
-
-            /* Compute and update the CRC for each word. The loop should get unrolled. */
-            crc0 = BRAID_TABLE[0][word0 & 0xff];
-#if N > 1
-            crc1 = BRAID_TABLE[0][word1 & 0xff];
-#if N > 2
-            crc2 = BRAID_TABLE[0][word2 & 0xff];
-#if N > 3
-            crc3 = BRAID_TABLE[0][word3 & 0xff];
-#if N > 4
-            crc4 = BRAID_TABLE[0][word4 & 0xff];
-#if N > 5
-            crc5 = BRAID_TABLE[0][word5 & 0xff];
-#endif
-#endif
-#endif
-#endif
-#endif
-            for (k = 1; k < W; k++) {
-                crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
-#if N > 1
-                crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
-#if N > 2
-                crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
-#if N > 3
-                crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
-#if N > 4
-                crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
-#if N > 5
-                crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
-#endif
-#endif
-#endif
-#endif
-#endif
-            }
-        }
-
-        /* Process the last block, combining the CRCs of the N braids at the same time. */
-        comb = crc_word(crc0 ^ words[0]);
-#if N > 1
-        comb = crc_word(crc1 ^ words[1] ^ comb);
-#if N > 2
-        comb = crc_word(crc2 ^ words[2] ^ comb);
-#if N > 3
-        comb = crc_word(crc3 ^ words[3] ^ comb);
-#if N > 4
-        comb = crc_word(crc4 ^ words[4] ^ comb);
-#if N > 5
-        comb = crc_word(crc5 ^ words[5] ^ comb);
-#endif
-#endif
-#endif
-#endif
-#endif
-        words += N;
-        c = ZSWAPWORD(comb);
-
-        /* Update the pointer to the remaining bytes to process. */
-        buf = (const unsigned char *)words;
-    }
-
-#endif /* W */
-
-    /* Complete the computation of the CRC on any remaining bytes. */
-    while (len >= 8) {
-        len -= 8;
-        DO8;
-    }
-    while (len) {
-        len--;
-        DO1;
-    }
-
-    /* Return the CRC, post-conditioned. */
-    return c ^ 0xffffffff;
-}
diff --git a/crc32_braid_comb.c b/crc32_braid_comb.c
index 75fb474258..f253ae10a2 100644
--- a/crc32_braid_comb.c
+++ b/crc32_braid_comb.c
@@ -7,7 +7,6 @@
  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
  */
 
-#include "zbuild.h"
 #include "zutil.h"
 #include "crc32_braid_p.h"
 #include "crc32_braid_tbl.h"
diff --git a/crc32_braid_p.h b/crc32_braid_p.h
index 1d8a07068a..65a535dc20 100644
--- a/crc32_braid_p.h
+++ b/crc32_braid_p.h
@@ -1,7 +1,6 @@
 #ifndef CRC32_BRAID_P_H_
 #define CRC32_BRAID_P_H_
 
-#include "zbuild.h"
 #include "zendian.h"
 
 /* Define N */
diff --git a/deflate.h b/deflate.h
index 55053b7fb0..ec1519daf4 100644
--- a/deflate.h
+++ b/deflate.h
@@ -12,7 +12,7 @@
 
 #include "zutil.h"
 #include "zendian.h"
-#include "crc32_fold.h"
+#include "crc32.h"
 
 /* define NO_GZIP when compiling if you want to disable gzip header and
    trailer creation by deflate().  NO_GZIP would be used to avoid linking in
diff --git a/functable.h b/functable.h
index 9f5dad9001..433f696d3c 100644
--- a/functable.h
+++ b/functable.h
@@ -7,7 +7,7 @@
 #define FUNCTABLE_H_
 
 #include "deflate.h"
-#include "crc32_fold.h"
+#include "crc32_fold_c.h"
 #include "adler32_fold_c.h"
 
 #ifdef ZLIB_COMPAT
diff --git a/inflate.h b/inflate.h
index 7a90c5ae59..1f43de297d 100644
--- a/inflate.h
+++ b/inflate.h
@@ -11,7 +11,7 @@
 #ifndef INFLATE_H_
 #define INFLATE_H_
 
-#include "crc32_fold.h"
+#include "crc32.h"
 
 /* define NO_GZIP when compiling if you want to disable gzip header and trailer decoding by inflate().
    NO_GZIP would be used to avoid linking in the crc code when it is not needed.
diff --git a/win32/Makefile.a64 b/win32/Makefile.a64
index f099cb06b3..985b758d8f 100644
--- a/win32/Makefile.a64
+++ b/win32/Makefile.a64
@@ -52,8 +52,9 @@ OBJS = \
 	compress.obj \
 	cpu_features.obj \
 	crc32_braid.obj \
+	crc32_braid_c.obj \
 	crc32_braid_comb.obj \
-	crc32_fold.obj \
+	crc32_fold_c.obj \
 	deflate.obj \
 	deflate_fast.obj \
 	deflate_huff.obj \
@@ -191,9 +192,10 @@ gzwrite.obj: $(SRCDIR)/gzwrite.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR)
 compress.obj: $(SRCDIR)/compress.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
 uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
 cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
-crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h
-crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h
-crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h
+crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_tbl.h
+crc32_braid_c.obj: $(SRCDIR)/arch/generic/crc32_braid_c.c $(SRCDIR)/zbuild.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h
+crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h
+crc32_fold_c.obj: $(SRCDIR)/arch/generic/crc32_fold_c.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/arch/generic/crc32_fold_c.h
 deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 deflate_huff.obj: $(SRCDIR)/deflate_huff.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
diff --git a/win32/Makefile.arm b/win32/Makefile.arm
index 4cdd8952b8..5c7ae6bd17 100644
--- a/win32/Makefile.arm
+++ b/win32/Makefile.arm
@@ -57,8 +57,9 @@ OBJS = \
 	compress.obj \
 	cpu_features.obj \
 	crc32_braid.obj \
+	crc32_braid_c.obj \
 	crc32_braid_comb.obj \
-	crc32_fold.obj \
+	crc32_fold_c.obj \
 	deflate.obj \
 	deflate_fast.obj \
 	deflate_huff.obj \
@@ -212,9 +213,10 @@ compress.obj: $(SRCDIR)/compress.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
 uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
 chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
-crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h
-crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h
-crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h
+crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_tbl.h
+crc32_braid_c.obj: $(SRCDIR)/arch/generic/crc32_braid_c.c $(SRCDIR)/zbuild.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h
+crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h
+crc32_fold_c.obj: $(SRCDIR)/arch/generic/crc32_fold_c.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/arch/generic/crc32_fold_c.h
 deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 deflate_huff.obj: $(SRCDIR)/deflate_huff.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
diff --git a/win32/Makefile.msc b/win32/Makefile.msc
index 470a379110..fb02db36b8 100644
--- a/win32/Makefile.msc
+++ b/win32/Makefile.msc
@@ -64,8 +64,9 @@ OBJS = \
 	compress.obj \
 	cpu_features.obj \
 	crc32_braid.obj \
+	crc32_braid_c.obj \
 	crc32_braid_comb.obj \
-	crc32_fold.obj \
+	crc32_fold_c.obj \
 	crc32_pclmulqdq.obj \
 	deflate.obj \
 	deflate_fast.obj \
@@ -210,11 +211,12 @@ chunkset_avx2.obj: $(SRCDIR)/arch/x86/chunkset_avx2.c $(SRCDIR)/zbuild.h $(SRCDI
 chunkset_sse2.obj: $(SRCDIR)/arch/x86/chunkset_sse2.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 chunkset_ssse3.obj: $(SRCDIR)/arch/x86/chunkset_ssse3.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
-crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h
-crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h
-crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h
+crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_tbl.h
+crc32_braid_c.obj: $(SRCDIR)/arch/generic/crc32_braid_c.c $(SRCDIR)/zbuild.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h
+crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h
+crc32_fold_c.obj: $(SRCDIR)/arch/generic/crc32_fold_c.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/arch/generic/crc32_fold_c.h
 crc32_pclmulqdq.obj: $(SRCDIR)/arch/x86/crc32_pclmulqdq.c $(SRCDIR)/arch/x86/crc32_pclmulqdq_tpl.h $(SRCDIR)/arch/x86/crc32_fold_pclmulqdq_tpl.h \
-				 $(SRCDIR)/crc32_fold.h $(SRCDIR)/zbuild.h
+				 $(SRCDIR)/arch/generic/crc32_fold_c.h $(SRCDIR)/zbuild.h
 deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 deflate_huff.obj: $(SRCDIR)/deflate_huff.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h