Skip to content

Commit

Permalink
Move crc32 C fallbacks to arch/generic
Browse files Browse the repository at this point in the history
  • Loading branch information
Dead2 committed Jan 19, 2024
1 parent 4e132cc commit 06895bc
Show file tree
Hide file tree
Showing 18 changed files with 301 additions and 263 deletions.
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -978,14 +978,14 @@ set(ZLIB_PUBLIC_HDRS
)
set(ZLIB_PRIVATE_HDRS
arch/generic/adler32_fold_c.h
arch/generic/crc32_fold_c.h
adler32_p.h
chunkset_tpl.h
compare256_rle.h
cpu_features.h
crc32_braid_p.h
crc32_braid_comb_p.h
crc32_braid_tbl.h
crc32_fold.h
deflate.h
deflate_p.h
functable.h
Expand All @@ -1006,14 +1006,15 @@ set(ZLIB_PRIVATE_HDRS
set(ZLIB_SRCS
arch/generic/adler32_c.c
arch/generic/adler32_fold_c.c
arch/generic/crc32_braid_c.c
arch/generic/crc32_fold_c.c
adler32.c
chunkset.c
compare256.c
compress.c
cpu_features.c
crc32_braid.c
crc32_braid_comb.c
crc32_fold.c
deflate.c
deflate_fast.c
deflate_huff.c
Expand Down
6 changes: 4 additions & 2 deletions Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,15 @@ pkgconfigdir = ${libdir}/pkgconfig
OBJZ = \
arch/generic/adler32_c.o \
arch/generic/adler32_fold_c.o \
arch/generic/crc32_braid_c.o \
arch/generic/crc32_fold_c.o \
adler32.o \
chunkset.o \
compare256.o \
compress.o \
cpu_features.o \
crc32_braid.o \
crc32_braid_comb.o \
crc32_fold.o \
deflate.o \
deflate_fast.o \
deflate_huff.o \
Expand Down Expand Up @@ -115,14 +116,15 @@ OBJC = $(OBJZ) $(OBJG)
PIC_OBJZ = \
arch/generic/adler32_c.lo \
arch/generic/adler32_fold_c.lo \
arch/generic/crc32_braid_c.lo \
arch/generic/crc32_fold_c.lo \
adler32.lo \
chunkset.lo \
compare256.lo \
compress.lo \
cpu_features.lo \
crc32_braid.lo \
crc32_braid_comb.lo \
crc32_fold.lo \
deflate.lo \
deflate_fast.lo \
deflate_huff.lo \
Expand Down
16 changes: 15 additions & 1 deletion arch/generic/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ TOPDIR=$(SRCTOP)

all: \
adler32_c.o adler32_c.lo \
adler32_fold_c.o adler32_fold_c.lo
adler32_fold_c.o adler32_fold_c.lo \
crc32_braid_c.o crc32_braid_c.lo \
crc32_fold_c.o crc32_fold_c.lo


adler32_c.o: $(SRCDIR)/adler32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/adler32_p.h
Expand All @@ -29,6 +31,18 @@ adler32_fold_c.o: $(SRCDIR)/adler32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/funct
adler32_fold_c.lo: $(SRCDIR)/adler32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h $(SRCDIR)/adler32_fold_c.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_fold_c.c

crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c

crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c

crc32_fold_c.o: $(SRCDIR)/crc32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h $(SRCDIR)/crc32_fold_c.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c

crc32_fold_c.lo: $(SRCDIR)/crc32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h $(SRCDIR)/crc32_fold_c.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c


mostlyclean: clean
clean:
Expand Down
235 changes: 235 additions & 0 deletions arch/generic/crc32_braid_c.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
/* crc32_braid.c -- compute the CRC-32 of a data stream
* Copyright (C) 1995-2022 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
* This interleaved implementation of a CRC makes use of pipelined multiple
* arithmetic-logic units, commonly found in modern CPU cores. It is due to
* Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
*/

#include "zbuild.h"
#include "crc32_braid_p.h"
#include "crc32_braid_tbl.h"

/* ========================================================================= */

/*
A CRC of a message is computed on N braids of words in the message, where
each word consists of W bytes (4 or 8). If N is 3, for example, then three
running sparse CRCs are calculated respectively on each braid, at these
indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
This is done starting at a word boundary, and continues until as many blocks
of N * W bytes as are available have been processed. The results are combined
into a single CRC at the end. For this code, N must be in the range 1..6 and
W must be 4 or 8. The upper limit on N can be increased if desired by adding
more #if blocks, extending the patterns apparent in the code. In addition,
crc32 tables would need to be regenerated, if the maximum N value is increased.
N and W are chosen empirically by benchmarking the execution time on a given
processor. The choices for N and W below were based on testing on Intel Kaby
Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
Octeon II processors. The Intel, AMD, and ARM processors were all fastest
with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
They were all tested with either gcc or clang, all using the -O3 optimization
level. Your mileage may vary.
*/

/* ========================================================================= */

#if BYTE_ORDER == LITTLE_ENDIAN
# define ZSWAPWORD(word) (word)
# define BRAID_TABLE crc_braid_table
#elif BYTE_ORDER == BIG_ENDIAN
# if W == 8
# define ZSWAPWORD(word) ZSWAP64(word)
# elif W == 4
# define ZSWAPWORD(word) ZSWAP32(word)
# endif
# define BRAID_TABLE crc_braid_big_table
#else
# error "No endian defined"
#endif
#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1

/* ========================================================================= */
#ifdef W
/*
Return the CRC of the W bytes in the word_t data, taking the
least-significant byte of the word as the first byte of data, without any pre
or post conditioning. This is used to combine the CRCs of each braid.
*/
#if BYTE_ORDER == LITTLE_ENDIAN
static uint32_t crc_word(z_word_t data) {
int k;
for (k = 0; k < W; k++)
data = (data >> 8) ^ crc_table[data & 0xff];
return (uint32_t)data;
}
#elif BYTE_ORDER == BIG_ENDIAN
static z_word_t crc_word(z_word_t data) {
int k;
for (k = 0; k < W; k++)
data = (data << 8) ^
crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
return data;
}
#endif /* BYTE_ORDER */

#endif /* W */

/* ========================================================================= */
Z_INTERNAL uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len) {
Z_REGISTER uint32_t c;

/* Pre-condition the CRC */
c = (~crc) & 0xffffffff;

#ifdef W
/* If provided enough bytes, do a braided CRC calculation. */
if (len >= N * W + W - 1) {
size_t blks;
z_word_t const *words;
int k;

/* Compute the CRC up to a z_word_t boundary. */
while (len && ((uintptr_t)buf & (W - 1)) != 0) {
len--;
DO1;
}

/* Compute the CRC on as many N z_word_t blocks as are available. */
blks = len / (N * W);
len -= blks * N * W;
words = (z_word_t const *)buf;

z_word_t crc0, word0, comb;
#if N > 1
z_word_t crc1, word1;
#if N > 2
z_word_t crc2, word2;
#if N > 3
z_word_t crc3, word3;
#if N > 4
z_word_t crc4, word4;
#if N > 5
z_word_t crc5, word5;
#endif
#endif
#endif
#endif
#endif
/* Initialize the CRC for each braid. */
crc0 = ZSWAPWORD(c);
#if N > 1
crc1 = 0;
#if N > 2
crc2 = 0;
#if N > 3
crc3 = 0;
#if N > 4
crc4 = 0;
#if N > 5
crc5 = 0;
#endif
#endif
#endif
#endif
#endif
/* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
while (--blks) {
/* Load the word for each braid into registers. */
word0 = crc0 ^ words[0];
#if N > 1
word1 = crc1 ^ words[1];
#if N > 2
word2 = crc2 ^ words[2];
#if N > 3
word3 = crc3 ^ words[3];
#if N > 4
word4 = crc4 ^ words[4];
#if N > 5
word5 = crc5 ^ words[5];
#endif
#endif
#endif
#endif
#endif
words += N;

/* Compute and update the CRC for each word. The loop should get unrolled. */
crc0 = BRAID_TABLE[0][word0 & 0xff];
#if N > 1
crc1 = BRAID_TABLE[0][word1 & 0xff];
#if N > 2
crc2 = BRAID_TABLE[0][word2 & 0xff];
#if N > 3
crc3 = BRAID_TABLE[0][word3 & 0xff];
#if N > 4
crc4 = BRAID_TABLE[0][word4 & 0xff];
#if N > 5
crc5 = BRAID_TABLE[0][word5 & 0xff];
#endif
#endif
#endif
#endif
#endif
for (k = 1; k < W; k++) {
crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
#if N > 1
crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
#if N > 2
crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
#if N > 3
crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
#if N > 4
crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
#if N > 5
crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
#endif
#endif
#endif
#endif
#endif
}
}

/* Process the last block, combining the CRCs of the N braids at the same time. */
comb = crc_word(crc0 ^ words[0]);
#if N > 1
comb = crc_word(crc1 ^ words[1] ^ comb);
#if N > 2
comb = crc_word(crc2 ^ words[2] ^ comb);
#if N > 3
comb = crc_word(crc3 ^ words[3] ^ comb);
#if N > 4
comb = crc_word(crc4 ^ words[4] ^ comb);
#if N > 5
comb = crc_word(crc5 ^ words[5] ^ comb);
#endif
#endif
#endif
#endif
#endif
words += N;
c = ZSWAPWORD(comb);

/* Update the pointer to the remaining bytes to process. */
buf = (const unsigned char *)words;
}

#endif /* W */

/* Complete the computation of the CRC on any remaining bytes. */
while (len >= 8) {
len -= 8;
DO8;
}
while (len) {
len--;
DO1;
}

/* Return the CRC, post-conditioned. */
return c ^ 0xffffffff;
}
5 changes: 2 additions & 3 deletions crc32_fold.c → arch/generic/crc32_fold_c.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@
*/
#include "zbuild.h"
#include "functable.h"
#include "crc32.h"

#include "crc32_fold.h"

#include <limits.h>
#include "crc32_fold_c.h"

Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
crc->value = CRC32_INITIAL_VALUE;
Expand Down
12 changes: 2 additions & 10 deletions crc32_fold.h → arch/generic/crc32_fold_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,8 @@
* Copyright (C) 2021 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef CRC32_FOLD_H_
#define CRC32_FOLD_H_

#define CRC32_FOLD_BUFFER_SIZE (16 * 4)
/* sizeof(__m128i) * (4 folds) */

typedef struct crc32_fold_s {
uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
uint32_t value;
} crc32_fold;
#ifndef CRC32_FOLD_C_H_
#define CRC32_FOLD_C_H_

Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/crc32_pclmulqdq_tpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# include <immintrin.h>
#endif

#include "crc32_fold.h"
#include "crc32.h"
#include "crc32_braid_p.h"
#include "x86_intrins.h"
#include <assert.h>
Expand Down
2 changes: 1 addition & 1 deletion cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#ifndef CPU_FEATURES_H_
#define CPU_FEATURES_H_

#include "crc32_fold.h"
#include "crc32.h"

#if defined(X86_FEATURES)
# include "arch/x86/x86_features.h"
Expand Down
16 changes: 16 additions & 0 deletions crc32.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/* crc32.h -- crc32 folding interface
* Copyright (C) 2021 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef CRC32_H_
#define CRC32_H_

#define CRC32_FOLD_BUFFER_SIZE (16 * 4)
/* sizeof(__m128i) * (4 folds) */

typedef struct crc32_fold_s {
uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
uint32_t value;
} crc32_fold;

#endif
Loading

0 comments on commit 06895bc

Please sign in to comment.