Skip to content

Commit

Permalink
Move update_hash(), insert_string() and quick_insert_string() out of …
Browse files Browse the repository at this point in the history
…functable

and remove SSE4.2 and ACLE optimizations. The functable overhead is higher
than the benefit from using optimized functions.
  • Loading branch information
Dead2 committed Feb 23, 2024
1 parent ef2f8d5 commit 9953f12
Show file tree
Hide file tree
Showing 23 changed files with 44 additions and 155 deletions.
8 changes: 4 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -653,7 +653,7 @@ if(WITH_OPTIM)
check_acle_compiler_flag()
if(HAVE_ACLE_FLAG)
add_definitions(-DARM_ACLE)
set(ACLE_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c)
set(ACLE_SRCS ${ARCHDIR}/crc32_acle.c)
set_property(SOURCE ${ACLE_SRCS} PROPERTY COMPILE_FLAGS "${ACLEFLAG} ${NOLTOFLAG}")
list(APPEND ZLIB_ARCH_SRCS ${ACLE_SRCS})
add_feature_info(ACLE_CRC 1 "Support ACLE optimized CRC hash generation, using \"${ACLEFLAG}\"")
Expand Down Expand Up @@ -857,8 +857,8 @@ if(WITH_OPTIM)
check_sse42_intrinsics()
if(HAVE_SSE42_INTRIN AND WITH_SSSE3)
add_definitions(-DX86_SSE42)
set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/insert_string_sse42.c)
add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized CRC hash generation, using \"${SSE42FLAG}\"")
set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c)
add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized adler32 hash generation, using \"${SSE42FLAG}\"")
list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
else()
Expand Down Expand Up @@ -1034,7 +1034,6 @@ set(ZLIB_SRCS
arch/generic/compare256_c.c
arch/generic/crc32_braid_c.c
arch/generic/crc32_fold_c.c
arch/generic/insert_string_c.c
arch/generic/slide_hash_c.c
adler32.c
compress.c
Expand All @@ -1053,6 +1052,7 @@ set(ZLIB_SRCS
infback.c
inflate.c
inftrees.c
insert_string.c
insert_string_roll.c
trees.c
uncompr.c
Expand Down
4 changes: 2 additions & 2 deletions Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ OBJZ = \
arch/generic/compare256_c.o \
arch/generic/crc32_braid_c.o \
arch/generic/crc32_fold_c.o \
arch/generic/insert_string_c.o \
arch/generic/slide_hash_c.o \
adler32.o \
compress.o \
Expand All @@ -99,6 +98,7 @@ OBJZ = \
infback.o \
inflate.o \
inftrees.o \
insert_string.o \
insert_string_roll.o \
trees.o \
uncompr.o \
Expand All @@ -120,7 +120,6 @@ PIC_OBJZ = \
arch/generic/compare256_c.lo \
arch/generic/crc32_braid_c.lo \
arch/generic/crc32_fold_c.lo \
arch/generic/insert_string_c.lo \
arch/generic/slide_hash_c.lo \
adler32.lo \
compress.lo \
Expand All @@ -139,6 +138,7 @@ PIC_OBJZ = \
infback.lo \
inflate.lo \
inftrees.lo \
insert_string.lo \
insert_string_roll.lo \
trees.lo \
uncompr.lo \
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ Features
* Support for CPU intrinsics when available
* Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
* CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
* Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX
* Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV
* Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
Expand Down
7 changes: 0 additions & 7 deletions arch/arm/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ all: \
crc32_acle.o crc32_acle.lo \
slide_hash_neon.o slide_hash_neon.lo \
slide_hash_armv6.o slide_hash_armv6.lo \
insert_string_acle.o insert_string_acle.lo

adler32_neon.o:
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
Expand Down Expand Up @@ -69,12 +68,6 @@ slide_hash_armv6.o:
slide_hash_armv6.lo:
$(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c

insert_string_acle.o:
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c

insert_string_acle.lo:
$(CC) $(SFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c

mostlyclean: clean
clean:
rm -f *.o *.lo *~
Expand Down
4 changes: 0 additions & 4 deletions arch/arm/arm_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,6 @@ void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);

#ifdef ARM_ACLE
uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len);

void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
uint32_t update_hash_acle(uint32_t h, uint32_t val);
#endif

#ifdef ARM_SIMD
Expand Down
24 changes: 0 additions & 24 deletions arch/arm/insert_string_acle.c

This file was deleted.

7 changes: 0 additions & 7 deletions arch/generic/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ all: \
compare256_c.o compare256_c.lo \
crc32_braid_c.o crc32_braid_c.lo \
crc32_fold_c.o crc32_fold_c.lo \
insert_string_c.o insert_string_c.lo \
slide_hash_c.o slide_hash_c.lo


Expand Down Expand Up @@ -59,12 +58,6 @@ crc32_fold_c.o: $(SRCDIR)/crc32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable
crc32_fold_c.lo: $(SRCDIR)/crc32_fold_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/functable.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_c.c

insert_string_c.o: $(SRCDIR)/insert_string_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h $(SRCTOP)/insert_string_tpl.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_c.c

insert_string_c.lo: $(SRCDIR)/insert_string_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h $(SRCTOP)/insert_string_tpl.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_c.c

slide_hash_c.o: $(SRCDIR)/slide_hash_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/deflate.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_c.c

Expand Down
3 changes: 0 additions & 3 deletions arch/generic/generic_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,7 @@ uint32_t compare256_unaligned_16(const uint8_t *src0, const uint8_t *src1);

typedef void (*slide_hash_func)(deflate_state *s);

void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
void slide_hash_c(deflate_state *s);
uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val);

uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
# if defined(UNALIGNED_OK) && BYTE_ORDER == LITTLE_ENDIAN
Expand Down
7 changes: 0 additions & 7 deletions arch/x86/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ all: \
chunkset_ssse3.o chunkset_ssse3.lo \
compare256_avx2.o compare256_avx2.lo \
compare256_sse2.o compare256_sse2.lo \
insert_string_sse42.o insert_string_sse42.lo \
crc32_pclmulqdq.o crc32_pclmulqdq.lo \
crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
slide_hash_avx2.o slide_hash_avx2.lo \
Expand Down Expand Up @@ -77,12 +76,6 @@ compare256_sse2.o:
compare256_sse2.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c

insert_string_sse42.o:
$(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c

insert_string_sse42.lo:
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c

crc32_pclmulqdq.o:
$(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c

Expand Down
24 changes: 0 additions & 24 deletions arch/x86/insert_string_sse42.c

This file was deleted.

3 changes: 0 additions & 3 deletions arch/x86/x86_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@ void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);

#ifdef X86_SSE42
uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
void insert_string_sse42(deflate_state *const s, const uint32_t str, uint32_t count);
Pos quick_insert_string_sse42(deflate_state *const s, const uint32_t str);
uint32_t update_hash_sse42(uint32_t h, uint32_t val);
#endif

#ifdef X86_AVX2
Expand Down
8 changes: 4 additions & 4 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -1503,8 +1503,8 @@ case "${ARCH}" in
if test ${HAVE_SSE42_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_SSE42"
SFLAGS="${SFLAGS} -DX86_SSE42"
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_sse42.o insert_string_sse42.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_sse42.lo insert_string_sse42.lo"
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_sse42.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_sse42.lo"
fi

check_pclmulqdq_intrinsics
Expand Down Expand Up @@ -1695,8 +1695,8 @@ EOF
CFLAGS="${CFLAGS} -DARM_ACLE"
SFLAGS="${SFLAGS} -DARM_ACLE"

ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_acle.o insert_string_acle.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_acle.lo insert_string_acle.lo"
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_acle.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_acle.lo"
fi
fi

Expand Down
10 changes: 3 additions & 7 deletions deflate.c
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,6 @@ static void lm_set_level (deflate_state *s, int level);
static void lm_init (deflate_state *s);
Z_INTERNAL unsigned read_buf (PREFIX3(stream) *strm, unsigned char *buf, unsigned size);

extern uint32_t update_hash_roll (uint32_t h, uint32_t val);
extern void insert_string_roll (deflate_state *const s, uint32_t str, uint32_t count);
extern Pos quick_insert_string_roll(deflate_state *const s, uint32_t str);

/* ===========================================================================
* Local data
*/
Expand Down Expand Up @@ -1144,9 +1140,9 @@ static void lm_set_level(deflate_state *s, int level) {
s->insert_string = &insert_string_roll;
s->quick_insert_string = &quick_insert_string_roll;
} else {
s->update_hash = functable.update_hash;
s->insert_string = functable.insert_string;
s->quick_insert_string = functable.quick_insert_string;
s->update_hash = update_hash;
s->insert_string = insert_string;
s->quick_insert_string = quick_insert_string;
}

s->level = level;
Expand Down
8 changes: 8 additions & 0 deletions deflate.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,14 @@ typedef uint32_t (* update_hash_cb) (uint32_t h, uint32_t val);
typedef void (* insert_string_cb) (deflate_state *const s, uint32_t str, uint32_t count);
typedef Pos (* quick_insert_string_cb)(deflate_state *const s, uint32_t str);

uint32_t update_hash (uint32_t h, uint32_t val);
void insert_string (deflate_state *const s, uint32_t str, uint32_t count);
Pos quick_insert_string (deflate_state *const s, uint32_t str);

uint32_t update_hash_roll (uint32_t h, uint32_t val);
void insert_string_roll (deflate_state *const s, uint32_t str, uint32_t count);
Pos quick_insert_string_roll(deflate_state *const s, uint32_t str);

struct ALIGNED_(16) internal_state {
PREFIX3(stream) *strm; /* pointer back to this zlib stream */
unsigned char *pending_buf; /* output still pending */
Expand Down
6 changes: 3 additions & 3 deletions deflate_fast.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
* dictionary, and set hash_head to the head of the hash chain:
*/
if (s->lookahead >= WANT_MIN_MATCH) {
hash_head = functable.quick_insert_string(s, s->strstart);
hash_head = quick_insert_string(s, s->strstart);
dist = (int64_t)s->strstart - hash_head;

/* Find the longest match, discarding those <= prev_length.
Expand Down Expand Up @@ -71,11 +71,11 @@ Z_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
match_len--; /* string at strstart already in table */
s->strstart++;

functable.insert_string(s, s->strstart, match_len);
insert_string(s, s->strstart, match_len);
s->strstart += match_len;
} else {
s->strstart += match_len;
functable.quick_insert_string(s, s->strstart + 2 - STD_MIN_MATCH);
quick_insert_string(s, s->strstart + 2 - STD_MIN_MATCH);

/* If lookahead < STD_MIN_MATCH, ins_h is garbage, but it does not
* matter since it will be recomputed at next deflate call.
Expand Down
16 changes: 8 additions & 8 deletions deflate_medium.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ static void insert_match(deflate_state *s, struct match match) {
if (UNLIKELY(match.match_length > 0)) {
if (match.strstart >= match.orgstart) {
if (match.strstart + match.match_length - 1 >= match.orgstart) {
functable.insert_string(s, match.strstart, match.match_length);
insert_string(s, match.strstart, match.match_length);
} else {
functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
}
match.strstart += match.match_length;
match.match_length = 0;
Expand All @@ -72,12 +72,12 @@ static void insert_match(deflate_state *s, struct match match) {

if (LIKELY(match.strstart >= match.orgstart)) {
if (LIKELY(match.strstart + match.match_length - 1 >= match.orgstart)) {
functable.insert_string(s, match.strstart, match.match_length);
insert_string(s, match.strstart, match.match_length);
} else {
functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
insert_string(s, match.strstart, match.orgstart - match.strstart + 1);
}
} else if (match.orgstart < match.strstart + match.match_length) {
functable.insert_string(s, match.orgstart, match.strstart + match.match_length - match.orgstart);
insert_string(s, match.orgstart, match.strstart + match.match_length - match.orgstart);
}
match.strstart += match.match_length;
match.match_length = 0;
Expand All @@ -86,7 +86,7 @@ static void insert_match(deflate_state *s, struct match match) {
match.match_length = 0;

if (match.strstart >= (STD_MIN_MATCH - 2))
functable.quick_insert_string(s, match.strstart + 2 - STD_MIN_MATCH);
quick_insert_string(s, match.strstart + 2 - STD_MIN_MATCH);

/* If lookahead < WANT_MIN_MATCH, ins_h is garbage, but it does not
* matter since it will be recomputed at next deflate call.
Expand Down Expand Up @@ -199,7 +199,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
} else {
hash_head = 0;
if (s->lookahead >= WANT_MIN_MATCH) {
hash_head = functable.quick_insert_string(s, s->strstart);
hash_head = quick_insert_string(s, s->strstart);
}

current_match.strstart = (uint16_t)s->strstart;
Expand Down Expand Up @@ -235,7 +235,7 @@ Z_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
/* now, look ahead one */
if (LIKELY(!early_exit && s->lookahead > MIN_LOOKAHEAD && (uint32_t)(current_match.strstart + current_match.match_length) < (s->window_size - MIN_LOOKAHEAD))) {
s->strstart = current_match.strstart + current_match.match_length;
hash_head = functable.quick_insert_string(s, s->strstart);
hash_head = quick_insert_string(s, s->strstart);

next_match.strstart = (uint16_t)s->strstart;
next_match.orgstart = next_match.strstart;
Expand Down
2 changes: 1 addition & 1 deletion deflate_quick.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
}

if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) {
hash_head = functable.quick_insert_string(s, s->strstart);
hash_head = quick_insert_string(s, s->strstart);
dist = (int64_t)s->strstart - hash_head;

if (dist <= MAX_DIST(s) && dist > 0) {
Expand Down
Loading

0 comments on commit 9953f12

Please sign in to comment.