Skip to content

Commit

Permalink
Use FNV1a for string hashing
Browse files Browse the repository at this point in the history
The existing X31 hash propagates bits fairly slowly, resulting in
a poor distribution of keys if most of the differences in strings
are at the end.  Fix by using FNV1a instead, which is a similar
speed to calculate but distributes keys much more effectively.

Includes kh_stats() function in khash which produces a histogram
of probe chain lengths and a khash test framework.  The test
program can also be used to benchmark insertion and lookup
times.
  • Loading branch information
daviesrob committed Jul 8, 2024
1 parent b8145e6 commit 816cc14
Show file tree
Hide file tree
Showing 4 changed files with 592 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ shlib-exports-*.txt
/test/test_index
/test/test_introspection
/test/test_kfunc
/test/test_khash
/test/test_kstring
/test/test_mod
/test/test_nibbles
Expand Down
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ BUILT_TEST_PROGRAMS = \
test/test_expr \
test/test_faidx \
test/test_kfunc \
test/test_khash \
test/test_kstring \
test/test_mod \
test/test_nibbles \
Expand Down Expand Up @@ -605,6 +606,7 @@ check test: all $(HTSCODECS_TEST_TARGETS)
test/hts_endian
test/test_expr
test/test_kfunc
test/test_khash
test/test_kstring
test/test_nibbles -v
test/test_str2int
Expand Down Expand Up @@ -669,6 +671,9 @@ test/test_faidx: test/test_faidx.o libhts.a
test/test_kfunc: test/test_kfunc.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a -lz $(LIBS) -lpthread

test/test_khash: test/test_khash.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test_khash.o libhts.a -lz $(LIBS) -lpthread

test/test_kstring: test/test_kstring.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test_kstring.o libhts.a -lz $(LIBS) -lpthread

Expand Down Expand Up @@ -778,6 +783,7 @@ test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_fa
test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(hfile_internal_h)
test/test_expr.o: test/test_expr.c config.h $(htslib_hts_expr_h)
test/test_kfunc.o: test/test_kfunc.c config.h $(htslib_kfunc_h)
test/test_khash.o: test/test_khash.c config.h $(htslib_khash_h) $(htslib_kroundup_h)
test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h)
test/test_mod.o: test/test_mod.c config.h $(htslib_sam_h)
test/test_nibbles.o: test/test_nibbles.c config.h $(htslib_sam_h) $(sam_internal_h)
Expand Down
87 changes: 83 additions & 4 deletions htslib/khash.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/* The MIT License
Copyright (c) 2008, 2009, 2011 by Attractive Chaos <[email protected]>
Copyright (C) 2014-2015, 2018 Genome Research Ltd.
Copyright (C) 2014-2015, 2018, 2024 Genome Research Ltd.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
Expand Down Expand Up @@ -356,7 +356,39 @@ static const double __ac_HASH_UPPER = 0.77;
__ac_set_isdel_true(h->flags, x); \
--h->size; \
} \
}
} \
SCOPE int kh_stats_##name(kh_##name##_t *h, khint_t *empty, \
khint_t *deleted, khint_t *hist_size, \
khint_t **hist_out) \
{ \
khint_t i, *hist = NULL, dist_max = 0, k, dist, step; \
khint_t mask = h->n_buckets - 1; \
*empty = *deleted = *hist_size = 0; \
hist = (khint_t *) calloc(1, sizeof(*hist)); \
if (!hist) { return -1; } \
for (i = kh_begin(h); i < kh_end(h); ++i) { \
if (__ac_isempty(h->flags, i)) { (*empty)++; continue; } \
if (__ac_isdel(h->flags, i)) { (*deleted)++; continue; } \
k = __hash_func(h->keys[i]) & (h->n_buckets - 1); \
dist = 0; \
step = 0; \
while (k != i) { \
dist++; \
k = (k + (++step)) & mask; \
} \
if (dist_max <= dist) { \
khint_t *new_hist = (khint_t *) realloc(hist, sizeof(*new_hist) * (dist + 1)); \
if (!new_hist) { free(hist); return -1; } \
for (k = dist_max + 1; k <= dist; k++) new_hist[k] = 0; \
hist = new_hist; \
dist_max = dist; \
} \
hist[dist]++; \
} \
*hist_out = hist; \
*hist_size = dist_max + 1; \
return 0; \
}

#define KHASH_DECLARE(name, khkey_t, khval_t) \
__KHASH_TYPE(name, khkey_t, khval_t) \
Expand Down Expand Up @@ -391,6 +423,7 @@ static const double __ac_HASH_UPPER = 0.77;
@abstract 64-bit integer comparison function
*/
#define kh_int64_hash_equal(a, b) ((a) == (b))

/*! @function
@abstract const char* hash function
@param s Pointer to a null terminated string
Expand All @@ -402,12 +435,28 @@ static kh_inline khint_t __ac_X31_hash_string(const char *s)
if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
return h;
}

/*! @function
@abstract const char* FNV1a hash function
@param s Pointer to a null terminated string
@return The hash value
*/
static kh_inline khint_t __ac_FNV1a_hash_string(const char *s)
{
const khint_t offset_basis = 2166136261;
const khint_t FNV_prime = 16777619;
khint_t h = offset_basis;
for (; *s; ++s) h = (h ^ (uint8_t) *s) * FNV_prime;
return h;
}

/*! @function
@abstract Another interface to const char* hash function
@param key Pointer to a nul terminated string [const char*]
@return The hash value [khint_t]
*/
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
#define kh_str_hash_func(key) __ac_FNV1a_hash_string(key)

/*! @function
@abstract Const char* comparison function
*/
Expand All @@ -426,12 +475,29 @@ static kh_inline khint_t __ac_X31_hash_kstring(const kstring_t ks)
h = (h << 5) - h + (khint_t)ks.s[i];
return h;
}

/*! @function
@abstract Kstring hash function
@param s Pointer to a kstring
@return The hash value
*/
static kh_inline khint_t __ac_FNV1a_hash_kstring(const kstring_t ks)
{
const khint_t offset_basis = 2166136261;
const khint_t FNV_prime = 16777619;
khint_t h = offset_basis;
size_t i;
for (i = 0; i < ks.l; i++)
h = (h ^ (uint8_t) ks.s[i]) * FNV_prime;
return h;
}

/*! @function
@abstract Interface to kstring hash function.
@param key Pointer to a khash; permits hashing on non-nul terminated strings.
@return The hash value [khint_t]
*/
#define kh_kstr_hash_func(key) __ac_X31_hash_kstring(key)
#define kh_kstr_hash_func(key) __ac_FNV1a_hash_kstring(key)
/*! @function
@abstract kstring comparison function
*/
Expand Down Expand Up @@ -604,6 +670,19 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
code; \
} }

/*! @function
@abstract Gather hash table statistics
@param name Name of the hash table [symbol]
@param h Pointer to the hash table [khash_t(name)*]
@param empty[out] Number of empty hash bins
@param deleted[out] Number of hash bins with the deleted flag
@param hist_size[out] Size of @p hist array
@param hist[out] Probe count histogram
@return 0 on success; -1 on failure
*/
#define kh_stats(name, h, empty, deleted, hist_size, hist) \
kh_stats_##name(h, empty, deleted, hist_size, hist)

/* More convenient interfaces */

/*! @function
Expand Down
Loading

0 comments on commit 816cc14

Please sign in to comment.